How do I define a variable as a scraped element using Puppeteer - javascript

const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null
})
const page = await browser.newPage()
await page.goto('https://www.supremenewyork.com/shop/sweatshirts/ftq968f24/lhrblx1z5')
var productName = await page.evaluate(() => {
document.querySelector('div[id="details"] > p[itemprop="model"]').innerText;
})
console.log(productName);
})()
When I run my code that is supposed to grab the name of the supreme item, it says undefined when it's supposed to log it in the console.

You are neither returning anything from the page.evaluate nor are you setting the value of productName. Try something like this instead that uses $eval to return the innerText of the matching element:
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
});
const page = await browser.newPage();
await page.goto(
"https://www.supremenewyork.com/shop/sweatshirts/ftq968f24/lhrblx1z5"
);
const productName = await page.$eval(
'div[id="details"] > p[itemprop="model"]',
(el) => el.innerText
);
console.log(productName);
})();
If you prefer to use evaluate it would look like:
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
});
const page = await browser.newPage();
await page.goto(
"https://www.supremenewyork.com/shop/sweatshirts/ftq968f24/lhrblx1z5"
);
const productName = await page.evaluate(() => {
// notice the return
return document.querySelector('div[id="details"] > p[itemprop="model"]').innerText;
});
console.log(productName);
})();
If innerText doesn't return anything you may instead need to use something like textContent.
Hopefully that helps!

Related

Having trouble scraping a particular element on a website using Puppeteer

I am trying to scrape the key features part of the website with the URL of: "https://www.alpinestars.com/products/stella-missile-v2-1-piece-suit-1" using puppeteer - however, whenever I try to use a selector that works on the chrome console for the website the output for my code is always an empty array or object. For example both document.querySelectorAll("#key\ features > p") and document.getElementById('key features') both return as empty arrays or objects when I output it through my code but work via chrome console.
I have attached my code below:
const puppeteer = require('puppeteer');
async function getDescripData(url) {
const browser = await puppeteer.launch({headless: true});
const page = await browser.newPage();
await page.goto(url);
const descripFeatures = await page.evaluate(() => {
const tds = Array.from(document.getElementById('key features'))
console.log(tds)
return tds.map(td => td.innerText)
});
console.log(descripFeatures)
await browser.close();
return {
features: descripFeatures
}
}
How should I go about overcoming this issue?
Thanks in advance!
Your problem is in Array.from you are passing a non-iterable object and return null.
This works for me:
const puppeteer = require('puppeteer');
const url = 'https://www.alpinestars.com/products/stella-missile-v2-1-piece-suit-1';
(async () => {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
args: ['--start-maximized'],
devtools: true
});
const page = (await browser.pages())[0];
await page.goto(url);
const descripFeatures = await page.evaluate(() => {
const tds = document.getElementById('key features').innerText;
return tds.split('• ');
});
console.log(descripFeatures)
await browser.close();
})();

Puppeteer: line of code being executed before others

I have this code:
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto("https://www.sisal.it/scommesse-matchpoint/quote/calcio/serie-a");
const [button1] = await
page.$x('//div[#class="marketBar_changeMarketLabel__l0vzl"]/p');
button1.click();
const [button2] = await page.$x('//div[#class="listItem_container__2IdVR white
marketList_listItemHeight__1aiAJ marketList_bgColorGrey__VdrVK"]/p[text()="1X2
ESITO FINALE"]');
button2.click();
})();
The proble is that after clicking button1 the page change and puppeteer executes immediately the following line of code, instead I want it to wait for the new page to be loaded becuase otherwise It will throw an error since It can't find button2.
I found this solution on stackoverflow:
const puppeteer = require("puppeteer");
function delay(time) {
return new Promise(function (resolve) {
setTimeout(resolve, time);
});
}
(async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto("https://www.sisal.it/scommesse-matchpoint/quote/calcio/serie-a");
const [button1] = await
page.$x('//div[#class="marketBar_changeMarketLabel__l0vzl"]/p');
button1.click();
await delay(4000);
const [button2] = await page.$x('//div[#class="listItem_container__2IdVR white
marketList_listItemHeight__1aiAJ
marketList_bgColorGrey__VdrVK"]/p[text()="1X2
ESITO FINALE"]');
button2.click();
})();
But of course this in't the best solution.
I think you have to modify a bit in your code:
await button1.click();
await page.waitForNavigation({waitUntil: 'networkidle2'});
For reference, see the documentation.
I found a solution, here's the code:
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto("https://www.sisal.it/scommesse
matchpoint/quote/calcio/serie-a");
await page.waitForXPath('//div[#class="marketBar_changeMarketLabel__l0vzl"]/p');
const [button1] = await page.$x('//div[#class="marketBar_changeMarketLabel__l0vzl"]/p');
await button1.click();
await page.waitForXPath('//div[#class="listItem_container__2IdVR white marketList_listItemHeight__1aiAJ marketList_bgColorGrey__VdrVK"]/p[text()="1X2 ESITO FINALE"]');
const [button2] = await page.$x('//div[#class="listItem_container__2IdVR white marketList_listItemHeight__1aiAJ marketList_bgColorGrey__VdrVK"]/p[text()="1X2 ESITO FINALE"]');
button2.click();
})();

Overriding showOpenFilePicker with Puppeteer

As illustrated in here here, Puppeteer allows to override Javascript functions. I want to override showOpenFilePicker function. That is, when the showOpenFilePicker invoked by the web page. I want to run another function before the showOpenFilePicker.
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.evaluateOnNewDocument(() => {
Object.defineProperty(HTMLCanvasElement.prototype, "toBlob", {
value: () => {
console.log("Hey there");
},
});
});
await page.goto("https://example.com");
await page.evaluate(() => {
console.log(HTMLCanvasElement.prototype.toBlob.toString());
});
// await browser.close();
})();
You can override built-in functions in Puppeteer like in the code sample below. This replaces the original function with an override that logs the arguments to the console.
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.evaluateOnNewDocument(() => {
const originalShowOpenFilePicker = window.showOpenFilePicker;
window.showOpenFilePicker = (...args) => {
console.log('Modified `showOpenFilePicker` called with these arguments:', args);
return originalShowOpenFilePicker(...args);
};
});
await page.goto("https://example.com");
await page.evaluate(() => {
console.log(showOpenFilePicker());
});
// await browser.close();
})();

Puppeteer unable to use get property

Cannot read property 'getProperty' of undefined is the error that I get.
const puppeteer = require('puppeteer');
async function scrapeUdemy(url) {
try {
const browser = await puppeteer.launch({headless: false, slowmo: 250});
const page = await browser.newPage()
await page.goto(url)
const [el] = await page.$x('//*[#id="udemy"]/div[1]/div[4]/div/div/div[2]/div/div/div[1]/a/div[1]/div[1]');
const txt = await el.getProperty('textContent');
const rawTxt = await src.jsonValue();
console.log({srcTxt});
browser.close();
}
catch(err) {
console.log(err.message);
}
}
scrapeUdemy('https://www.udemy.com/user/eren-cem-salta/')
I tried using other versions but does not work. It is not working with the catch block too.
The element that you want to get is loaded with AJAX after the page started and you have to wait until it appears in the DOM:
await page.waitForSelector('[data-purpose="course-card-container"] div.udlite-heading-sm');
And why not use the same selector to get all of the cards:
const titles = await page.evaluate(() => {
const nodes = document.querySelectorAll(
'[data-purpose="course-card-container"] div.udlite-heading-sm'
);
return [...nodes].map((node) => node.textContent);
})

How to iterate async function in puppeteer with NodeJS

I want to take a screenshot with puppeteer and it's working for one post. But I want to make it iterate.
If it's normal function I can just wrote the function name in the last side of the code so that it can iterate. But this is async function so I don't know how to iterate it.
const puppeteer = require('puppeteer');
let postNumber = 1;
let by;
(async () => {
const browser = await puppeteer.launch({
executablePath: 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
userDataDir: 'C:\\Users\\{computerName}\\AppData\\Local\\Google\\Chrome\\User Data',
headless: false
}); // default is true
const page = await browser.newPage();
await page.goto(`https://band.us/band/{someNumbers}/post/${postNumber}`, {
waitUntil: 'networkidle2'
});
let element = await page.$('.boardList');
by = await page.evaluate(() => document.getElementsByClassName('text')[0].textContent);
console.log(by);
await element.screenshot({
path: `./image/${postNumber}-${by}.png`
});
console.log(`SAVED : ${postNumber}-${by}.png`)
postNumber++;
await browser.close();
})();
After the function is finished, the postNumber variable should be increase by one. And then run the code again by new URLs.
As you want to run the code one iteration after another, a normal for (or while) loop can be used. async/await code works fine with these.
You can use a for in your case like this:
(async () => {
const browser = await puppeteer.launch(/* ... */);
const page = await browser.newPage();
for (let postNumber = 1; postNumber < 10; postNumber++) {
await page.goto(/* ... */);
let element = await page.$('.boardList');
// ...
}
await browser.close();
})();
You can use any appropriate loop, like while-loop:
'use strict';
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
executablePath: 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
userDataDir: 'C:\\Users\\{computerName}\\AppData\\Local\\Google\\Chrome\\User Data',
headless: false
}); // default is true
const page = await browser.newPage();
let postNumber = 1;
while (postNumber <= 10) {
await page.goto(`https://band.us/band/{someNumbers}/post/${postNumber}`, {
waitUntil: 'networkidle2'
});
const element = await page.$('.boardList');
const by = await page.evaluate(() => document.getElementsByClassName('text')[0].textContent);
console.log(by);
await element.screenshot({
path: `./image/${postNumber}-${by}.png`
});
console.log(`SAVED : ${postNumber}-${by}.png`)
postNumber++;
}
await browser.close();
})();

Categories