How to iterate async function in puppeteer with NodeJS - javascript

I want to take a screenshot with puppeteer and it's working for one post. But I want to make it iterate.
If it's normal function I can just wrote the function name in the last side of the code so that it can iterate. But this is async function so I don't know how to iterate it.
const puppeteer = require('puppeteer');
let postNumber = 1;
let by;
(async () => {
const browser = await puppeteer.launch({
executablePath: 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
userDataDir: 'C:\\Users\\{computerName}\\AppData\\Local\\Google\\Chrome\\User Data',
headless: false
}); // default is true
const page = await browser.newPage();
await page.goto(`https://band.us/band/{someNumbers}/post/${postNumber}`, {
waitUntil: 'networkidle2'
});
let element = await page.$('.boardList');
by = await page.evaluate(() => document.getElementsByClassName('text')[0].textContent);
console.log(by);
await element.screenshot({
path: `./image/${postNumber}-${by}.png`
});
console.log(`SAVED : ${postNumber}-${by}.png`)
postNumber++;
await browser.close();
})();
After the function is finished, the postNumber variable should be increase by one. And then run the code again by new URLs.

As you want to run the code one iteration after another, a normal for (or while) loop can be used. async/await code works fine with these.
You can use a for in your case like this:
(async () => {
const browser = await puppeteer.launch(/* ... */);
const page = await browser.newPage();
for (let postNumber = 1; postNumber < 10; postNumber++) {
await page.goto(/* ... */);
let element = await page.$('.boardList');
// ...
}
await browser.close();
})();

You can use any appropriate loop, like while-loop:
'use strict';
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
executablePath: 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
userDataDir: 'C:\\Users\\{computerName}\\AppData\\Local\\Google\\Chrome\\User Data',
headless: false
}); // default is true
const page = await browser.newPage();
let postNumber = 1;
while (postNumber <= 10) {
await page.goto(`https://band.us/band/{someNumbers}/post/${postNumber}`, {
waitUntil: 'networkidle2'
});
const element = await page.$('.boardList');
const by = await page.evaluate(() => document.getElementsByClassName('text')[0].textContent);
console.log(by);
await element.screenshot({
path: `./image/${postNumber}-${by}.png`
});
console.log(`SAVED : ${postNumber}-${by}.png`)
postNumber++;
}
await browser.close();
})();

Related

Puppeteer querySelector returns empty object [duplicate]

Recently I started to crawl the web using Puppeteer. Below is a code for extracting a specific product name from the shopping mall.
const puppeteer = require('puppeteer');
(async () => {
const width = 1600, height = 1040;
const option = { headless: false, slowMo: true, args: [`--window-size=${width},${height}`] };
const browser = await puppeteer.launch(option);
const page = await browser.newPage();
const vp = {width: width, height: height};
await page.setViewport(vp);
const navigationPromise = page.waitForNavigation();
await page.goto('https://shopping.naver.com/home/p/index.nhn');
await navigationPromise;
await page.waitFor(2000);
const textBoxId = 'co_srh_input';
await page.type('.' + textBoxId, '양말', {delay: 100});
await page.keyboard.press('Enter');
await page.waitFor(5000);
await page.waitForSelector('div.info > a.tit');
const stores = await page.evaluate(() => {
const links = Array.from(document.querySelectorAll('div.info > a.tit'));
return links.map(link => link.innerText).slice(0, 10) // 10개 제품만 가져오기
});
console.log(stores);
await browser.close();
})();
I have a question. How can I output the crawled results to an HTML document (without using the database)? Please use sample code to explain it.
I used what was seen on blog.kowalczyk.info
const puppeteer = require("puppeteer");
const fs = require("fs");
async function run() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://www.google.com/", { waitUntil: "networkidle2" });
// hacky defensive move but I don't know a better way:
// wait a bit so that the browser finishes executing JavaScript
await page.waitFor(1 * 1000);
const html = await page.content();
fs.writeFileSync("index.html", html);
await browser.close();
}
run();
fs.writeFile()
You can use the following write_file function that returns a Promise that resolves or rejects when fs.writeFile() succeeds or fails.
Then, you can await the Promise from within your anonymous, asynchronous function and check whether or not the data was written to the file:
'use strict';
const fs = require('fs');
const puppeteer = require('puppeteer');
const write_file = (file, data) => new Promise((resolve, reject) => {
fs.writeFile(file, data, 'utf8', error => {
if (error) {
console.error(error);
reject(false);
} else {
resolve(true);
}
});
});
(async () => {
// ...
const stores = await page.evaluate(() => {
return Array.from(document.querySelectorAll('div.info > a.tit'), link => link.innerText).slice(0, 10); // 10개 제품만 가져오기
});
if (await write_file('example.html', stores.toString()) === false) {
console.error('Error: Unable to write stores to example.html.');
}
// ...
});

puppeteer - export JSON file from loop

The exported file contains only one url. The rest of the urls are not found in the exported file. How can I generate a file with all the entries in the loop?
const puppeteer = require("puppeteer");
const fs = require('fs');
let browser;
(async () => {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox']
});
const [page] = await browser.pages();
await page.goto('https://old.reddit.com/',{"waitUntil" : "networkidle0"});
const a_elems = await page.$$('.thumbnail');
for (var i=0; i<a_elems.length && i<3; i++) {
const elem = a_elems[i];
const href = await page.evaluate(e => e.href, elem);
const newPage = await browser.newPage();
await newPage.goto(href,{"waitUntil" : "networkidle0"});
const url = await newPage.evaluate(() => document.location.href);
console.log(url);
fs.writeFileSync('export.json', JSON.stringify(url));
}
await browser.close();
})()
;
Thanks!
Create an array, push each url onto it in the loop, then move your writeFile call to the end.
const puppeteer = require("puppeteer");
const fs = require('fs').promises;
let browser;
(async () => {
browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox']
});
const [page] = await browser.pages();
await page.goto('https://old.reddit.com/', {
"waitUntil": "networkidle0"
});
const aElems = await page.$$('.thumbnail');
const urls = [];
for (let i = 0; i < aElems.length && i < 3; i++) {
const href = await aElems[i].evaluate(e => e.href);
const newPage = await browser.newPage();
await newPage.goto(href, {waitUntil: "networkidle0"});
const url = await newPage.evaluate(() => document.location.href);
console.log(url);
urls.push(url);
}
await fs.writeFile('export.json', JSON.stringify(urls));
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
Tips:
You're already in async code, so writeFileSync seems suboptimal here relative to the async version.
Use let instead of var so you don't get bit by i breaking scope and popping up with a stale value outside (or inside) the loop block.
Consider newPage.close(); at the end of the loop. You're only doing 3 pages now, but if this is temporary and you're going to make it 800, then it's a great idea.
"waitUntil": "networkidle0" is really slow. Since all you're doing is accessing document.location.href you can probably speed things up with waitUntil: "domcontentloaded".
JS uses camelCase, not snake_case.
If you have an ElementHandle, you can just elementHandle.evaluate(...) rather than page.evaluate(..., elementHandle).
Catch errors with catch and clean up the browser resource with finally.
let browser; was pointless in your original code.

Puppeteer: line of code being executed before others

I have this code:
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto("https://www.sisal.it/scommesse-matchpoint/quote/calcio/serie-a");
const [button1] = await
page.$x('//div[#class="marketBar_changeMarketLabel__l0vzl"]/p');
button1.click();
const [button2] = await page.$x('//div[#class="listItem_container__2IdVR white
marketList_listItemHeight__1aiAJ marketList_bgColorGrey__VdrVK"]/p[text()="1X2
ESITO FINALE"]');
button2.click();
})();
The proble is that after clicking button1 the page change and puppeteer executes immediately the following line of code, instead I want it to wait for the new page to be loaded becuase otherwise It will throw an error since It can't find button2.
I found this solution on stackoverflow:
const puppeteer = require("puppeteer");
function delay(time) {
return new Promise(function (resolve) {
setTimeout(resolve, time);
});
}
(async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto("https://www.sisal.it/scommesse-matchpoint/quote/calcio/serie-a");
const [button1] = await
page.$x('//div[#class="marketBar_changeMarketLabel__l0vzl"]/p');
button1.click();
await delay(4000);
const [button2] = await page.$x('//div[#class="listItem_container__2IdVR white
marketList_listItemHeight__1aiAJ
marketList_bgColorGrey__VdrVK"]/p[text()="1X2
ESITO FINALE"]');
button2.click();
})();
But of course this in't the best solution.
I think you have to modify a bit in your code:
await button1.click();
await page.waitForNavigation({waitUntil: 'networkidle2'});
For reference, see the documentation.
I found a solution, here's the code:
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto("https://www.sisal.it/scommesse
matchpoint/quote/calcio/serie-a");
await page.waitForXPath('//div[#class="marketBar_changeMarketLabel__l0vzl"]/p');
const [button1] = await page.$x('//div[#class="marketBar_changeMarketLabel__l0vzl"]/p');
await button1.click();
await page.waitForXPath('//div[#class="listItem_container__2IdVR white marketList_listItemHeight__1aiAJ marketList_bgColorGrey__VdrVK"]/p[text()="1X2 ESITO FINALE"]');
const [button2] = await page.$x('//div[#class="listItem_container__2IdVR white marketList_listItemHeight__1aiAJ marketList_bgColorGrey__VdrVK"]/p[text()="1X2 ESITO FINALE"]');
button2.click();
})();

Puppeteer not working as expected when clicking button

My problem is that I need to set the comment selector to "all comments" whit puppeteer but the comments don't render after that puppeteer clicks on the correct button, "all the comments", the comment section just disappears, I will provide the code and a video of the browser in action.
const $ = require('cheerio');
const puppeteer = require('puppeteer');
const url = 'https://www.facebook.com/pg/SamsungGlobal/posts/';
const main = async () => {
const browser = await puppeteer.launch({
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const page = await browser.newPage();
await page.setViewport({
width: 1920,
height: 1080
});
await page.goto(url, {
waitUntil: 'networkidle2',
timeout: 0
});
page.mouse.click(50, 540, {});
for (var a = 0; a < 18; a++) {
setTimeout(() => {}, 16);
await page.keyboard.press('ArrowDown');
}
let bodyHTML = await page.evaluate(() => document.body.innerHTML);
var id = "#" + $("._427x ._4-u2.mbm._4mrt", bodyHTML).attr('id'); // selects id of first post
try {
var exp = await page.$(`${id} a._21q1`); // clicks on "most relevant" from the first post
await exp.evaluate(exp => exp.click());
await page.click('div[data-ordering="RANKED_UNFILTERED"]'); // selects "all the comments"
var exp = await page.$(`${id} a._42ft`); // should click on "more comments" but it doesn't load
await exp.evaluate(exp => exp.click());
await page.waitForSelector(`${id} a._5v47.fss`); // wait for the "others" in facebook comments
var exp = await page.$$(`${id} a._5v47.fss`);
await exp.evaluate(exp => exp.click());
await page.screenshot({
path: "./srn4.png"
});
// var post = await page.$eval(id + " .userContentWrapper", el => el.innerHTML);
// console.log("that's the post " + post);
} catch (e) {
console.log(e);
}
setTimeout(async function() {
await browser.close(); //close after some time
}, 1500);
};
main();
That's the video of the full execution process: https://youtu.be/jXpSOBfVskg
That's a slow motion of the moment it click on the menu: https://youtu.be/1OgfFNokxsA
You can try a variant with selectors:
'use strict';
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch({ headless: false });
const [page] = await browser.pages();
await page.goto('https://www.facebook.com/pg/SamsungGlobal/posts/');
await page.waitForSelector('[data-ordering="RANKED_THREADED"]');
await page.click('[data-ordering="RANKED_THREADED"]');
await page.waitForSelector('[data-ordering="RANKED_UNFILTERED"]');
await page.click('[data-ordering="RANKED_UNFILTERED"]');
} catch (err) {
console.error(err);
}
})();
page.mouse.click(50, 540, {});
This is not going to work necessarily. What are you trying to click? You need to use CSS selectors to find elements that you want to click.
Also, dynamic elements might not appear in the page right away. You should use waitForSelector as needed.

How to print an HTML document using Puppeteer?

Recently I started to crawl the web using Puppeteer. Below is a code for extracting a specific product name from the shopping mall.
const puppeteer = require('puppeteer');
(async () => {
const width = 1600, height = 1040;
const option = { headless: false, slowMo: true, args: [`--window-size=${width},${height}`] };
const browser = await puppeteer.launch(option);
const page = await browser.newPage();
const vp = {width: width, height: height};
await page.setViewport(vp);
const navigationPromise = page.waitForNavigation();
await page.goto('https://shopping.naver.com/home/p/index.nhn');
await navigationPromise;
await page.waitFor(2000);
const textBoxId = 'co_srh_input';
await page.type('.' + textBoxId, '양말', {delay: 100});
await page.keyboard.press('Enter');
await page.waitFor(5000);
await page.waitForSelector('div.info > a.tit');
const stores = await page.evaluate(() => {
const links = Array.from(document.querySelectorAll('div.info > a.tit'));
return links.map(link => link.innerText).slice(0, 10) // 10개 제품만 가져오기
});
console.log(stores);
await browser.close();
})();
I have a question. How can I output the crawled results to an HTML document (without using the database)? Please use sample code to explain it.
I used what was seen on blog.kowalczyk.info
const puppeteer = require("puppeteer");
const fs = require("fs");
async function run() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://www.google.com/", { waitUntil: "networkidle2" });
// hacky defensive move but I don't know a better way:
// wait a bit so that the browser finishes executing JavaScript
await page.waitFor(1 * 1000);
const html = await page.content();
fs.writeFileSync("index.html", html);
await browser.close();
}
run();
fs.writeFile()
You can use the following write_file function that returns a Promise that resolves or rejects when fs.writeFile() succeeds or fails.
Then, you can await the Promise from within your anonymous, asynchronous function and check whether or not the data was written to the file:
'use strict';
const fs = require('fs');
const puppeteer = require('puppeteer');
const write_file = (file, data) => new Promise((resolve, reject) => {
fs.writeFile(file, data, 'utf8', error => {
if (error) {
console.error(error);
reject(false);
} else {
resolve(true);
}
});
});
(async () => {
// ...
const stores = await page.evaluate(() => {
return Array.from(document.querySelectorAll('div.info > a.tit'), link => link.innerText).slice(0, 10); // 10개 제품만 가져오기
});
if (await write_file('example.html', stores.toString()) === false) {
console.error('Error: Unable to write stores to example.html.');
}
// ...
});

Categories