I try to write simple web-scraper using puppeteer library.
When I get page by url via page.goto, I need to retry if it failed, i.e response code is >= 400.
My snippet:
'use strict';
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.setViewport({width: 1024, height: 768});
await page.setDefaultNavigationTimeout(0);
await page.goto('https://google.com');
await browser.close();
process.exit();
})();
I need to implement fail strategy to retry url if response.code is >= 400.
I need delay beetween attempts equal to retryNumber * 1000ms:
1000 ms for first attempt;
2000 ms for second attempt;
3000 ms for third attempt and so on.
Promise should be rejected if retryNumber exceeds maxRetryNumber.
Who knows how to implement this via code? Are there any ready to use packets or snippets to achieve the goal?
You can then use a simple for loop to execute your retries (exit the for loop when your request was successful):
'use strict';
const puppeteer = require('puppeteer');
const delay = (ms) => {
return new Promise(resolve => setTimeout(resolve, ms));
};
(async () => {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.setViewport({width: 1024, height: 768});
await page.setDefaultNavigationTimeout(0);
const maxRetryNumber = 10;
let success = false;
for (let retryNumber = 1; retryNumber <= maxRetryNumber; retryNumber++) {
const response = await page.goto('https://google.com');
if (response.status() < 400) {
success = true;
break;
}
await delay(1000 * retryNumber);
}
if (!success) {
// do something
}
await browser.close();
process.exit();
})();
Source of delay function.
var maxRetryNumber = 10 ;
var retryNumber = 0 ;
scrape();
async function scrape(){
retryNumber++;
if(retryNumber >= maxRetryNumber )
{
console.log(' retryNumber exceeded maxRetryNumber ! ');
return ;
}
try {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.setViewport({width: 1024, height: 768});
await page.setDefaultNavigationTimeout(0);
await page.waitFor(retryNumber*1000);
let response = await page.goto('https://google.com');
await browser.close();
if(response.status() >= 400)
scrape();
else
{
console.log('ALL OK');
}
}
catch(e){
scrape();
}
}
Related
I can't handle the error that occurs if the proxy server is down. Here is the code:
const puppeteer = require('puppeteer');
const proxyChain = require('proxy chain');
async function getPic() {
const proxiesList = [
'http://208.70.77.222:1994',
];
const oldProxyUrl = proxiesList[Math.floor(Math.random() * (proxiesList.length))];
const newProxyUrl = await proxyChain.anonymizeProxy(oldProxyUrl);
const browser = await puppeteer.launch({
headless: false
ignoreHTTPSErrors: true
args: [
`--proxy-server=${newProxyUrl}`,
`--ignore-certificate-errors`,
`--no-sandbox`,
`--disable-setuid-sandbox`
]
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.43 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36OPR/94.0.0.0');
await page.goto('https://siteURL.com/',{
waitUntil: "domcontentloaded"
});
await page.waitForSelector('input[type="search"]');
await page.type('input[type="search"]','pc programs', {delay:500} )
await page.click('button[type="submit"]');
await page.waitForSelector('.footer-wrap');
await page.waitForSelector('.footer-wrap');
await page.evaluate(() => new Promise((resolve) => {
let scrollTop = -1;
const interval = setInterval(() => {
window.scrollBy(0, 100);
if(document.documentElement.scrollTop !== scrollTop) {
scrollTop = document.documentElement.scrollTop;
return;
}
clearInterval(interval);
resolve();
}, 500);
}));
awaitpage.screenshot({path: 'scr.png'});
await browser.close();
console log('1')
};
setInterval(getPic,50000);
An error is thrown Error: net::ERR_TIMED_OUT at
Tried with try-catch:
async function restartableFunction() {
try {
getPic()
} catch (c) {
if (error.message === "Error: net::ERR_TIMED_OUT") {
console.error(error);
// wait for a set amount of time before restarting the function
await new Promise(resolve => setTimeout(resolve, 5000));
// restart the function
await restartableFunction();
} else {
throw error;
}
}
}
Doesn't help solve problem.I would like to restart the function again if an error occurs in order to set up a working proxy and the code will continue to work. I will be very grateful for your advice!
The restartableFunction never catches the error because the getPic() is async and it evaluates to a Promise.
You need to await the getPic() in try block of repeatableFunction:
try {
await getPic();
}
Read more about it here https://itnext.io/error-handling-with-async-await-in-js-26c3f20bc06a
Recently I started to crawl the web using Puppeteer. Below is a code for extracting a specific product name from the shopping mall.
const puppeteer = require('puppeteer');
(async () => {
const width = 1600, height = 1040;
const option = { headless: false, slowMo: true, args: [`--window-size=${width},${height}`] };
const browser = await puppeteer.launch(option);
const page = await browser.newPage();
const vp = {width: width, height: height};
await page.setViewport(vp);
const navigationPromise = page.waitForNavigation();
await page.goto('https://shopping.naver.com/home/p/index.nhn');
await navigationPromise;
await page.waitFor(2000);
const textBoxId = 'co_srh_input';
await page.type('.' + textBoxId, '양말', {delay: 100});
await page.keyboard.press('Enter');
await page.waitFor(5000);
await page.waitForSelector('div.info > a.tit');
const stores = await page.evaluate(() => {
const links = Array.from(document.querySelectorAll('div.info > a.tit'));
return links.map(link => link.innerText).slice(0, 10) // 10개 제품만 가져오기
});
console.log(stores);
await browser.close();
})();
I have a question. How can I output the crawled results to an HTML document (without using the database)? Please use sample code to explain it.
I used what was seen on blog.kowalczyk.info
const puppeteer = require("puppeteer");
const fs = require("fs");
async function run() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://www.google.com/", { waitUntil: "networkidle2" });
// hacky defensive move but I don't know a better way:
// wait a bit so that the browser finishes executing JavaScript
await page.waitFor(1 * 1000);
const html = await page.content();
fs.writeFileSync("index.html", html);
await browser.close();
}
run();
fs.writeFile()
You can use the following write_file function that returns a Promise that resolves or rejects when fs.writeFile() succeeds or fails.
Then, you can await the Promise from within your anonymous, asynchronous function and check whether or not the data was written to the file:
'use strict';
const fs = require('fs');
const puppeteer = require('puppeteer');
const write_file = (file, data) => new Promise((resolve, reject) => {
fs.writeFile(file, data, 'utf8', error => {
if (error) {
console.error(error);
reject(false);
} else {
resolve(true);
}
});
});
(async () => {
// ...
const stores = await page.evaluate(() => {
return Array.from(document.querySelectorAll('div.info > a.tit'), link => link.innerText).slice(0, 10); // 10개 제품만 가져오기
});
if (await write_file('example.html', stores.toString()) === false) {
console.error('Error: Unable to write stores to example.html.');
}
// ...
});
I try to use the code from this question, but it doesn't work out.
The waitForFunction seems to get skipped and is not evaluated.
chrome://downloads/ is shown, the file still downloads and the script ends.
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
headless: false, slowMo: 100, // Uncomment to visualize test
});
const page = await browser.newPage();
await page.goto('https://speed.hetzner.de/');
// Resize window to 1588 x 901 and await navigation
await page.setViewport({ width: 1588, height: 901 });
// Click on <a> "10MB.bin"
await page.waitForSelector('[href="10GB.bin"]');
await page.click('[href="10GB.bin"]');
dmPage = await browser.newPage()
await dmPage.goto('chrome://downloads/')
await dmPage.bringToFront()
await dmPage.waitForFunction(() => {
// monitoring the state of the first download item
// if finish than return true; if fail click
const dm = document.querySelector('downloads-manager').shadowRoot
const firstItem = dm.querySelector('#frb0')
if (firstItem) {
const thatArea = firstItem.shadowRoot.querySelector('.controls')
const atag = thatArea.querySelector('a')
if (atag && atag.textContent === 'Show in folder') {
return true
}
const btn = thatArea.querySelector('cr-button')
if (btn && btn.textContent === 'Retry') {
btn.click()
}
}
},
{ polling: 'raf', timeout: 0 }, // polling? yes. there is a 'polling: "mutation"' which kind of async
)
console.log('finish')
// await browser.close();
})();
My problem is that I need to set the comment selector to "all comments" whit puppeteer but the comments don't render after that puppeteer clicks on the correct button, "all the comments", the comment section just disappears, I will provide the code and a video of the browser in action.
const $ = require('cheerio');
const puppeteer = require('puppeteer');
const url = 'https://www.facebook.com/pg/SamsungGlobal/posts/';
const main = async () => {
const browser = await puppeteer.launch({
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const page = await browser.newPage();
await page.setViewport({
width: 1920,
height: 1080
});
await page.goto(url, {
waitUntil: 'networkidle2',
timeout: 0
});
page.mouse.click(50, 540, {});
for (var a = 0; a < 18; a++) {
setTimeout(() => {}, 16);
await page.keyboard.press('ArrowDown');
}
let bodyHTML = await page.evaluate(() => document.body.innerHTML);
var id = "#" + $("._427x ._4-u2.mbm._4mrt", bodyHTML).attr('id'); // selects id of first post
try {
var exp = await page.$(`${id} a._21q1`); // clicks on "most relevant" from the first post
await exp.evaluate(exp => exp.click());
await page.click('div[data-ordering="RANKED_UNFILTERED"]'); // selects "all the comments"
var exp = await page.$(`${id} a._42ft`); // should click on "more comments" but it doesn't load
await exp.evaluate(exp => exp.click());
await page.waitForSelector(`${id} a._5v47.fss`); // wait for the "others" in facebook comments
var exp = await page.$$(`${id} a._5v47.fss`);
await exp.evaluate(exp => exp.click());
await page.screenshot({
path: "./srn4.png"
});
// var post = await page.$eval(id + " .userContentWrapper", el => el.innerHTML);
// console.log("that's the post " + post);
} catch (e) {
console.log(e);
}
setTimeout(async function() {
await browser.close(); //close after some time
}, 1500);
};
main();
That's the video of the full execution process: https://youtu.be/jXpSOBfVskg
That's a slow motion of the moment it click on the menu: https://youtu.be/1OgfFNokxsA
You can try a variant with selectors:
'use strict';
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch({ headless: false });
const [page] = await browser.pages();
await page.goto('https://www.facebook.com/pg/SamsungGlobal/posts/');
await page.waitForSelector('[data-ordering="RANKED_THREADED"]');
await page.click('[data-ordering="RANKED_THREADED"]');
await page.waitForSelector('[data-ordering="RANKED_UNFILTERED"]');
await page.click('[data-ordering="RANKED_UNFILTERED"]');
} catch (err) {
console.error(err);
}
})();
page.mouse.click(50, 540, {});
This is not going to work necessarily. What are you trying to click? You need to use CSS selectors to find elements that you want to click.
Also, dynamic elements might not appear in the page right away. You should use waitForSelector as needed.
Recently I started to crawl the web using Puppeteer. Below is a code for extracting a specific product name from the shopping mall.
const puppeteer = require('puppeteer');
(async () => {
const width = 1600, height = 1040;
const option = { headless: false, slowMo: true, args: [`--window-size=${width},${height}`] };
const browser = await puppeteer.launch(option);
const page = await browser.newPage();
const vp = {width: width, height: height};
await page.setViewport(vp);
const navigationPromise = page.waitForNavigation();
await page.goto('https://shopping.naver.com/home/p/index.nhn');
await navigationPromise;
await page.waitFor(2000);
const textBoxId = 'co_srh_input';
await page.type('.' + textBoxId, '양말', {delay: 100});
await page.keyboard.press('Enter');
await page.waitFor(5000);
await page.waitForSelector('div.info > a.tit');
const stores = await page.evaluate(() => {
const links = Array.from(document.querySelectorAll('div.info > a.tit'));
return links.map(link => link.innerText).slice(0, 10) // 10개 제품만 가져오기
});
console.log(stores);
await browser.close();
})();
I have a question. How can I output the crawled results to an HTML document (without using the database)? Please use sample code to explain it.
I used what was seen on blog.kowalczyk.info
const puppeteer = require("puppeteer");
const fs = require("fs");
async function run() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://www.google.com/", { waitUntil: "networkidle2" });
// hacky defensive move but I don't know a better way:
// wait a bit so that the browser finishes executing JavaScript
await page.waitFor(1 * 1000);
const html = await page.content();
fs.writeFileSync("index.html", html);
await browser.close();
}
run();
fs.writeFile()
You can use the following write_file function that returns a Promise that resolves or rejects when fs.writeFile() succeeds or fails.
Then, you can await the Promise from within your anonymous, asynchronous function and check whether or not the data was written to the file:
'use strict';
const fs = require('fs');
const puppeteer = require('puppeteer');
const write_file = (file, data) => new Promise((resolve, reject) => {
fs.writeFile(file, data, 'utf8', error => {
if (error) {
console.error(error);
reject(false);
} else {
resolve(true);
}
});
});
(async () => {
// ...
const stores = await page.evaluate(() => {
return Array.from(document.querySelectorAll('div.info > a.tit'), link => link.innerText).slice(0, 10); // 10개 제품만 가져오기
});
if (await write_file('example.html', stores.toString()) === false) {
console.error('Error: Unable to write stores to example.html.');
}
// ...
});