Node.js and Puppeteer Error: net::ERR_TIMED_OUT at - javascript

I can't handle the error that occurs if the proxy server is down. Here is the code:
const puppeteer = require('puppeteer');
const proxyChain = require('proxy chain');
async function getPic() {
const proxiesList = [
'http://208.70.77.222:1994',
];
const oldProxyUrl = proxiesList[Math.floor(Math.random() * (proxiesList.length))];
const newProxyUrl = await proxyChain.anonymizeProxy(oldProxyUrl);
const browser = await puppeteer.launch({
headless: false
ignoreHTTPSErrors: true
args: [
`--proxy-server=${newProxyUrl}`,
`--ignore-certificate-errors`,
`--no-sandbox`,
`--disable-setuid-sandbox`
]
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.43 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36OPR/94.0.0.0');
await page.goto('https://siteURL.com/',{
waitUntil: "domcontentloaded"
});
await page.waitForSelector('input[type="search"]');
await page.type('input[type="search"]','pc programs', {delay:500} )
await page.click('button[type="submit"]');
await page.waitForSelector('.footer-wrap');
await page.waitForSelector('.footer-wrap');
await page.evaluate(() => new Promise((resolve) => {
let scrollTop = -1;
const interval = setInterval(() => {
window.scrollBy(0, 100);
if(document.documentElement.scrollTop !== scrollTop) {
scrollTop = document.documentElement.scrollTop;
return;
}
clearInterval(interval);
resolve();
}, 500);
}));
awaitpage.screenshot({path: 'scr.png'});
await browser.close();
console log('1')
};
setInterval(getPic,50000);
An error is thrown Error: net::ERR_TIMED_OUT at
Tried with try-catch:
async function restartableFunction() {
try {
getPic()
} catch (c) {
if (error.message === "Error: net::ERR_TIMED_OUT") {
console.error(error);
// wait for a set amount of time before restarting the function
await new Promise(resolve => setTimeout(resolve, 5000));
// restart the function
await restartableFunction();
} else {
throw error;
}
}
}
Doesn't help solve problem.I would like to restart the function again if an error occurs in order to set up a working proxy and the code will continue to work. I will be very grateful for your advice!

The restartableFunction never catches the error because the getPic() is async and it evaluates to a Promise.
You need to await the getPic() in try block of repeatableFunction:
try {
await getPic();
}
Read more about it here https://itnext.io/error-handling-with-async-await-in-js-26c3f20bc06a

Related

How to handle concurrent requests with Puppeteer?

I have problem with EventEmitter memory leak detected
I have function that starts the browser like this
async function startBrowser() {
const revisionInfo = await browserFetcher.download("901912");
const browser = await puppeteer.launch({
executablePath: revisionInfo.executablePath,
args: [
"--no-sandbox",
"--disable-gpu",
"--disable-dev-shm-usage",
"--disable-setuid-sandbox",
"--no-first-run",
"--no-zygote",
"--single-process",
],
});
const page = await browser.newPage();
return { browser, page };
}
One of my functions maps through number of items like
const work = (stuff) => {
const workTodo = stuff.map(item = {
setTimeout(() => {
anotherFunction(some params)
}, some time variable all different seconds
}
await Promise.all(workTodo);
}
and in the anotherFunction()
anotherFunction(someParams) {
const { browser, page } = await startBrowser();
try {
await browser.newPage();
await page.goto('stuff');
await page.waitForSelector('stuff');
await page.click('stuff');
await page.close();
await browser.close();
} catch (error) {
console.log('error', error);
return
}
The logic in anotherFunction() works about 70-80 % of the time, but sometimes it gets timeout exceeded at waitForSelector calls and sometimes I get memory leak errors.
Am I creating too many browsers ? and not closing properly?
Can someone who knows puppeteer tell me what Im doing wrong?

DOM Selection with Puppeteer [duplicate]

This question already has answers here:
puppeteer: how to wait until an element is visible?
(8 answers)
Closed 3 years ago.
I am always getting an Error while trying to scrape something from coinfarm.online. So I want the last price. When I am trying it with the console inside the browser it works perfect, but with this script I am always getting an error or null.
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto("https://coinfarm.online", { waitUntil: "load", timeout: 0 });
const example = await page.evaluate(
() => document.querySelector("#xbt_last").innerText
);
console.log("Price: " + example);
await browser.close();
})();
I also tried it with the XPath but also didn't work....
I've made this for you
const puppeteer = require ('puppeteer')
const uaString = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3882.0 Safari/537.36'
;(async () => {
const browser = await puppeteer.launch ({
headless : true,
devtools : false
})
const [page] = await browser.pages()
page.setDefaultNavigationTimeout(0)
page.setUserAgent(uaString)
page.setRequestInterception(true)
page.on('request', async request => {
if ( request.resourceType() === 'image' || request.resourceType() === 'font' || request.resourceType() === 'media' ) {
request.abort ()
} else {
request.continue ()
}
})
const open = await page.goto ('https://s.tradingview.com/embed-widget/tickers/?locale=en#%7B%22symbols%22%3A%5B%7B%22description%22%3A%22BitMex%20XBT%22%2C%22proName%22%3A%22BITMEX%3AXBTUSD%22%7D%2C%7B%22description%22%3A%22Binance%20USDT%22%2C%22proName%22%3A%22BINANCE%3ABTCUSDT%22%7D%2C%7B%22description%22%3A%22BitFinex%20USDT%22%2C%22proName%22%3A%22BITFINEX%3ABTCUSD%22%7D%2C%7B%22description%22%3A%22BitFlyer%20JPY%22%2C%22proName%22%3A%22BITFLYER%3ABTCJPY%22%7D%5D%2C%22width%22%3A%22100%25%22%2C%22height%22%3A72%2C%22utm_source%22%3A%22coinfarm.online%22%2C%22utm_medium%22%3A%22widget%22%2C%22utm_campaign%22%3A%22tickers%22%7D', {timeout: 0, waitUntil: 'networkidle0'})
const wait = await page.waitForSelector('.tv-ticker-item-change__last')
const eVal = await page.evaluate( () => document.querySelectorAll('.tv-ticker-item-change__last')[0].innerText )
console.log ( parseFloat( eVal ) )
const exit = await browser.close()
})()

Puppeteer how to retry url fetch with delay if it failed

I try to write simple web-scraper using puppeteer library.
When I get page by url via page.goto, I need to retry if it failed, i.e response code is >= 400.
My snippet:
'use strict';
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.setViewport({width: 1024, height: 768});
await page.setDefaultNavigationTimeout(0);
await page.goto('https://google.com');
await browser.close();
process.exit();
})();
I need to implement fail strategy to retry url if response.code is >= 400.
I need delay beetween attempts equal to retryNumber * 1000ms:
1000 ms for first attempt;
2000 ms for second attempt;
3000 ms for third attempt and so on.
Promise should be rejected if retryNumber exceeds maxRetryNumber.
Who knows how to implement this via code? Are there any ready to use packets or snippets to achieve the goal?
You can then use a simple for loop to execute your retries (exit the for loop when your request was successful):
'use strict';
const puppeteer = require('puppeteer');
const delay = (ms) => {
return new Promise(resolve => setTimeout(resolve, ms));
};
(async () => {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.setViewport({width: 1024, height: 768});
await page.setDefaultNavigationTimeout(0);
const maxRetryNumber = 10;
let success = false;
for (let retryNumber = 1; retryNumber <= maxRetryNumber; retryNumber++) {
const response = await page.goto('https://google.com');
if (response.status() < 400) {
success = true;
break;
}
await delay(1000 * retryNumber);
}
if (!success) {
// do something
}
await browser.close();
process.exit();
})();
Source of delay function.
var maxRetryNumber = 10 ;
var retryNumber = 0 ;
scrape();
async function scrape(){
retryNumber++;
if(retryNumber >= maxRetryNumber )
{
console.log(' retryNumber exceeded maxRetryNumber ! ');
return ;
}
try {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.setViewport({width: 1024, height: 768});
await page.setDefaultNavigationTimeout(0);
await page.waitFor(retryNumber*1000);
let response = await page.goto('https://google.com');
await browser.close();
if(response.status() >= 400)
scrape();
else
{
console.log('ALL OK');
}
}
catch(e){
scrape();
}
}

Puppeteer doesn't close browser

I'm running puppeteer on express/node/ubuntu as follow:
var puppeteer = require('puppeteer');
var express = require('express');
var router = express.Router();
/* GET home page. */
router.get('/', function(req, res, next) {
(async () => {
headless = true;
const browser = await puppeteer.launch({headless: true, args:['--no-sandbox']});
const page = await browser.newPage();
url = req.query.url;
await page.goto(url);
let bodyHTML = await page.evaluate(() => document.body.innerHTML);
res.send(bodyHTML)
await browser.close();
})();
});
running this script multiple times leaves hundred of Zombies:
$ pgrep chrome | wc -l
133
Which clogs the srv,
How do I fix this?
Running kill from a Express JS script could solve it?
Is there a better way to get the same result other than puppeteer and headless chrome?
Ahhh! This is a simple oversight. What if an error occurs and your await browser.close() never executes thus leaving you with zombies.
Using shell.js seems to be a hacky way of solving this issue.
The better practice is to use try..catch..finally. The reason being you would want the browser to be closed irrespective of a happy flow or an error being thrown.
And unlike the other code snippet, you don't have to try and close the browser in the both the catch block and finally block. finally block is always executed irrespective of whether an error is thrown or not.
So, your code should look like,
const puppeteer = require('puppeteer');
const express = require('express');
const router = express.Router();
/* GET home page. */
router.get('/', function(req, res, next) {
(async () => {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox'],
});
try {
const page = await browser.newPage();
url = req.query.url;
await page.goto(url);
const bodyHTML = await page.evaluate(() => document.body.innerHTML);
res.send(bodyHTML);
} catch (e) {
console.log(e);
} finally {
await browser.close();
}
})();
});
Hope this helps!
wrap your code in try-catch like this and see if it helps
headless = true;
const browser = await puppeteer.launch({headless: true, args:['--no-sandbox']});
try {
const page = await browser.newPage();
url = req.query.url;
await page.goto(url);
let bodyHTML = await page.evaluate(() => document.body.innerHTML);
res.send(bodyHTML);
await browser.close();
} catch (error) {
console.log(error);
} finally {
await browser.close();
}
From my experience, the browser closing process may take some time after close is called. Anyway, you can check the browser process property to check if it's still not closed and force kill it.
if (browser && browser.process() != null) browser.process().kill('SIGINT');
I'm also posting the full code of my puppeteer resources manager below. Take a look at bw.on('disconnected', async () => {
const puppeteer = require('puppeteer-extra')
const randomUseragent = require('random-useragent');
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
const USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36';
puppeteer.use(StealthPlugin())
function ResourceManager(loadImages) {
let browser = null;
const _this = this;
let retries = 0;
let isReleased = false;
this.init = async () => {
isReleased = false;
retries = 0;
browser = await runBrowser();
};
this.release = async () => {
isReleased = true;
if (browser) await browser.close();
}
this.createPage = async (url) => {
if (!browser) browser = await runBrowser();
return await createPage(browser,url);
}
async function runBrowser () {
const bw = await puppeteer.launch({
headless: true,
devtools: false,
ignoreHTTPSErrors: true,
slowMo: 0,
args: ['--disable-gpu','--no-sandbox','--no-zygote','--disable-setuid-sandbox','--disable-accelerated-2d-canvas','--disable-dev-shm-usage', "--proxy-server='direct://'", "--proxy-bypass-list=*"]
});
bw.on('disconnected', async () => {
if (isReleased) return;
console.log("BROWSER CRASH");
if (retries <= 3) {
retries += 1;
if (browser && browser.process() != null) browser.process().kill('SIGINT');
await _this.init();
} else {
throw "===================== BROWSER crashed more than 3 times";
}
});
return bw;
}
async function createPage (browser,url) {
const userAgent = randomUseragent.getRandom();
const UA = userAgent || USER_AGENT;
const page = await browser.newPage();
await page.setViewport({
width: 1920 + Math.floor(Math.random() * 100),
height: 3000 + Math.floor(Math.random() * 100),
deviceScaleFactor: 1,
hasTouch: false,
isLandscape: false,
isMobile: false,
});
await page.setUserAgent(UA);
await page.setJavaScriptEnabled(true);
await page.setDefaultNavigationTimeout(0);
if (!loadImages) {
await page.setRequestInterception(true);
page.on('request', (req) => {
if(req.resourceType() == 'stylesheet' || req.resourceType() == 'font' || req.resourceType() == 'image'){
req.abort();
} else {
req.continue();
}
});
}
await page.evaluateOnNewDocument(() => {
//pass webdriver check
Object.defineProperty(navigator, 'webdriver', {
get: () => false,
});
});
await page.evaluateOnNewDocument(() => {
//pass chrome check
window.chrome = {
runtime: {},
// etc.
};
});
await page.evaluateOnNewDocument(() => {
//pass plugins check
const originalQuery = window.navigator.permissions.query;
return window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
});
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'plugins', {
// This just needs to have `length > 0` for the current test,
// but we could mock the plugins too if necessary.
get: () => [1, 2, 3, 4, 5],
});
});
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en'],
});
});
await page.goto(url, { waitUntil: 'networkidle2',timeout: 0 } );
return page;
}
}
module.exports = {ResourceManager}
I solve it with https://www.npmjs.com/package/shelljs
var shell = require('shelljs');
shell.exec('pkill chrome')
try to close the browser before sending the response
var puppeteer = require('puppeteer');
var express = require('express');
var router = express.Router();
router.get('/', function(req, res, next) {
(async () => {
headless = true;
const browser = await puppeteer.launch({headless: true});
const page = await browser.newPage();
url = req.query.url;
await page.goto(url);
let bodyHTML = await page.evaluate(() => document.body.innerHTML);
await browser.close();
res.send(bodyHTML);
})();
});
I ran into the same issue and while your shelljs solution did work, it kills all chrome processes, which might interrupt one that is still processing a request. Here is a better solution that should work.
var puppeteer = require('puppeteer');
var express = require('express');
var router = express.Router();
router.get('/', function (req, res, next) {
(async () => {
await puppeteer.launch({ headless: true }).then(async browser => {
const page = await browser.newPage();
url = req.query.url;
await page.goto(url);
let bodyHTML = await page.evaluate(() => document.body.innerHTML);
await browser.close();
res.send(bodyHTML);
});
})();
});
use
(await browser).close()
that happens because what the browser contains is a promise you have to solve it, I suffered a lot for this I hope it helps
I use the following basic setup for running Puppeteer:
const puppeteer = require("puppeteer");
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
/* use the page */
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
Here, the finally block guarantees the browser will close correctly regardless of whether an error was thrown. Errors are logged (if desired). I like .catch and .finally as chained calls because the mainline Puppeteer code is one level flatter, but this accomplishes the same thing:
const puppeteer = require("puppeteer");
(async () => {
let browser;
try {
browser = await puppeteer.launch();
const [page] = await browser.pages();
/* use the page */
}
catch (err) {
console.error(err);
}
finally {
await browser?.close();
}
})();
There's no reason to call newPage because Puppeteer starts with a page open.
As for Express, you need only place the entire code above, including let browser; and excluding require("puppeteer"), into your route, and you're good to go, although you might want to use an async middleware error handler.
You ask:
Is there a better way to get the same result other than puppeteer and headless chrome?
That depends on what you're doing and what you mean by "better". If your goal is to get document.body.innerHTML and the page content you're interested in is baked into the static HTML, you can dump Puppeteer entirely and just make a request to get the resource, then use Cheerio to extract the desired information.
Another consideration is that you may not need to load and close a whole browser per request. If you can use one new page per request, consider the following strategy:
const express = require("express");
const puppeteer = require("puppeteer");
const asyncHandler = fn => (req, res, next) =>
Promise.resolve(fn(req, res, next)).catch(next)
;
const browserReady = puppeteer.launch({
args: ["--no-sandbox", "--disable-setuid-sandbox"]
});
const app = express();
app
.set("port", process.env.PORT || 5000)
.get("/", asyncHandler(async (req, res) => {
const browser = await browserReady;
const page = await browser.newPage();
try {
await page.goto(req.query.url || "http://www.example.com");
return res.send(await page.content());
}
catch (err) {
return res.status(400).send(err.message);
}
finally {
await page.close();
}
}))
.use((err, req, res, next) => res.sendStatus(500))
.listen(app.get("port"), () =>
console.log("listening on port", app.get("port"))
)
;
Finally, make sure to never set any timeouts to 0 (for example, page.setDefaultNavigationTimeout(0);), which introduces the potential for the script to hang forever. If you need a generous timeout, at most set it to a few minutes--long enough not to trigger false positives.
See also:
Parallelism of Puppeteer with Express Router Node JS. How to pass page between routes while maintaining concurrency
Puppeteer unable to run on heroku

Node.js Return variable from event handler to parent function

I need the function scrape to return the value obtained in the page.on("request") event handler.
async function scrape(url) {
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on("request", async(request) => {
return "fish"
}
await page.goto(url)
}
Currently:
const ans = await scrape(url)
console.log(ans)
"undefined'
Expected:
const ans = await scrape(url)
console.log(ans)
"fish"
you'll need to return promise that is resolved when you see event that you are waiting
const matchRequest = request => request.method() === 'GET'; // your filter
async function scrape(url) {
return new Promise(resolve => {
const page = await browser.newPage();
// not sure what your logic is, but if you don't need to cancel or modify requests/resposes you probably don't need interception
// await page.setRequestInterception(true);
page.on("response", async(response) => {
if (matchRequest(response.request())) {
resolve(response.buffer());
}
}
await page.goto(url);
})
}
const body = await scrape('https://example.com');
Try follow like:
async function scrape(url) {
let sendRequest = []
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.setRequestInterception(true)
page.on('request', request => {
request.continue()
sendRequest.push('fish')
})
await page.goto(url)
return sendRequest
}
Using request.continue() for continues request with optional request overrides. To use this, request interception should be enabled with page.setRequestInterception. Exception is immediately thrown if the request interception is not enabled.

Categories