I can't handle the error that occurs if the proxy server is down. Here is the code:
const puppeteer = require('puppeteer');
const proxyChain = require('proxy chain');
async function getPic() {
const proxiesList = [
'http://208.70.77.222:1994',
];
const oldProxyUrl = proxiesList[Math.floor(Math.random() * (proxiesList.length))];
const newProxyUrl = await proxyChain.anonymizeProxy(oldProxyUrl);
const browser = await puppeteer.launch({
headless: false
ignoreHTTPSErrors: true
args: [
`--proxy-server=${newProxyUrl}`,
`--ignore-certificate-errors`,
`--no-sandbox`,
`--disable-setuid-sandbox`
]
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.43 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36OPR/94.0.0.0');
await page.goto('https://siteURL.com/',{
waitUntil: "domcontentloaded"
});
await page.waitForSelector('input[type="search"]');
await page.type('input[type="search"]','pc programs', {delay:500} )
await page.click('button[type="submit"]');
await page.waitForSelector('.footer-wrap');
await page.waitForSelector('.footer-wrap');
await page.evaluate(() => new Promise((resolve) => {
let scrollTop = -1;
const interval = setInterval(() => {
window.scrollBy(0, 100);
if(document.documentElement.scrollTop !== scrollTop) {
scrollTop = document.documentElement.scrollTop;
return;
}
clearInterval(interval);
resolve();
}, 500);
}));
awaitpage.screenshot({path: 'scr.png'});
await browser.close();
console log('1')
};
setInterval(getPic,50000);
An error is thrown Error: net::ERR_TIMED_OUT at
Tried with try-catch:
async function restartableFunction() {
try {
getPic()
} catch (c) {
if (error.message === "Error: net::ERR_TIMED_OUT") {
console.error(error);
// wait for a set amount of time before restarting the function
await new Promise(resolve => setTimeout(resolve, 5000));
// restart the function
await restartableFunction();
} else {
throw error;
}
}
}
Doesn't help solve problem.I would like to restart the function again if an error occurs in order to set up a working proxy and the code will continue to work. I will be very grateful for your advice!
The restartableFunction never catches the error because the getPic() is async and it evaluates to a Promise.
You need to await the getPic() in try block of repeatableFunction:
try {
await getPic();
}
Read more about it here https://itnext.io/error-handling-with-async-await-in-js-26c3f20bc06a
I'm trying to get the response headers and cookies for
url= 'https://www.asr.pima.gov/Parcel/GetParcel'
I have the following code using node and puppeteer:
const browser = await puppeteer.launch({
headless: false,
executablePath: executablePath(),
});
let req;
const page = await browser.newPage();
await page.goto(url);
const finalResponse = await page.waitForResponse(response =>
response.url() === 'https://www.asr.pima.gov/Parcel/GetParcel' && response.status() === 200
);
.......
As I step through the code there is a delay at the final line ( starting at: const finalResponse = ..), The error is in the title.
What am I doing wrong?
You may be overthinking it. page.goto returns the response you seem to be looking for:
const puppeteer = require("puppeteer"); // ^19.1.0
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const url = "https://www.asr.pima.gov/Parcel/GetParcel";
const finalResponse = await page.goto(url);
console.log(finalResponse.headers());
console.log(await page.cookies());
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Now, let's say you're looking for some other response that's kicked off by the initial page load. A way to do it is by planting the promise without await before goto, then beginning navigation.
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const url = "https://www.asr.pima.gov/Parcel/GetParcel";
const finalResponseP = page.waitForResponse(res => res.url() === url);
await page.goto(url, {waitUntil: "domcontentloaded"});
const finalResponse = await finalResponseP;
console.log(finalResponse.headers());
console.log(await page.cookies());
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
This works for the base url as shown above, but not necessarily, so it's a more general solution than the first code block. If you have multiple responses, you can try Promise.all or add a listener with page.on("response", res => {}).
Based on the follow-up, it sounds like you were expecting cookies to be set on this request, but I don't see any on a visit to the page in my browser.
If you're looking for the cookie that's returned after you enter a search, you can access it as follows:
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const ua =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36";
await page.setUserAgent(ua);
const url = "https://www.asr.pima.gov/Parcel/GetParcel";
const finalResponse = await page.goto(url, {waitUntil: "domcontentloaded"});
console.log(finalResponse.headers());
await page.type("#parcel", "123-45-678A");
await Promise.all([
page.waitForResponse(res => res.url().includes("GetSearchResults")),
page.click('[type="submit"]'),
]);
console.log(await page.cookies());
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
This question already has answers here:
puppeteer: how to wait until an element is visible?
(8 answers)
Closed 3 years ago.
I am always getting an Error while trying to scrape something from coinfarm.online. So I want the last price. When I am trying it with the console inside the browser it works perfect, but with this script I am always getting an error or null.
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto("https://coinfarm.online", { waitUntil: "load", timeout: 0 });
const example = await page.evaluate(
() => document.querySelector("#xbt_last").innerText
);
console.log("Price: " + example);
await browser.close();
})();
I also tried it with the XPath but also didn't work....
I've made this for you
const puppeteer = require ('puppeteer')
const uaString = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3882.0 Safari/537.36'
;(async () => {
const browser = await puppeteer.launch ({
headless : true,
devtools : false
})
const [page] = await browser.pages()
page.setDefaultNavigationTimeout(0)
page.setUserAgent(uaString)
page.setRequestInterception(true)
page.on('request', async request => {
if ( request.resourceType() === 'image' || request.resourceType() === 'font' || request.resourceType() === 'media' ) {
request.abort ()
} else {
request.continue ()
}
})
const open = await page.goto ('https://s.tradingview.com/embed-widget/tickers/?locale=en#%7B%22symbols%22%3A%5B%7B%22description%22%3A%22BitMex%20XBT%22%2C%22proName%22%3A%22BITMEX%3AXBTUSD%22%7D%2C%7B%22description%22%3A%22Binance%20USDT%22%2C%22proName%22%3A%22BINANCE%3ABTCUSDT%22%7D%2C%7B%22description%22%3A%22BitFinex%20USDT%22%2C%22proName%22%3A%22BITFINEX%3ABTCUSD%22%7D%2C%7B%22description%22%3A%22BitFlyer%20JPY%22%2C%22proName%22%3A%22BITFLYER%3ABTCJPY%22%7D%5D%2C%22width%22%3A%22100%25%22%2C%22height%22%3A72%2C%22utm_source%22%3A%22coinfarm.online%22%2C%22utm_medium%22%3A%22widget%22%2C%22utm_campaign%22%3A%22tickers%22%7D', {timeout: 0, waitUntil: 'networkidle0'})
const wait = await page.waitForSelector('.tv-ticker-item-change__last')
const eVal = await page.evaluate( () => document.querySelectorAll('.tv-ticker-item-change__last')[0].innerText )
console.log ( parseFloat( eVal ) )
const exit = await browser.close()
})()
I'm running puppeteer on express/node/ubuntu as follow:
var puppeteer = require('puppeteer');
var express = require('express');
var router = express.Router();
/* GET home page. */
router.get('/', function(req, res, next) {
(async () => {
headless = true;
const browser = await puppeteer.launch({headless: true, args:['--no-sandbox']});
const page = await browser.newPage();
url = req.query.url;
await page.goto(url);
let bodyHTML = await page.evaluate(() => document.body.innerHTML);
res.send(bodyHTML)
await browser.close();
})();
});
running this script multiple times leaves hundred of Zombies:
$ pgrep chrome | wc -l
133
Which clogs the srv,
How do I fix this?
Running kill from a Express JS script could solve it?
Is there a better way to get the same result other than puppeteer and headless chrome?
Ahhh! This is a simple oversight. What if an error occurs and your await browser.close() never executes thus leaving you with zombies.
Using shell.js seems to be a hacky way of solving this issue.
The better practice is to use try..catch..finally. The reason being you would want the browser to be closed irrespective of a happy flow or an error being thrown.
And unlike the other code snippet, you don't have to try and close the browser in the both the catch block and finally block. finally block is always executed irrespective of whether an error is thrown or not.
So, your code should look like,
const puppeteer = require('puppeteer');
const express = require('express');
const router = express.Router();
/* GET home page. */
router.get('/', function(req, res, next) {
(async () => {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox'],
});
try {
const page = await browser.newPage();
url = req.query.url;
await page.goto(url);
const bodyHTML = await page.evaluate(() => document.body.innerHTML);
res.send(bodyHTML);
} catch (e) {
console.log(e);
} finally {
await browser.close();
}
})();
});
Hope this helps!
wrap your code in try-catch like this and see if it helps
headless = true;
const browser = await puppeteer.launch({headless: true, args:['--no-sandbox']});
try {
const page = await browser.newPage();
url = req.query.url;
await page.goto(url);
let bodyHTML = await page.evaluate(() => document.body.innerHTML);
res.send(bodyHTML);
await browser.close();
} catch (error) {
console.log(error);
} finally {
await browser.close();
}
From my experience, the browser closing process may take some time after close is called. Anyway, you can check the browser process property to check if it's still not closed and force kill it.
if (browser && browser.process() != null) browser.process().kill('SIGINT');
I'm also posting the full code of my puppeteer resources manager below. Take a look at bw.on('disconnected', async () => {
const puppeteer = require('puppeteer-extra')
const randomUseragent = require('random-useragent');
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
const USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36';
puppeteer.use(StealthPlugin())
function ResourceManager(loadImages) {
let browser = null;
const _this = this;
let retries = 0;
let isReleased = false;
this.init = async () => {
isReleased = false;
retries = 0;
browser = await runBrowser();
};
this.release = async () => {
isReleased = true;
if (browser) await browser.close();
}
this.createPage = async (url) => {
if (!browser) browser = await runBrowser();
return await createPage(browser,url);
}
async function runBrowser () {
const bw = await puppeteer.launch({
headless: true,
devtools: false,
ignoreHTTPSErrors: true,
slowMo: 0,
args: ['--disable-gpu','--no-sandbox','--no-zygote','--disable-setuid-sandbox','--disable-accelerated-2d-canvas','--disable-dev-shm-usage', "--proxy-server='direct://'", "--proxy-bypass-list=*"]
});
bw.on('disconnected', async () => {
if (isReleased) return;
console.log("BROWSER CRASH");
if (retries <= 3) {
retries += 1;
if (browser && browser.process() != null) browser.process().kill('SIGINT');
await _this.init();
} else {
throw "===================== BROWSER crashed more than 3 times";
}
});
return bw;
}
async function createPage (browser,url) {
const userAgent = randomUseragent.getRandom();
const UA = userAgent || USER_AGENT;
const page = await browser.newPage();
await page.setViewport({
width: 1920 + Math.floor(Math.random() * 100),
height: 3000 + Math.floor(Math.random() * 100),
deviceScaleFactor: 1,
hasTouch: false,
isLandscape: false,
isMobile: false,
});
await page.setUserAgent(UA);
await page.setJavaScriptEnabled(true);
await page.setDefaultNavigationTimeout(0);
if (!loadImages) {
await page.setRequestInterception(true);
page.on('request', (req) => {
if(req.resourceType() == 'stylesheet' || req.resourceType() == 'font' || req.resourceType() == 'image'){
req.abort();
} else {
req.continue();
}
});
}
await page.evaluateOnNewDocument(() => {
//pass webdriver check
Object.defineProperty(navigator, 'webdriver', {
get: () => false,
});
});
await page.evaluateOnNewDocument(() => {
//pass chrome check
window.chrome = {
runtime: {},
// etc.
};
});
await page.evaluateOnNewDocument(() => {
//pass plugins check
const originalQuery = window.navigator.permissions.query;
return window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
});
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'plugins', {
// This just needs to have `length > 0` for the current test,
// but we could mock the plugins too if necessary.
get: () => [1, 2, 3, 4, 5],
});
});
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en'],
});
});
await page.goto(url, { waitUntil: 'networkidle2',timeout: 0 } );
return page;
}
}
module.exports = {ResourceManager}
I solve it with https://www.npmjs.com/package/shelljs
var shell = require('shelljs');
shell.exec('pkill chrome')
try to close the browser before sending the response
var puppeteer = require('puppeteer');
var express = require('express');
var router = express.Router();
router.get('/', function(req, res, next) {
(async () => {
headless = true;
const browser = await puppeteer.launch({headless: true});
const page = await browser.newPage();
url = req.query.url;
await page.goto(url);
let bodyHTML = await page.evaluate(() => document.body.innerHTML);
await browser.close();
res.send(bodyHTML);
})();
});
I ran into the same issue and while your shelljs solution did work, it kills all chrome processes, which might interrupt one that is still processing a request. Here is a better solution that should work.
var puppeteer = require('puppeteer');
var express = require('express');
var router = express.Router();
router.get('/', function (req, res, next) {
(async () => {
await puppeteer.launch({ headless: true }).then(async browser => {
const page = await browser.newPage();
url = req.query.url;
await page.goto(url);
let bodyHTML = await page.evaluate(() => document.body.innerHTML);
await browser.close();
res.send(bodyHTML);
});
})();
});
use
(await browser).close()
that happens because what the browser contains is a promise you have to solve it, I suffered a lot for this I hope it helps
I use the following basic setup for running Puppeteer:
const puppeteer = require("puppeteer");
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
/* use the page */
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
Here, the finally block guarantees the browser will close correctly regardless of whether an error was thrown. Errors are logged (if desired). I like .catch and .finally as chained calls because the mainline Puppeteer code is one level flatter, but this accomplishes the same thing:
const puppeteer = require("puppeteer");
(async () => {
let browser;
try {
browser = await puppeteer.launch();
const [page] = await browser.pages();
/* use the page */
}
catch (err) {
console.error(err);
}
finally {
await browser?.close();
}
})();
There's no reason to call newPage because Puppeteer starts with a page open.
As for Express, you need only place the entire code above, including let browser; and excluding require("puppeteer"), into your route, and you're good to go, although you might want to use an async middleware error handler.
You ask:
Is there a better way to get the same result other than puppeteer and headless chrome?
That depends on what you're doing and what you mean by "better". If your goal is to get document.body.innerHTML and the page content you're interested in is baked into the static HTML, you can dump Puppeteer entirely and just make a request to get the resource, then use Cheerio to extract the desired information.
Another consideration is that you may not need to load and close a whole browser per request. If you can use one new page per request, consider the following strategy:
const express = require("express");
const puppeteer = require("puppeteer");
const asyncHandler = fn => (req, res, next) =>
Promise.resolve(fn(req, res, next)).catch(next)
;
const browserReady = puppeteer.launch({
args: ["--no-sandbox", "--disable-setuid-sandbox"]
});
const app = express();
app
.set("port", process.env.PORT || 5000)
.get("/", asyncHandler(async (req, res) => {
const browser = await browserReady;
const page = await browser.newPage();
try {
await page.goto(req.query.url || "http://www.example.com");
return res.send(await page.content());
}
catch (err) {
return res.status(400).send(err.message);
}
finally {
await page.close();
}
}))
.use((err, req, res, next) => res.sendStatus(500))
.listen(app.get("port"), () =>
console.log("listening on port", app.get("port"))
)
;
Finally, make sure to never set any timeouts to 0 (for example, page.setDefaultNavigationTimeout(0);), which introduces the potential for the script to hang forever. If you need a generous timeout, at most set it to a few minutes--long enough not to trigger false positives.
See also:
Parallelism of Puppeteer with Express Router Node JS. How to pass page between routes while maintaining concurrency
Puppeteer unable to run on heroku
I'm new to working with async programming, so there may be something simple I'm missing out on here.
I have an express project, I'm passing an array in the body of my request.
Inside my function, I validate the body then parse the array and use a promise as I map over the array.
const games = JSON.parse(JSON.stringify(req.body.games));
const gamesMap = games.map((game) => gameSearch(game));
return Promise.all(gamesMap)
.then(function(g) {
// async is still running here, I want to wait until it returns
console.log(g); // returns [ undefined, undefined, ... ]
});
The game search function uses puppeteer to use a headless browser to return prices of the game passed in array. However, it doesn't wait until the array is returned before Promise.all is called, so the console.log(g); above returns an undefined array. I assume it is something to do with using async await inside the gameSearch function, although I'm not sure what I am supposed to do here? Any help would be greatly appreciated.
function gameSearch(game) {
(async () => {
const url = '.....' + game;
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36');
await page.goto(url);
const selector = '.searchRcrd';
await page.waitForSelector(selector);
const searchRcrds = await page.$$(selector);
const records = [];
for (let i = 0; i < searchRcrds.length; i++) {
const searchRcrd = searchRcrds[i];
const title = await searchRcrd.$eval('h1', (h1) => h1.innerText.trim());
const buyFor = await searchRcrd.$eval('.desc .prodPrice div:nth-child(2) .priceTxt:nth-child(1)', (buy) => buy.innerText.trim());
const inStoreFor = await searchRcrd.$eval('.desc .priceTxt:nth-child(2)', (inStore) => inStore.innerText.trim());
const imgSrc = await searchRcrd.$eval('div.thumb > a > img', (img) => img.src.trim());
records.push({
'title': title,
'buyFor': buyFor,
'inStoreFor': inStoreFor,
'imgSrc': imgSrc
});
}
await browser.close();
return records;
} catch (err) {
next(err);
}
})();
}
The return records returns from the (async () => {…})(); IIFE. Drop that and make gameSearch itself an async function that returns (a promise for) the array.
async function gameSearch(game) {
const url = '.....' + game;
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36');
await page.goto(url);
const selector = '.searchRcrd';
await page.waitForSelector(selector);
const searchRcrds = await page.$$(selector);
const records = [];
for (let i = 0; i < searchRcrds.length; i++) {
const searchRcrd = searchRcrds[i];
const title = await searchRcrd.$eval('h1', (h1) => h1.innerText.trim());
const buyFor = await searchRcrd.$eval('.desc .prodPrice div:nth-child(2) .priceTxt:nth-child(1)', (buy) => buy.innerText.trim());
const inStoreFor = await searchRcrd.$eval('.desc .priceTxt:nth-child(2)', (inStore) => inStore.innerText.trim());
const imgSrc = await searchRcrd.$eval('div.thumb > a > img', (img) => img.src.trim());
records.push({
'title': title,
'buyFor': buyFor,
'inStoreFor': inStoreFor,
'imgSrc': imgSrc
});
}
await browser.close();
return records;
}