Here is the code in my data scraping file:
const puppeteer = require('puppeteer');
const db = require('../db');
const Job = require('../models/job');
(async() => {
try {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
// args: ['--no-zygote', '--no-sandbox']
});
const url = 'https://www.linkedin.com/jobs/search?keywords=Junior%20Software%20Developer&location=Indianapolis%2C%20IN&geoId=&trk=homepage-jobseeker_jobs-search-bar_search-submit&position=1&pageNum=0';
// Open browser instance
const page = await browser.newPage({
waitUntil: 'networkidle0'
});
console.log(`Navigating to ${url}`);
await page.goto(url);
// Scroll to bottom of page, click on 'See More Jobs' and repeat
let lastHeight = await page.evaluate('document.body.scrollHeight');
const scroll = async() => {
while (true) {
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForTimeout(2000);
let newHeight = await page.evaluate('document.body.scrollHeight');
if (newHeight === lastHeight) {
console.log('Done scrolling!');
break;
}
lastHeight = newHeight;
seeMoreJobs();
}
console.log(data);
}
// Click on 'See More Jobs'
const seeMoreJobs = async() => {
await page.evaluate(() => {
document.querySelector('button[data-tracking-control-name="infinite-scroller_show-more"]').click();
});
}
// Collect data
const data = await page.evaluate(() => {
const allJobsArr = Array.from(document.querySelectorAll('a[data-tracking-control-name="public_jobs_jserp-result_search-card"]'));
const namesAndUrls = allJobsArr.map(job => {
return {
name: job.innerText,
url: job.href,
path: job.pathname
}
});
return namesAndUrls;
});
scroll();
} catch (err) {
console.log(err);
}
})();
So the above code is designed to navigate to the variable url and then to scroll until the scroll function "breaks"/finishes, i.e., to the very bottom of the page. Once these actions have finished, I want to then log some data in the form of an array with three properties from each job posting: name, href, and path. When I run the IIFE as shown I am able to grab the first 24-25 job postings with my data function, which are the first to be displayed on page load (before any of the scrolling takes place).
For whatever reason, this data function is unable to evaluate the entire page or document after all the scrolling has occurred.
I have tried various things and have really analyzed what the code is doing, but alas, I am at a loss for a solution. My end goal here is to comb through every job posting that has displayed with my scrolling function and then to log everything (not just the first 24-25 results) returned with the desired data properties to the console.
Thanks, all.
Ok, I have now figured out the reason why it was only pulling out the first 25 results, and I believe it was a problem of scope, sort of how I had outlined in the original question. I ended up housing the 'data' functional expression within the scroll() function, so that the same 'page' was being 'evaluated', otherwise I believe the two were looking at two different instances of the 'page'. I know this might not be the most accurate explanation, so if someone would like to better articulate this for me, that would be awesome. Here is the simple solution to the simple problem that I was having. Thanks.
const puppeteer = require('puppeteer');
const db = require('../db');
const Job = require('../models/job');
(async() => {
try {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
// args: ['--no-zygote', '--no-sandbox']
});
const url = 'https://www.linkedin.com/jobs/search?keywords=Junior%20Software%20Developer&location=Indianapolis%2C%20IN&geoId=&trk=homepage-jobseeker_jobs-search-bar_search-submit&position=1&pageNum=0';
// Open browser instance
const page = await browser.newPage({
waitUntil: 'networkidle0'
});
console.log(`Navigating to ${url}`);
await page.goto(url);
// Scroll to bottom of page, click on 'See More Jobs' and repeat
let lastHeight = await page.evaluate('document.body.scrollHeight');
const scroll = async() => {
while (true) {
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForTimeout(2000);
let newHeight = await page.evaluate('document.body.scrollHeight');
if (newHeight === lastHeight) {
break;
}
lastHeight = newHeight;
seeMoreJobs();
}
// Scrape all junior job titles
const data = await page.evaluate(() => {
const allJobsArr = Array.from(document.querySelectorAll('a[data-tracking-control-name="public_jobs_jserp-result_search-card"]'));
const namesAndUrls = allJobsArr.map(job => {
return {
name: job.innerText,
url: job.href,
path: job.pathname
}
});
const juniorJobs = namesAndUrls.filter(function(job) {
return job.name.includes('Junior') || job.name.includes('Jr') || job.name.includes('Entry') && job.url && job.path;
});
return juniorJobs;
});
console.log(data);
}
// Click on 'See More Jobs'
const seeMoreJobs = async() => {
await page.evaluate(() => {
document.querySelector('button[data-tracking-control-name="infinite-scroller_show-more"]').click();
});
}
scroll();
} catch (err) {
console.log(err);
}
})();
I would like to know how to scrape data located in nested pages. Here's an example I tried to build however couldn't make it work. The idea is to go to https://dev.to/, click the question and grab its title. Then go back and redo the process for the next question.
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://dev.to/");
try {
const selectors = await page.$$(".crayons-story > a");
for (const post of selectors) {
await Promise.all([
page.waitForNavigation(),
post.click(),
page.goBack(),
]);
}
} catch (error) {
console.log(error);
} finally {
browser.close();
}
})();
When I run this code, I get
Error: Node is either not visible or not an HTMLElement
Edit: The code is missing a piece where grabs the title, but is enough for the purpose.
What is happening is the website doesn't automatically have that node when the page is opened. However, puppeteer fetches the webcontents immediately after going to that page. What you'll need is a delay so that the website is able to use it's "script" tags and inject the story in.
To wait, use this following command:
await page.waitForSelector(".crayons-story > a")
This makes sure puppeteer waits for that selector to become visible, and then starts scraping the contents.
So your final code should look like this:
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://dev.to/");
await page.waitForSelector(".crayons-story > a")
try {
const selectors = await page.$$(".crayons-story > a");
for (const post of selectors) {
await Promise.all([
page.waitForNavigation(),
post.click(".crayons-story > a"),
page.goBack(),
]);
}
} catch (error) {
console.log(error);
} finally {
browser.close();
}
})();
The problem I'm facing here is very similar to this one.
Puppeteer Execution context was destroyed, most likely because of a navigation
The best solution I could come up with is to avoid using page.goBack() and rather use page.goto() so the references are not lost.
Solution 1: (this one uses MAP and the scrape is resolved in an async way, much quicker than the one bellow this one):
const puppeteer = require("puppeteer");
const SELECTOR_POSTS_LINK = ".article--post__title > a";
const SELECTOR_POST_TITLE = ".article-header--title";
async function scrape() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://www.smashingmagazine.com/articles/");
try {
const links = await page.$$eval(SELECTOR_POSTS_LINK, (links) => links.map((link) => link.href));
const resolver = async (link) => {
await page.goto(link);
const title = await page.$eval(SELECTOR_POST_TITLE, (el) => el.textContent);
return { title };
};
const promises = await links.map((link) => resolver(link));
const articles = await Promise.all(promises);
console.log(articles);
} catch (error) {
console.log(error);
} finally {
browser.close();
}
}
scrape();
Solution 2: (Use for of so it's sync and then much slower than the previous):
const puppeteer = require("puppeteer");
const SELECTOR_POSTS_LINK = ".article--post__title > a";
const SELECTOR_POST_TITLE = ".article-header--title";
async function scrape() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://www.smashingmagazine.com/articles/");
try {
const links = await page.$$eval(SELECTOR_POSTS_LINK, (links) => links.map((link) => link.href));
const articles = [];
for (const link of links) {
await page.goto(link);
const title = await page.$eval(SELECTOR_POST_TITLE, (el) => el.textContent);
articles.push({ title });
}
console.log(articles);
} catch (error) {
console.log(error);
} finally {
browser.close();
}
}
scrape();
So I'm trying to use puppeteer to parse through a bunch of pages. I'm able to do so successfully, but not when I try to do multiple pages at the same time. I understand what's happening - rather than executing a block of code one at a time per row, the code is just hammering the browser with asyncs. My code looks similar to:
const MY_USER = process.env.MY_USER;
const MY_PWD = process.env.MY_PWD;
const puppeteer = require('puppeteer');
const fs = require('fs')
var results = [];
(async => {
const browser = puppeteer.launch({
headless: true,
ignoreHTTPSErrors: true,
});
const page = await browser.newPage()
page.setViewport({
width: 1920,
height: 2200,
});
//Log into my site
await page.goto('https://example.com',{"waitUntil" : "networkidle0"}}
await page.type('input[name="username"]', MY_USER);
await page.type('input[name="password"]', MY_PWD);
await page.click('[type="submit"]');
//Wait for it to load...
await page.waitForTimeout(1*2000);
//Here is when the problems begin
fs.readFileSync("myCSV.csv", {
encoding: 'utf-8'
})
.split('\n')
.map(async (row) => {
await captureMyPage(row[0]);
})
async function captureMyPage(thisPage)
{
await page.goto('https://example.com/'+thisPage, {"waituntil":"networkidle0"});
await page.click('thisThing')
await page.click('thisOtherThing')
await page.click('thisThirdThing')
await page.screenshot({
path: 'files/'+thisPage+'.jpg',
fullpage: true,
});
}
}
}
{)();
So, the code works if I do it on one page, but what i'm asking is, how do I get
await captureMyPage(row[0])
To wait until that whole function is done executing until it goes back and does it for the same row?
Thanks!
Use for loop instead of map. async/await will not working as your expectation.
const rows = fs.readFileSync("myCSV.csv", {
encoding: 'utf-8'
}).split('\n');
for (const row of rows) {
await captureMyPage(row[0]);
}
Using Puppeteer to listen for map.on('load') from within Node.
(async () => {
const browser = await puppeteer.launch({ headless: false, devtools: true });
const page = await browser.newPage();
function nodeLog(msg) {
console.log(msg);
}
page.on('load', async () => {
await page.evaluate(() => {
window.map.on('load', () => {
console.log("This runs on the index.html js but I do not need that");
nodeLog("WHY IS THIS NOT WORKING??")
})
})
});
await page.goto(`file:${__dirname + '/index.html'}`);
})();
waitForSelector should work, eg. when using a selector from the readily rendered map... or listen for the map.bounds_changed or the map.idle event, which are triggered once the map is fully loaded. The map.load event might happen too soon.
Here's a working example, which I've just put together:
const puppeteer = require('puppeteer');
const url = 'https://developers-dot-devsite-v2-prod.appspot.com/maps/documentation/javascript/examples/full/map-simple';
run().then(() => {
console.log('entering asynchronous execution.')
}).catch(error => {
console.log(error)
});
async function run() {
puppeteer
.launch({devtools: true, headless: false})
.then(async browser => {
const page = await browser.newPage();
await page.goto(url);
await page.evaluate(() => {
window.map.addListener('idle', function(){
console.log('the map is idle now');
var div = document.createElement('div');
div.setAttribute('id', 'puppeteer-map-idle');
window.document.body.append(div);
});
});
await page.waitForSelector('#puppeteer-map-idle' , {
timeout: 5000
}).then((res) => {
console.log('selector #puppeteer-map-idle has been found.');
/* in here the map should be fully loaded. */
});
// await browser.close();
});
}
Admittedly that's kind of workaround, but the DOM manipulation can be observed.
I also figured out how to return information. I reread the docs and got some understanding. I was not understanding the context.
const nodeLog = msg => console.log;
const msg = await page.evaluate(() => { return 'this is working' });
nodeLog(msg);
I'm running puppeteer on express/node/ubuntu as follow:
var puppeteer = require('puppeteer');
var express = require('express');
var router = express.Router();
/* GET home page. */
router.get('/', function(req, res, next) {
(async () => {
headless = true;
const browser = await puppeteer.launch({headless: true, args:['--no-sandbox']});
const page = await browser.newPage();
url = req.query.url;
await page.goto(url);
let bodyHTML = await page.evaluate(() => document.body.innerHTML);
res.send(bodyHTML)
await browser.close();
})();
});
running this script multiple times leaves hundred of Zombies:
$ pgrep chrome | wc -l
133
Which clogs the srv,
How do I fix this?
Running kill from a Express JS script could solve it?
Is there a better way to get the same result other than puppeteer and headless chrome?
Ahhh! This is a simple oversight. What if an error occurs and your await browser.close() never executes thus leaving you with zombies.
Using shell.js seems to be a hacky way of solving this issue.
The better practice is to use try..catch..finally. The reason being you would want the browser to be closed irrespective of a happy flow or an error being thrown.
And unlike the other code snippet, you don't have to try and close the browser in the both the catch block and finally block. finally block is always executed irrespective of whether an error is thrown or not.
So, your code should look like,
const puppeteer = require('puppeteer');
const express = require('express');
const router = express.Router();
/* GET home page. */
router.get('/', function(req, res, next) {
(async () => {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox'],
});
try {
const page = await browser.newPage();
url = req.query.url;
await page.goto(url);
const bodyHTML = await page.evaluate(() => document.body.innerHTML);
res.send(bodyHTML);
} catch (e) {
console.log(e);
} finally {
await browser.close();
}
})();
});
Hope this helps!
wrap your code in try-catch like this and see if it helps
headless = true;
const browser = await puppeteer.launch({headless: true, args:['--no-sandbox']});
try {
const page = await browser.newPage();
url = req.query.url;
await page.goto(url);
let bodyHTML = await page.evaluate(() => document.body.innerHTML);
res.send(bodyHTML);
await browser.close();
} catch (error) {
console.log(error);
} finally {
await browser.close();
}
From my experience, the browser closing process may take some time after close is called. Anyway, you can check the browser process property to check if it's still not closed and force kill it.
if (browser && browser.process() != null) browser.process().kill('SIGINT');
I'm also posting the full code of my puppeteer resources manager below. Take a look at bw.on('disconnected', async () => {
const puppeteer = require('puppeteer-extra')
const randomUseragent = require('random-useragent');
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
const USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36';
puppeteer.use(StealthPlugin())
function ResourceManager(loadImages) {
let browser = null;
const _this = this;
let retries = 0;
let isReleased = false;
this.init = async () => {
isReleased = false;
retries = 0;
browser = await runBrowser();
};
this.release = async () => {
isReleased = true;
if (browser) await browser.close();
}
this.createPage = async (url) => {
if (!browser) browser = await runBrowser();
return await createPage(browser,url);
}
async function runBrowser () {
const bw = await puppeteer.launch({
headless: true,
devtools: false,
ignoreHTTPSErrors: true,
slowMo: 0,
args: ['--disable-gpu','--no-sandbox','--no-zygote','--disable-setuid-sandbox','--disable-accelerated-2d-canvas','--disable-dev-shm-usage', "--proxy-server='direct://'", "--proxy-bypass-list=*"]
});
bw.on('disconnected', async () => {
if (isReleased) return;
console.log("BROWSER CRASH");
if (retries <= 3) {
retries += 1;
if (browser && browser.process() != null) browser.process().kill('SIGINT');
await _this.init();
} else {
throw "===================== BROWSER crashed more than 3 times";
}
});
return bw;
}
async function createPage (browser,url) {
const userAgent = randomUseragent.getRandom();
const UA = userAgent || USER_AGENT;
const page = await browser.newPage();
await page.setViewport({
width: 1920 + Math.floor(Math.random() * 100),
height: 3000 + Math.floor(Math.random() * 100),
deviceScaleFactor: 1,
hasTouch: false,
isLandscape: false,
isMobile: false,
});
await page.setUserAgent(UA);
await page.setJavaScriptEnabled(true);
await page.setDefaultNavigationTimeout(0);
if (!loadImages) {
await page.setRequestInterception(true);
page.on('request', (req) => {
if(req.resourceType() == 'stylesheet' || req.resourceType() == 'font' || req.resourceType() == 'image'){
req.abort();
} else {
req.continue();
}
});
}
await page.evaluateOnNewDocument(() => {
//pass webdriver check
Object.defineProperty(navigator, 'webdriver', {
get: () => false,
});
});
await page.evaluateOnNewDocument(() => {
//pass chrome check
window.chrome = {
runtime: {},
// etc.
};
});
await page.evaluateOnNewDocument(() => {
//pass plugins check
const originalQuery = window.navigator.permissions.query;
return window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
});
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'plugins', {
// This just needs to have `length > 0` for the current test,
// but we could mock the plugins too if necessary.
get: () => [1, 2, 3, 4, 5],
});
});
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en'],
});
});
await page.goto(url, { waitUntil: 'networkidle2',timeout: 0 } );
return page;
}
}
module.exports = {ResourceManager}
I solve it with https://www.npmjs.com/package/shelljs
var shell = require('shelljs');
shell.exec('pkill chrome')
try to close the browser before sending the response
var puppeteer = require('puppeteer');
var express = require('express');
var router = express.Router();
router.get('/', function(req, res, next) {
(async () => {
headless = true;
const browser = await puppeteer.launch({headless: true});
const page = await browser.newPage();
url = req.query.url;
await page.goto(url);
let bodyHTML = await page.evaluate(() => document.body.innerHTML);
await browser.close();
res.send(bodyHTML);
})();
});
I ran into the same issue and while your shelljs solution did work, it kills all chrome processes, which might interrupt one that is still processing a request. Here is a better solution that should work.
var puppeteer = require('puppeteer');
var express = require('express');
var router = express.Router();
router.get('/', function (req, res, next) {
(async () => {
await puppeteer.launch({ headless: true }).then(async browser => {
const page = await browser.newPage();
url = req.query.url;
await page.goto(url);
let bodyHTML = await page.evaluate(() => document.body.innerHTML);
await browser.close();
res.send(bodyHTML);
});
})();
});
use
(await browser).close()
that happens because what the browser contains is a promise you have to solve it, I suffered a lot for this I hope it helps
I use the following basic setup for running Puppeteer:
const puppeteer = require("puppeteer");
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
/* use the page */
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
Here, the finally block guarantees the browser will close correctly regardless of whether an error was thrown. Errors are logged (if desired). I like .catch and .finally as chained calls because the mainline Puppeteer code is one level flatter, but this accomplishes the same thing:
const puppeteer = require("puppeteer");
(async () => {
let browser;
try {
browser = await puppeteer.launch();
const [page] = await browser.pages();
/* use the page */
}
catch (err) {
console.error(err);
}
finally {
await browser?.close();
}
})();
There's no reason to call newPage because Puppeteer starts with a page open.
As for Express, you need only place the entire code above, including let browser; and excluding require("puppeteer"), into your route, and you're good to go, although you might want to use an async middleware error handler.
You ask:
Is there a better way to get the same result other than puppeteer and headless chrome?
That depends on what you're doing and what you mean by "better". If your goal is to get document.body.innerHTML and the page content you're interested in is baked into the static HTML, you can dump Puppeteer entirely and just make a request to get the resource, then use Cheerio to extract the desired information.
Another consideration is that you may not need to load and close a whole browser per request. If you can use one new page per request, consider the following strategy:
const express = require("express");
const puppeteer = require("puppeteer");
const asyncHandler = fn => (req, res, next) =>
Promise.resolve(fn(req, res, next)).catch(next)
;
const browserReady = puppeteer.launch({
args: ["--no-sandbox", "--disable-setuid-sandbox"]
});
const app = express();
app
.set("port", process.env.PORT || 5000)
.get("/", asyncHandler(async (req, res) => {
const browser = await browserReady;
const page = await browser.newPage();
try {
await page.goto(req.query.url || "http://www.example.com");
return res.send(await page.content());
}
catch (err) {
return res.status(400).send(err.message);
}
finally {
await page.close();
}
}))
.use((err, req, res, next) => res.sendStatus(500))
.listen(app.get("port"), () =>
console.log("listening on port", app.get("port"))
)
;
Finally, make sure to never set any timeouts to 0 (for example, page.setDefaultNavigationTimeout(0);), which introduces the potential for the script to hang forever. If you need a generous timeout, at most set it to a few minutes--long enough not to trigger false positives.
See also:
Parallelism of Puppeteer with Express Router Node JS. How to pass page between routes while maintaining concurrency
Puppeteer unable to run on heroku