I'm trying to implement an async on each loop on nodejs.
I have a variable html which contains the page content. There I want to iterate through all divs that have a particular class. Inside those divs, there are some links that I want to navigate and get some content from them too. So basically since each expects synchronous function it doesn't wait for the other code to be executed.
I tried to do it like this:
const browser = await puppeteer.launch({
headless: true
});
const page = await browser.newPage();
const page2 = await browser.newPage();
const mainUrl = "http ... ";
const html = await page.goto(mainUrl)
.then(function() {
return page.content();
});
await $('.data-row', html).each(function() => {
const url = await $(this).find(".link-details a").attr("href");
page2.goto(url)
.then(function() {
const title = await page.evaluate(el => el.innerHTML, await page.$('#title'));
// do other things
});
// do other things
// create a json with data add it to a list
});
But the title gives undefined and it's executed after the loop finishes executing ... What can I do here?
I've edited your code to show how Puppeteer was supposed to be used. Your main problem here was using jQuery where it was not needed and attempting to await things that were not asynchronous; while mixing in a promise chain.
(async () => {
const browser = await puppeteer.launch({
headless: true
});
const page = await browser.newPage();
const page2 = await browser.newPage();
const mainUrl = "http ... ";
/*const html = await page.goto(mainUrl)
.then(function() {
return page.content();
});*/
await (page.goto(mainUrl))
await page.waitForSelector('.data-row');
const dataRows = await page.evaluate(() =>
document.querySelectorAll('.data-row');
)
/*await $('.data-row', html).each(function() => {
const url = await $(this).find(".link-details a").attr("href");
await page2.goto(url)
.then(function() {
const title = await page.evaluate(el => el.innerHTML, await page.$('#title'));
// do other things
});
// do other things
// create a json with data add it to a list
});*/
for (const row of dataRows) {
const url = dataRows.querySelector(".link-details a").href;
await page2.goto(url)
const title = await page2.evaluate(() => document.title)
console.log(title)
}
})()
You can't await jQuery.each, to you can try doing the following.
const rows = await $('.data-row', html).toArray();
for(const row of rows){
const url = await $(this).find(".link-details a").attr("href");
page2.goto(url)
.then(function() {
const title = await page.evaluate(el => el.innerHTML, await page.$('#title'));
// do other things
});
// do other things
// create a json with data add it to a list
}
Related
Recently I started to crawl the web using Puppeteer. Below is a code for extracting a specific product name from the shopping mall.
const puppeteer = require('puppeteer');
(async () => {
const width = 1600, height = 1040;
const option = { headless: false, slowMo: true, args: [`--window-size=${width},${height}`] };
const browser = await puppeteer.launch(option);
const page = await browser.newPage();
const vp = {width: width, height: height};
await page.setViewport(vp);
const navigationPromise = page.waitForNavigation();
await page.goto('https://shopping.naver.com/home/p/index.nhn');
await navigationPromise;
await page.waitFor(2000);
const textBoxId = 'co_srh_input';
await page.type('.' + textBoxId, '양말', {delay: 100});
await page.keyboard.press('Enter');
await page.waitFor(5000);
await page.waitForSelector('div.info > a.tit');
const stores = await page.evaluate(() => {
const links = Array.from(document.querySelectorAll('div.info > a.tit'));
return links.map(link => link.innerText).slice(0, 10) // 10개 제품만 가져오기
});
console.log(stores);
await browser.close();
})();
I have a question. How can I output the crawled results to an HTML document (without using the database)? Please use sample code to explain it.
I used what was seen on blog.kowalczyk.info
const puppeteer = require("puppeteer");
const fs = require("fs");
async function run() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://www.google.com/", { waitUntil: "networkidle2" });
// hacky defensive move but I don't know a better way:
// wait a bit so that the browser finishes executing JavaScript
await page.waitFor(1 * 1000);
const html = await page.content();
fs.writeFileSync("index.html", html);
await browser.close();
}
run();
fs.writeFile()
You can use the following write_file function that returns a Promise that resolves or rejects when fs.writeFile() succeeds or fails.
Then, you can await the Promise from within your anonymous, asynchronous function and check whether or not the data was written to the file:
'use strict';
const fs = require('fs');
const puppeteer = require('puppeteer');
const write_file = (file, data) => new Promise((resolve, reject) => {
fs.writeFile(file, data, 'utf8', error => {
if (error) {
console.error(error);
reject(false);
} else {
resolve(true);
}
});
});
(async () => {
// ...
const stores = await page.evaluate(() => {
return Array.from(document.querySelectorAll('div.info > a.tit'), link => link.innerText).slice(0, 10); // 10개 제품만 가져오기
});
if (await write_file('example.html', stores.toString()) === false) {
console.error('Error: Unable to write stores to example.html.');
}
// ...
});
The exported file contains only one url. The rest of the urls are not found in the exported file. How can I generate a file with all the entries in the loop?
const puppeteer = require("puppeteer");
const fs = require('fs');
let browser;
(async () => {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox']
});
const [page] = await browser.pages();
await page.goto('https://old.reddit.com/',{"waitUntil" : "networkidle0"});
const a_elems = await page.$$('.thumbnail');
for (var i=0; i<a_elems.length && i<3; i++) {
const elem = a_elems[i];
const href = await page.evaluate(e => e.href, elem);
const newPage = await browser.newPage();
await newPage.goto(href,{"waitUntil" : "networkidle0"});
const url = await newPage.evaluate(() => document.location.href);
console.log(url);
fs.writeFileSync('export.json', JSON.stringify(url));
}
await browser.close();
})()
;
Thanks!
Create an array, push each url onto it in the loop, then move your writeFile call to the end.
const puppeteer = require("puppeteer");
const fs = require('fs').promises;
let browser;
(async () => {
browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox']
});
const [page] = await browser.pages();
await page.goto('https://old.reddit.com/', {
"waitUntil": "networkidle0"
});
const aElems = await page.$$('.thumbnail');
const urls = [];
for (let i = 0; i < aElems.length && i < 3; i++) {
const href = await aElems[i].evaluate(e => e.href);
const newPage = await browser.newPage();
await newPage.goto(href, {waitUntil: "networkidle0"});
const url = await newPage.evaluate(() => document.location.href);
console.log(url);
urls.push(url);
}
await fs.writeFile('export.json', JSON.stringify(urls));
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
Tips:
You're already in async code, so writeFileSync seems suboptimal here relative to the async version.
Use let instead of var so you don't get bit by i breaking scope and popping up with a stale value outside (or inside) the loop block.
Consider newPage.close(); at the end of the loop. You're only doing 3 pages now, but if this is temporary and you're going to make it 800, then it's a great idea.
"waitUntil": "networkidle0" is really slow. Since all you're doing is accessing document.location.href you can probably speed things up with waitUntil: "domcontentloaded".
JS uses camelCase, not snake_case.
If you have an ElementHandle, you can just elementHandle.evaluate(...) rather than page.evaluate(..., elementHandle).
Catch errors with catch and clean up the browser resource with finally.
let browser; was pointless in your original code.
I would like to know how to scrape data located in nested pages. Here's an example I tried to build however couldn't make it work. The idea is to go to https://dev.to/, click the question and grab its title. Then go back and redo the process for the next question.
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://dev.to/");
try {
const selectors = await page.$$(".crayons-story > a");
for (const post of selectors) {
await Promise.all([
page.waitForNavigation(),
post.click(),
page.goBack(),
]);
}
} catch (error) {
console.log(error);
} finally {
browser.close();
}
})();
When I run this code, I get
Error: Node is either not visible or not an HTMLElement
Edit: The code is missing a piece where grabs the title, but is enough for the purpose.
What is happening is the website doesn't automatically have that node when the page is opened. However, puppeteer fetches the webcontents immediately after going to that page. What you'll need is a delay so that the website is able to use it's "script" tags and inject the story in.
To wait, use this following command:
await page.waitForSelector(".crayons-story > a")
This makes sure puppeteer waits for that selector to become visible, and then starts scraping the contents.
So your final code should look like this:
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://dev.to/");
await page.waitForSelector(".crayons-story > a")
try {
const selectors = await page.$$(".crayons-story > a");
for (const post of selectors) {
await Promise.all([
page.waitForNavigation(),
post.click(".crayons-story > a"),
page.goBack(),
]);
}
} catch (error) {
console.log(error);
} finally {
browser.close();
}
})();
The problem I'm facing here is very similar to this one.
Puppeteer Execution context was destroyed, most likely because of a navigation
The best solution I could come up with is to avoid using page.goBack() and rather use page.goto() so the references are not lost.
Solution 1: (this one uses MAP and the scrape is resolved in an async way, much quicker than the one bellow this one):
const puppeteer = require("puppeteer");
const SELECTOR_POSTS_LINK = ".article--post__title > a";
const SELECTOR_POST_TITLE = ".article-header--title";
async function scrape() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://www.smashingmagazine.com/articles/");
try {
const links = await page.$$eval(SELECTOR_POSTS_LINK, (links) => links.map((link) => link.href));
const resolver = async (link) => {
await page.goto(link);
const title = await page.$eval(SELECTOR_POST_TITLE, (el) => el.textContent);
return { title };
};
const promises = await links.map((link) => resolver(link));
const articles = await Promise.all(promises);
console.log(articles);
} catch (error) {
console.log(error);
} finally {
browser.close();
}
}
scrape();
Solution 2: (Use for of so it's sync and then much slower than the previous):
const puppeteer = require("puppeteer");
const SELECTOR_POSTS_LINK = ".article--post__title > a";
const SELECTOR_POST_TITLE = ".article-header--title";
async function scrape() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://www.smashingmagazine.com/articles/");
try {
const links = await page.$$eval(SELECTOR_POSTS_LINK, (links) => links.map((link) => link.href));
const articles = [];
for (const link of links) {
await page.goto(link);
const title = await page.$eval(SELECTOR_POST_TITLE, (el) => el.textContent);
articles.push({ title });
}
console.log(articles);
} catch (error) {
console.log(error);
} finally {
browser.close();
}
}
scrape();
Cannot read property 'getProperty' of undefined is the error that I get.
const puppeteer = require('puppeteer');
async function scrapeUdemy(url) {
try {
const browser = await puppeteer.launch({headless: false, slowmo: 250});
const page = await browser.newPage()
await page.goto(url)
const [el] = await page.$x('//*[#id="udemy"]/div[1]/div[4]/div/div/div[2]/div/div/div[1]/a/div[1]/div[1]');
const txt = await el.getProperty('textContent');
const rawTxt = await src.jsonValue();
console.log({srcTxt});
browser.close();
}
catch(err) {
console.log(err.message);
}
}
scrapeUdemy('https://www.udemy.com/user/eren-cem-salta/')
I tried using other versions but does not work. It is not working with the catch block too.
The element that you want to get is loaded with AJAX after the page started and you have to wait until it appears in the DOM:
await page.waitForSelector('[data-purpose="course-card-container"] div.udlite-heading-sm');
And why not use the same selector to get all of the cards:
const titles = await page.evaluate(() => {
const nodes = document.querySelectorAll(
'[data-purpose="course-card-container"] div.udlite-heading-sm'
);
return [...nodes].map((node) => node.textContent);
})
Recently I started to crawl the web using Puppeteer. Below is a code for extracting a specific product name from the shopping mall.
const puppeteer = require('puppeteer');
(async () => {
const width = 1600, height = 1040;
const option = { headless: false, slowMo: true, args: [`--window-size=${width},${height}`] };
const browser = await puppeteer.launch(option);
const page = await browser.newPage();
const vp = {width: width, height: height};
await page.setViewport(vp);
const navigationPromise = page.waitForNavigation();
await page.goto('https://shopping.naver.com/home/p/index.nhn');
await navigationPromise;
await page.waitFor(2000);
const textBoxId = 'co_srh_input';
await page.type('.' + textBoxId, '양말', {delay: 100});
await page.keyboard.press('Enter');
await page.waitFor(5000);
await page.waitForSelector('div.info > a.tit');
const stores = await page.evaluate(() => {
const links = Array.from(document.querySelectorAll('div.info > a.tit'));
return links.map(link => link.innerText).slice(0, 10) // 10개 제품만 가져오기
});
console.log(stores);
await browser.close();
})();
I have a question. How can I output the crawled results to an HTML document (without using the database)? Please use sample code to explain it.
I used what was seen on blog.kowalczyk.info
const puppeteer = require("puppeteer");
const fs = require("fs");
async function run() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://www.google.com/", { waitUntil: "networkidle2" });
// hacky defensive move but I don't know a better way:
// wait a bit so that the browser finishes executing JavaScript
await page.waitFor(1 * 1000);
const html = await page.content();
fs.writeFileSync("index.html", html);
await browser.close();
}
run();
fs.writeFile()
You can use the following write_file function that returns a Promise that resolves or rejects when fs.writeFile() succeeds or fails.
Then, you can await the Promise from within your anonymous, asynchronous function and check whether or not the data was written to the file:
'use strict';
const fs = require('fs');
const puppeteer = require('puppeteer');
const write_file = (file, data) => new Promise((resolve, reject) => {
fs.writeFile(file, data, 'utf8', error => {
if (error) {
console.error(error);
reject(false);
} else {
resolve(true);
}
});
});
(async () => {
// ...
const stores = await page.evaluate(() => {
return Array.from(document.querySelectorAll('div.info > a.tit'), link => link.innerText).slice(0, 10); // 10개 제품만 가져오기
});
if (await write_file('example.html', stores.toString()) === false) {
console.error('Error: Unable to write stores to example.html.');
}
// ...
});