Recently I started to crawl the web using Puppeteer. Below is a code for extracting a specific product name from the shopping mall.
const puppeteer = require('puppeteer');
(async () => {
const width = 1600, height = 1040;
const option = { headless: false, slowMo: true, args: [`--window-size=${width},${height}`] };
const browser = await puppeteer.launch(option);
const page = await browser.newPage();
const vp = {width: width, height: height};
await page.setViewport(vp);
const navigationPromise = page.waitForNavigation();
await page.goto('');
await navigationPromise;
await page.waitFor(2000);
const textBoxId = 'co_srh_input';
await page.type('.' + textBoxId, '양말', {delay: 100});
await page.waitFor(5000);
await page.waitForSelector(' > a.tit');
const stores = await page.evaluate(() => {
const links = Array.from(document.querySelectorAll(' > a.tit'));
return => link.innerText).slice(0, 10) // 10개 제품만 가져오기
await browser.close();
I have a question. How can I output the crawled results to an HTML document (without using the database)? Please use sample code to explain it.
I used what was seen on
const puppeteer = require("puppeteer");
const fs = require("fs");
async function run() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("", { waitUntil: "networkidle2" });
// hacky defensive move but I don't know a better way:
// wait a bit so that the browser finishes executing JavaScript
await page.waitFor(1 * 1000);
const html = await page.content();
fs.writeFileSync("index.html", html);
await browser.close();
You can use the following write_file function that returns a Promise that resolves or rejects when fs.writeFile() succeeds or fails.
Then, you can await the Promise from within your anonymous, asynchronous function and check whether or not the data was written to the file:
'use strict';
const fs = require('fs');
const puppeteer = require('puppeteer');
const write_file = (file, data) => new Promise((resolve, reject) => {
fs.writeFile(file, data, 'utf8', error => {
if (error) {
} else {
(async () => {
// ...
const stores = await page.evaluate(() => {
return Array.from(document.querySelectorAll(' > a.tit'), link => link.innerText).slice(0, 10); // 10개 제품만 가져오기
if (await write_file('example.html', stores.toString()) === false) {
console.error('Error: Unable to write stores to example.html.');
// ...
As illustrated in here here, Puppeteer allows to override Javascript functions. I want to override showOpenFilePicker function. That is, when the showOpenFilePicker invoked by the web page. I want to run another function before the showOpenFilePicker.
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.evaluateOnNewDocument(() => {
Object.defineProperty(HTMLCanvasElement.prototype, "toBlob", {
value: () => {
console.log("Hey there");
await page.goto("");
await page.evaluate(() => {
// await browser.close();
You can override built-in functions in Puppeteer like in the code sample below. This replaces the original function with an override that logs the arguments to the console.
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.evaluateOnNewDocument(() => {
const originalShowOpenFilePicker = window.showOpenFilePicker;
window.showOpenFilePicker = (...args) => {
console.log('Modified `showOpenFilePicker` called with these arguments:', args);
return originalShowOpenFilePicker(...args);
await page.goto("");
await page.evaluate(() => {
// await browser.close();
The page.on is reconized by the async for loop at the bottom as finished and ready to run the function again, but its not actually done. It still needs to run everything up to page.close. How do I let the async function know that it is done after page.close, not page.on? Let me know if you need anymore info, thanks.
const puppeteer = require('puppeteer');
const fs = require('fs');
const req = require('request');
const got = require('got');
const NodeID3 = require('node-id3');
const readline = require('readline');
const selectors = require('./selectors');
const getDownloadUrl = async (url, browser) => {
const page = await browser.newPage();
await page.goto(url);
await page._client.send('Page.setDownloadBehavior', {behavior: 'allow', downloadPath: './Songs'})
const baseUrl = '';
await page.on('request', async (request) => {
const downloadUrl = fixUrl(request.url());
const info = await getSongInfo(page);
downloadSong(downloadUrl, info.title);
await tagSong(info);
await request.abort();
await page.close();
} else {
const fixUrl = (url) => {
const downloadSong = (url, title) => {
const getSongInfo = async (page) => {
const tagSong = async (info) => {
(() => {
const readInterface = readline.createInterface({
input: fs.createReadStream('../Song Urls.csv'),
output: process.stdout,
console: false,
terminal: false,
let urls = [];
readInterface.on('line', function(line) {
}).on('close', async () => {
const browser = await puppeteer.launch({headless: false});
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
await getDownloadUrl(url, browser);
Issue: The loop recognizes that the getDownloadUrl function is done even though it's
not and continues anyways.
await only works with promises, and page.on looks to be a callback-based event listener, not something that returns a promise. If you want to be able to await it, you will need to create a promise around it.
await new Promise((resolve) => {
page.on('request', async (request) => {
const downloadUrl = fixUrl(request.url());
const info = await getSongInfo(page);
downloadSong(downloadUrl, info.title);
await tagSong(info);
await request.abort();
await page.close();
} else {
Cannot read property 'getProperty' of undefined is the error that I get.
const puppeteer = require('puppeteer');
async function scrapeUdemy(url) {
try {
const browser = await puppeteer.launch({headless: false, slowmo: 250});
const page = await browser.newPage()
await page.goto(url)
const [el] = await page.$x('//*[#id="udemy"]/div[1]/div[4]/div/div/div[2]/div/div/div[1]/a/div[1]/div[1]');
const txt = await el.getProperty('textContent');
const rawTxt = await src.jsonValue();
catch(err) {
I tried using other versions but does not work. It is not working with the catch block too.
The element that you want to get is loaded with AJAX after the page started and you have to wait until it appears in the DOM:
await page.waitForSelector('[data-purpose="course-card-container"] div.udlite-heading-sm');
And why not use the same selector to get all of the cards:
const titles = await page.evaluate(() => {
const nodes = document.querySelectorAll(
'[data-purpose="course-card-container"] div.udlite-heading-sm'
return [...nodes].map((node) => node.textContent);
I'm trying to implement an async on each loop on nodejs.
I have a variable html which contains the page content. There I want to iterate through all divs that have a particular class. Inside those divs, there are some links that I want to navigate and get some content from them too. So basically since each expects synchronous function it doesn't wait for the other code to be executed.
I tried to do it like this:
const browser = await puppeteer.launch({
headless: true
const page = await browser.newPage();
const page2 = await browser.newPage();
const mainUrl = "http ... ";
const html = await page.goto(mainUrl)
.then(function() {
return page.content();
await $('.data-row', html).each(function() => {
const url = await $(this).find(".link-details a").attr("href");
.then(function() {
const title = await page.evaluate(el => el.innerHTML, await page.$('#title'));
// do other things
// do other things
// create a json with data add it to a list
But the title gives undefined and it's executed after the loop finishes executing ... What can I do here?
I've edited your code to show how Puppeteer was supposed to be used. Your main problem here was using jQuery where it was not needed and attempting to await things that were not asynchronous; while mixing in a promise chain.
(async () => {
const browser = await puppeteer.launch({
headless: true
const page = await browser.newPage();
const page2 = await browser.newPage();
const mainUrl = "http ... ";
/*const html = await page.goto(mainUrl)
.then(function() {
return page.content();
await (page.goto(mainUrl))
await page.waitForSelector('.data-row');
const dataRows = await page.evaluate(() =>
/*await $('.data-row', html).each(function() => {
const url = await $(this).find(".link-details a").attr("href");
await page2.goto(url)
.then(function() {
const title = await page.evaluate(el => el.innerHTML, await page.$('#title'));
// do other things
// do other things
// create a json with data add it to a list
for (const row of dataRows) {
const url = dataRows.querySelector(".link-details a").href;
await page2.goto(url)
const title = await page2.evaluate(() => document.title)
You can't await jQuery.each, to you can try doing the following.
const rows = await $('.data-row', html).toArray();
for(const row of rows){
const url = await $(this).find(".link-details a").attr("href");
.then(function() {
const title = await page.evaluate(el => el.innerHTML, await page.$('#title'));
// do other things
// do other things
// create a json with data add it to a list
Recently I started to crawl the web using Puppeteer. Below is a code for extracting a specific product name from the shopping mall.
const puppeteer = require('puppeteer');
(async () => {
const width = 1600, height = 1040;
const option = { headless: false, slowMo: true, args: [`--window-size=${width},${height}`] };
const browser = await puppeteer.launch(option);
const page = await browser.newPage();
const vp = {width: width, height: height};
await page.setViewport(vp);
const navigationPromise = page.waitForNavigation();
await page.goto('');
await navigationPromise;
await page.waitFor(2000);
const textBoxId = 'co_srh_input';
await page.type('.' + textBoxId, '양말', {delay: 100});
await page.waitFor(5000);
await page.waitForSelector(' > a.tit');
const stores = await page.evaluate(() => {
const links = Array.from(document.querySelectorAll(' > a.tit'));
return => link.innerText).slice(0, 10) // 10개 제품만 가져오기
await browser.close();
I have a question. How can I output the crawled results to an HTML document (without using the database)? Please use sample code to explain it.
I used what was seen on
const puppeteer = require("puppeteer");
const fs = require("fs");
async function run() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("", { waitUntil: "networkidle2" });
// hacky defensive move but I don't know a better way:
// wait a bit so that the browser finishes executing JavaScript
await page.waitFor(1 * 1000);
const html = await page.content();
fs.writeFileSync("index.html", html);
await browser.close();
You can use the following write_file function that returns a Promise that resolves or rejects when fs.writeFile() succeeds or fails.
Then, you can await the Promise from within your anonymous, asynchronous function and check whether or not the data was written to the file:
'use strict';
const fs = require('fs');
const puppeteer = require('puppeteer');
const write_file = (file, data) => new Promise((resolve, reject) => {
fs.writeFile(file, data, 'utf8', error => {
if (error) {
} else {
(async () => {
// ...
const stores = await page.evaluate(() => {
return Array.from(document.querySelectorAll(' > a.tit'), link => link.innerText).slice(0, 10); // 10개 제품만 가져오기
if (await write_file('example.html', stores.toString()) === false) {
console.error('Error: Unable to write stores to example.html.');
// ...