As illustrated in here here, Puppeteer allows to override Javascript functions. I want to override showOpenFilePicker function. That is, when the showOpenFilePicker invoked by the web page. I want to run another function before the showOpenFilePicker.
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.evaluateOnNewDocument(() => {
Object.defineProperty(HTMLCanvasElement.prototype, "toBlob", {
value: () => {
console.log("Hey there");
},
});
});
await page.goto("https://example.com");
await page.evaluate(() => {
console.log(HTMLCanvasElement.prototype.toBlob.toString());
});
// await browser.close();
})();
You can override built-in functions in Puppeteer like in the code sample below. This replaces the original function with an override that logs the arguments to the console.
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.evaluateOnNewDocument(() => {
const originalShowOpenFilePicker = window.showOpenFilePicker;
window.showOpenFilePicker = (...args) => {
console.log('Modified `showOpenFilePicker` called with these arguments:', args);
return originalShowOpenFilePicker(...args);
};
});
await page.goto("https://example.com");
await page.evaluate(() => {
console.log(showOpenFilePicker());
});
// await browser.close();
})();
Related
Recently I started to crawl the web using Puppeteer. Below is a code for extracting a specific product name from the shopping mall.
const puppeteer = require('puppeteer');
(async () => {
const width = 1600, height = 1040;
const option = { headless: false, slowMo: true, args: [`--window-size=${width},${height}`] };
const browser = await puppeteer.launch(option);
const page = await browser.newPage();
const vp = {width: width, height: height};
await page.setViewport(vp);
const navigationPromise = page.waitForNavigation();
await page.goto('https://shopping.naver.com/home/p/index.nhn');
await navigationPromise;
await page.waitFor(2000);
const textBoxId = 'co_srh_input';
await page.type('.' + textBoxId, '양말', {delay: 100});
await page.keyboard.press('Enter');
await page.waitFor(5000);
await page.waitForSelector('div.info > a.tit');
const stores = await page.evaluate(() => {
const links = Array.from(document.querySelectorAll('div.info > a.tit'));
return links.map(link => link.innerText).slice(0, 10) // 10개 제품만 가져오기
});
console.log(stores);
await browser.close();
})();
I have a question. How can I output the crawled results to an HTML document (without using the database)? Please use sample code to explain it.
I used what was seen on blog.kowalczyk.info
const puppeteer = require("puppeteer");
const fs = require("fs");
async function run() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://www.google.com/", { waitUntil: "networkidle2" });
// hacky defensive move but I don't know a better way:
// wait a bit so that the browser finishes executing JavaScript
await page.waitFor(1 * 1000);
const html = await page.content();
fs.writeFileSync("index.html", html);
await browser.close();
}
run();
fs.writeFile()
You can use the following write_file function that returns a Promise that resolves or rejects when fs.writeFile() succeeds or fails.
Then, you can await the Promise from within your anonymous, asynchronous function and check whether or not the data was written to the file:
'use strict';
const fs = require('fs');
const puppeteer = require('puppeteer');
const write_file = (file, data) => new Promise((resolve, reject) => {
fs.writeFile(file, data, 'utf8', error => {
if (error) {
console.error(error);
reject(false);
} else {
resolve(true);
}
});
});
(async () => {
// ...
const stores = await page.evaluate(() => {
return Array.from(document.querySelectorAll('div.info > a.tit'), link => link.innerText).slice(0, 10); // 10개 제품만 가져오기
});
if (await write_file('example.html', stores.toString()) === false) {
console.error('Error: Unable to write stores to example.html.');
}
// ...
});
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null
})
const page = await browser.newPage()
await page.goto('https://www.supremenewyork.com/shop/sweatshirts/ftq968f24/lhrblx1z5')
var productName = await page.evaluate(() => {
document.querySelector('div[id="details"] > p[itemprop="model"]').innerText;
})
console.log(productName);
})()
When I run my code that is supposed to grab the name of the supreme item, it says undefined when it's supposed to log it in the console.
You are neither returning anything from the page.evaluate nor are you setting the value of productName. Try something like this instead that uses $eval to return the innerText of the matching element:
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
});
const page = await browser.newPage();
await page.goto(
"https://www.supremenewyork.com/shop/sweatshirts/ftq968f24/lhrblx1z5"
);
const productName = await page.$eval(
'div[id="details"] > p[itemprop="model"]',
(el) => el.innerText
);
console.log(productName);
})();
If you prefer to use evaluate it would look like:
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
});
const page = await browser.newPage();
await page.goto(
"https://www.supremenewyork.com/shop/sweatshirts/ftq968f24/lhrblx1z5"
);
const productName = await page.evaluate(() => {
// notice the return
return document.querySelector('div[id="details"] > p[itemprop="model"]').innerText;
});
console.log(productName);
})();
If innerText doesn't return anything you may instead need to use something like textContent.
Hopefully that helps!
Using Puppeteer to listen for map.on('load') from within Node.
(async () => {
const browser = await puppeteer.launch({ headless: false, devtools: true });
const page = await browser.newPage();
function nodeLog(msg) {
console.log(msg);
}
page.on('load', async () => {
await page.evaluate(() => {
window.map.on('load', () => {
console.log("This runs on the index.html js but I do not need that");
nodeLog("WHY IS THIS NOT WORKING??")
})
})
});
await page.goto(`file:${__dirname + '/index.html'}`);
})();
waitForSelector should work, eg. when using a selector from the readily rendered map... or listen for the map.bounds_changed or the map.idle event, which are triggered once the map is fully loaded. The map.load event might happen too soon.
Here's a working example, which I've just put together:
const puppeteer = require('puppeteer');
const url = 'https://developers-dot-devsite-v2-prod.appspot.com/maps/documentation/javascript/examples/full/map-simple';
run().then(() => {
console.log('entering asynchronous execution.')
}).catch(error => {
console.log(error)
});
async function run() {
puppeteer
.launch({devtools: true, headless: false})
.then(async browser => {
const page = await browser.newPage();
await page.goto(url);
await page.evaluate(() => {
window.map.addListener('idle', function(){
console.log('the map is idle now');
var div = document.createElement('div');
div.setAttribute('id', 'puppeteer-map-idle');
window.document.body.append(div);
});
});
await page.waitForSelector('#puppeteer-map-idle' , {
timeout: 5000
}).then((res) => {
console.log('selector #puppeteer-map-idle has been found.');
/* in here the map should be fully loaded. */
});
// await browser.close();
});
}
Admittedly that's kind of workaround, but the DOM manipulation can be observed.
I also figured out how to return information. I reread the docs and got some understanding. I was not understanding the context.
const nodeLog = msg => console.log;
const msg = await page.evaluate(() => { return 'this is working' });
nodeLog(msg);
I want to take a screenshot with puppeteer and it's working for one post. But I want to make it iterate.
If it's normal function I can just wrote the function name in the last side of the code so that it can iterate. But this is async function so I don't know how to iterate it.
const puppeteer = require('puppeteer');
let postNumber = 1;
let by;
(async () => {
const browser = await puppeteer.launch({
executablePath: 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
userDataDir: 'C:\\Users\\{computerName}\\AppData\\Local\\Google\\Chrome\\User Data',
headless: false
}); // default is true
const page = await browser.newPage();
await page.goto(`https://band.us/band/{someNumbers}/post/${postNumber}`, {
waitUntil: 'networkidle2'
});
let element = await page.$('.boardList');
by = await page.evaluate(() => document.getElementsByClassName('text')[0].textContent);
console.log(by);
await element.screenshot({
path: `./image/${postNumber}-${by}.png`
});
console.log(`SAVED : ${postNumber}-${by}.png`)
postNumber++;
await browser.close();
})();
After the function is finished, the postNumber variable should be increase by one. And then run the code again by new URLs.
As you want to run the code one iteration after another, a normal for (or while) loop can be used. async/await code works fine with these.
You can use a for in your case like this:
(async () => {
const browser = await puppeteer.launch(/* ... */);
const page = await browser.newPage();
for (let postNumber = 1; postNumber < 10; postNumber++) {
await page.goto(/* ... */);
let element = await page.$('.boardList');
// ...
}
await browser.close();
})();
You can use any appropriate loop, like while-loop:
'use strict';
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
executablePath: 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
userDataDir: 'C:\\Users\\{computerName}\\AppData\\Local\\Google\\Chrome\\User Data',
headless: false
}); // default is true
const page = await browser.newPage();
let postNumber = 1;
while (postNumber <= 10) {
await page.goto(`https://band.us/band/{someNumbers}/post/${postNumber}`, {
waitUntil: 'networkidle2'
});
const element = await page.$('.boardList');
const by = await page.evaluate(() => document.getElementsByClassName('text')[0].textContent);
console.log(by);
await element.screenshot({
path: `./image/${postNumber}-${by}.png`
});
console.log(`SAVED : ${postNumber}-${by}.png`)
postNumber++;
}
await browser.close();
})();
Recently I started to crawl the web using Puppeteer. Below is a code for extracting a specific product name from the shopping mall.
const puppeteer = require('puppeteer');
(async () => {
const width = 1600, height = 1040;
const option = { headless: false, slowMo: true, args: [`--window-size=${width},${height}`] };
const browser = await puppeteer.launch(option);
const page = await browser.newPage();
const vp = {width: width, height: height};
await page.setViewport(vp);
const navigationPromise = page.waitForNavigation();
await page.goto('https://shopping.naver.com/home/p/index.nhn');
await navigationPromise;
await page.waitFor(2000);
const textBoxId = 'co_srh_input';
await page.type('.' + textBoxId, '양말', {delay: 100});
await page.keyboard.press('Enter');
await page.waitFor(5000);
await page.waitForSelector('div.info > a.tit');
const stores = await page.evaluate(() => {
const links = Array.from(document.querySelectorAll('div.info > a.tit'));
return links.map(link => link.innerText).slice(0, 10) // 10개 제품만 가져오기
});
console.log(stores);
await browser.close();
})();
I have a question. How can I output the crawled results to an HTML document (without using the database)? Please use sample code to explain it.
I used what was seen on blog.kowalczyk.info
const puppeteer = require("puppeteer");
const fs = require("fs");
async function run() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://www.google.com/", { waitUntil: "networkidle2" });
// hacky defensive move but I don't know a better way:
// wait a bit so that the browser finishes executing JavaScript
await page.waitFor(1 * 1000);
const html = await page.content();
fs.writeFileSync("index.html", html);
await browser.close();
}
run();
fs.writeFile()
You can use the following write_file function that returns a Promise that resolves or rejects when fs.writeFile() succeeds or fails.
Then, you can await the Promise from within your anonymous, asynchronous function and check whether or not the data was written to the file:
'use strict';
const fs = require('fs');
const puppeteer = require('puppeteer');
const write_file = (file, data) => new Promise((resolve, reject) => {
fs.writeFile(file, data, 'utf8', error => {
if (error) {
console.error(error);
reject(false);
} else {
resolve(true);
}
});
});
(async () => {
// ...
const stores = await page.evaluate(() => {
return Array.from(document.querySelectorAll('div.info > a.tit'), link => link.innerText).slice(0, 10); // 10개 제품만 가져오기
});
if (await write_file('example.html', stores.toString()) === false) {
console.error('Error: Unable to write stores to example.html.');
}
// ...
});