This question already has answers here:
Crawling multiple URLs in a loop using Puppeteer
(5 answers)
Using async/await with a forEach loop
(33 answers)
Closed 26 days ago.
Here's my script. I have a .csv file that contain URL adress. For each URL, i'm scrapping text in the <div> element with the class "sc-kIKDeO.hIofms". But, when I run my script, it opens a new browser windows for each URL, which is not really effective & quick. I need to open a tab in the same browser instance for each URL. I tried differents things but I still don't manage to do this... If someone could help me please?
const puppeteer = require("puppeteer");
const fs = require("fs");
const urlList = fs.readFileSync("sample.csv", "utf-8").split("\n");
urlList.forEach(async (url) => {
const browser = await puppeteer.launch({
executablePath:
"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
headless: true,
arg: ["--incognito"],
});
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on("request", (request) => {
if (
request.resourceType() === "image" ||
request.resourceType() === "stylesheet" ||
request.resourceType() ===
"font" /*|| request.resourceType() === 'script' && !request.url().includes('.sc-kIKDeO.hIofms')*/
) {
request.abort();
} else {
request.continue();
}
});
await page.goto(url, { waitUntil: "networkidle0", timeout: 60000 });
try {
await page.waitForSelector(".sc-kIKDeO.hIofms");
const text = await page.evaluate(
() => document.querySelector(".sc-kIKDeO.hIofms").textContent
);
console.log(text);
} catch (error) {
console.log("error:${error}");
}
await browser.close();
});
I'm trying to scrape data from differents URL by opening each URL in differents tab of the same browser's instance.
Related
I'm trying to scrape video url of Instagram videos using puppeteer but unable to do it. it is returning null as a response
here is my code
async function getVideo(){
const launch = await puppeteer.launch({headless: true});
const page = await launch.newPage();
await page.goto('https://www.instagram.com/p/CfW5u5UJmny/?hl=en');
const video = await page.evaluate(() => {
return document.querySelector('video').src;
});
console.log(video); returns null
await launch.close();
}
example ur: https://instagram.fluh1-1.fna.fbcdn.net/v/t50.16885-16/290072800_730588251588660_5005285215058589375_n.mp4?efg=eyJ2ZW5jb2RlX3RhZyI6InZ0c192b2RfdXJsZ2VuLjcyMC5pZ3R2LmJhc2VsaW5lIiwicWVfZ3JvdXBzIjoiW1wiaWdfd2ViX2RlbGl2ZXJ5X3Z0c19vdGZcIl0ifQ&_nc_ht=instagram.fluh1-1.fna.fbcdn.net&_nc_cat=100&_nc_ohc=ROJWkaOqkQcAX_z-_Ls&edm=AP_V10EBAAAA&vs=440468611258459_2442386419&_nc_vs=HBksFQAYJEdPQW9TaEUwaURaVmQ1Z0NBTC0yRkV0aVdIWkZidlZCQUFBRhUAAsgBABUAGCRHTEdvVHhGMWFjUUpsMzhDQUZNT0c1cV8wT3c1YnZWQkFBQUYVAgLIAQAoABgAGwGIB3VzZV9vaWwBMRUAACaa%2BO%2FYnLPeQBUCKAJDMywXQCDdsi0OVgQYEmRhc2hfYmFzZWxpbmVfMV92MREAdewHAA%3D%3D&ccb=7-5&oh=00_AfCBrACQlXOqmbGSWRk_6Urv_fmHJUFDIt-8w6EO0_UcHQ&oe=638D6CBD&_nc_sid=4f375e
You are loading the Instagram page. Since it takes a little while to load, I used setTimeout function to wait. Puppeteer also has many inbuilt functions you can use to obtain the src, such as the following.
async function getVideo(){
const launch = await puppeteer.launch({headless: false});
const page = await launch.newPage();
await page.goto('https://www.instagram.com/p/CfW5u5UJmny/?hl=en');
setTimeout(async () => {
let src = await page.$eval("video", n => n.getAttribute("src"))
console.log(src);
await launch.close();
}, 1000)
}
I'm using Puppeteer.js to crawl some URL. I'm using the default Chromium browser of Puppeteer.All is working well, but the problem is, that when I run the crawling script, and doing other things in the background and the focus is no longer on the Chromium browser of Puppeteer, it's not working: waiting for elements way too long, and abort operations, or in other words: puppeteer is paused (or freeze).
P.S, I'm also using puppeteer-extra and puppeteer-extra-plugin-stealth NPM packages for advance options.
Here is how I create the browser and the page:
async initiateCrawl(isDisableAsserts) {
// Set the browser.
this.isPlannedClose = false;
const browser = await puppeteerExtra.launch({
headless: false,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--start-maximized',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding'
]
});
const pid = browser.process().pid;
browser.on('disconnected', () => {
systemUtils.killProcess(pid);
if (!this.isPlannedClose) {
systemUtils.exit(Status.BROWSER_CLOSE, Color.RED, 0);
}
});
process.on('SIGINT', () => {
this.close(browser, true);
});
// Set the page and close the first empty tab.
const page = await browser.newPage();
const pages = await browser.pages();
if (pages.length > 1) {
await pages[0].close();
}
await page.setRequestInterception(true);
await page.setJavaScriptEnabled(false);
await page.setDefaultNavigationTimeout(this.timeout);
page.on('request', (request) => {
if (isDisableAsserts && ['image', 'stylesheet', 'font', 'script'].indexOf(request.resourceType()) !== -1) {
request.abort();
} else {
request.continue();
}
});
return {
browser: browser,
page: page
};
}
I already looked at:
https://github.com/puppeteer/puppeteer/issues/3339
https://github.com/GoogleChrome/chrome-launcher/issues/169
https://www.gitmemory.com/issue/GoogleChrome/puppeteer/3339/530620329
Not working solutions:
const session = await page.target().createCDPSession();
await session.send('Page.enable');
await session.send('Page.setWebLifecycleState', {state: 'active'});
const chromeArgs = [
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding'
];
var ops = {args:[
'--kiosks',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
'--disable-canvas-aa',
'--disable-2d-canvas-clip-aa',
'--disable-gl-drawing-for-tests',
'--disable-dev-shm-usage',
'--no-zygote',
'--use-gl=desktop',
'--enable-webgl',
'--hide-scrollbars',
'--mute-audio',
'--start-maximized',
'--no-first-run',
'--disable-infobars',
'--disable-breakpad',
'--user-data-dir='+tempFolder,
'--no-sandbox',
'--disable-setuid-sandbox'
], headless: false, timeout:0 };
puppeteer = require('puppeteer');
browser = await puppeteer.launch(ops);
page = await browser.newPage();
Has anyone faced this issue before and have any idea how to solve this? Thanks.
My issue was solved when I updated to the latest puppeteer version (9.0.0).
I'm writing a simple script to just check if all the resources load correctly (i check status codes of the responses). I've decided to use puppeteer for this and i wrote
(async () => {
const browser = await puppeteer.launch({headless: false});
const [page] = await browser.pages();
page.on("response", (res) => {
const url = res.url(), status=res.status();
// my functionality goes here
if (url.includes("favicon")) console.log(status, url); // not logging in headless
});
await page.goto("https://stackoverflow.com/", {waitUntil: 'networkidle2'});
await browser.close();
})();
the issue is that if i run my application in headless mode the favicon is missing from the responses, i assume it has something to do with puppeteer not loading a favicon in headless. Any built in functionality or workarounds?
From the lack of a better solution right now I'm evaluating favicons url and manually visiting it
async function checkFavicon(page){
const iconUrl = await page.$eval("link[rel*='icon']", ({href}) => href);
await page.goto(iconUrl);
await page.goBack();
}
Using Puppeteer (https://github.com/GoogleChrome/puppeteer), I have a page that's a application/pdf. With headless: false, the page is loaded though the Chromium PDF viewer, but I want to use headless. How can I download the original .pdf file or use as a blob with another library, such as (pdf-parse https://www.npmjs.com/package/pdf-parse)?
Since Puppeteer does not currently support navigation to a PDF document in headless mode via page.goto() due to the upstream issue, you can use page.setRequestInterception() to enable request interception, and then you can listen for the 'request' event and detect whether the resource is a PDF before using the request client to obtain the PDF buffer.
After obtaining the PDF buffer, you can use request.abort() to abort the original Puppeteer request, or if the request is not for a PDF, you can use request.continue() to continue the request normally.
Here's a full working example:
'use strict';
const puppeteer = require('puppeteer');
const request_client = require('request-promise-native');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', request => {
if (request.url().endsWith('.pdf')) {
request_client({
uri: request.url(),
encoding: null,
headers: {
'Content-type': 'applcation/pdf',
},
}).then(response => {
console.log(response); // PDF Buffer
request.abort();
});
} else {
request.continue();
}
});
await page.goto('https://example.com/hello-world.pdf').catch(error => {});
await browser.close();
})();
Grant Miller's solution didn't work for me because I was logged in the website. But if the pdf is public this solution works out well.
The solution for my case was to add the cookies
await page.setRequestInterception(true);
page.on('request', async request => {
if (request.url().indexOf('exibirFat.do')>0) { //This condition is true only in pdf page (in my case of course)
const options = {
encoding: null,
method: request._method,
uri: request._url,
body: request._postData,
headers: request._headers
}
/* add the cookies */
const cookies = await page.cookies();
options.headers.Cookie = cookies.map(ck => ck.name + '=' + ck.value).join(';');
/* resend the request */
const response = await request_client(options);
//console.log(response); // PDF Buffer
buffer = response;
let filename = 'file.pdf';
fs.writeFileSync(filename, buffer); //Save file
} else {
request.continue();
}
});
i have a node server. I pass a Url into request and then extract the contects with cherio. Now what im trying to do is detect if that webpage is using google analytics. How would i do this?
request({uri: URL}, function(error, response, body)
{
if (!error)
{
const $ = cheerio.load(body);
const usesAnalytics = body.includes('googletag') || body.includes('analytics.js') || body.includes('ga.js');
const isUsingGA = ?;
}
}
From the official analytics site, they say that you can find some strings that would indicate GA is active. I have tried scanning the body for these but they always return false even if that page is running GA. I included this in the code above.
Ive looked at websites that use it and I cant see anything in their index that would suggest they are using it. Its only when i go to their sources and see they are using it. How would i detect this in node?
I have Node script which uses Puppeteer to monitor the requests sent from a website.
I wrote this some time ago so some parts might be irrelevant to you but here you go:
'use strict';
const puppeteer = require('puppeteer');
function getGaTag(lookupDomain){
return new Promise((resolve) => {
(async() => {
var result = [];
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', request => {
const url = request.url();
const regexp = /(UA|YT|MO)-\d+-\d+/i;
// look for tracking script
if (url.match(/^https?:\/\/www\.google-analytics\.com\/(r\/)?collect/i)) {
console.log(url.match(regexp));
console.log('\n');
result.push(url.match(regexp)[0]);
}
request.continue();
});
try {
await page.goto(lookupDomain);
await page.waitFor(9000);
} catch (err) {
console.log("Couldn't fetch page " + err);
}
await browser.close();
resolve(result);
})();
})
}
getGaTag('https://store.google.com/').then(result => {
console.log(result)
})
Running node ga-check.js now returns the UA ID of the Google Analytucs tracker on the lookup domain: [ 'UA-54090495-1' ] which in this case is https://store.google.com
Hope this helps!