Puppeteer reload the page until some specific style changed - javascript

I want to open the web browser and continue reloading until the reloaded page has different style then move to next functions to execute unless continue reloading.
let's say i have a P tag then reload the page because display: block:
<p id="notYetStarted" style="display: block;">You need to reload the page if u can read me!</p>
but stop reloading the page because the display property of P tag is display: none; for now (and in this case instead of reloading, continue execute other codes):
<p id="notYetStarted" style="display: none;">You need to reload the page if u can read me!</p>
i tried to use Recursive function but not working:
(async () => {
try {
//init a browser tab and wait until completely loaded then go to next step
const browser = await puppeteer.launch({headless:false, args: ['--no-sandbox'] });
const page = await browser.newPage();
await page.setViewport({width:1366, height: 768})
await page.goto(url, { waitUntil: 'networkidle2' });
// wait for Recursive function to be resolve
await checkPTag(page)
// we are here because p.display:none
// continue execute other codes :)
}catch(err) {
console.log(err)
}
})();
const checkPTag = (page) => {
return new Promise(async (resolve, reject) => {
//search the dom for p tag and check it's display property
let result = await isPTagAvailable(page)
if(result === 'not started') {
//reload the page cz p.display:block
await page.reload({ waitUntil: ["networkidle0", "domcontentloaded"] })
//Recursive calling again
await checkPTag(page)
}else if(result === 'started') {
//no need reload the page cz p.none
resolve('started')
}
})
}
const isPTagAvailable = (page) => {
return new Promise (async (resolve, reject) => {
await page.waitForSelector('#body');
const pTags = await page.$$eval(
'#body',
nodes =>
nodes.map(element => {
const p = element.querySelector('p#notYetStarted');
console.log(p)
return JSON.parse(JSON.stringify(getComputedStyle(element, null).display));
} )
);
const pDisplay = pTags[0]
if(pDisplay === 'block') {
resolve('not started')
}else {
resolve('started')
}
})
}
the above code open a web browser and wait until dom completely loaded and get the display value of P tag and since it is block then reload the page so far so good but then if display value changing to none but still it is try to reload the page.
sry for long code

I think your code is just loading the same cache as the first request. So you should add some random number in the end of the URL to make sure the response isn't the same as the cache of the first response.
const puppeteer = require ('puppeteer')
const urlPage = 'http://localhost/testing/test_display_none.html'
;(async () => {
const browser = await puppeteer.launch ({
headless: false,
devtools: false
})
const [page] = await browser.pages ()
page.setDefaultNavigationTimeout(0)
const functionToExecute = async () => {
// Code to run if P tag display is none (hidden)
console.log ('P tag display = none\n Executing next defined function...')
}
const ifTagPdisplayed = async () => {
const openPage = await page.goto ( urlPage + '?r=' + Date.now() , { waitUntil: 'networkidle2', timeout: 0 } )
const elemExist = await page.waitForSelector ('#notYetStarted', { timeout: 0 })
const getDisplay = await page.evaluate ( () => document.querySelector('#notYetStarted').style.display === 'none' )
if ( !getDisplay ) {
await ifTagPdisplayed ()
} else {
await functionToExecute ()
}
}
await ifTagPdisplayed ()
})()

Related

"The execution context was destroyed, most likely due to a navigation" when scraping with Puppeteer in NextJs

I'm trying to create an application to search for my music on some sites that post illegal content so I can ask them to delete it later.
I am facing this problem in puppeteer, when I try to press enter on the search input I get this error: Error: Execution context was destroyed, most likely because of a navigation.
I have two files. One called urlScrapper.js with my script and an array with the names of my songs:
import InfringementFinder from './InfringementFinder.js';
const songs = ['Artist Name - Song Name', 'Artist Name - Song Name'];
const irscCodes = ['XXXXXXXXXX', XXXXXXXXXX];
InfringementFinder(songs, irscCodes).then(() => {
console.log('Search complete!');
}).catch((error) => {
console.error(error);
});
and InfringementFinder.js:
import puppeteer from 'puppeteer';
const InfringementFinder = async (songs, irscCodes) => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const mainPage = 'https://example.com/';
await page.goto(mainPage);
// This enter the search term in the input field
await page.type('.search-field', 'Artist Name - Song Name'); // this supposed to be my prop but someone doesnt work
// Trigger the search by submitting the form
const searchSubmit = await page.waitForSelector('.search-submit');
await searchSubmit.press('Enter');
// Wait for the search results to load
await page.waitForSelector('.g1-frame');
// This finds the first entry-content element containing the information
const entryContent = await page.$('.g1-frame');
if (!entryContent) return;
// This press on the element
await entryContent.press('Enter');
// Extract the relevant information
try {
const data = await page.evaluate(() => {
const trackElements = Array.from(document.querySelectorAll('li', 'ol', 'a', 'href', 'strong', 'span', 'p', 'div', 'class'))
.filter(el => el.innerText.includes('Artist Name - Song Name'));
const tracks = trackElements.map(trackElement => {
const trackName = trackElement.innerText.split(' – ')[0];
return { trackName };
});
const downloadLinks = Array.from(document.querySelectorAll('.dl-btn'))
.map(link => link.getAttribute('href'));
return { tracks, downloadLinks };
});
console.log('Data:', data);
} catch (error) {
console.error(error);
} finally {
await browser.close();
}
};
export default InfringementFinder;
It only works if I try to scrape a page where I know my music is posted and using a different code version but the idea is to search the whole website using the search input.
The logic is as follows: You click on the search input, type the name of the song, navigate to another page, click on your music, navigate to another page, and scrape the name of the songs and links to illegal downloads.
Your error is probably due to the following code:
await entryContent.press('Enter'); // triggers a nav
// Extract the relevant information immediately
// without waiting for nav to complete
try {
const data = await page.evaluate(() => {
I'd either wait for a nav here or wait for the selector on the next page you're about to access with evaluate.
Also,
document.querySelectorAll('li', 'ol', 'a', 'href', 'strong', 'span', 'p', 'div', 'class')
doesn't make sense: querySelectorAll only accepts one parameter, and 'class' isn't a name of an HTML element. It's a good idea to test this in the browser first, because it's plain JS.
I don't see 'Artist Name - Song Name' anywhere on the page.
This code:
await page.waitForSelector('.g1-frame');
// This finds the first entry-content element containing the information
const entryContent = await page.$('.g1-frame');
if (!entryContent) return;
could just be:
const entryContent = await page.waitForSelector('.g1-frame');
It's common to assume you need to navigate the site as the user would: go to the homepage, type in the search term, press Enter...
Better is to look at the query string of the search page and build your own, avoiding the extra nav and fuss of dealing with the DOM. Here's an example:
const puppeteer = require("puppeteer"); // ^19.6.3
const baseUrl = "<Your base URL, ending in .net>";
const searchTerm = "autechre";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const url = `${baseUrl}?s=${encodeURIComponent(searchTerm)}`;
await page.setJavaScriptEnabled(false);
await page.setRequestInterception(true);
page.on("request", request => {
if (request.resourceType() === "document") {
request.continue();
}
else {
request.abort();
}
});
await page.goto(url, {waitUntil: "domcontentloaded"});
await (await page.$(".g1-frame")).click();
const trackListEl = await page.waitForSelector(".entry-content > .greyf12");
const tracks = await trackListEl.$$eval("li", els => {
const fields = ["artist", "track"];
return els.map(e =>
Object.fromEntries(
e.textContent
.split(/ *– */)
.map((e, i) => [fields[i], e.trim()]),
)
);
});
const downloadLinks = await page.$$eval(".dl-btn", els =>
els.map(e => e.getAttribute("href"))
);
console.log({tracks, downloadLinks});
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Note that we don't need to execute JS and we're blocking almost all resource requests, so we can speed up the scrape significantly by switching to fetch/axios and a simple HTML parser like Cheerio:
const cheerio = require("cheerio"); // 1.0.0-rc.12
const baseUrl = "<Your base URL, ending in .net>";
const searchTerm = "autechre";
const url = `${baseUrl}?s=${encodeURIComponent(searchTerm)}`;
const get = url =>
fetch(url) // Node 18 or install node-fetch, or use another library like axios
.then(res => {
if (!res.ok) {
throw Error(res.statusText);
}
return res.text();
});
get(url)
.then(html =>
get(cheerio.load(html)(".entry-title a").attr("href"))
)
.then(html => {
const $ = cheerio.load(html);
const tracks = [...$(".entry-content > .greyf12 li")].map(
e => {
const fields = ["artist", "track"];
return Object.fromEntries(
$(e)
.text()
.split(/ *– */)
.map((e, i) => [fields[i], e.trim()])
);
}
);
const downloadLinks = [...$(".dl-btn")].map(e =>
$(e).attr("href")
);
console.log({tracks, downloadLinks});
});
The code is simpler, and on my machine, twice as fast as Puppeteer.

After puppeteer infinite scroll finishes does not return all results

Here is the code in my data scraping file:
const puppeteer = require('puppeteer');
const db = require('../db');
const Job = require('../models/job');
(async() => {
try {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
// args: ['--no-zygote', '--no-sandbox']
});
const url = 'https://www.linkedin.com/jobs/search?keywords=Junior%20Software%20Developer&location=Indianapolis%2C%20IN&geoId=&trk=homepage-jobseeker_jobs-search-bar_search-submit&position=1&pageNum=0';
// Open browser instance
const page = await browser.newPage({
waitUntil: 'networkidle0'
});
console.log(`Navigating to ${url}`);
await page.goto(url);
// Scroll to bottom of page, click on 'See More Jobs' and repeat
let lastHeight = await page.evaluate('document.body.scrollHeight');
const scroll = async() => {
while (true) {
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForTimeout(2000);
let newHeight = await page.evaluate('document.body.scrollHeight');
if (newHeight === lastHeight) {
console.log('Done scrolling!');
break;
}
lastHeight = newHeight;
seeMoreJobs();
}
console.log(data);
}
// Click on 'See More Jobs'
const seeMoreJobs = async() => {
await page.evaluate(() => {
document.querySelector('button[data-tracking-control-name="infinite-scroller_show-more"]').click();
});
}
// Collect data
const data = await page.evaluate(() => {
const allJobsArr = Array.from(document.querySelectorAll('a[data-tracking-control-name="public_jobs_jserp-result_search-card"]'));
const namesAndUrls = allJobsArr.map(job => {
return {
name: job.innerText,
url: job.href,
path: job.pathname
}
});
return namesAndUrls;
});
scroll();
} catch (err) {
console.log(err);
}
})();
So the above code is designed to navigate to the variable url and then to scroll until the scroll function "breaks"/finishes, i.e., to the very bottom of the page. Once these actions have finished, I want to then log some data in the form of an array with three properties from each job posting: name, href, and path. When I run the IIFE as shown I am able to grab the first 24-25 job postings with my data function, which are the first to be displayed on page load (before any of the scrolling takes place).
For whatever reason, this data function is unable to evaluate the entire page or document after all the scrolling has occurred.
I have tried various things and have really analyzed what the code is doing, but alas, I am at a loss for a solution. My end goal here is to comb through every job posting that has displayed with my scrolling function and then to log everything (not just the first 24-25 results) returned with the desired data properties to the console.
Thanks, all.
Ok, I have now figured out the reason why it was only pulling out the first 25 results, and I believe it was a problem of scope, sort of how I had outlined in the original question. I ended up housing the 'data' functional expression within the scroll() function, so that the same 'page' was being 'evaluated', otherwise I believe the two were looking at two different instances of the 'page'. I know this might not be the most accurate explanation, so if someone would like to better articulate this for me, that would be awesome. Here is the simple solution to the simple problem that I was having. Thanks.
const puppeteer = require('puppeteer');
const db = require('../db');
const Job = require('../models/job');
(async() => {
try {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
// args: ['--no-zygote', '--no-sandbox']
});
const url = 'https://www.linkedin.com/jobs/search?keywords=Junior%20Software%20Developer&location=Indianapolis%2C%20IN&geoId=&trk=homepage-jobseeker_jobs-search-bar_search-submit&position=1&pageNum=0';
// Open browser instance
const page = await browser.newPage({
waitUntil: 'networkidle0'
});
console.log(`Navigating to ${url}`);
await page.goto(url);
// Scroll to bottom of page, click on 'See More Jobs' and repeat
let lastHeight = await page.evaluate('document.body.scrollHeight');
const scroll = async() => {
while (true) {
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForTimeout(2000);
let newHeight = await page.evaluate('document.body.scrollHeight');
if (newHeight === lastHeight) {
break;
}
lastHeight = newHeight;
seeMoreJobs();
}
// Scrape all junior job titles
const data = await page.evaluate(() => {
const allJobsArr = Array.from(document.querySelectorAll('a[data-tracking-control-name="public_jobs_jserp-result_search-card"]'));
const namesAndUrls = allJobsArr.map(job => {
return {
name: job.innerText,
url: job.href,
path: job.pathname
}
});
const juniorJobs = namesAndUrls.filter(function(job) {
return job.name.includes('Junior') || job.name.includes('Jr') || job.name.includes('Entry') && job.url && job.path;
});
return juniorJobs;
});
console.log(data);
}
// Click on 'See More Jobs'
const seeMoreJobs = async() => {
await page.evaluate(() => {
document.querySelector('button[data-tracking-control-name="infinite-scroller_show-more"]').click();
});
}
scroll();
} catch (err) {
console.log(err);
}
})();

Scrape nested page puppeteer

I would like to know how to scrape data located in nested pages. Here's an example I tried to build however couldn't make it work. The idea is to go to https://dev.to/, click the question and grab its title. Then go back and redo the process for the next question.
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://dev.to/");
try {
const selectors = await page.$$(".crayons-story > a");
for (const post of selectors) {
await Promise.all([
page.waitForNavigation(),
post.click(),
page.goBack(),
]);
}
} catch (error) {
console.log(error);
} finally {
browser.close();
}
})();
When I run this code, I get
Error: Node is either not visible or not an HTMLElement
Edit: The code is missing a piece where grabs the title, but is enough for the purpose.
What is happening is the website doesn't automatically have that node when the page is opened. However, puppeteer fetches the webcontents immediately after going to that page. What you'll need is a delay so that the website is able to use it's "script" tags and inject the story in.
To wait, use this following command:
await page.waitForSelector(".crayons-story > a")
This makes sure puppeteer waits for that selector to become visible, and then starts scraping the contents.
So your final code should look like this:
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://dev.to/");
await page.waitForSelector(".crayons-story > a")
try {
const selectors = await page.$$(".crayons-story > a");
for (const post of selectors) {
await Promise.all([
page.waitForNavigation(),
post.click(".crayons-story > a"),
page.goBack(),
]);
}
} catch (error) {
console.log(error);
} finally {
browser.close();
}
})();
The problem I'm facing here is very similar to this one.
Puppeteer Execution context was destroyed, most likely because of a navigation
The best solution I could come up with is to avoid using page.goBack() and rather use page.goto() so the references are not lost.
Solution 1: (this one uses MAP and the scrape is resolved in an async way, much quicker than the one bellow this one):
const puppeteer = require("puppeteer");
const SELECTOR_POSTS_LINK = ".article--post__title > a";
const SELECTOR_POST_TITLE = ".article-header--title";
async function scrape() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://www.smashingmagazine.com/articles/");
try {
const links = await page.$$eval(SELECTOR_POSTS_LINK, (links) => links.map((link) => link.href));
const resolver = async (link) => {
await page.goto(link);
const title = await page.$eval(SELECTOR_POST_TITLE, (el) => el.textContent);
return { title };
};
const promises = await links.map((link) => resolver(link));
const articles = await Promise.all(promises);
console.log(articles);
} catch (error) {
console.log(error);
} finally {
browser.close();
}
}
scrape();
Solution 2: (Use for of so it's sync and then much slower than the previous):
const puppeteer = require("puppeteer");
const SELECTOR_POSTS_LINK = ".article--post__title > a";
const SELECTOR_POST_TITLE = ".article-header--title";
async function scrape() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://www.smashingmagazine.com/articles/");
try {
const links = await page.$$eval(SELECTOR_POSTS_LINK, (links) => links.map((link) => link.href));
const articles = [];
for (const link of links) {
await page.goto(link);
const title = await page.$eval(SELECTOR_POST_TITLE, (el) => el.textContent);
articles.push({ title });
}
console.log(articles);
} catch (error) {
console.log(error);
} finally {
browser.close();
}
}
scrape();

Puppeteer to listen for map.on('load') from within Node

Using Puppeteer to listen for map.on('load') from within Node.
(async () => {
const browser = await puppeteer.launch({ headless: false, devtools: true });
const page = await browser.newPage();
function nodeLog(msg) {
console.log(msg);
}
page.on('load', async () => {
await page.evaluate(() => {
window.map.on('load', () => {
console.log("This runs on the index.html js but I do not need that");
nodeLog("WHY IS THIS NOT WORKING??")
})
})
});
await page.goto(`file:${__dirname + '/index.html'}`);
})();
waitForSelector should work, eg. when using a selector from the readily rendered map... or listen for the map.bounds_changed or the map.idle event, which are triggered once the map is fully loaded. The map.load event might happen too soon.
Here's a working example, which I've just put together:
const puppeteer = require('puppeteer');
const url = 'https://developers-dot-devsite-v2-prod.appspot.com/maps/documentation/javascript/examples/full/map-simple';
run().then(() => {
console.log('entering asynchronous execution.')
}).catch(error => {
console.log(error)
});
async function run() {
puppeteer
.launch({devtools: true, headless: false})
.then(async browser => {
const page = await browser.newPage();
await page.goto(url);
await page.evaluate(() => {
window.map.addListener('idle', function(){
console.log('the map is idle now');
var div = document.createElement('div');
div.setAttribute('id', 'puppeteer-map-idle');
window.document.body.append(div);
});
});
await page.waitForSelector('#puppeteer-map-idle' , {
timeout: 5000
}).then((res) => {
console.log('selector #puppeteer-map-idle has been found.');
/* in here the map should be fully loaded. */
});
// await browser.close();
});
}
Admittedly that's kind of workaround, but the DOM manipulation can be observed.
I also figured out how to return information. I reread the docs and got some understanding. I was not understanding the context.
const nodeLog = msg => console.log;
const msg = await page.evaluate(() => { return 'this is working' });
nodeLog(msg);

How to store every network request into an array given the loop behaviour of request.continue()?

I'm trying to get all the network requests when a page is accessed and store them into an array.
My code looks like this:
await page.setRequestInterceptionEnabled(true);
page.on('request', request => {
if(request.url) {
var networkRequests = request.url;
var networkArray = [];
for (var i = 0; i < networkRequests; i++) {
networkArray.push(networkRequests[i]);
}
console.log(networkArray);
console.log(typeof networkArray);
request.continue();
} else {
request.abort();
}
});
await page.goto('http://www.example.com', {waitUntil: 'networkidle'});
I find that the problem is with the request.continue(). It creates several iterations for each fetched request, and for each iteration it shows that request and returns it as string.
That means that I end up with several strings.
The problem is that I couldn't managed to insert all those strings into one array, so I can make use of them lately. I tried several for loops but didn't succeed.
A quick fix has been found in the meantime:
const puppeteer = require('puppeteer');
function extractRequests(url) {
return new Promise((resolve, reject) => {
(async() => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setRequestInterceptionEnabled(true);
let result = [];
page.on('request', request => {
if (request.url) {
var networkRequests = request.url;
result.push(networkRequests);
request.continue();
} else {
request.abort();
}
});
page.goto(url, {
waitUntil: 'networkidle'
})
.then( _=> setTimeout( _=> resolve(result), 1000));
})();
});
}
extractRequests('http://example.com').then(requests => {
console.log(requests.filter(x => x.includes('event-name') && x.includes('other-event-name')));
process.exit(0);
});

Categories