Puppeteer getting Error: Evaluation failed: TypeError: Cannot read properties of null (reading 'innerText') when returning multiple values - javascript

Using puppeteer to get some data from a site. I need to return multiple values gotten from the site but for some reason I can only return one at a time, anytime I try returning multiple values(like the one in the code below), I get the following error: Error: Evaluation failed: TypeError: Cannot read properties of null (reading 'innerText') when returning multiple values and I can't find out why.
Code
(async () => {
try {
const chromeBrowser = await puppeterr.launch({ headless: true });
const page = await chromeBrowser.newPage();
await page.goto("https://www.sec.gov/edgar/search/#/category=form-cat2", {timeout: 0});
const getInfo = await page.evaluate(() => {
const secTableEN = document.querySelector(".table td.entity-name");
const secTableFiled = document.querySelector(".table td.entity-filed");
const secTableLink = document.querySelector(".table td.filetype");
return {
secTableEN: secTableEN.innerText,
secTableFiled: secTableFiled.innerText,
};
})
console.log(getInfo);
await page.close();
await chromeBrowser.close();
} catch (e) {
console.error(e)
}
})();

Two problems:
The page loads the data dynamically, so you should waitForSelector before querying.
.entity-filed should be .filed.
const puppeteer = require("puppeteer"); // ^19.0.0
const url = "<your URL>";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const $ = (...args) => page.waitForSelector(...args);
const text = async (...args) =>
(await $(...args)).evaluate(el => el.textContent.trim());
await page.goto(url, {waitUntil: "domcontentloaded"});
const info = {
secTableEN: await text(".table td.entity-name"),
secTableFiled: await text(".table td.filed"),
secTableLink: await text(".table td.filetype"),
};
console.log(info);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
As an aside, I wouldn't use {timeout: 0}. If the page doesn't load after a minute or so, something is wrong and you should probably report an error rather than hang the script forever.
Another approach is to avoid the DOM and simply intercept the API response with the payload you're interested in:
// ... same boilerplate as above ...
browser = await puppeteer.launch();
const [page] = await browser.pages();
const resP = page.waitForResponse(res =>
res.url() === "https://efts.sec.gov/LATEST/search-index"
);
await page.goto(url, {waitUntil: "domcontentloaded"});
const res = await resP;
const data = JSON.parse(await res.text());
const hit = data.hits.hits[0]._source;
const info = {
secTableEN: hit.display_names[0],
secTableFiled: hit.file_date,
secTableLink: hit.file_type // slightly different output than from the DOM
};
console.log(info);
// ...

Related

Cannot read properties of null (reading 'textContent') in Puppeteer evaluate

I am trying to use Puppeteer to scrape this element from eBay:
However, when I run my code, I get an error that says "Cannot read properties of null (reading 'textContent')". This is my code:
async function scrape() {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto('https://www.ebay.com/sch/i.html?_from=R40&_nkw=Blastoise+%282%2F102%29+%5BCelebrations%3A+Classic+Collection%5D&_sacat=0&Graded=No&_dcat=183454&rt=nc&LH_Sold=1&LH_Complete=1');
await page.waitForSelector('.s-item');
let cards = await page.evaluate(() => {
let cardElement = document.body.querySelectorAll('.s-item')
let cards = Object.values(cardElement).map(x => {
return {
date: x.querySelector('.s-item__title--tagblock span.POSITIVE').textContent ? ? null
}
})
return cards
})
console.log(cards)
})()
How can I solve this?
The first item in the result list is sort of a dummy/template/placeholder node that has no data in it. You can use .slice(1) or .s-item:not(:first-child) to skip that first node.
While you're at it, you might as well make your waitForSelector more precise to match the element you're about to select (although this line isn't necessary, as I'll explain below), use domcontentloaded to speed up goto and use ?.textContent so you can see undefined rather than a crash, which is how I debugged the problem. .textContent ?? null doesn't work because the throw happens before ?? null has the chance to evaluate.
const puppeteer = require("puppeteer"); // ^14.1.1
let browser;
(async () => {
browser = await puppeteer.launch({headless: true});
const [page] = await browser.pages();
const url = "https://www.ebay.com/sch/i.html?_from=R40&_nkw=Blastoise+%282%2F102%29+%5BCelebrations%3A+Classic+Collection%5D&_sacat=0&Graded=No&_dcat=183454&rt=nc&LH_Sold=1&LH_Complete=1";
await page.goto(url, {waitUntil: "domcontentloaded"});
await page.waitForSelector(".s-item .s-item__title--tagblock span.POSITIVE");
const cards = await page.evaluate(() =>
[...document.querySelectorAll(".s-item:not(:first-child)")].map(x => ({
date: x
.querySelector(".s-item__title--tagblock span.POSITIVE")
?.textContent
}))
);
console.log(cards);
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
Better yet, skip Puppeteer entirely and make an HTTP request with Axios or fetch, then use Cheerio to parse the static HTML, which already has the data you need:
const axios = require("axios");
const cheerio = require("cheerio");
const url = "https://www.ebay.com/sch/i.html?_from=R40&_nkw=Blastoise+%282%2F102%29+%5BCelebrations%3A+Classic+Collection%5D&_sacat=0&Graded=No&_dcat=183454&rt=nc&LH_Sold=1&LH_Complete=1";
axios.get(url).then(({data}) => {
const $ = cheerio.load(data);
const cards = [];
$(".s-item:not(:first-child)").each(function (i, e) {
$(this).find(".s-item__title--tagblock .POSITIVE").each(function (i, e) {
cards.push($(this).text().trim());
});
});
console.log(cards);
})
.catch(err => console.error(err))
;

Scrape nested page puppeteer

I would like to know how to scrape data located in nested pages. Here's an example I tried to build however couldn't make it work. The idea is to go to https://dev.to/, click the question and grab its title. Then go back and redo the process for the next question.
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://dev.to/");
try {
const selectors = await page.$$(".crayons-story > a");
for (const post of selectors) {
await Promise.all([
page.waitForNavigation(),
post.click(),
page.goBack(),
]);
}
} catch (error) {
console.log(error);
} finally {
browser.close();
}
})();
When I run this code, I get
Error: Node is either not visible or not an HTMLElement
Edit: The code is missing a piece where grabs the title, but is enough for the purpose.
What is happening is the website doesn't automatically have that node when the page is opened. However, puppeteer fetches the webcontents immediately after going to that page. What you'll need is a delay so that the website is able to use it's "script" tags and inject the story in.
To wait, use this following command:
await page.waitForSelector(".crayons-story > a")
This makes sure puppeteer waits for that selector to become visible, and then starts scraping the contents.
So your final code should look like this:
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://dev.to/");
await page.waitForSelector(".crayons-story > a")
try {
const selectors = await page.$$(".crayons-story > a");
for (const post of selectors) {
await Promise.all([
page.waitForNavigation(),
post.click(".crayons-story > a"),
page.goBack(),
]);
}
} catch (error) {
console.log(error);
} finally {
browser.close();
}
})();
The problem I'm facing here is very similar to this one.
Puppeteer Execution context was destroyed, most likely because of a navigation
The best solution I could come up with is to avoid using page.goBack() and rather use page.goto() so the references are not lost.
Solution 1: (this one uses MAP and the scrape is resolved in an async way, much quicker than the one bellow this one):
const puppeteer = require("puppeteer");
const SELECTOR_POSTS_LINK = ".article--post__title > a";
const SELECTOR_POST_TITLE = ".article-header--title";
async function scrape() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://www.smashingmagazine.com/articles/");
try {
const links = await page.$$eval(SELECTOR_POSTS_LINK, (links) => links.map((link) => link.href));
const resolver = async (link) => {
await page.goto(link);
const title = await page.$eval(SELECTOR_POST_TITLE, (el) => el.textContent);
return { title };
};
const promises = await links.map((link) => resolver(link));
const articles = await Promise.all(promises);
console.log(articles);
} catch (error) {
console.log(error);
} finally {
browser.close();
}
}
scrape();
Solution 2: (Use for of so it's sync and then much slower than the previous):
const puppeteer = require("puppeteer");
const SELECTOR_POSTS_LINK = ".article--post__title > a";
const SELECTOR_POST_TITLE = ".article-header--title";
async function scrape() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://www.smashingmagazine.com/articles/");
try {
const links = await page.$$eval(SELECTOR_POSTS_LINK, (links) => links.map((link) => link.href));
const articles = [];
for (const link of links) {
await page.goto(link);
const title = await page.$eval(SELECTOR_POST_TITLE, (el) => el.textContent);
articles.push({ title });
}
console.log(articles);
} catch (error) {
console.log(error);
} finally {
browser.close();
}
}
scrape();

puppeteer trouble. Or, at least, javascript trouble; you decide, please

Can someone please explain what might be going wrong here:
await page.click('some-selector-that-devtools-confirms-is-definitely-there')
let peeps = await page.evaluate(() =>
{
document.querySelector('some-selector-that-devtools-confirms-is-definitely-there')
}
);
console.log('classes: '+peeps.classList)
I've tried page.wait...., to no avail, same error
Error
TypeError: Cannot read property 'classList' of undefined
Incidentally, is there a best practice for finding out if an element has a certain css class?
You have two problems here.
You don't return document.querySelector('some-selector-that-devtools-confirms-is-definitely-there') so the variable peeps will be undefined
You expect you can return any value with page.evaluate(). but acutely you can only return a serializable value, so it is not possible to return an element or NodeList back from the page environment using this method.
Example to return classlist by page.evaluate().
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://google.com", { waitUntil: "networkidle2" });
const classList = await page.evaluate(() => {
return [...document.querySelector("div").classList];
});
console.log(classList);
await browser.close();
})();
There are two main problems with your code:
Your evaluate method is not returning anything;
You need to access the classList inside the evaluate method
Here's an example:
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://stackoverflow.com/");
const classes = await page.evaluate(() => {
return document.querySelector("body").classList;
});
console.log(classes);
await browser.close();
})();
As an alternative approach, you could use getProperty("className"):
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://stackoverflow.com/");
const el = await page.$("body");
const className = await el.getProperty("className");
const classes = className._remoteObject.value.split(" ");
console.log(classes);
await browser.close();
})();

Puppeteer unable to use get property

Cannot read property 'getProperty' of undefined is the error that I get.
const puppeteer = require('puppeteer');
async function scrapeUdemy(url) {
try {
const browser = await puppeteer.launch({headless: false, slowmo: 250});
const page = await browser.newPage()
await page.goto(url)
const [el] = await page.$x('//*[#id="udemy"]/div[1]/div[4]/div/div/div[2]/div/div/div[1]/a/div[1]/div[1]');
const txt = await el.getProperty('textContent');
const rawTxt = await src.jsonValue();
console.log({srcTxt});
browser.close();
}
catch(err) {
console.log(err.message);
}
}
scrapeUdemy('https://www.udemy.com/user/eren-cem-salta/')
I tried using other versions but does not work. It is not working with the catch block too.
The element that you want to get is loaded with AJAX after the page started and you have to wait until it appears in the DOM:
await page.waitForSelector('[data-purpose="course-card-container"] div.udlite-heading-sm');
And why not use the same selector to get all of the cards:
const titles = await page.evaluate(() => {
const nodes = document.querySelectorAll(
'[data-purpose="course-card-container"] div.udlite-heading-sm'
);
return [...nodes].map((node) => node.textContent);
})

Create async loop using js

I'm trying to implement an async on each loop on nodejs.
I have a variable html which contains the page content. There I want to iterate through all divs that have a particular class. Inside those divs, there are some links that I want to navigate and get some content from them too. So basically since each expects synchronous function it doesn't wait for the other code to be executed.
I tried to do it like this:
const browser = await puppeteer.launch({
headless: true
});
const page = await browser.newPage();
const page2 = await browser.newPage();
const mainUrl = "http ... ";
const html = await page.goto(mainUrl)
.then(function() {
return page.content();
});
await $('.data-row', html).each(function() => {
const url = await $(this).find(".link-details a").attr("href");
page2.goto(url)
.then(function() {
const title = await page.evaluate(el => el.innerHTML, await page.$('#title'));
// do other things
});
// do other things
// create a json with data add it to a list
});
But the title gives undefined and it's executed after the loop finishes executing ... What can I do here?
I've edited your code to show how Puppeteer was supposed to be used. Your main problem here was using jQuery where it was not needed and attempting to await things that were not asynchronous; while mixing in a promise chain.
(async () => {
const browser = await puppeteer.launch({
headless: true
});
const page = await browser.newPage();
const page2 = await browser.newPage();
const mainUrl = "http ... ";
/*const html = await page.goto(mainUrl)
.then(function() {
return page.content();
});*/
await (page.goto(mainUrl))
await page.waitForSelector('.data-row');
const dataRows = await page.evaluate(() =>
document.querySelectorAll('.data-row');
)
/*await $('.data-row', html).each(function() => {
const url = await $(this).find(".link-details a").attr("href");
await page2.goto(url)
.then(function() {
const title = await page.evaluate(el => el.innerHTML, await page.$('#title'));
// do other things
});
// do other things
// create a json with data add it to a list
});*/
for (const row of dataRows) {
const url = dataRows.querySelector(".link-details a").href;
await page2.goto(url)
const title = await page2.evaluate(() => document.title)
console.log(title)
}
})()
You can't await jQuery.each, to you can try doing the following.
const rows = await $('.data-row', html).toArray();
for(const row of rows){
const url = await $(this).find(".link-details a").attr("href");
page2.goto(url)
.then(function() {
const title = await page.evaluate(el => el.innerHTML, await page.$('#title'));
// do other things
});
// do other things
// create a json with data add it to a list
}

Categories