I'm just starting to play around with Puppeteer (Headless Chrome) and Nodejs. I'm scraping some test sites, and things work great when all the values are present, but if the value is missing I get an error like:
Cannot read property 'src' of null (so in the code below, the first two passes might have all values, but the third pass, there is no picture, so it just errors out).
Before I was using if(!picture) continue; but I think it's not working now because of the for loop.
Any help would be greatly appreciated, thanks!
for (let i = 1; i <= 3; i++) {
//...Getting to correct page and scraping it three times
const result = await page.evaluate(() => {
let title = document.querySelector('h1').innerText;
let article = document.querySelector('.c-entry-content').innerText;
let picture = document.querySelector('.c-picture img').src;
if (!document.querySelector('.c-picture img').src) {
let picture = 'No Link'; } //throws error
let source = "The Verge";
let categories = "Tech";
if (!picture)
continue; //throws error
return {
title,
article,
picture,
source,
categories
}
});
}
let picture = document.querySelector('.c-picture img').src;
if (!document.querySelector('.c-picture img').src) {
let picture = 'No Link'; } //throws error
If there is no picture, then document.querySelector() returns null, which does not have a src property. You need to check that your query found an element before trying to read the src property.
Moving the null-check to the top of the function has the added benefit of saving unnecessary calculations when you are just going to bail out anyway.
async function scrape3() {
// ...
for (let i = 1; i <= 3; i++) {
//...Getting to correct page and scraping it three times
const result = await page.evaluate(() => {
const pictureElement = document.querySelector('.c-picture img');
if (!pictureElement) return null;
const picture = pictureElement.src;
const title = document.querySelector('h1').innerText;
const article = document.querySelector('.c-entry-content').innerText;
const source = "The Verge";
const categories = "Tech";
return {
title,
article,
picture,
source,
categories
}
});
if (!result) continue;
// ... do stuff with result
}
Answering comment question: "Is there a way just to skip anything blank, and return the rest?"
Yes. You just need to check the existence of each element that could be missing before trying to read a property off of it. In this case we can omit the early return since you're always interested in all the results.
async function scrape3() {
// ...
for (let i = 1; i <= 3; i++) {
const result = await page.evaluate(() => {
const img = document.querySelector('.c-picture img');
const h1 = document.querySelector('h1');
const content = document.querySelector('.c-entry-content');
const picture = img ? img.src : '';
const title = h1 ? h1.innerText : '';
const article = content ? content.innerText : '';
const source = "The Verge";
const categories = "Tech";
return {
title,
article,
picture,
source,
categories
}
});
// ...
}
}
Further thoughts
Since I'm still on this question, let me take this one step further, and refactor it a bit with some higher level techniques you might be interested in. Not sure if this is exactly what you are after, but it should give you some ideas about writing more maintainable code.
// Generic reusable helper to return an object property
// if object exists and has property, else a default value
//
// This is a curried function accepting one argument at a
// time and capturing each parameter in a closure.
//
const maybeGetProp = default => key => object =>
(object && object.hasOwnProperty(key)) ? object.key : default
// Pass in empty string as the default value
//
const getPropOrEmptyString = maybeGetProp('')
// Apply the second parameter, the property name, making 2
// slightly different functions which have a default value
// and a property name pre-loaded. Both functions only need
// an object passed in to return either the property if it
// exists or an empty string.
//
const maybeText = getPropOrEmptyString('innerText')
const maybeSrc = getPropOrEmptyString('src')
async function scrape3() {
// ...
// The _ parameter name is acknowledging that we expect a
// an argument passed in but saying we plan to ignore it.
//
const evaluate = _ => page.evaluate(() => {
// Attempt to retrieve the desired elements
//
const img = document.querySelector('.c-picture img');
const h1 = document.querySelector('h1')
const content = document.querySelector('.c-entry-content')
// Return the results, with empty string in
// place of any missing properties.
//
return {
title: maybeText(h1),
article: maybeText(article),
picture: maybeSrc(img),
source: 'The Verge',
categories: 'Tech'
}
}))
// Start with an empty array of length 3
//
const evaluations = Array(3).fill()
// Then map over that array ignoring the undefined
// input and return a promise for a page evaluation
//
.map(evaluate)
// All 3 scrapes are occuring concurrently. We'll
// wait for all of them to finish.
//
const results = await Promise.all(evaluations)
// Now we have an array of results, so we can
// continue using array methods to iterate over them
// or otherwise manipulate or transform them
//
results
.filter(result => result.title && result.picture)
.forEach(result => {
//
// Do something with each result
//
})
}
Try-catch worked for me:
try {
if (await page.$eval('element')!==null) {
const name = await page.$eval('element')
}
}catch(error){
name = ''
}
Related
I have a function to render comments, in which each comment is stored as an object in an array. Comments can have reply comments, in which they have the exact same html and data to render, just their styling is different (via a CSS modifier class).
How can I make this function recursive? The renderReplies(comment.replies) calls a function that is the exact same as renderComments function, just without the mentioned function call renderReplies (as a reply to a reply is the exact same also in styling terms).
const renderComments = (comments) => {
commentsElement.innerHTML = '';
comments.forEach(comment => {
commentsElement.innerHTML += html;
// data-id attribute
const liElements = commentsElement.querySelectorAll('.comment');
const liElement = liElements.item(liElements.length - 1);
liElement.setAttribute('data-id', comment.id);
// author
liElement.querySelector('.comment__author').innerHTML = comment.user.username;
// avatar src & alt attributes
const avatar = liElement.querySelector('.comment__avatar');
avatar.setAttribute('src', comment.user.image.png);
avatar.setAttribute('alt', comment.user.username);
// time since posted
// content
const p = liElement.querySelector('.comment__text');
p.appendChild(document.createTextNode(comment.content));
// score
liElement.querySelector('.comment__score b').innerHTML = comment.score;
// replies
renderReplies(comment.replies);
});
};
Check whether there's a replies property. If it exists, call the function recursively. Since replies won't have replies of their own, you'll stop there.
const renderComments = (comments) => {
commentsElement.innerHTML = '';
comments.forEach(comment => {
commentsElement.innerHTML += html;
// data-id attribute
const liElements = commentsElement.querySelectorAll('.comment');
const liElement = liElements.item(liElements.length - 1);
liElement.setAttribute('data-id', comment.id);
// author
liElement.querySelector('.comment__author').innerHTML = comment.user.username;
// avatar src & alt attributes
const avatar = liElement.querySelector('.comment__avatar');
avatar.setAttribute('src', comment.user.image.png);
avatar.setAttribute('alt', comment.user.username);
// time since posted
// content
const p = liElement.querySelector('.comment__text');
p.appendChild(document.createTextNode(comment.content));
// score
liElement.querySelector('.comment__score b').innerHTML = comment.score;
// replies
if (comment.hasOwnProperty("replies")) {
renderComments(comment.replies);
}
});
};
I have created an async function that will extra the data from the argument, create a Postgres query based on a data, then did some processing using the retrieved query data. Yet, when I call this function inside a map function, it seemed like it has looped through all the element to extra the data from the argument first before it proceed to the second and the third part, which lead to wrong computation on the second element and onwards(the first element is always correct). I am new to async function, can someone please take at the below code? Thanks!
async function testWeightedScore(test, examData) {
var grade = [];
const testID = examData[test.name];
console.log(testID);
var res = await DefaultPostgresPool().query(
//postgres query based on the score constant
);
var result = res.rows;
for (var i = 0; i < result.length; i++) {
const score = result[i].score;
var weightScore = score * 20;
//more computation
const mid = { "testID": testID, "score": weightScore, more values...};
grade.push(mid);
}
return grade;
}
(async () => {
const examSession = [{"name": "Sally"},{"name": "Bob"},{"name": "Steph"}]
const examData = {
"Sally": 384258,
"Bob": 718239,
"Steph": 349285,
};
var test = [];
examSession.map(async sesion => {
var result = await testWeightedScore(sesion,examData);
let counts = result.reduce((prev, curr) => {
let count = prev.get(curr.testID) || 0;
prev.set(curr.testID, curr.score + count);
return prev;
}, new Map());
let reducedObjArr = [...counts].map(([testID, score]) => {
return {testID, score}
})
console.info(reducedObjArr);
}
);
})();
// The console log printed out all the tokenID first(loop through all the element in examSession ), before it printed out reducedObjArr for each element
The async/await behaviour is that the code pause at await, and do something else (async) until the result of await is provided.
So your code will launch a testWeightedScore, leave at the postgresql query (second await) and in the meantime go to the other entries in your map, log the id, then leave again at the query level.
I didn't read your function in detail however so I am unsure if your function is properly isolated or the order and completion of each call is important.
If you want each test to be fully done one after the other and not in 'parallel', you should do a for loop instead of a map.
I have the following code:
const readDataFromSql = () => {
// going to have to iterate through all known activities + load them here
let sql = "[...]"
return new Promise((resolve, reject) => {
executeSqlQuery(sql).then((dict) => {
let loadedData = [];
for (let key in dict) {
let newItemVal = new ItemVal("reading hw", 7121, progress.DONE);
loadedData.push(newItemVal);
}
resolve(loadedData);
});
});
}
ItemVal implementation:
class ItemVal {
constructor(name, time, type) {
this.name = name
this.time = time
this.type = type
}
}
Let's assume that newItemVal = "reading hwj", 5081, progress.PAUSED when readDataFromSql() first runs.
readDataFromSql() is then again called after some state changes -- where it repulls some information from a database and generates new values. What is perplexing, however, is that when it is called the second time, newItemVal still retains its old properties (attaching screenshot below).
Am I misusing the new keyword?
From what I can see in your example code, you are not mutating existing properties but creating a new object with the ItemVal constructor function and adding them to an array, that you then return as a resolved promise. Are you sure the examples you give a correct representation of what you are actually doing
Given that, I'm not sure what could be causing the issue you are having, but I would at least recommend a different structure for your code, using a simpler function for the itemVal.
Perhaps with this setup, you might get an error returned that might help you debug your issue.
const itemVal = (name, time, type) => ({ name, time, type })
const readDataFromSql = async () => {
try {
const sql = "[...]"
const dict = await executeSqlQuery(sql)
const loadedData = dict.map((key) =>
ItemVal("reading hw", 7121, progress.DONE)
)
return loadedData
} catch (error) {
return error
}
};
If the issue is not in the function, then I would assume that the way you handle the data, returned from the readDataFromSql function, is where the issue lies. You need to then share more details about your implementation.
const readDataFromSql = async () => {
let sql = "[...]"
------> await executeSqlQuery(sql).then((dict) => {
Use the await keyword instead of creating a new promise.
I did some modification and found that below code is working correctly, and updating the new values on each call.
const readDataFromSql = () => {
return new Promise((resolve, reject) => {
let loadedData = [];
let randomVal = Math.random();
let newItemVal = new ItemVal(randomVal*10, randomVal*100, randomVal*1000);
loadedData.push(newItemVal);
resolve(loadedData);
});
}
Could you recheck if you are using below line in the code, as it will instantiate object with same properties again and again.
let newItemVal = new ItemVal("reading hw", 7121, progress.DONE);
You can modify your code as below to simplify the problem.
const readDataFromSql = async () => {
// going to have to iterate through all known activities + load them here
let sql = "[...]" // define sql properly
let result = await executeSqlQuery(sql);
let loadedData = [];
for (let row in result) {
let newItemVal = new ItemVal(row.name, row.time, row.type);
loadedData.push(newItemVal);
}
return loadedData;
}
class ItemVal {
constructor(name, time, type) {
this.name = name
this.time = time
this.type = type
}
}
What you are talking about is an issue related to Object mutation in Redux, however, you didn't add any redux code. Anyway, you might be making some mistake while recreating(not mutating) the array.
General solution is the use spread operator as:
loadedData = [ ...loadedData.slice(0) , ...newloadedData]
In Dropdown.js line 188 instead of console.log-ing your variable write debugger;
This will function as a breakpoint. It will halt your code and you can inspect the value by hovering your mouse over the code BEFORE the newItemVal is changed again.
I can see in your screenshot that the newItemVal is modified again after you log it.
I am trying to scrape a one-page website. There are multiple selection combinations that would result in different search redirects. I wrote a for loop in the page.evaluate's call back function to click the different selections and did the click search in every button. However, I got error: Converting circular structure to JSON Are you passing a nested JSHandle?
Please help!
My current version of code looks like this:
const res = await page.evaluate(async (i, courseCountArr, page) => {
for (let j = 1; j < courseCountArr[i]; j++) {
await document.querySelectorAll('.btn-group > button, .bootstrap-select > button')['1'].click() // click on school drop down
await document.querySelectorAll('div.bs-container > div.dropdown-menu > ul > li > a')[`${j}`].click() // click on each school option
await document.querySelectorAll('.btn-group > button, .bootstrap-select > button')['2'].click() // click on subject drop down
const subjectLen = document.querySelectorAll('div.bs-container > div.dropdown-menu > ul > li > a').length // length of the subject drop down
for (let k = 1; k < subjectLen; k++) {
await document.querySelectorAll('div.bs-container > div.dropdown-menu > ul > li > a')[`${k}`].click() // click on each subject option
document.getElementById('buttonSearch').click() //click on search button
page.waitForSelector('.strong, .section-body')
return document.querySelectorAll('.strong, .section-body').length
}
}
}, i, courseCountArr, page);
Why the error happens
While you haven't shown enough code to reproduce the problem (is courseCountArr an array of ElementHandles? Passing page to evaluate won't work either, that's a Node object), here's a minimal reproduction that shows the likely pattern:
const puppeteer = require("puppeteer");
let browser;
(async () => {
const html = `<ul><li>a</li><li>b</li><li>c</li></ul>`;
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.setContent(html);
// ...
const nestedHandle = await page.$$("li"); // $$ selects all matches
await page.evaluate(els => {}, nestedHandle); // throws
// ...
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
The output is
TypeError: Converting circular structure to JSON
--> starting at object with constructor 'BrowserContext'
| property '_browser' -> object with constructor 'Browser'
--- property '_defaultContext' closes the circle Are you passing a nested JSHandle?
at JSON.stringify (<anonymous>)
Why is this happening? All code inside of the callback to page.evaluate (and family: evaluateHandle, $eval, $$eval) is executed inside the browser console programmatically by Puppeteer. The browser console is a distinct environment from Node, where Puppeteer and the ElementHandles live. To bridge the inter-process gap, the callback to evaluate, parameters and return value are serialized and deserialized.
The consequence of this is that you can't access any Node state like you're attempting with page.waitForSelector('.strong, .section-body') inside the browser. page is in a totally different process from the browser. (As an aside, document.querySelectorAll is purely synchronous, so there's no point in awaiting it.)
Puppeteer ElementHandles are complex structures used to hook into the page's DOM that can't be serialized and passed to the page as you're trying to do. Puppeteer has to perform the translation under the hood. Any ElementHandles passed to evaluate (or have .evaluate() called on them) are followed to the DOM node in the browser that they represent, and that DOM node is what your evaluate's callback is invoked with. Puppeteer can't do this with nested ElementHandles, as of the time of writing.
Possible fixes
In the above code, if you change .$$ to .$, you'll retrieve only the first <li>. This singular, non-nested ElementHandle can be converted to an element:
// ...
const handle = await page.$("li");
const val = await page.evaluate(el => el.innerText, handle);
console.log(val); // => a
// ...
Or:
const handle = await page.$("li");
const val = await handle.evaluate(el => el.innerText);
console.log(val); // => a
Making this work on your example is a matter of either swapping the loop and the evaluate call so that you access courseCountArr[i] in Puppeteer land, unpacking the nested ElementHandles into separate parameters to evaluate, or moving most of your console browser calls to click on things back to Puppeteer (depending on your use case and goals with the code).
You could apply the evaluate call to each ElementHandle:
const nestedHandles = await page.$$("li");
for (const handle of nestedHandles) {
const val = await handle.evaluate(el => el.innerText);
console.log(val); // a b c
}
To get an array of results, you could do:
const nestedHandles = await page.$$("li");
const vals = await Promise.all(
nestedHandles.map(el => el.evaluate(el => el.innerText))
);
console.log(vals); // [ 'a', 'b', 'c' ]
You can also unpack the ElementHandles into arguments for evaluate and use the (...els) parameter list in the callback:
const nestedHandles = await page.$$("li");
const vals = await page.evaluate((...els) =>
els.map(e => e.innerText),
...nestedHandles
);
console.log(vals); // => [ 'a', 'b', 'c' ]
If you have other arguments in addition to the handles you can do:
const nestedHandle = await page.$$("li");
const vals = await page.evaluate((foo, bar, ...els) =>
els.map(e => e.innerText + foo + bar)
, 1, 2, ...nestedHandle);
console.log(vals); // => [ 'a12', 'b12', 'c12' ]
or:
const nestedHandle = await page.$$("li");
const vals = await page.evaluate(({foo, bar}, ...els) =>
els.map(e => e.innerText + foo + bar)
, {foo: 1, bar: 2}, ...nestedHandle);
console.log(vals); // => [ 'a12', 'b12', 'c12' ]
Another option may be to use $$eval, which selects multiple handles, then runs a callback in browser context with the array of selected elements as its parameter:
const vals = await page.$$eval("li", els =>
els.map(e => e.innerText)
);
console.log(vals); // => [ 'a', 'b', 'c' ]
This is probably cleanest if you're not doing anything else with the handles in Node.
Similarly, you can totally bypass Puppeteer and do the entire selection and manipulation in browser context:
const vals = await page.evaluate(() =>
[...document.querySelectorAll("li")].map(e => e.innerText)
);
console.log(vals); // => [ 'a', 'b', 'c' ]
(note that getting the inner text throughout is just a placeholder for whatever browser code of arbitrary complexity you might have)
I wrote a little utility to solve this problem
const jsHandleToJSON = (jsHandle) => {
if (jsHandle.length > 0) {
let json = []
for (let i = 0; i < jsHandle.length; i++) {
json.push(jsHandleToJSON(jsHandle[i]))
}
return json
} else {
let json = {}
const keys = Object.keys(jsHandle)
for (let i = 0; i < keys.length; i++) {
if (typeof jsHandle[keys[i]] !== 'object') {
json[keys[i]] = jsHandle[keys[i]]
} else if (['elements', 'element'].includes(keys[i])) {
json[keys[i]] = jsHandleToJSON(jsHandle[keys[i]])
} else {
console.log(`skipping field ${keys[i]}`)
}
}
return json
}
}
It will create a new object with all the primitive fields of the jsHandle (recursively) and parse some extra jsHandle properties ['elements', 'element'], skips the others.
You could add more properties in there if you need them (but adding all of them will result in a infinite loop).
To make the log into puppeteer working you need to add the following line before the evaluate
page.on('console', message => console.log(`${message.type()}: ${message.text()}`))
can anyone tell me where i'm going wrong?
I'm desperately trying to get my jquery carousel to re-render with the results of a filtered array - but really hitting issues today.
I grab the carouselId from its div with wrapper
i get the api array of objects & store it in moviesArray.
Then the key thing is that when someone has typed in the input, it re-renders the carousel based on that newly filtered array.
function populateCarousel() {
var moviesArray = [];
var tempResultsArr = [];
fetch('http://localhost:3000/getdata')
.then((res) => res.json())
.then((json) => {
moviesArray = json;
// ===== if NO searchTerm is entered ====
// THIS ONE WORKS
if (typeof searchTerm === 'undefined') {
moviesArray.map((each) => {
var eachTile = document.createElement("a");
var imageForEachTile = document.createElement("img");
imageForEachTile.setAttribute('src', each.poster);
eachTile.setAttribute("class", "carousel-item thumbnail-item");
eachTile.setAttribute("href", "#linkToSomeWhere");
eachTile.innerHTML = each.title;
eachTile.appendChild(imageForEachTile);
wrapper.appendChild(eachTile);
})
} else {
// ===== if a searchTerm *IS* entered ====
// THIS ONE DOESN'T WORK!
while (wrapper.firstChild) {
wrapper.removeChild(wrapper.firstChild);
}
function checkWords(searchTerm, arr) {
let st = searchTerm.toLowerCase();
return arr.filter(each => each.title.toLowerCase().indexOf(st) === 0);
}
let newArr = checkWords(searchTerm, moviesArray);
newArr.map((each) => {
var eachTile = document.createElement("a");
var imageForEachTile = document.createElement("img");
imageForEachTile.setAttribute('src', each.poster);
eachTile.setAttribute("class", "carousel-item thumbnail-item");
eachTile.setAttribute("href", "#linkToSomeWhere");
eachTile.innerHTML = each.title;
eachTile.appendChild(imageForEachTile);
wrapper.appendChild(eachTile);
})
// this logs out correctly & when i go into the html
// the correct searchresults are there, a and img tags.
// but they dont show.
console.log('wrapper: ', wrapper);
}
})
}
you notice the .map((each)... is identical on both sides of the IF/ELSE.
It does create the elements and attributes correctly, and does append them because a) i can log them out but b) i can actually inspect the HTMl in chrome and see them there.
But why aren't they actually showing?