I am testing out the nodejs modules x-ray and cheerio
Following is my code:
const Xray = require("x-ray");
const cheerio = require('cheerio');
const xray = new Xray();
xray('https://news.ycombinator.com/', 'body#html')((err, result) => {
const $ = cheerio.load(`<body>${result}</body>`);
const elements = $('body')
.find('*')
.filter((i, e) => (
e.type === 'tag'
))
.map((i, e) => {
e.foo = {
id: i
};
return e;
})
.filter((i, e) => (
e.foo.id % 2 === 0
));
const elementsArray = elements.toArray();
console.log('Length of the array is:', elementsArray.length);
elementsArray.forEach((e) => {
console.log('I appear to print only once, even though elementsArray has lots of elements');
});
});
Issue here is the the console.log() inside the forEach loop prints only once - even though the output of the earlier console.log(elementsArray.length) is about 369.
Runkit link to test it out
I checked the type of elementsArray and I get Array or array as the type. Why then does the loop run only once?
The message is shown many times, but the console will consolidate (what's in a word) repetitions of the same message into one line with a counter next to it.
If you would change the message to make it unique every time, you'd see the difference.
For instance, if you would use the index in the message:
elementsArray.forEach((e,i) => {
console.log(i); // different message on every iteration
});
See the difference with the updated script on runkit.com
Related
Good Afternoon,
I am using the MERN stack to making a simple invoice application.
I have a function that runs 2 forEach() that goes through the invoices in the DB and the Users. if the emails match then it gives the invoices for that user.
When I log DBElement to the console it works, it has the proper data, but when I log test1 to the console (app.get()) it only has one object not both.
// forEach() function
function matchUserAndInvoice(dbInvoices, dbUsers) {
dbInvoices.forEach((DBElement) => {
dbUsers.forEach((userElement) => {
if(DBElement.customer_email === userElement.email){
const arrayNew = [DBElement];
arrayNew.push(DBElement);
app.set('test', arrayNew);
}
})
})
}
// end point that triggers the function and uses the data.
app.get('/test', async (req,res) => {
const invoices = app.get('Invoices');
const users = await fetchUsersFromDB().catch((e) => {console.log(e)});
matchUserAndInvoice(invoices,users,res);
const test1 = await app.get('test');
console.log(test1);
res.json(test1);
})
function matchUserAndInvoice(dbInvoices, dbUsers) {
let newArray = [];
dbInvoices.forEach((DBElement) => {
dbUsers.forEach(async(userElement) => {
if(DBElement.customer_email === userElement.email){
newArray.push(DBElement);
app.set('test', newArray);
}
})
})
}
app.set('test', DBElement); overrides the existing DBElement, so only the last matching DBElement is shown in test1.
If you want to have test correspond to all matching DBElement, you should set it to an array, and then append a new DBElement to the array each time it matches inside the for-loop:
if(DBElement.customer_email === userElement.email){
let newArray = await app.get('test');
newArray.push(DBElement);
app.set('test', newArray);
}
I am trying to scrape a one-page website. There are multiple selection combinations that would result in different search redirects. I wrote a for loop in the page.evaluate's call back function to click the different selections and did the click search in every button. However, I got error: Converting circular structure to JSON Are you passing a nested JSHandle?
Please help!
My current version of code looks like this:
const res = await page.evaluate(async (i, courseCountArr, page) => {
for (let j = 1; j < courseCountArr[i]; j++) {
await document.querySelectorAll('.btn-group > button, .bootstrap-select > button')['1'].click() // click on school drop down
await document.querySelectorAll('div.bs-container > div.dropdown-menu > ul > li > a')[`${j}`].click() // click on each school option
await document.querySelectorAll('.btn-group > button, .bootstrap-select > button')['2'].click() // click on subject drop down
const subjectLen = document.querySelectorAll('div.bs-container > div.dropdown-menu > ul > li > a').length // length of the subject drop down
for (let k = 1; k < subjectLen; k++) {
await document.querySelectorAll('div.bs-container > div.dropdown-menu > ul > li > a')[`${k}`].click() // click on each subject option
document.getElementById('buttonSearch').click() //click on search button
page.waitForSelector('.strong, .section-body')
return document.querySelectorAll('.strong, .section-body').length
}
}
}, i, courseCountArr, page);
Why the error happens
While you haven't shown enough code to reproduce the problem (is courseCountArr an array of ElementHandles? Passing page to evaluate won't work either, that's a Node object), here's a minimal reproduction that shows the likely pattern:
const puppeteer = require("puppeteer");
let browser;
(async () => {
const html = `<ul><li>a</li><li>b</li><li>c</li></ul>`;
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.setContent(html);
// ...
const nestedHandle = await page.$$("li"); // $$ selects all matches
await page.evaluate(els => {}, nestedHandle); // throws
// ...
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
The output is
TypeError: Converting circular structure to JSON
--> starting at object with constructor 'BrowserContext'
| property '_browser' -> object with constructor 'Browser'
--- property '_defaultContext' closes the circle Are you passing a nested JSHandle?
at JSON.stringify (<anonymous>)
Why is this happening? All code inside of the callback to page.evaluate (and family: evaluateHandle, $eval, $$eval) is executed inside the browser console programmatically by Puppeteer. The browser console is a distinct environment from Node, where Puppeteer and the ElementHandles live. To bridge the inter-process gap, the callback to evaluate, parameters and return value are serialized and deserialized.
The consequence of this is that you can't access any Node state like you're attempting with page.waitForSelector('.strong, .section-body') inside the browser. page is in a totally different process from the browser. (As an aside, document.querySelectorAll is purely synchronous, so there's no point in awaiting it.)
Puppeteer ElementHandles are complex structures used to hook into the page's DOM that can't be serialized and passed to the page as you're trying to do. Puppeteer has to perform the translation under the hood. Any ElementHandles passed to evaluate (or have .evaluate() called on them) are followed to the DOM node in the browser that they represent, and that DOM node is what your evaluate's callback is invoked with. Puppeteer can't do this with nested ElementHandles, as of the time of writing.
Possible fixes
In the above code, if you change .$$ to .$, you'll retrieve only the first <li>. This singular, non-nested ElementHandle can be converted to an element:
// ...
const handle = await page.$("li");
const val = await page.evaluate(el => el.innerText, handle);
console.log(val); // => a
// ...
Or:
const handle = await page.$("li");
const val = await handle.evaluate(el => el.innerText);
console.log(val); // => a
Making this work on your example is a matter of either swapping the loop and the evaluate call so that you access courseCountArr[i] in Puppeteer land, unpacking the nested ElementHandles into separate parameters to evaluate, or moving most of your console browser calls to click on things back to Puppeteer (depending on your use case and goals with the code).
You could apply the evaluate call to each ElementHandle:
const nestedHandles = await page.$$("li");
for (const handle of nestedHandles) {
const val = await handle.evaluate(el => el.innerText);
console.log(val); // a b c
}
To get an array of results, you could do:
const nestedHandles = await page.$$("li");
const vals = await Promise.all(
nestedHandles.map(el => el.evaluate(el => el.innerText))
);
console.log(vals); // [ 'a', 'b', 'c' ]
You can also unpack the ElementHandles into arguments for evaluate and use the (...els) parameter list in the callback:
const nestedHandles = await page.$$("li");
const vals = await page.evaluate((...els) =>
els.map(e => e.innerText),
...nestedHandles
);
console.log(vals); // => [ 'a', 'b', 'c' ]
If you have other arguments in addition to the handles you can do:
const nestedHandle = await page.$$("li");
const vals = await page.evaluate((foo, bar, ...els) =>
els.map(e => e.innerText + foo + bar)
, 1, 2, ...nestedHandle);
console.log(vals); // => [ 'a12', 'b12', 'c12' ]
or:
const nestedHandle = await page.$$("li");
const vals = await page.evaluate(({foo, bar}, ...els) =>
els.map(e => e.innerText + foo + bar)
, {foo: 1, bar: 2}, ...nestedHandle);
console.log(vals); // => [ 'a12', 'b12', 'c12' ]
Another option may be to use $$eval, which selects multiple handles, then runs a callback in browser context with the array of selected elements as its parameter:
const vals = await page.$$eval("li", els =>
els.map(e => e.innerText)
);
console.log(vals); // => [ 'a', 'b', 'c' ]
This is probably cleanest if you're not doing anything else with the handles in Node.
Similarly, you can totally bypass Puppeteer and do the entire selection and manipulation in browser context:
const vals = await page.evaluate(() =>
[...document.querySelectorAll("li")].map(e => e.innerText)
);
console.log(vals); // => [ 'a', 'b', 'c' ]
(note that getting the inner text throughout is just a placeholder for whatever browser code of arbitrary complexity you might have)
I wrote a little utility to solve this problem
const jsHandleToJSON = (jsHandle) => {
if (jsHandle.length > 0) {
let json = []
for (let i = 0; i < jsHandle.length; i++) {
json.push(jsHandleToJSON(jsHandle[i]))
}
return json
} else {
let json = {}
const keys = Object.keys(jsHandle)
for (let i = 0; i < keys.length; i++) {
if (typeof jsHandle[keys[i]] !== 'object') {
json[keys[i]] = jsHandle[keys[i]]
} else if (['elements', 'element'].includes(keys[i])) {
json[keys[i]] = jsHandleToJSON(jsHandle[keys[i]])
} else {
console.log(`skipping field ${keys[i]}`)
}
}
return json
}
}
It will create a new object with all the primitive fields of the jsHandle (recursively) and parse some extra jsHandle properties ['elements', 'element'], skips the others.
You could add more properties in there if you need them (but adding all of them will result in a infinite loop).
To make the log into puppeteer working you need to add the following line before the evaluate
page.on('console', message => console.log(`${message.type()}: ${message.text()}`))
I'm trying to iterate and print out in order an array in Javascript that contains the title of 2 events that I obtained from doing web scraping to a website but it prints out in disorder. I know Javascript is asynchronous but I'm new in this world of asynchronism. How can I implement the loop for to print the array in order and give customized info?
agent.add('...') is like console.log('...'). I'm doing a chatbot with DialogFlow and NodeJs 8 but that's not important at this moment. I used console.log() in the return just for debug.
I tried the next:
async function printEvent(event){
agent.add(event)
}
async function runLoop(eventsTitles){
for (let i = 0; i<eventsTitles.length; i++){
aux = await printEvent(eventsTitles[i])
}
}
But i got this error error Unexpected await inside a loop no-await-in-loop
async function showEvents(agent) {
const cheerio = require('cheerio');
const rp = require('request-promise');
const options = {
uri: 'https://www.utb.edu.co/eventos',
transform: function (body) {
return cheerio.load(body);
}
}
return rp(options)
.then($ => {
//** HERE START THE PROBLEM**
var eventsTitles = [] // array of event's titles
agent.add(`This mont we have these events available: \n`)
$('.product-title').each(function (i, elem) {
var event = $(this).text()
eventsTitles.push(event)
})
agent.add(`${eventsTitles}`) // The array prints out in order but if i iterate it, it prints out in disorder.
// *** IMPLEMENT LOOP FOR ***
agent.add(`To obtain more info click on this link https://www.utb.edu.co/eventos`)
return console.log(`Show available events`);
}).catch(err => {
agent.add(`${err}`)
return console.log(err)
})
}
I would like to always print out Event's title #1 and after Event's title #2. Something like this:
events titles.forEach((index,event) => {
agent.add(`${index}. ${event}`) // remember this is like console.log(`${index}. ${event}`)
})
Thanks for any help and explanation!
There no async case here but if you still face difficultly than use this loop
for (let index = 0; index < eventsTitles.length; index++) {
const element = eventsTitles[index];
agent.add(${index}. ${element})
}
I used tf.expandDims() to add dimensions. Since I'm able to go into model.fit(), but got stuck due to this error Cannot start training because another fit() call is ongoing. and Cannot read property 'length' of undefined.
You can find my code here
// Train the model using the data.
let tesnor_dim =[];
let tensr;for(var j=0; j<2; j++){
console.log('resize_image',resize_image);
tensr = tf.expandDims(ysarr[j], 0);
tesnor_dim.push(tensr);
console.log('tesnor_dim',tesnor_dim);
model.fit(resize_image[j], tesnor_dim[j], {epochs: 100}).then((loss) => {
console.log('resize_image[j]',resize_image[j]);
console.log('tesnor_dim[j]',tesnor_dim[j]);
console.log('loss',loss);
const t = model.predict(resize_image[j]);
console.log('Prediction:::'+t);
pred = t.argMax(1).dataSync(); // get the class of highest probability
const labelsPred = Array.from(pred).map(e => setLabel[e])
console.log('labelsPred:::'+labelsPred);
//const saveResults = model.save('downloads://my-model-1');
//console.log(saveResults);
}).catch((e) => {
console.log(e.message);
})
}
When multiple fit are called on the same model, they have to be done sequentially. It means that the second call has to start only when the first one has completed. Using async and await will prevent your second call to happen unless the first has completed.
loss = await model.fit(resize_image[j], tesnor_dim[j], {epochs: 100})
// continue rest of processing
you may use this code,
await model.fit(resize_image[j], tesnor_dim[j], {epochs: 100}).then((loss) => {
console.log('resize_image[j]',resize_image[j]);
console.log('tesnor_dim[j]',tesnor_dim[j]);
console.log('loss',loss);
const t = model.predict(resize_image[j]);
console.log('Prediction:::'+t);
pred = t.argMax(1).dataSync(); // get the class of highest probability
const labelsPred = Array.from(pred).map(e => setLabel[e])
console.log('labelsPred:::'+labelsPred);
//const saveResults = model.save('downloads://my-model-1');
//console.log(saveResults);
}).catch((e) => {
console.log(e.message);
})
}
I'm just starting to play around with Puppeteer (Headless Chrome) and Nodejs. I'm scraping some test sites, and things work great when all the values are present, but if the value is missing I get an error like:
Cannot read property 'src' of null (so in the code below, the first two passes might have all values, but the third pass, there is no picture, so it just errors out).
Before I was using if(!picture) continue; but I think it's not working now because of the for loop.
Any help would be greatly appreciated, thanks!
for (let i = 1; i <= 3; i++) {
//...Getting to correct page and scraping it three times
const result = await page.evaluate(() => {
let title = document.querySelector('h1').innerText;
let article = document.querySelector('.c-entry-content').innerText;
let picture = document.querySelector('.c-picture img').src;
if (!document.querySelector('.c-picture img').src) {
let picture = 'No Link'; } //throws error
let source = "The Verge";
let categories = "Tech";
if (!picture)
continue; //throws error
return {
title,
article,
picture,
source,
categories
}
});
}
let picture = document.querySelector('.c-picture img').src;
if (!document.querySelector('.c-picture img').src) {
let picture = 'No Link'; } //throws error
If there is no picture, then document.querySelector() returns null, which does not have a src property. You need to check that your query found an element before trying to read the src property.
Moving the null-check to the top of the function has the added benefit of saving unnecessary calculations when you are just going to bail out anyway.
async function scrape3() {
// ...
for (let i = 1; i <= 3; i++) {
//...Getting to correct page and scraping it three times
const result = await page.evaluate(() => {
const pictureElement = document.querySelector('.c-picture img');
if (!pictureElement) return null;
const picture = pictureElement.src;
const title = document.querySelector('h1').innerText;
const article = document.querySelector('.c-entry-content').innerText;
const source = "The Verge";
const categories = "Tech";
return {
title,
article,
picture,
source,
categories
}
});
if (!result) continue;
// ... do stuff with result
}
Answering comment question: "Is there a way just to skip anything blank, and return the rest?"
Yes. You just need to check the existence of each element that could be missing before trying to read a property off of it. In this case we can omit the early return since you're always interested in all the results.
async function scrape3() {
// ...
for (let i = 1; i <= 3; i++) {
const result = await page.evaluate(() => {
const img = document.querySelector('.c-picture img');
const h1 = document.querySelector('h1');
const content = document.querySelector('.c-entry-content');
const picture = img ? img.src : '';
const title = h1 ? h1.innerText : '';
const article = content ? content.innerText : '';
const source = "The Verge";
const categories = "Tech";
return {
title,
article,
picture,
source,
categories
}
});
// ...
}
}
Further thoughts
Since I'm still on this question, let me take this one step further, and refactor it a bit with some higher level techniques you might be interested in. Not sure if this is exactly what you are after, but it should give you some ideas about writing more maintainable code.
// Generic reusable helper to return an object property
// if object exists and has property, else a default value
//
// This is a curried function accepting one argument at a
// time and capturing each parameter in a closure.
//
const maybeGetProp = default => key => object =>
(object && object.hasOwnProperty(key)) ? object.key : default
// Pass in empty string as the default value
//
const getPropOrEmptyString = maybeGetProp('')
// Apply the second parameter, the property name, making 2
// slightly different functions which have a default value
// and a property name pre-loaded. Both functions only need
// an object passed in to return either the property if it
// exists or an empty string.
//
const maybeText = getPropOrEmptyString('innerText')
const maybeSrc = getPropOrEmptyString('src')
async function scrape3() {
// ...
// The _ parameter name is acknowledging that we expect a
// an argument passed in but saying we plan to ignore it.
//
const evaluate = _ => page.evaluate(() => {
// Attempt to retrieve the desired elements
//
const img = document.querySelector('.c-picture img');
const h1 = document.querySelector('h1')
const content = document.querySelector('.c-entry-content')
// Return the results, with empty string in
// place of any missing properties.
//
return {
title: maybeText(h1),
article: maybeText(article),
picture: maybeSrc(img),
source: 'The Verge',
categories: 'Tech'
}
}))
// Start with an empty array of length 3
//
const evaluations = Array(3).fill()
// Then map over that array ignoring the undefined
// input and return a promise for a page evaluation
//
.map(evaluate)
// All 3 scrapes are occuring concurrently. We'll
// wait for all of them to finish.
//
const results = await Promise.all(evaluations)
// Now we have an array of results, so we can
// continue using array methods to iterate over them
// or otherwise manipulate or transform them
//
results
.filter(result => result.title && result.picture)
.forEach(result => {
//
// Do something with each result
//
})
}
Try-catch worked for me:
try {
if (await page.$eval('element')!==null) {
const name = await page.$eval('element')
}
}catch(error){
name = ''
}