.each in Phantom.js with Cheerio - any alternative? - javascript

I'm looking for some function that would allow me to reiterate through the div elements scraped by PhantomJs (which uses jQuery-like syntax), but one by one - not all at the same time like .each seems to be doing. So I guess I need it to run syncroniously.
At the moment my code looks something like this
page.open("https://www.google.com" + expandedurl, function (status) {
console.log("opened google knowledge graph ", status);
page.evaluate(function () { return document.body.innerHTML; }, function (result) {
var $ = cheerio.load(result);
$(".kltat").each(function() {
var link = $(this);
var text = link.text();
launch(text);
});
ph.exit();
// Move on to the next one
});
});
I need something that would not launch all of the each iterations at the same time. Maybe there's some way of reiterating I could use that would not work asynchroniously - that's what i need...

If launch is something asynchronous and is able to take a callback, then
Use async for this:
var async = require('async');
var $ = cheerio.load(result);
var callbacks = [];
$(".kltat").each(function() {
var link = $(this);
var text = link.text();
callbacks.push(function(cb){
launch(text, cb);
});
});
async.series(callbacks, function(){
ph.exit();
});
Otherwise, you can either use a static wait amount:
callbacks.push(function(cb){
launch(text);
setTimeout(function(){
cb(null);
});
});
or use something like waitFor to wait for an external condition triggered through launch.

Related

Make code wait for popup to open, then scrape the popup

I am writing a JS script that automates some browser actions.
With get_baskets_onclicks, I am collecting onclick functions from certain DOM elements and returning them in an array. Each onclick looks something like this:
onclick="PrimeFaces.ab({s:"j_id_32:GenerationTable:0:j_id_1e_2_3p",u:"#widgetVar(GenerationCodingDialog)",onco:function(xhr,status,args){PF('GenerationCodingDialog').show();}});return false;"
and opens a pop-up from which I need to collect some data with get_MAP_data.
Also, each of these functions is called from within get_MAP_data.
The problem is I cannot make the code wait for the popup to be opened, so the data returned by get_MAP_data is empty.
Besides the below document.readyState === 'complete', I have also tried window.onload = function(){}, to no avail.
Is there any way to make the browser (Chrome) wait? I guess I cannot use jQuery, because this is not my webpage.
function get_baskets_onclicks() {
// returns array of functions that launch MAP dialogs
var baskets = Array.from(document.getElementsByClassName("ui-commandlink ui-widget margin-right-5px"));
var baskets_onclicks = baskets.map(basket => basket.onclick);
return baskets_onclicks;
};
function get_MAP_data(basket_onclick) {
basket_onclick()
if (document.readyState === 'complete') {
console.log("PAGE LOADED");
// wait here for the dialog to open
// dt = detail table
var MAP_data = {} // container for transaction details
var labels_to_get = ['Description', 'Category', 'Department', 'Justification', 'My Shop Voucher', 'My Shop Coding'];
var all_dts = document.getElementsByClassName('summary-details-grid');
var dt = Array.from(all_dts).filter(table => table.parentElement.id == "paymentGenerationmyShopCodingForm")[0];
var dt_body = dt.children[0];
var dt_trs = Array.from(dt_body.children) ;
dt_trs.forEach(function(tr) {
tds = Array.from(tr.children);
tds.forEach(function(td) {
var label = td.textContent;
if (labels_to_get.includes(label)) {
var value_for_label = tds[1].textContent;
MAP_data[label] = value_for_label;
console.log(label, value_for_label);
};
});
});
// console.log(MAP_data);
return MAP_data;
};
};
var first_onclick = get_baskets_onclicks()[0];
get_MAP_data(first_onclick);
A small, hacky fix would be to make your code poll for the existence of the elements you are checking.
let interval = setInterval(() => {
var all_dts = document.getElementsByClassName('summary-details-grid');
if (all_dts.length !== 0) {
// Found some elements, now lets run the code
clearInterval(interval);
get_MAP_data(first_onclick);
}
}, 100);
This would check for summary-details-grid classes ever 10th of a second, and when it finds them, run your code.
https://developer.mozilla.org/en-US/docs/Talk:DOM/window.setTimeout
http://mdn.beonex.com/en/DOM/window.setInterval.html
combination of retries, setTimeout, setInterval, while loop, I'm not sure the best way for your specific modal, but one of those options should be fit for polling the DOM results until something is there or it's tried to many times.

page.open breaks when called multiple times [duplicate]

My goal is open many pages(with a short delay) and save my data to a file.
But my code does not work.
var gamesList = [url1,url2,url3];
//gamesList is getting from a file
var urls = [];
var useragent = [];
useragent.push('Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14');
useragent.push('Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50');
var page = require('webpage').create();
page.settings.userAgent = useragent[Math.floor(Math.random() * useragent.length)];
console.log('Loading a web page');
function handle_page(url){
page.open(url,function(){
//...
var html= page.evaluate(function(){
// ...do stuff...
page.injectJs('jquery.min.js');
return $('body').html();
});
//save to file
var file = fs.open('new_test.txt', "w");
file.write(html + '\n');
file.close();
console.log(html);
setTimeout(next_page,1000);
});
}
function next_page(urls){
var url=urls.shift();
if(!urls){
phantom.exit(0);
}
handle_page(url);
}
next_page(urls);
phantom.exit();
Does it matter where I am writing phantom.exit();? If I am writing it in the page.open() callback in the end then the 1st page opens well.
Your idea of opening multiple pages with recursion is correct, but you have some problems.
Exit
As you correctly noted, you have a problem with phantom.exit(). Since page.open() and setTimeout() are asynchronous, you only need to exit when you are done. When you call phantom.exit() at the end of the script, you're exiting before the first page is even loaded.
Simply remove that last phantom.exit(), because you already have another exit at the correct place.
Page context
page.evaluate() provides access to the DOM context (page context). The problem is that it is sandboxed. Inside of that callback you have no access to variables defined outside. You can explicitly pass variables in, but they have to be primitive objects which page is not. You simply have to access to page inside of page.evaluate(). You need to inject jQuery before calling page.evaluate().
Files
You're overwriting the file in every iteration by not changing the file name. Either you need to change the filename or use the appending mode 'a' instead of 'w'.
Then you don't need to open a stream when you simply want to write once. Change:
var file = fs.open('new_test.txt', "w");
file.write(html + '\n');
file.close();
to
fs.write('new_test.txt', html + '\n', 'a');
Recursive step
The recursive step with calling the next_page() function requires that you pass in the urls. Since urls is already a global variable and you change it in each iteration, you don't need to pass in the urls.
You also don't need to add a setTimeout(), because everything before inside of the page.open() callback was synchronous.
Fixed Script
//...
var urls = [/*....*/];
function handle_page(url){
page.open(url, function(){
//...
page.injectJs('jquery.min.js');
var html = page.evaluate(function(){
// ...do stuff...
return $('body').html();
});
//save to file
fs.write('new_test.txt', html + '\n', 'a');
console.log(html);
next_page();
});
}
function next_page(){
var url = urls.shift();
if(!url){
phantom.exit(0);
}
handle_page(url);
}
next_page();
Explanation above is very helpful to me. So, thanx a lot. Come to the point, sometimes js function renders even after the page has already loaded, in this scenario setTimeout() method, is very helpful. And,I faced problem like this when scraping many site....I used setTimeout() method in this way,
`
function handle_page(url){page.open(url, function() {setTimeout(function() {var html_text=page.evaluate(function(){
var is= document.querySelectorAll("li.bookinDetails_c1_180616")[0].textContent;
var isbn=is.trim();
//return s1;
var w,x,y,z,z1,w1,u,a;
a= document.querySelectorAll("li.selectableBook");
if(a.length==5){
w1=document.querySelectorAll("span.bookPrice")[0].textContent;
w=w1.trim();
x1=document.querySelectorAll("span.bookPrice")[1].textContent;
x=x1.trim();
y1=document.querySelectorAll("span.bookPrice")[2].textContent;
y=y1.trim();
z1=document.querySelectorAll("span.bookPrice")[3].textContent;
z=z1.trim();
u=isbn+"=>["+"RENT USED:-"+w+","+"RENT NEW:-"+x+","+"BUY USED:-"+y+","+"BUY NEW:-"+z+"]";
return u;
}else{
y1=document.querySelectorAll("span.bookPrice")[0].textContent;
y=y1.trim();
z1=document.querySelectorAll("span.bookPrice")[1].textContent;
z=z1.trim();
u=isbn+"=>["+"BUY USED:-"+y+","+"BUY NEW:-"+z+"]";
return u;
}
});
fs.write('html.txt',html_text+'\r\n','a');
next_page();
}, 400);
});
}
`
I ran into a similar problem recently. I found an answer here:https://stackoverflow.com/questions/34120421/scraping-multiple-urls-by-looping-in-phantomjs. But it did not work for me. I think recursion should be used instead of loop. But I don't know how to write the code. Fortunately, I found a solution here. These answers above are very helpful to me. I will post my code here, hoping to help others in the future.
var urls = new Array("1.html", "2.html", "3.html");
var page = new WebPage();
var fs = require('fs');
var count = 1;
function handle_page(url) {
page.open(url, function () {
setTimeout(function () {
var ct = page.evaluate(function () {
return document.getElementsByClassName('content');
});
var fn = '00' + count + '.html';
console.log(fn);
try {
fs.write(fn, ct[0].textContent, 'w');
} catch (e) {
console.log(e);
};
count += 1;
next_page();
}, 1000);
});
}
function next_page() {
var url = urls.shift();
if (!url) {
phantom.exit(0);
}
handle_page(url);
}
next_page();

Using multiple page.open in one script

My goal is open many pages(with a short delay) and save my data to a file.
But my code does not work.
var gamesList = [url1,url2,url3];
//gamesList is getting from a file
var urls = [];
var useragent = [];
useragent.push('Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14');
useragent.push('Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50');
var page = require('webpage').create();
page.settings.userAgent = useragent[Math.floor(Math.random() * useragent.length)];
console.log('Loading a web page');
function handle_page(url){
page.open(url,function(){
//...
var html= page.evaluate(function(){
// ...do stuff...
page.injectJs('jquery.min.js');
return $('body').html();
});
//save to file
var file = fs.open('new_test.txt', "w");
file.write(html + '\n');
file.close();
console.log(html);
setTimeout(next_page,1000);
});
}
function next_page(urls){
var url=urls.shift();
if(!urls){
phantom.exit(0);
}
handle_page(url);
}
next_page(urls);
phantom.exit();
Does it matter where I am writing phantom.exit();? If I am writing it in the page.open() callback in the end then the 1st page opens well.
Your idea of opening multiple pages with recursion is correct, but you have some problems.
Exit
As you correctly noted, you have a problem with phantom.exit(). Since page.open() and setTimeout() are asynchronous, you only need to exit when you are done. When you call phantom.exit() at the end of the script, you're exiting before the first page is even loaded.
Simply remove that last phantom.exit(), because you already have another exit at the correct place.
Page context
page.evaluate() provides access to the DOM context (page context). The problem is that it is sandboxed. Inside of that callback you have no access to variables defined outside. You can explicitly pass variables in, but they have to be primitive objects which page is not. You simply have to access to page inside of page.evaluate(). You need to inject jQuery before calling page.evaluate().
Files
You're overwriting the file in every iteration by not changing the file name. Either you need to change the filename or use the appending mode 'a' instead of 'w'.
Then you don't need to open a stream when you simply want to write once. Change:
var file = fs.open('new_test.txt', "w");
file.write(html + '\n');
file.close();
to
fs.write('new_test.txt', html + '\n', 'a');
Recursive step
The recursive step with calling the next_page() function requires that you pass in the urls. Since urls is already a global variable and you change it in each iteration, you don't need to pass in the urls.
You also don't need to add a setTimeout(), because everything before inside of the page.open() callback was synchronous.
Fixed Script
//...
var urls = [/*....*/];
function handle_page(url){
page.open(url, function(){
//...
page.injectJs('jquery.min.js');
var html = page.evaluate(function(){
// ...do stuff...
return $('body').html();
});
//save to file
fs.write('new_test.txt', html + '\n', 'a');
console.log(html);
next_page();
});
}
function next_page(){
var url = urls.shift();
if(!url){
phantom.exit(0);
}
handle_page(url);
}
next_page();
Explanation above is very helpful to me. So, thanx a lot. Come to the point, sometimes js function renders even after the page has already loaded, in this scenario setTimeout() method, is very helpful. And,I faced problem like this when scraping many site....I used setTimeout() method in this way,
`
function handle_page(url){page.open(url, function() {setTimeout(function() {var html_text=page.evaluate(function(){
var is= document.querySelectorAll("li.bookinDetails_c1_180616")[0].textContent;
var isbn=is.trim();
//return s1;
var w,x,y,z,z1,w1,u,a;
a= document.querySelectorAll("li.selectableBook");
if(a.length==5){
w1=document.querySelectorAll("span.bookPrice")[0].textContent;
w=w1.trim();
x1=document.querySelectorAll("span.bookPrice")[1].textContent;
x=x1.trim();
y1=document.querySelectorAll("span.bookPrice")[2].textContent;
y=y1.trim();
z1=document.querySelectorAll("span.bookPrice")[3].textContent;
z=z1.trim();
u=isbn+"=>["+"RENT USED:-"+w+","+"RENT NEW:-"+x+","+"BUY USED:-"+y+","+"BUY NEW:-"+z+"]";
return u;
}else{
y1=document.querySelectorAll("span.bookPrice")[0].textContent;
y=y1.trim();
z1=document.querySelectorAll("span.bookPrice")[1].textContent;
z=z1.trim();
u=isbn+"=>["+"BUY USED:-"+y+","+"BUY NEW:-"+z+"]";
return u;
}
});
fs.write('html.txt',html_text+'\r\n','a');
next_page();
}, 400);
});
}
`
I ran into a similar problem recently. I found an answer here:https://stackoverflow.com/questions/34120421/scraping-multiple-urls-by-looping-in-phantomjs. But it did not work for me. I think recursion should be used instead of loop. But I don't know how to write the code. Fortunately, I found a solution here. These answers above are very helpful to me. I will post my code here, hoping to help others in the future.
var urls = new Array("1.html", "2.html", "3.html");
var page = new WebPage();
var fs = require('fs');
var count = 1;
function handle_page(url) {
page.open(url, function () {
setTimeout(function () {
var ct = page.evaluate(function () {
return document.getElementsByClassName('content');
});
var fn = '00' + count + '.html';
console.log(fn);
try {
fs.write(fn, ct[0].textContent, 'w');
} catch (e) {
console.log(e);
};
count += 1;
next_page();
}, 1000);
});
}
function next_page() {
var url = urls.shift();
if (!url) {
phantom.exit(0);
}
handle_page(url);
}
next_page();

Preventing nested callbacks in JavaScript that uses iteration

Currently I'm using promises to try to prevent the need for nested callbacks in my code, but I've hit a setback. In this case, I'm using node's request-promise and cheerio to emulate jQuery on the server. However, at some point I need to call jQuery.each(), to create a request for each <a> element. Is there any way I can use promises to prevent this nested callback?
request("http://url.com").then(function (html) {
var $ = cheerio.load(html);
var rows = $("tr.class a");
rows.each(function (index, el) {
//Iterate over all <a> elements, and send a request for each one.
//Can this code be modified to return a promise?
//Is there another way to prevent this from being nested?
request($(el).attr("href")).then(function (html) {
var $ = cheerio.load(html);
var url = $("td>img").attr("src");
return request(url);
})
.then(function (img) {
//Save the image to the database
});
});
});
Assuming Bluebird promises (code in other libraries is similar):
Promise.resolve(request("http://url.com").then(function (html) {
var $ = cheerio.load(html)("tr.class a");
})).map(function(el){ // map is `then` over an array
return el.href;
}).map(request).map(function(html){
return cheerio.load(html)("td>img").src;
}).map(request).map(function(img){
// save to database.
});
Alternatively, you can define actions for a single link and then process those. It would look similar.
This is the best solution I got in the end. Some incidental changes I made include using url.resolve to allow relative URLs to work.
var $ = require('cheerio');
var request = require('request-promise');
var url = require('url');
var baseURL = "http://url.com";
request(baseURL).then(function (html) {
$("tr.class a", html).toArray();
}).map(function (el) {
return request(url.resolve(baseURL, jq.attr("href")));
}).map(function (html) {
var src = $("td>img", html).attr("src");
return request(url.resolve(baseURL, src));
}).map(function (img) {
//Save the image to the database
});
Thanks to Benjamin Gruenbaum for alterting me to the .map() method in bluebird.

Dojo fetch, how to wait on two simultaneous async fetches?

Hey guys, im not well versed in dealing with asynchronous design patterns, and im having a problem writing a script that does two async data fetches.
Im using Dojo.data.api.Read.Fetch() to make two fetch() calls from seperate databases. The reulsts come back asynchronously. However, I have to cross reference the results, so i want my script to continue once BOTH async fetches are complete. I dont know how to do this, and therein lies the problem.
I am aware of the fetch's onComplete field and how to use it, BUT the best case solution i see there is to call the second fetch in the onComplete of the first fetch. I would like to do these fetches at the same time. Is there a way to do this?
Here's the current structure of my program for illustration purposes:
this.dict1.fetch({query:"blahblahblah", onComplete: function(items) { something here? }});
this.dict2.fetch({query:"blahblahbleh", onComplete: function(items) { or maybe something here? }});
this.orMaybeDoSomethingAfterBothFetches()
Any help would be greatly appreciated!
You could create dojo.Deferreds for each of the fetches and then use dojo.DeferredList and add the deferreds to it - see here. This solution allows you to take advantage of adding 'n' functions to the list of functions you want to call. It also takes advantage of all the dojo.Deferred's callback and errBack functionality.
var fetch1 = new dojo.Deferred();
fetch1.addCallback(this.dict1.fetch...);
var fetch2 = new dojo.Deferred();
fetch2.addCallback(this.dict2.fetch...);
var allFuncs = new dojo.DeferredList([fetch1, fetch2]);
var doStuffWhenAllFuncsReturn = function() {...};
allFuncs.addCallback(doStuffWhenAllFuncsReturn);
// this is a variation of a function I have answered quite a few similar questions on SO with
function collected(count, fn){
var loaded = 0;
var collectedItems = [];
return function(items){
collectedItems = collectedItems.concat(items);
if (++loaded === count){
fn(collectedItems);
}
}
}
var collectedFn = collected(2, function(items){
//do stuff
});
this.dict1.fetch({query:"blahblahblah", onComplete: collectedFn);
this.dict2.fetch({query:"blahblahbleh", onComplete: collectedFn);
An alternative solution is
var store = {
exec: function(){
if (this.items1 && this.items2) {
// do stuff with this.items1 and this.items2
}
}
};
this.dict1.fetch({query:"blahblahblah", onComplete: function(items) {
store.items1 = items;
store.exec();
});
this.dict2.fetch({query:"blahblahbleh", onComplete: function(items) {
store.items2 = items;
store.exec();
});

Categories