I am tring to scrape a few sites. Here is my code:
for (var i = 0; i < urls.length; i++) {
url = urls[i];
console.log("Start scraping: " + url);
page.open(url, function () {
waitFor(function() {
return page.evaluate(function() {
return document.getElementById("progressWrapper").childNodes.length == 1;
});
}, function() {
var price = page.evaluate(function() {
// do something
return price;
});
console.log(price);
result = url + " ; " + price;
output = output + "\r\n" + result;
});
});
}
fs.write('test.txt', output);
phantom.exit();
I want to scrape all sites in the array urls, extract some information and then write this information to a text file.
But there seems to be a problem with the for loop. When scraping only one site without using a loop, all works as I want. But with the loop, first nothing happens, then the line
console.log("Start scraping: " + url);
is shown, but one time too much.
If url = {a,b,c}, then phantomjs does:
Start scraping: a
Start scraping: b
Start scraping: c
Start scraping:
It seems that page.open isn't called at all.
I am newbie to JS so I am sorry for this stupid question.
PhantomJS is asynchronous. By calling page.open() multiple times using a loop, you essentially rush the execution of the callback. You're overwriting the current request before it is finished with a new request which is then again overwritten. You need to execute them one after the other, for example like this:
page.open(url, function () {
waitFor(function() {
// something
}, function() {
page.open(url, function () {
waitFor(function() {
// something
}, function() {
// and so on
});
});
});
});
But this is tedious. There are utilities that can help you with writing nicer code like async.js. You can install it in the directory of the phantomjs script through npm.
var async = require("async"); // install async through npm
var tests = urls.map(function(url){
return function(callback){
page.open(url, function () {
waitFor(function() {
// something
}, function() {
callback();
});
});
};
});
async.series(tests, function finish(){
fs.write('test.txt', output);
phantom.exit();
});
If you don't want any dependencies, then it is also easy to define your own recursive function (from here):
var urls = [/*....*/];
function handle_page(url){
page.open(url, function(){
waitFor(function() {
// something
}, function() {
next_page();
});
});
}
function next_page(){
var url = urls.shift();
if(!urls){
phantom.exit(0);
}
handle_page(url);
}
next_page();
Related
I modified the basic phantomjs example here http://phantomjs.org/screen-capture.html to accept command line args.
When I pass http://google.com as argument console.log outputs are correct
0: index.js
1: http://google.com
but I don't get any thumbnail.png in my folder why ?
var page = require('webpage').create();
var system = require('system');
var args = system.args;
var url;
if (args.length === 1) {
url = 'http://github.com/';
} else {
args.forEach(function(arg, i) {
console.log(i + ': ' + arg);
if (i > 0) {
page.open(arg, function() {
page.render('thumbnail' + '.png');
});
}
});
}
phantom.exit();
page.open is an asynchronous function, therefore phantom.exit is being called before your callback to render the thumbnail.
move phantom.exit inside your callback as specified in the docs
var page = require('webpage').create();
page.open('http://github.com/', function() {
page.render('github.png');
phantom.exit();
});
I have been having some issues with opening multiple webpages in phantomjs, I am first opening a website which contains a few links, which I want to open as well, and save a piece of text from each URL to my jobs_list which has many objects inside of it. And after all the URL's have been run, I want to exit phantomjs. But as it is right now it never exits, and I have trouble recieving data from second function.
var webPage = require('webpage');
var page = webPage.create();
var jobs_list = [];
page.open('url', function (status) {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
page.onConsoleMessage = function(msg) {
console.log(msg);
};
var list = page.evaluate(function() {
var jobs = [];
var job;
$('.test').each(function(){
$(this).find('span').each(function(){
var job_link = $(this).find('a');
var url = job_link.attr("href");
job = {title : job_link.text(), url : url, location : ""};
jobs.push(job);
})
});
return jobs;
});
var i = 0;
jobs_list = list;
next_page(i);
});
});
function next_page(i){
if (i <= (jobs_list.length-1)) {
var current_job = jobs_list[i];
var url = current_job.url;
page.open(url, function (status) {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function () {
var location = page.evaluate(function() {
var job_location;
$('.job-location').each(function(){
$(this).find('li').each(function(){
job_location = $(this).text();
})
})
console.log(job_location);
return job_location;
});
jobs_list[i].location = location;
if(i == (jobs_list.length-1)) {
phantom.exit(0);
}
});
});
console.log(i, current_job.title);
next_page(++i);
}
}
The problem is that the page.open call is asynchronous. If you look closely to your next_page function it can be shortened to this:
function next_page(i){
if (i <= (jobs_list.length-1)) {
var current_job = jobs_list[i];
var url = current_job.url;
page.open(url, function (status) {
...
});
console.log(i, current_job.title);
next_page(++i);
}
}
It means that next_page(++i); is executed before page.open(url, ...) even managed to load the first HTML content. This call leads to the next page.open(url, ...) being executed immediately, thus overwriting the previous request. And you're never going to get any data this way.
You have to do two things:
move the next_page(++i); call where the execution of one page is finished
reduce the number of condition checking
I propose:
function next_page(i){
if (i <= (jobs_list.length-1)) {
var current_job = jobs_list[i];
var url = current_job.url;
page.open(url, function (status) {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function () {
var location = page.evaluate(function() {
var job_location;
$('.job-location').each(function(){
$(this).find('li').each(function(){
job_location = $(this).text();
})
})
console.log(job_location);
return job_location;
});
jobs_list[i].location = location;
console.log(i, current_job.title);
next_page(++i);
});
});
} else {
phantom.exit(0);
}
}
That's quite an old version of jQuery. Perhaps you want to load a newer version. If the page already has jQuery included, you will likely break the page by loading another jQuery into it. Don't load an additional jQuery version at all in this case.
I'm trying to automate the navigation of some web pages with phantomJS.
What i'm trying to create is a pattern for testing and navigation, so far i got this.
For a moment ignore all the potential null pointers due to empty arrays and such :)
testSuite.js
var webPage = require('webpage');
// Test suite definition
function testSuite(name){
this.name=name;
this.startDate=new Date();
this.tests=[];
this.add=function(test){
this.tests.push(test);
};
this.start=function(){
console.log("Test Suite ["+this.name+"] - Start");
this.next();
},
this.next=function(){
console.log("neeext");
console.log(this.tests.length);
var test=this.tests[0];
this.tests.splice(0,1);
console.log("Test ["+ test.name+"]");
test.execute();
};
}
//Test definition
function test(name,testFunction){
this.name=name;
this.execute=testFunction;
}
module.exports.testSuite=testSuite;
module.exports.test=test;
FirstPageModule.js
var currentPage;
function onPageLoadFinished(status) {
var url = currentPage.url;
var filename='snapshot.png';
console.log("---------------------------------------------------------------");
console.log("Status: " + status);
console.log("Loaded: " + url);
console.log("Render filename:" + filename);
console.log("---------------------------------------------------------------");
if(status == 'success'){
currentPage.render(filename);
}
if(status=='fail'){
console.log("Status: " + status);
}
}
function open(){
currentPage.open("http://localhost:8080");
}
function login(){
var username="topSecretUsername";
var password="topSecretPassord";
currentPage.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js");
currentPage.evaluate(function(user,pass) {
$("#user").val(user);
$("#pass").val(pass);
},username,password);
currentPage.render("page.png");
currentPage.evaluate(function(){
$('#loginButton').click();
});
}
function FirstPage(){
var page = webPage.create();
currentPage=page;
this.testSuite = new testSuite("FirstPageModule");
this.testSuite.add(new test("Open First Page",open));
this.testSuite.add(new test("Login",login));
var onLoadFinished=onPageLoadFinished;
var callNextTest=this.testSuite.next;
currentPage.onLoadFinished=function(status){
onLoadFinished.apply(this,arguments);
callNextTest();
};
page.onConsoleMessage = function(msg) {
console.log(msg);
}
}
module.exports=new FirstPage();
PageTests.js
var firstPage=require('./FirstPageModule.js');
firstPage.testSuite.start();
What i want to do is to have a sequential execution of isolated functions, after each function gets executed, i take a screenshot and call the next function.
But, for some reason, the next method on the testSuite isn't getting called, or the method on the second test isn't getting executed.
What am i doing wrong?
Just make available the logName variable in the "global" scope :
var logName;
function onPageLoadComplete(status){
console.log(status);
// Call the logName function
if(typeof(logName) == "function"){
logName();
}
}
function test(){
var page = webPage.create();
this.name="TestName";
// Update logName value as a function.
logName = function(){
console.log(this.name);
}
page.onLoadFinished = onPageLoadComplete;
}
Primary, it doesn't seems to be related to phantomjs but only plain javascript, i hope that's what you need, otherwise please be more specific with your question.
You can create your own page.get implementation with a callback when a page is fully loaded.
ex: create a file module pageSupport.js
// attach listeners
Object.prototype.listeners = {};
// store and fire listeners
Object.prototype.addEventListener = function(event, callback) {
if (!this.listeners[event]) this.listeners[event] = [];
this.listeners[event].push(callback);
this[event] = function(e) {
if (listeners[event]) {
listeners[event].forEach(function(listener) {
listener.call(null, e);
});
}
}
}
// create a new reference to webpage.open method
Object.prototype._open = Object.open;
// receive an url and
// return a function success that will be called when page is loaded.
Object.prototype.get = function(url) {
return {
success : function(callback) {
this.open(url);
this.addEventListener('onLoadFinished', function(status) {
if (status == 'success') {
return callback(status);
}
});
}.bind(this)
}
}
// export as phantomjs module.
exports.Object = Object;
So you can call this module in your script and uses it as follows:
var page = require('webpage').create();
require('./pageSupport');
page.get('http://stackoverflow.com').success(function(status) {
// Now this callback will be called only when the page is fully loaded \o/
console.log(status); // logs success
});
I have some javascript code that updates some data to a database using a http handler, but this async call is made inside an .each loop. At the end of the loop I make a call to function CancelChanges() that refreshed the page. The problem is that the page seems to refresh before the database is updated. The .each loop seems to finish after the call to CancelChanges(). How can I make sure the page is refreshed after all the async calls are completed in the .each loop?
function SaveChanges() {
if (PreSaveValidation()) {
var allChangesSucceeded = true;
var studioId = $("#param_studio_id").val();
var baseDate = $("#param_selected_month").val().substring(6, 10) + $("#param_selected_month").val().substring(0,2);
var currency = "CAD";
var vacationPct = null;
var gvAdmissible = null;
$(".editable-unsaved").each( function() {
var newSalary = $(this).text();
var disciplineId = $(this).data("disciplineid");
var seniorityId = $(this).data("seniorityid");
var handlerCommand = "";
if ($(this).data("valuetype") === "inflated") {
handlerCommand = "AddAverageSalary";
} else if ($(this).data("valuetype") === "actual") {
handlerCommand = "UpdateAverageSalary";
}
$.get("WS/AverageSalary.ashx", { command: handlerCommand, studio_id: studioId, discipline_id: disciplineId, seniority_id: seniorityId, base_date: baseDate, currency: currency, salary: newSalary, vacation_pct: vacationPct, gv_admissible: gvAdmissible }).done(function (data) {
if (data != "1") {
$(this).removeClass("editable-unsaved");
allChangesSucceeded = true;
}
else {
alert('fail');
allChangesSucceeded = false;
}
});
});
if(allChangesSucceeded) CancelChanges();
}
}
function CancelChanges() {
var href = window.location.href;
href = href.split('#')[0];
window.location.href = href;
}
You could try using Promises and jQuery $.when
Store a list of the ajax call promises:
var defereds = [];
$(".editable-unsaved").each( function() {
//...
defereds.push($.get("WS/AverageSalary.ashx" /*...*/));
}
$.when.apply($, defereds).done(function() {
CancelChanges();
});
This should, hopefully, wait for all the ajax calls to finish before calling CancelChanges()
I think you need to change your structure a little bit, using a counter and calling CancelChanges when the counter equals the number of calls.
function SaveChanges() {
if (PreSaveValidation()) {
var studioId = $("#param_studio_id").val();
var baseDate = $("#param_selected_month").val().substring(6, 10) + $("#param_selected_month").val().substring(0,2);
var currency = "CAD";
var vacationPct = null;
var gvAdmissible = null;
var editableUnsaveds = $(".editable-unsaved"); //cache the selector here, because selectors are costly
var numOfGetsReturned = 0;
editableUnsaveds.each( function() {
var newSalary = $(this).text();
var disciplineId = $(this).data("disciplineid");
var seniorityId = $(this).data("seniorityid");
var handlerCommand = "";
if ($(this).data("valuetype") === "inflated") {
handlerCommand = "AddAverageSalary";
} else if ($(this).data("valuetype") === "actual") {
handlerCommand = "UpdateAverageSalary";
}
$.get("WS/AverageSalary.ashx", { command: handlerCommand, studio_id: studioId, discipline_id: disciplineId, seniority_id: seniorityId, base_date: baseDate, currency: currency, salary: newSalary, vacation_pct: vacationPct, gv_admissible: gvAdmissible }).done(function (data) {
if (data != "1") {
$(this).removeClass("editable-unsaved");
}
else {
alert('fail');
}
if(editableUnsaveds.length === ++numOfGetsReturned){
CancelChanges(); //now it should call when the final get call finishes.
}
});
});
}
}
function CancelChanges() {
var href = window.location.href;
href = href.split('#')[0];
window.location.href = href;
}
I'd use promises. The q library is my favorite way to implement them. But since you're using JQuery, I'd recommend following a similar approach to what I outline below, but using $.when, instead of q.allSettled
I often use promises when scraping tons of websites at once -- I need to iterate through a long list of websites, make requests for content, and do something with the content when the requests return. The last thing I want to do is send requests one at a time, handling each one as it returns.
In the abstract, that looks something like this:
function scrapeFromMany() {
var promises = [];
_.forEach(urls, function(url) {
// this makes the request
var promise = scraper(url);
// this stores the promise with the others you iterate through
promises.push(promise);
});
q.allSettled(promises).then(function(res) {
// this function is executed when all of the promises (requests) have been resolved
console.log("Everything is done -- do something with the results.", res);
});
}
Fwiw, promises aren't that easy to grok if you've never used them. If that's the case, plan on spending some time getting up to speed with the concepts. They'll change (for the much much better) the way you write async javascript, and they really are the blessed path with these sorts of operations.
Asynchronously call your check function within the "done" function handler. Keep track of how many requests have completed, and only do your processing once that's equal to the total number of expected requests.
if (PreSaveValidation()) {
var allChangesSucceeded = true;
var length = $(".editable-unsaved").length;
var completedCount = 0;
// ...
$(".editable-unsaved").each( function() {
// ...
$.get("WS/AverageSalary.ashx", data).done(function (data) {
completedCount++;
if (data != "1") {
$(this).removeClass("editable-unsaved");
// don't set all changes succeeded to true here
}
else {
alert('fail');
allChangesSucceeded = false;
}
isComplete(length, completedCount, allChangesSucceeded);
});
});
}
function isComplete(totalLength, currentLength, allChangesSucceeded) {
if (currentLength == totalLength) {
// should this be !allChangesSucceeded?
if (allChangesSucceeded) CancelChanges();
}
}
This happens because you are not waiting for the requests to complete to proceed with the loop.
To achieve so you have to set the "async" flag to false.
The call to the server should be like this:
$.ajax({
url: "WS/AverageSalary.ashx",
async: false,
data:{ command: handlerCommand, studio_id: studioId, discipline_id: disciplineId, seniority_id: seniorityId, base_date: baseDate, currency: currency, salary: newSalary, vacation_pct: vacationPct, gv_admissible: gvAdmissible },
success: function (data) {
if (data != "1") {
$(this).removeClass("editable-unsaved");
allChangesSucceeded = true;
}
else {
alert('fail');
allChangesSucceeded = false;
}
}
});
Hey all I'm trying to get a window populated with a table view that is populated from a network function in Titanium Studio, build: 2.1.1.201207271312. I have the data being fetched properlybut the problem is that the program continues to run without waiting for the table view to be populated properly. Here's the code:
ui.js:
bs.ui.createTransitRoutesListWindow = function() {
var winbsRoutesList = Titanium.UI.createWindow({
});
var tv2 = Ti.UI.createTableView();
tv2 = bs.ui.createbsRouteListTableView();
winbsRoutesList.add(tv2);
};
bs.ui.createbsRouteListTableView = function() {
var tv = Ti.UI.createTableView();
Ti.API.info('populating data');
var busStopList = bs.db.routeStopList();
tv.setData(busStopList);
return tv;
};
db.js:
bs.db.routeStopList = function() {
var stoplist = [];
bs.net.getRoutes(function(data) {
Ti.API.info('data length: '+data.length);
for (var i = 0;i<data.length;i++) {
stoplist.push({
title:data[i].stopName,
id: i
});
}
});
return stoplist;
}
network.js
bs.net.getRoutes = function(_cb) {
var xhr = Titanium.Network.createHTTPClient();
xhr.onload = function() {
_cb(JSON.parse(this.responseText));
Ti.API.info(this.responseText)
};
xhr.onerror = function(event) {
}
xhr.open("GET","<URL to valid JSON>", true);
//+ Ti.App.Properties.getString('currentBus','This is a string default')
xhr.send();
};
bussearch.net.getRoutes() is an AJAX operation and thus it is asynchronous. This means that the code will NOT wait for it to complete. The code will proceed while the response is going. The time it will respond is not known also.
If you want to do something after the data returns, you should do everything in the callback instead or create deferred objects like jQuery (which are basically callback containers).
//db.js
bussearch.db.routeStopList = function(callback) {
var stoplist = [];
bussearch.net.getRoutes(function(data) {
....
callback.call(this,stoplist);
});
}
//ui.js
bussearch.ui.createBussearchRouteListTableView = function(callback) {
var tv = Ti.UI.createTableView();
Ti.API.info('populating data');
bussearch.db.routeStopList(function(busStopList){
tv.setData(busStopList);
callback.call(this,tv);
});
};
//calling createBussearchRouteListTableView()
createBussearchRouteListTableView(function(tv){
//tv in here is the data
//do inside here what you want to do to tv after it's retrieved
});