jQuery bug using PhantomJS

jQuery bug using PhantomJS - javascript

I work at the moment on an automated tests scripts which needs a headless browser (PhantomJS) and all the DOM navigation and actions possible (except downloads).
So PhantomJS should fit.
However I can't use PhantomJS with javascript jQuery as the following errors :
var page = require('webpage').create();
page.open('http://mywebsite.com/', function() {
function dothat() {
console.log('0 started');
$('#id').children().eq(0).click();
console.log('0 finished');
return main(1, 0);
}
}
dothat()
Here the scripts stop a '0 started'. The page I load uses jQuery so I though I wouldn't need to download it back. Even so, let's try with loading it.
var page = require('webpage').create();
page.open('http://mywebsite.com/', function() {
page.injectJs('jquery-2.2.1.min.js');
function dothat() {
console.log('0 started');
$('#id').children().eq(0).click();
console.log('0 finished');
return main(1, 0);
}
}
dothat()
Yes I have the file locally in the same directory...
And it shows that error (very fastly) :
TypeError: undefined is not a constructor (evaluating 'jQuery.easing[jQuery.easing.def](t,e,n,i,a)')
What do you suggest doing to fix that?
Is this an internal bug in PhantomJS and so I should try a Firefox and Xvfb on Ubuntu or is it my error somewhere?
I though first about changing code to a total non-jQuery version, but I need great selectors with classes and to click on some objects...
Thank you very much for pointing me the solution!
EDIT 1: I changed this according to what I have understand from a first answer. Now the console does not print anything
var page = require('webpage').create();
page.onConsoleMessage = function(msg, lineNum, sourceId) {
console.log('CONSOLE: ' + msg + ' (from line #' + lineNum + ' in "' + sourceId + '")');
};
page.open('http://mywebsite.com/', function() {
page.injectJs('jquery-2.2.1.min.js');
var result = page.evaluate(function() {
function dothat() {
console.log('0 started');
$('#id').children().eq(0).click();
console.log('0 finished');
return main(1, 0);
}
function main(x, y) {
if (x === 0) {
return setTimeout(dothat(), y);
} else if (x === 1) {
return setTimeout(dothis(), y);
}
}
return main(0, 5000); //first launch
}
});
I'm not sure that was what you was pointed out though.
EDIT 2: This is a real script that wants to go on google.com and login as me. However the script does not work as intended, see below.
var page = require('webpage').create();
page.onConsoleMessage = function(msg) {
console.log('CONSOLE: ' + msg);
};
page.open('https://google.com/', function() {
page.injectJs('jquery-2.2.1.min.js');
page.evaluate(function() {
function dofirst() {
$('#gb_70').click();
return main(1, 0);
}
function dosecond() {
document.getElementById('Email').value = 'myemail#gmail.com';
$('#next').click();
return main(2, 0);
}
function dothird() {
document.getElementById('Passwd').value = 'mypwd';
$('#signIn').click();
}
function main(i, j) {
if (i === 0) {
console.log('launching 0');
return setTimeout(dofirst(), j); // connections
}
else if (i === 1) {
console.log('launching 1');
return setTimeout(dosecond(), 5000);
}
else if (i === 2) {
console.log('launching 2');
return setTimeout(dothird(), 5000);
}
}
return main(0, 5000);
});
});
As you can see, main is a function I'm using for delaying steps.
However the code does not work and returns me this error (or those) :
TypeError: null is not an object (evaluating 'document.getElementById('Email').value = 'myemail#gmail.com'')
undefined:7 in dosecond
:22 in main
:4 in dofirst
:18 in main
:29
:30
Thank you for keeping trying to help me!

In PhantomJS there's always two javascirpt contexts where functions and variables exist:
PhantomJS context
Opened page context
They are not aware of each other and do not intersect!
PhantomJS cannot access any of the target page objects or DOM, that is why $('#id').children().eq(0).click(); won't work - there is no jQuery in PhantomJS script, there are no DOM elements. They are in the second context, or, in other words, sandbox.
To do anything with the page you have to sort of teleport code there. It is done with the help of page.evaluate() method.
var page = require('webpage').create();
page.open('http://mywebsite.com/', function() {
page.injectJs('jquery-2.2.1.min.js');
var result = page.evaluate(function(){
// You can certainly use functions
// Be sure to declare them also
// inside of page.evaluate
function dothat() {
console.log('0 started');
$('#id').children().eq(0).click();
console.log('0 finished');
return main(1, 0);
}
// Important: don't forget to return result
// back to PhantomJS main context
return dothat();
});
});
Note that to receieve console messages from sandboxed page context you must subscribe to them via page.onConsoleMessage callback:
page.onConsoleMessage = function(msg, lineNum, sourceId) {
console.log('CONSOLE: ' + msg + ' (from line #' + lineNum + ' in "' + sourceId + '")');
};
After EDIT 1:
My bad, I've lost a bracket editing your example (fixed above) which led to a syntax error, a fact that PhantomJS v2.x won't complain about, it'll just hang there silently. That's what you mistook for no console messages. Could be useful to test script for syntax errors in v1.9.8 first or in some IDE with syntax highlighting.
Your current code should look this (the missing bracket also fixed):
var page = require('webpage').create();
page.onConsoleMessage = function(msg, lineNum, sourceId) {
console.log('CONSOLE: ' + msg + ' (from line #' + lineNum + ' in "' + sourceId + '")');
};
page.open('http://mywebsite.com/', function() {
page.injectJs('jquery-2.2.1.min.js');
var result = page.evaluate(function() {
function dothat() {
console.log('0 started');
$('#id').children().eq(0).click();
console.log('0 finished');
return main(1, 0);
}
function main(x, y) {
if (x === 0) {
return setTimeout(dothat(), y);
} else if (x === 1) {
return setTimeout(dothis(), y);
}
}
return main(0, 5000); //first launch
}); //
});
Note: will complain about inextistent dothis() function.

Don't you want to do it the other way around?
function dothat() {
console.log('0 started');
$('#id').children().eq(0).click();
console.log('0 finished');
return main(1, 0);
}
var page = require('webpage').create();
page.open('http://mywebsite.com/', dothat)

Related

JavascriptExecutor SyntaxError: Unexpected identifier. Why?

Java
JavascriptExecutor js = (JavascriptExecutor) driver;
Boolean ready = (Boolean)js.executeScript("the below JavaScript");
JavaScript
var ready = false;
window.onload = function() {
ready = true;
}
window.sleep = function() {
return new Promise(resolve => setTimeout(resolve, 2000));
}
for(var i = 0; i < 30; i++) {
if(ready) {
return true;
}
await sleep();
}
return false;
UPDATE: Sorry about the previous syntax error "funtion" in my post. That was a typo not in my actual code. All syntax errors should be gone, but I still get "SyntaxError: Unexpected identifier".
What this code is trying to do is wait for a maximum amount of time for the page to be loaded. I typically return document.readyState to check for that condition but in unique circumstances Chrome will suddenly stop loading the page and document.readyState hangs for 5+ minutes. This is killing my code so I am attempting to develop single-threaded code to kind of mimic a typically multi-threaded process.
Since JavaScript is single threaded (such a disappointing feature for a cool language like JavaScript), we have to be creative.
This code works in the browser console if you replace return true; with console.log('true'); and return false; with console.log('false'); so I don't see what the problem is.

Indeed there are some mistakes in your JavaScript code.
The first mistake is, in third line
window.sleep = funtion() { return new Promise(resolve => setTimeout(resolve, 2000)); }, the function spelling is wrong.
The second mistake is you should not use await when there is no async in your function definition. Here is the thing, async ensures that the function returns a promise, and wraps non-promises in it. The keyword await makes JavaScript wait until that promise settles and returns its result. await works only inside async functions. So you can avoid using these completely or you need to format it accordingly.
The third mistake is, you are trying to do return true; from the for loop of the if condition which is not allowed basically because it is not wrapped inside the function.
The fourth mistake is, you are not calling the window.onload function - as the result, it is always returns false even though the page is loaded.
The fifth thing is, I don't know what incomplete resolve is doing in window.sleep function.
The sixth thing is, you are returning return false; at the end without any reference which is completely meaningless.
I have modified the code and avoided the above mistakes, please look into it.
Below is the modified JavaScript code:
var status = false;
window.sleep = function() {
return setTimeout(() => {
console.log("=> Waited for 2 seconds...");
}, 2000);
}
var getStatus = function() {
for(var i = 0; i < 30; i++) {
if(window.onload = function() {
return true;
}) {
status = true;
console.log(i+"). Loaded ? "+status);
break;
} else {
console.log(i+"). Loaded ? "+status);
sleep();
}
}
return status;
}
getStatus();
Try the below Java code which prints true after the page loads :
System.setProperty("webdriver.chrome.driver", "C:\\NotBackedUp\\chromedriver.exe");
WebDriver driver = new ChromeDriver();
driver.get("http://www.google.com");
JavascriptExecutor js = (JavascriptExecutor) driver;
Boolean result = (Boolean) js.executeScript("var status = false;\r\n" +
"window.sleep = function() { \r\n" +
" return setTimeout(() => {\r\n" +
" console.log(\"=> Waited for 2 seconds...\");\r\n" +
" }, 2000);\r\n" +
"}\r\n" +
"var getStatus = function() {\r\n" +
" for(var i = 0; i < 30; i++) {\r\n" +
" if(window.onload = function() {\r\n" +
" return true;\r\n" +
" }) {\r\n" +
" status = true;\r\n" +
" console.log(i+\"). Loaded ? \"+status);\r\n" +
" break;\r\n" +
" } else {\r\n" +
" console.log(i+\"). Loaded ? \"+status);\r\n" +
" sleep();\r\n" +
" }\r\n" +
" }\r\n" +
" return status;\r\n" +
"}\r\n" +
"return getStatus();");
System.out.println(result);
I hope it helps...

PhantomJS bug in basic script

This is a script pretending to log in your google account (I've made). But obviously, that doesn't work. There is no particular objective here, but to make it work.
var page = require('webpage').create();
page.onConsoleMessage = function(msg) {
console.log('CONSOLE: ' + msg);
};
page.open('https://google.com/', function() {
page.injectJs('jquery-2.2.1.min.js');
page.evaluate(function() {
function include(arr,obj) { // those functions are not part of scraping
return (arr.indexOf(obj) != -1);
}
function add(a, b) {
return a + b;
}
Array.min = function( array ){
return Math.min.apply( Math, array );
};
function dofirst() {
$('#gb_70').click();
main(1, 0);
}
function dosecond() {
document.getElementById('Email').value = 'myemail#gmail.com';
$('#next').click();
main(2, 0);
}
function dothird() {
document.getElementById('Passwd').value = 'P4SSW0RD';
$('#signIn').click();
main(3, 0);
}
function dofourth() {
L1 = ['test', 'test2', 'google'];
for (var i = 0; i < 1; i++) {
if (L1, 'google') {
console.log('SUCCESS!');
}
}
main(4, 0);
}
function dofifth() {
$('.gb_b.gb_8a.gb_R').click()
setTimeout(function(){$('#gb_71').click()}, 500);
main(0, 5000);
}
function main(i, j) {
if (i === 0) {
console.log('launching 0');
setTimeout(dofirst(), j); // connections
}
else if (i === 1) {
console.log('launching 1');
setTimeout(dosecond(), 5000);
}
else if (i === 2) {
console.log('launching 2');
setTimeout(dothird(), 5000);
}
else if (i === 3) {
console.log('launching 3');
setTimeout(dofourth(), 5000);
} else if (i === 4) {
console.log('launching 4');
setTimeout(dofifth(), 5000);
}
}
main(0, 5000);
});
console('super end');
page.render('google.png');
});
At the end I get those errors :
CONSOLE: launching 0
CONSOLE: launching 1
TypeError: null is not an object (evaluating 'document.getElementById('Email').value = 'myemail#gmail.com'')
undefined:7 in dosecond
:22 in main
:4 in dofirst
:18 in main
:29
:30
I tried many ways and no one worked. I could make it work though with Python and selenium web driver (which was real love). But now the time has passed, and it has to be in javascript (to be completely DOM/jQuery... so Web compatible).
Can you help me please to make it work!
EDIT 1: by trying to capture a screenshot, it does save an empty PNG.
EDIT 2: I think that may be a hint, when I do phantomjs test.js, it takes a very long time to finally load and logs quickly everything...
EDIT 3: I changed document.get(...).value = 'blabla' to $('#id').val('blabla'); And now it prints
CONSOLE: launching 0
CONSOLE: launching 1
CONSOLE: launching 2
CONSOLE: launching 3
CONSOLE: SUCCESS!
CONSOLE: SUCCESS!
CONSOLE: SUCCESS!
CONSOLE: SUCCESS!
CONSOLE: SUCCESS!
However it should prints only one SUCCESS, and evidently capture still does not work.

For EDIT 1: trying to capture the screen
Check the status to make sure the page is loaded.
page.open(url, function(status) {
if (status !== 'success') {
// exit if it fails to load the page
console.log(status);
phantom.exit(1);
}
else{
// your code here
}
});

How to make casperjs continue instead of exit on exception?

When I use CasperJS to handle web, I find a strange case. The program exits on exception even if I use try-catch block!
My code is as follows. I hope the program can continue running next loop. However, when the iframe cannot be found, an exception throws like CasperError: Frame number "1" is out of bounds. and the whole program exits. The myStore function in catch block is also not run.
Is there anybody can help us?
try {
// find the iframe and then fill in the message
casper.withFrame(1, function() {
casper.evaluate(function(message) {
// some function code
});
casper.wait(1000, function() {
myStore(stores, index+1);
// when the iframe not found, function myStroe will not be run
});
});
} catch (err) {
output(false, "error=" + err.message);
myStore(stores, index+1); // myStroe will not be run on Exception
}
I try the example given by Artjom B but it does not work.
var frameExists = false;
casper.withFrame(1, function() {
frameExists = true;
casper.evaluate(function(message) {
// some function code
});
});
casper.wait(3000, function() {
if (frameExists) {
// the program run this branch and got stuck in the nonexistent selector since the frame is not found.
casper.click("input#send");
// some function code
output(true, "index=" + index + ";storeId=" + store.id + ";succeeded");
myStore(stores, index+1);
} else {
output(false, "index=" + index + ";storeId=" + store.id + ";error=" + err.message);
myStore(stores, index+1);
}
});
It is very strange. I don't know why the program run into the wrong branch with frameExists === true.

There is the handy little option called exitOnError:
At the beginning:
var casper = require('casper').create({
exitOnError: false
});
or later:
casper.options.exitOnError = false;
If you want myStore() to be run regardless of the existence of the frame, you can do something like this:
casper.then(function(){
var frameExists = false;
// find the iframe and then fill in the message
casper.withFrame(1, function() {
frameExists = true;
casper.evaluate(function(message) {
// some function code
});
casper.wait(1000, function() {
myStore(stores, index+1);
// when the iframe not found, function myStroe will not be run
});
});
casper.then(function(){
if (!frameExists) {
output(false, "error=" + err.message);
myStore(stores, index+1); // myStroe will not be run on Exception
}
});
});

Unreliable behaviour in Node.js

I have a Node.js application that, upon initialisation, reads two tables from an SQL database and reconstructs their relationship in memory. They're used for synchronously looking up data that changes (very) infrequently.
Problem: Sometimes I can't access the data, even though the application reports successfully loading it.
Code:
constants.js
module.exports = {
ready: function () { return false; }
};
var log = sysLog('core', 'constants')
, Geo = require('../models/geo.js');
var _ready = false
, _countries = []
, _carriers = [];
function reload() {
_ready = false;
var index = Object.create(null);
return Geo.Country.find().map(function (country) {
var obj = country.toPlainObject()
, id = obj.id;
delete obj.id;
index[id] = obj;
return Object.freeze(obj);
}).then(function (countries) {
log.debug('Loaded ' + countries.length + ' countries');
_countries = countries;
return Geo.Carrier.Descriptor.find().map(function (carrier) {
var obj = carrier.toPlainObject();
if (obj.country) {
obj.country = index[obj.country];
}
return Object.freeze(obj);
}).then(function (carriers) {
log.debug('Loaded ' + carriers.length + ' carriers');
_carriers = carriers;
});
}).finally(function () {
_ready = true;
});
}
reload().catch(function (err) {
log.crit({ message: 'Could not load constants', reason: err });
process.exit(-42);
}).done();
module.exports = {
reload : reload,
ready : function () { return _ready; },
countries : function () { return _countries; },
carriers : function () { return _carriers; }
};
utils.js
var log = sysLog('core', 'utils')
, constants = require('./constants');
module.exports = {
getCountryByISO: function(iso) {
if (!iso) {
return;
}
if ('string' != typeof iso) {
throw new Error('getCountryByISO requires a string');
}
if (!constants.ready()) {
throw new UnavailableError('Try again in a few seconds');
}
switch (iso.length) {
case 2:
return _.findWhere(constants.countries(), { 'iso2' : iso.toUpperCase() });
case 3:
return _.findWhere(constants.countries(), { 'iso3' : iso.toUpperCase() });
default:
throw new Error('getCountryByISO requires a 2 or 3 letter ISO code');
}
},
getCarrierByCode: function(code) {
if (!code) {
return;
}
if ('string' != typeof code) {
throw new Error('getCarrierByCode requires a string');
}
if (!constants.ready()) {
throw new UnavailableError('Try again in a few seconds');
}
return _.findWhere(constants.carriers(), { 'code' : code });
},
getCarrierByHandle: function(handle) {
if (!handle) {
return;
}
if ('string' != typeof handle) {
throw new Error('getCarrierByHandle requires a string');
}
if (!constants.ready()) {
throw new UnavailableError('Try again in a few seconds');
}
return _.findWhere(constants.carriers(), { 'handle' : handle });
}
};
Use case
if (data.handle) {
carrier = utils.getCarrierByHandle(data.handle);
if (_.isEmpty(carrier)) {
throw new InternalError('Unknown carrier', { handle: data.handle });
}
}
What's going on: All errors are logged; as soon as I see an error (i.e. "Unknown carrier") in the logs, I check the SQL database to see if it should've been recognised. That has always been the case so far, so I check the debug log to see if data was loaded. I always see "Loaded X countries" and "Loaded Y carriers" with correct values and no sign of "Could not load constants" or any other kind of trouble.
This happens around 10% of the time I start the application and the problem persists (i.e. didn't seem to go away after 12+ hours) and seems to occur regardless of input, leading me to think that the data isn't referenced correctly.
Questions:
Is there something wrong in constants.js or am I doing something very obviously wrong? I've tried setting it up for cyclical loading (even though I am not aware of that happening in this case).
Why can't I (sometimes) access my data?
What can I do to figure out what's wrong?
Is there any way I can work around this? Is there anything else I could to achieve the desired behaviour? Hard-coding the data in constants.js is excluded.
Additional information:
constants.reload() is never actually called from outside of constants.js.
constants.js is required only in utils.js.
utils.js is required in app.js (application entry); all files required before it do not require it.
SQL access is done through an in-house library built on top of knex.js and bluebird; so far it's been very stable.
Versions:
Node.js v0.10.33
underscore 1.7.0
bluebird 2.3.11
knex 0.6.22

}).finally(function () {
_ready = true;
});
Code in a finally will always get called, regardless of if an error was thrown up the promise chain. Additionally, your reload().catch(/* ... */) clause will never be reached, because finally swallows the error.
Geo.Country.find() or Geo.Carrier.Descriptor.find() could throw an error, and _ready would still be set to true, and the problem of your countries and carriers not being set would persist.
This problem would not have occurred if you had designed your system without a ready call, as I described in my previous post. Hopefully this informs you that the issue here is really beyond finally swallowing a catch. The real issue is relying on side-effects; the modification of free variables results in brittle systems, especially when asynchrony is involved. I highly recommend against it.

Try this
var log = sysLog('core', 'constants');
var Geo = require('../models/geo.js');
var index;
var _countries;
var _carriers;
function reload() {
index = Object.create(null);
_countries = Geo.Country.find().map(function (country) {
var obj = country.toPlainObject();
var id = obj.id;
delete obj.id;
index[id] = obj;
return Object.freeze(obj);
});
_carriers = _countries.then(function(countries) {
return Geo.Carrier.Descriptor.find().map(function (carrier) {
var obj = carrier.toPlainObject();
if (obj.country) {
obj.country = index[obj.country];
}
return Object.freeze(obj);
});
});
return _carriers;
}
reload().done();
module.exports = {
reload : reload,
countries : function () { return _countries; },
carriers : function () { return _carriers; }
};

constants.reload() is never actually called from outside of
constants.js.
That's your issue. constants.reload() reads from a database, which is an aysnchronous process. Node's require() is a synchronous process. At the time constants.js is required in utils.js and the module.exports value is returned, your database query is still running. And at whatever point in time that app.js reaches the point where it calls a method from the utils module, that query could still be running, resulting in the error.
You could say that requiring utils.js has the side-effect of requiring constants.js, which has the side-effect of executing a database query, which has the side-effect of concurrently modifying the free variables _countries and _carriers.
Initialize _countries and _carriers as unresolved promises. Have reload() resolve them. Make the utils.js api async.
promises.js:
// ...
var Promise = require('bluebird');
var countriesResolve
, carriersResolve;
var _ready = false
, _countries = new Promise(function (resolve) {
countriesResolve = resolve;
})
, _carriers = new Promise(function (resolve) {
carriersResolve = resolve;
});
function reload() {
_ready = false;
var index = Object.create(null);
return Geo.Country.find().map(function (country) {
// ...
}).then(function (countries) {
log.debug('Loaded ' + countries.length + ' countries');
countriesResolve(countries);
return Geo.Carrier.Descriptor.find().map(function (carrier) {
// ...
}).then(function (carriers) {
log.debug('Loaded ' + carriers.length + ' carriers');
carriersResolve(carriers);
});
}).finally(function () {
_ready = true;
});
}
reload().catch(function (err) {
log.crit({ message: 'Could not load constants', reason: err });
process.exit(-42);
}).done();
module.exports = {
reload : reload,
ready : function () { return _ready; },
countries : function () { return _countries; },
carriers : function () { return _carriers; }
};
utils.js
getCarrierByHandle: function(handle) {
// ...
return constants.carriers().then(function (carriers) {
return _.findWhere(carriers, { 'handle' : handle });
});
}
Use case:
utils.getCarrierByHandle(data.handle).then(function (carrier) {
if (_.isEmpty(carrier)) {
throw new InternalError('Unknown carrier', { handle: data.handle });
}
}).then(function () {
// ... next step in application logic
});
This design will also eliminate the need for a ready method.
Alternatively, you could call constants.reload() on initialization and hang all possibly-dependent operations until it completes. This approach would also obsolete the ready method.
What can I do to figure out what's wrong?
You could have analyzed your logs and observed that "Loaded X countries" and "Loaded Y carriers" were sometimes written after "Unknown carrier", helping you realize that the success of utils.getCarrierByHandle() was a race condition.

phantomjs not waiting for "full" page load

I'm using PhantomJS v1.4.1 to load some web pages. I don't have access to their server-side, I just getting links pointing to them. I'm using obsolete version of Phantom because I need to support Adobe Flash on that web pages.
The problem is many web-sites are loading their minor content async and that's why Phantom's onLoadFinished callback (analogue for onLoad in HTML) fired too early when not everything still has loaded. Can anyone suggest how can I wait for full load of a webpage to make, for example, a screenshot with all dynamic content like ads?

Another approach is to just ask PhantomJS to wait for a bit after the page has loaded before doing the render, as per the regular rasterize.js example, but with a longer timeout to allow the JavaScript to finish loading additional resources:
page.open(address, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit();
} else {
window.setTimeout(function () {
page.render(output);
phantom.exit();
}, 1000); // Change timeout as required to allow sufficient time
}
});

I would rather periodically check for document.readyState status (https://developer.mozilla.org/en-US/docs/Web/API/document.readyState). Although this approach is a bit clunky, you can be sure that inside onPageReady function you are using fully loaded document.
var page = require("webpage").create(),
url = "http://example.com/index.html";
function onPageReady() {
var htmlContent = page.evaluate(function () {
return document.documentElement.outerHTML;
});
console.log(htmlContent);
phantom.exit();
}
page.open(url, function (status) {
function checkReadyState() {
setTimeout(function () {
var readyState = page.evaluate(function () {
return document.readyState;
});
if ("complete" === readyState) {
onPageReady();
} else {
checkReadyState();
}
});
}
checkReadyState();
});
Additional explanation:
Using nested setTimeout instead of setInterval prevents checkReadyState from "overlapping" and race conditions when its execution is prolonged for some random reasons. setTimeout has a default delay of 4ms (https://stackoverflow.com/a/3580085/1011156) so active polling will not drastically affect program performance.
document.readyState === "complete" means that document is completely loaded with all resources (https://html.spec.whatwg.org/multipage/dom.html#current-document-readiness).
EDIT 2022:
I created this response 8 years ago and I did not use PhantomJS since then. It is very probable it won't work now in some cases. Also now I think it is not possible to create a one-size-fits-all solution to be absolutely sure the page is loaded. This is because some pages may load additional resources after document is ready. For example, there might be some JS code on the website that waits for the document to be ready an then loads some additional assets (after document state changes to ready) - in this case the onPageReady will trigger and after that the page will start loading some more resources again.
I still think the above snipped is a good starting point and may work in most cases, but may also necessary to create a specific solutions to handle specific websites.

You could try a combination of the waitfor and rasterize examples:
/**
* See https://github.com/ariya/phantomjs/blob/master/examples/waitfor.js
*
* Wait until the test condition is true or a timeout occurs. Useful for waiting
* on a server response or for a ui change (fadeIn, etc.) to occur.
*
* #param testFx javascript condition that evaluates to a boolean,
* it can be passed in as a string (e.g.: "1 == 1" or "$('#bar').is(':visible')" or
* as a callback function.
* #param onReady what to do when testFx condition is fulfilled,
* it can be passed in as a string (e.g.: "1 == 1" or "$('#bar').is(':visible')" or
* as a callback function.
* #param timeOutMillis the max amount of time to wait. If not specified, 3 sec is used.
*/
function waitFor(testFx, onReady, timeOutMillis) {
var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 3000, //< Default Max Timout is 3s
start = new Date().getTime(),
condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()), //< defensive code
interval = setInterval(function() {
if ( (new Date().getTime() - start < maxtimeOutMillis) && !condition ) {
// If not time-out yet and condition not yet fulfilled
condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()); //< defensive code
} else {
if(!condition) {
// If condition still not fulfilled (timeout but condition is 'false')
console.log("'waitFor()' timeout");
phantom.exit(1);
} else {
// Condition fulfilled (timeout and/or condition is 'true')
console.log("'waitFor()' finished in " + (new Date().getTime() - start) + "ms.");
typeof(onReady) === "string" ? eval(onReady) : onReady(); //< Do what it's supposed to do once the condition is fulfilled
clearInterval(interval); //< Stop this interval
}
}
}, 250); //< repeat check every 250ms
};
var page = require('webpage').create(), system = require('system'), address, output, size;
if (system.args.length < 3 || system.args.length > 5) {
console.log('Usage: rasterize.js URL filename [paperwidth*paperheight|paperformat] [zoom]');
console.log(' paper (pdf output) examples: "5in*7.5in", "10cm*20cm", "A4", "Letter"');
phantom.exit(1);
} else {
address = system.args[1];
output = system.args[2];
if (system.args.length > 3 && system.args[2].substr(-4) === ".pdf") {
size = system.args[3].split('*');
page.paperSize = size.length === 2 ? {
width : size[0],
height : size[1],
margin : '0px'
} : {
format : system.args[3],
orientation : 'portrait',
margin : {
left : "5mm",
top : "8mm",
right : "5mm",
bottom : "9mm"
}
};
}
if (system.args.length > 4) {
page.zoomFactor = system.args[4];
}
var resources = [];
page.onResourceRequested = function(request) {
resources[request.id] = request.stage;
};
page.onResourceReceived = function(response) {
resources[response.id] = response.stage;
};
page.open(address, function(status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit();
} else {
waitFor(function() {
// Check in the page if a specific element is now visible
for ( var i = 1; i < resources.length; ++i) {
if (resources[i] != 'end') {
return false;
}
}
return true;
}, function() {
page.render(output);
phantom.exit();
}, 10000);
}
});
}

Here is a solution that waits for all resource requests to complete. Once complete it will log the page content to the console and generate a screenshot of the rendered page.
Although this solution can serve as a good starting point, I have observed it fail so it's definitely not a complete solution!
I didn't have much luck using document.readyState.
I was influenced by the waitfor.js example found on the phantomjs examples page.
var system = require('system');
var webPage = require('webpage');
var page = webPage.create();
var url = system.args[1];
page.viewportSize = {
width: 1280,
height: 720
};
var requestsArray = [];
page.onResourceRequested = function(requestData, networkRequest) {
requestsArray.push(requestData.id);
};
page.onResourceReceived = function(response) {
var index = requestsArray.indexOf(response.id);
if (index > -1 && response.stage === 'end') {
requestsArray.splice(index, 1);
}
};
page.open(url, function(status) {
var interval = setInterval(function () {
if (requestsArray.length === 0) {
clearInterval(interval);
var content = page.content;
console.log(content);
page.render('yourLoadedPage.png');
phantom.exit();
}
}, 500);
});

Maybe you can use the onResourceRequested and onResourceReceived callbacks to detect asynchronous loading. Here's an example of using those callbacks from their documentation:
var page = require('webpage').create();
page.onResourceRequested = function (request) {
console.log('Request ' + JSON.stringify(request, undefined, 4));
};
page.onResourceReceived = function (response) {
console.log('Receive ' + JSON.stringify(response, undefined, 4));
};
page.open(url);
Also, you can look at examples/netsniff.js for a working example.

In my program, I use some logic to judge if it was onload: watching it's network request, if there was no new request on past 200ms, I treat it onload.
Use this, after onLoadFinish().
function onLoadComplete(page, callback){
var waiting = []; // request id
var interval = 200; //ms time waiting new request
var timer = setTimeout( timeout, interval);
var max_retry = 3; //
var counter_retry = 0;
function timeout(){
if(waiting.length && counter_retry < max_retry){
timer = setTimeout( timeout, interval);
counter_retry++;
return;
}else{
try{
callback(null, page);
}catch(e){}
}
}
//for debug, log time cost
var tlogger = {};
bindEvent(page, 'request', function(req){
waiting.push(req.id);
});
bindEvent(page, 'receive', function (res) {
var cT = res.contentType;
if(!cT){
console.log('[contentType] ', cT, ' [url] ', res.url);
}
if(!cT) return remove(res.id);
if(cT.indexOf('application') * cT.indexOf('text') != 0) return remove(res.id);
if (res.stage === 'start') {
console.log('!!received start: ', res.id);
//console.log( JSON.stringify(res) );
tlogger[res.id] = new Date();
}else if (res.stage === 'end') {
console.log('!!received end: ', res.id, (new Date() - tlogger[res.id]) );
//console.log( JSON.stringify(res) );
remove(res.id);
clearTimeout(timer);
timer = setTimeout(timeout, interval);
}
});
bindEvent(page, 'error', function(err){
remove(err.id);
if(waiting.length === 0){
counter_retry = 0;
}
});
function remove(id){
var i = waiting.indexOf( id );
if(i < 0){
return;
}else{
waiting.splice(i,1);
}
}
function bindEvent(page, evt, cb){
switch(evt){
case 'request':
page.onResourceRequested = cb;
break;
case 'receive':
page.onResourceReceived = cb;
break;
case 'error':
page.onResourceError = cb;
break;
case 'timeout':
page.onResourceTimeout = cb;
break;
}
}
}

I found this approach useful in some cases:
page.onConsoleMessage(function(msg) {
// do something e.g. page.render
});
Than if you own the page put some script inside:
<script>
window.onload = function(){
console.log('page loaded');
}
</script>

I found this solution useful in a NodeJS app.
I use it just in desperate cases because it launches a timeout in order to wait for the full page load.
The second argument is the callback function which is going to be called once the response is ready.
phantom = require('phantom');
var fullLoad = function(anUrl, callbackDone) {
phantom.create(function (ph) {
ph.createPage(function (page) {
page.open(anUrl, function (status) {
if (status !== 'success') {
console.error("pahtom: error opening " + anUrl, status);
ph.exit();
} else {
// timeOut
global.setTimeout(function () {
page.evaluate(function () {
return document.documentElement.innerHTML;
}, function (result) {
ph.exit(); // EXTREMLY IMPORTANT
callbackDone(result); // callback
});
}, 5000);
}
});
});
});
}
var callback = function(htmlBody) {
// do smth with the htmlBody
}
fullLoad('your/url/', callback);

This is an implementation of Supr's answer. Also it uses setTimeout instead of setInterval as Mateusz Charytoniuk suggested.
Phantomjs will exit in 1000ms when there isn't any request or response.
// load the module
var webpage = require('webpage');
// get timestamp
function getTimestamp(){
// or use Date.now()
return new Date().getTime();
}
var lastTimestamp = getTimestamp();
var page = webpage.create();
page.onResourceRequested = function(request) {
// update the timestamp when there is a request
lastTimestamp = getTimestamp();
};
page.onResourceReceived = function(response) {
// update the timestamp when there is a response
lastTimestamp = getTimestamp();
};
page.open(html, function(status) {
if (status !== 'success') {
// exit if it fails to load the page
phantom.exit(1);
}
else{
// do something here
}
});
function checkReadyState() {
setTimeout(function () {
var curentTimestamp = getTimestamp();
if(curentTimestamp-lastTimestamp>1000){
// exit if there isn't request or response in 1000ms
phantom.exit();
}
else{
checkReadyState();
}
}, 100);
}
checkReadyState();

This the code I use:
var system = require('system');
var page = require('webpage').create();
page.open('http://....', function(){
console.log(page.content);
var k = 0;
var loop = setInterval(function(){
var qrcode = page.evaluate(function(s) {
return document.querySelector(s).src;
}, '.qrcode img');
k++;
if (qrcode){
console.log('dataURI:', qrcode);
clearInterval(loop);
phantom.exit();
}
if (k === 50) phantom.exit(); // 10 sec timeout
}, 200);
});
Basically given the fact you're supposed to know that the page is full downloaded when a given element appears on the DOM. So the script is going to wait until this happens.

I use a personnal blend of the phantomjs waitfor.js example.
This is my main.js file:
'use strict';
var wasSuccessful = phantom.injectJs('./lib/waitFor.js');
var page = require('webpage').create();
page.open('http://foo.com', function(status) {
if (status === 'success') {
page.includeJs('https://cdnjs.cloudflare.com/ajax/libs/jquery/3.1.1/jquery.min.js', function() {
waitFor(function() {
return page.evaluate(function() {
if ('complete' === document.readyState) {
return true;
}
return false;
});
}, function() {
var fooText = page.evaluate(function() {
return $('#foo').text();
});
phantom.exit();
});
});
} else {
console.log('error');
phantom.exit(1);
}
});
And the lib/waitFor.js file (which is just a copy and paste of the waifFor() function from the phantomjs waitfor.js example):
function waitFor(testFx, onReady, timeOutMillis) {
var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 3000, //< Default Max Timout is 3s
start = new Date().getTime(),
condition = false,
interval = setInterval(function() {
if ( (new Date().getTime() - start < maxtimeOutMillis) && !condition ) {
// If not time-out yet and condition not yet fulfilled
condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()); //< defensive code
} else {
if(!condition) {
// If condition still not fulfilled (timeout but condition is 'false')
console.log("'waitFor()' timeout");
phantom.exit(1);
} else {
// Condition fulfilled (timeout and/or condition is 'true')
// console.log("'waitFor()' finished in " + (new Date().getTime() - start) + "ms.");
typeof(onReady) === "string" ? eval(onReady) : onReady(); //< Do what it's supposed to do once the condi>
clearInterval(interval); //< Stop this interval
}
}
}, 250); //< repeat check every 250ms
}
This method is not asynchronous but at least am I assured that all the resources were loaded before I try using them.

This is an old question, but since I was looking for full page load but for Spookyjs (that uses casperjs and phantomjs) and didn't find my solution, I made my own script for that, with the same approach as the user deemstone .
What this approach does is, for a given quantity of time, if the page did not receive or started any request it will end the execution.
On casper.js file (if you installed it globally, the path would be something like /usr/local/lib/node_modules/casperjs/modules/casper.js) add the following lines:
At the top of the file with all the global vars:
var waitResponseInterval = 500
var reqResInterval = null
var reqResFinished = false
var resetTimeout = function() {}
Then inside function "createPage(casper)" just after "var page = require('webpage').create();" add the following code:
resetTimeout = function() {
if(reqResInterval)
clearTimeout(reqResInterval)
reqResInterval = setTimeout(function(){
reqResFinished = true
page.onLoadFinished("success")
},waitResponseInterval)
}
resetTimeout()
Then inside "page.onResourceReceived = function onResourceReceived(resource) {" on the first line add:
resetTimeout()
Do the same for "page.onResourceRequested = function onResourceRequested(requestData, request) {"
Finally, on "page.onLoadFinished = function onLoadFinished(status) {" on the first line add:
if(!reqResFinished)
{
return
}
reqResFinished = false
And that's it, hope this one helps someone in trouble like I was. This solution is for casperjs but works directly for Spooky.
Good luck !

this is my solution its worked for me .
page.onConsoleMessage = function(msg, lineNum, sourceId) {
if(msg=='hey lets take screenshot')
{
window.setInterval(function(){
try
{
var sta= page.evaluateJavaScript("function(){ return jQuery.active;}");
if(sta == 0)
{
window.setTimeout(function(){
page.render('test.png');
clearInterval();
phantom.exit();
},1000);
}
}
catch(error)
{
console.log(error);
phantom.exit(1);
}
},1000);
}
};
page.open(address, function (status) {
if (status !== "success") {
console.log('Unable to load url');
phantom.exit();
} else {
page.setContent(page.content.replace('</body>','<script>window.onload = function(){console.log(\'hey lets take screenshot\');}</script></body>'), address);
}
});

Do Mouse move while page is loading should work.
page.sendEvent('click',200, 660);
do { phantom.page.sendEvent('mousemove'); } while (page.loading);
UPDATE
When submitting the form, nothing was returned, so the program stopped. The program did not wait for the page to load as it took a few seconds for the redirect to begin.
telling it to move the mouse until the URL changes to the home page gave the browser as much time as it needed to change. then telling it to wait for the page to finish loading allowed the page to full load before the content was grabbed.
page.evaluate(function () {
document.getElementsByClassName('btn btn-primary btn-block')[0].click();
});
do { phantom.page.sendEvent('mousemove'); } while (page.evaluate(function()
{
return document.location != "https://www.bestwaywholesale.co.uk/";
}));
do { phantom.page.sendEvent('mousemove'); } while (page.loading);

We Keep Coding

JavaScript is the programming language of the Web.

jQuery bug using PhantomJS - javascript

Don't you want to do it the other way around? function dothat() { console.log('0 started'); $('#id').children().eq(0).click(); console.log('0 finished'); return main(1, 0); } var page = require('webpage').create(); page.open('http://mywebsite.com/', dothat)

Related

JavascriptExecutor SyntaxError: Unexpected identifier. Why?

PhantomJS bug in basic script

How to make casperjs continue instead of exit on exception?

Unreliable behaviour in Node.js

phantomjs not waiting for "full" page load

Categories

Resources