PhantomJS not returning any results - javascript

I'm using PhantomJS to scrape data from a webpage. PhantomJS is not returning anything from the evaluate method. The script just runs for a few seconds and then exits.
I've already checked to see if PhantomJS is connecting to the page -- it is.
PhantomJS is also able to grab the page title. I've already double-checked the class I'm looking for, yes -- I'm spelling it correctly.
var page = require('webpage').create();
page.open('http://www.maccosmetics.com/product/13854/36182/Products/Makeup/Lips/Lipstick/Giambattista-Valli-Lipstick', function(status) {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
waitFor(function() {
return page.evaluate(function() {
$('.product__price').is(':visible');
});
}, function(){
search = page.evaluate(function() {
return $('.product__price').text();
});
console.log(search)
});
});
phantom.exit();
});
I don't know what's going wrong here.

It's not showing you anything, because you're exiting too early. All functions (except evaluate()) that take a callback are asynchronous.
You're requesting to include jQuery in the page by calling page.includeJs(), you immediately exit PhantomJS. You need to exit when you're finished:
var page = require('webpage').create();
page.open('http://www.maccosmetics.com/product/13854/36182/Products/Makeup/Lips/Lipstick/Giambattista-Valli-Lipstick', function(status) {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
waitFor(function() {
return page.evaluate(function() {
$('.product__price').is(':visible');
});
}, function(){
search = page.evaluate(function() {
return $('.product__price').text();
});
console.log(search);
phantom.exit();
});
});
});

Related

phantomjs - execute a Javascript function after page load and then output new changes

I use phantomjs 2.1.1 and something is bothering me.
Here is the piece of code that I use for scraping a url and the html of the website is written into output.html file
page = require('webpage').create();
page.open(url, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit();
} else {
window.setTimeout(function () {
var content = page.content;
fs.write("output.html", content, 'w');
}, 40000); //40 seconds timeout
}
});
Now, I need to scrape its paginations too. The next pages are loaded by a javascript function page(2); or page(3); I tried to get it done using
var pageinationOutput = page.evaluate(function (s) {
page(2);
});
console.log(pageinationOutput); // I need the output made by the `page(2);` call.
page = require('webpage').create();
page.open(url, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit();
} else {
window.setTimeout(function () {
var content = page.content;
fs.write("output.html", content, 'w');
}, 40000); //40 seconds timeout
}
});
But i am not getting any outputs for this.
How can I execute a JavaScript function after a page is finished loading and get the new changes that has happened to the website contents after the javascript exec, in this case website will call the next page (using ajax) after page(2); method call.
Thanks in advance!
I found out the solution myself but I am not sure whether it's the perfect way to do it.
Code:
page.open(url, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit();
} else {
window.setTimeout(function () {
var content = page.content;
fs.write("output.html", content, 'w');
page.evaluate(function (cb) {
window.page(2);
});
var waiter = window.setInterval(function () {
var nextPageContent = page.evaluate(function (cb) {
return document.documentElement.outerHTML;
});
if (nextPageContent !== false) {
window.clearInterval(waiter);
fs.write("output-2.html", content, 'w');
}
}, 40000);//40 seconds timeout
}, 40000);//40 seconds timeout
}
});
I recently published a project that gives PHP access to a browser. Get it here: https://github.com/merlinthemagic/MTS. It is also PhantomJS under the hood.
If you provided the URL i could make a working example. I need to know how you determine the last page. In the example i simply set it to 10.
I also need to know if the page buttons have an id attribute, If they dont no problem, we find another way to trigger them. But for this example I assume they do and to make it simple the ids will be page_2, page_3 ....
After downloading and setup you would simply use the following code:
$myUrl = "http://www.example.com";
$windowObj = \MTS\Factories::getDevices()->getLocalHost()->getBrowser('phantomjs')->getNewWindow($myUrl);
//now you can either retrieve the DOM for each page:
$doms = array();
//get the initial page DOM
$doms[] = $windowObj->getDom();
$pageID = "page_";
$lastPage = 10;
for ($i = 2; $i <= $lastPage; $i++) {
$windowObj->mouseEventOnElement("[id=".$pageID. $i . "]", 'leftclick');
$doms[] = $windowObj->getDom();
}
//$doms now hold all the pages, so you can parse them.

How to get get URL of new page using casperJS

I am using casperJS to get links when clicking a button. The links are returned with window.open in javaScript.
The code I have written logs all the pages after clicking button, but phantom is not exiting in terminal window. Also some pages only show about:blank, especially the last ones.
var casper = require('casper').create();
var page = require('webpage').create();
var address = 'http://www.example.com';
page.open(address, function() {
page.onPageCreated = function(newPage) {
newPage.onClosing = function(closingPage) {
console.log('A child page is closing: ' + closingPage.url);
/* if i set phantom.exit() it will only log the first page url.
Problem: I need all page urls. */
}
}
page.evaluate(function() {
$(".button").click();
});
}
The method "getCurrentUrl()" will return the current url. Here is an easy (not really meaningful) example:
casper.test.begin('My Test', 1 , function suite(test) {
casper.start().viewport(1600,1000).thenOpen("https://blabla.de", function() {
var mylink = this.getElementInfo("#linkid").text;
this.clickLabel(mylink);
});
casper.waitForSelector("#page2", function() {
this.echo(this.getCurrentUrl());
});
casper.run(function() {
test.done();
});
});
You can get the URL of the current page in CasperJS using one of the following methods:
casper.getCurrentUrl():
Retrieves the current page URL. Note that the URL will be URL-decoded.
casper.page.url:
Accesses the existing PhantomJS WebPage instance (page), and gets the current URL of the web page.
casper.evaluate(function () { return location.href; }):
Evaluates the page URL in the Page DOM Environment.

Including a local version of a library that failed to load

I am using PhantomJS to take a screenshot of a page every five minutes, and it works correctly most of the time. The problem is that sometimes the page I am taking a screenshot of fails to load the AngularJS library, and then, it can't build the page after that. So I am trying to figure out how to load a local copy in its place. Here is what I have been trying...
var page = require('webpage').create(),system = require('system');
var home = 'https://smartway.tn.gov/traffic/';
page.open(home, function (status) {
if(status === "success"){
page.injectJs('angular.js');
window.setTimeout((function() {
page.evaluate(function () {
/*stuff*/
});
}), 2000);
}
});
So angular.js is the name of my local copy of what the site would normally download. The site calls the script at the end of the body with several other scripts, and I am trying to find the best way to include it. I am wondering if it needs to be included by replacing the script tag in the html so it can be loaded in sequence, but I am not sure how to do that.
Thanks
It is problematic to reload a single JavaScript file when it failed, particularly when it is the framework. There are probably many scripts which depend on it. When the core framework is not loaded, those scripts will stop executing, because the angular reference cannot be resolved.
You could inject a local version of angular, but then you would have to go over all the other scripts which reference angular and "reload" them by either downloading and evaling them in order or putting them into the page as script elements. I advise against it, because it is probably very error prone.
You should just reload the page if angular does not exist after page load (callback of page.open). Since the same problem may occurr during reload, this has to be done recursively:
function open(countDown, done){
if (countDown === 0) {
done("ERROR: not loaded");
return;
}
page.open(home, function (status) {
if(status === "success"){
var angularExists = page.evaluate(function () {
return !!angular;
});
if (angularExists){
done();
} else {
open(countDown - 1, done);
}
} else {
open(countDown - 1, done);
}
});
}
open(5, function(err){
if(err) {
console.log(err);
} else {
page.render(target);
}
});
You can also try the page.reload() function instead of a page.open().
The other possiblity is to always inject the local version when the page loading began and stop any request for the remote version of the script:
page.onLoadStarted = function() {
page.injectJs('angular.js');
};
page.onResourceRequested = function(requestData, networkRequest) {
var match = requestData.url.match(/angular\.min\.js/g);
if (match != null) {
networkRequest.abort();
}
};
page.open(home, function (status) {
if(status === "success"){
window.setTimeout((function() {
page.evaluate(function () {
/*stuff*/
});
}), 2000);
}
});
This version works entirely without reloading.

Web automation, Events and Execution Flow in CasperJS

I'm new to CasperJS and I'm trying to figure out the execution flow.
This is what I'm trying to achieve:
load a page
stores an image of the page
pass this image to a function and execute it (this process is quite long: ~15 seconds)
wait for the function to return the result
use the returned value to fill a field in the form in the loaded page
submit the form
this is a code snippet which tries to explain the solution I came up with:
var globProcessedImage;
var casper = require('casper').create({
viewportSize: {
width: 1024,
height: 768
}
});
casper.start('http://example.com/');
casper.then(function() {
this.captureSelector('./image.png', '#img-node');
});
casper.waitFor(function() {
return globProcessedImage !== undefined;
}, function then() {
this.sendKeys('#imagePassword', globProcessedImage);
});
casper.then(function() {
this.capture('./page.png');
});
casper.run();
casper.on('image.processed', function() {
setTimeout(function() {
globProcessedImage = 'my_result';
}, 15000);
});
This results in ReferenceError: Can't find variable: globProcessedImage.
It's still unclear to me how web automation and "external" functions mix together with CasperJS, as well as how parameters are passed between the page and casper/phantom environments.
Maybe something like that :
var globProcessedImage ;
var casper = require('casper').create({
viewportSize: {
width: 1024,
height: 768
}
});
casper.start('http://example.com/');
casper.options.waitTimeout = 16000;
casper.then(function() {
this.captureSelector('./image.png', '#img-node');
this.emit('image.processed');
});
/*
* If you need to wait for a fix time, it's okay, but a better way would be to wait in Casper
* 'the thing you do with your image' in a waitFor. -> asynchronous. With this solution you combine two timers, this is not the optimized solution.
*/
casper.waitFor(function() {
return globProcessedImage !== undefined;
}, function then() {
this.sendKeys('#imagePassword', globProcessedImage);
});
casper.then(function() {
this.capture('./page.png');
});
casper.run();
casper.on('image.processed', function() {
setTimeout(function() {
//when you will emit this event in your script, it will set the value of globProcessedImage to 'my_result' after 15sec
globProcessedImage = 'my_result';
}, 15000);
});

Detect if script has already loaded or not

It seems that helloworld.js gets loaded multiple times based on the number of times I click #load. I say this because when I look at Google Chromes Developer Tools Network tab, it shows helloworld.js as many times as I click #load.
$(document).ready(function() {
$("#load").click(function(){
$.getScript('helloworld.js', function() {
hello();
});
});
});
The hello() function looks like this:
function hello(){
alert("hello");
}
Is it possible to detect if helloworld.js has already loaded?
So if it hasn't loaded, load it, and if it has loaded, don't load it.
This is what Developer Tools currently shows me if I click the #load button 4 times:
Set a flag when file loaded successfully. If flag is set then skip the file loading again.
Try this code,
var isLoaded = 0; //Set the flag OFF
$(document).ready(function() {
$("#load").click(function(){
if(isLoaded){ //If flag is ON then return false
alert("File already loaded");
return false;
}
$.getScript('helloworld.js', function() {
isLoaded = 1; //Turn ON the flag
hello();
});
});
});
So why not only fire the event once like this:
$("#load").one("click", function() {
$load = $(this);
$.getScript('helloworld.js', function() {
hello();
// bind hello to the click event of load for subsequent calls
$load.on('click', hello);
});
});
That would prevent subsequent loads and avoids the use of a global
Another option is letting .getScript() run but let it take the script from browser's cache so you won't have it reloaded each and every time.
To achieve this, add such code:
$.ajaxSetup({
cache: true
});
This is taken from the documentation page.
You could create a helper function:
var getScript = (function() {
var loadedFiles = {};
return function(filename, callback) {
if(loadedFiles[filename]) {
callback();
} else {
$.getScript(filename, function() {
loadedFiles[filename] = true;
callback();
});
}
};
})();

Categories