phantomjs: is webpage singleton? - javascript

see code below.
I want to capture two web page.
After run this code using phantomjs, I only got one screenshot of webpage(www.baidu.com).
Is webpage in phantomjs singleton? Can I open two webpages in one phantomjs intance?
var webpage1 = require('webpage').create();
webpage1.onLoadFinished = function() {
webpage1.render('1.png');
}
webpage1.open('http://www.google.com');
var webpage2 = require('webpage').create();
webpage2.onLoadFinished = function() {
webpage2.render('2.png');
}
webpage2.open('http://www.baidu.com');
UPDATE:
thank you #Cybermaxs
I delay the end of the script, it works. I got two screenshot.
var webpage1 = require('webpage').create();
webpage1.onLoadFinished = function() {
webpage1.render('1.png');
}
webpage1.open('http://www.google.com.cn');
var webpage2 = require('webpage').create();
webpage2.onLoadFinished = function() {
webpage2.render('2.png');
}
webpage2.open('http://www.baidu.com');
setTimeout(function() {
console.log(webpage1.url);
console.log(webpage2.url);
phantom.exit(0);
}, 9000);

The problem is your script is that PhantomJS is async by Nature.
Your script will end before
There are many ways to solve this :
Delay the end of the script
setTimeout(function()
{
phantom.exit(0);
},5000);
Use the waitFor example to wait for all screen shots
Use a recurive approach
var webpage = require('webpage').create();
webpage.open('http://www.google.com', function(status)
{
if (status !== 'success') {
console.log('FAIL to load the address');
} else
{
webpage.render('1.png');
webpage.open('http://www.baidu.com',function(status)
{
if (status !== 'success') {
console.log('FAIL to load the address');
} else
{
webpage.render('2.png');
}
phantom.exit(0);
});
}
}
);
Fro sure, you can open two pages at the same time.

Related

Issues scraping dynamic site (PhantomJS)

I am trying to find a way to wget/download a website.
I have tried wget and curl but no luck, then I've been led to PhantomJS.
var url = 'https://www.sagedining.com/menus/admiralfarragutacademy';
var fs = require('fs');
var page = require('webpage').create();
page.open(url, function(status) {
if (status === 'success') {
var html = page.evaluate(function() {
return document.documentElement.outerHTML;
});
try {
fs.write("/root/choate/page.html", html, 'w');
} catch(e) {
console.log(e);
}
}
phantom.exit();
});
When I run this code on my Debian VPS,
sudo xvfb-run -- phantomjs menu.js
It downloads the site when it's still loading, and therefore only downloads the loading screen.
It also throws this error every time it runs:
TypeError: Attempting to change the setter of an unconfigurable property.
TypeError: Attempting to change the setter of an unconfigurable property.
Is there any way to download this website after it loads all the menus? Does the error message have anything to do with it?
Thank you in advance.
That error is coming from PhantomJS because the page code is trying to set some properties in the DOM and maybe it does not have access to them. You should wait for the loading to happen, you can do it using timeout function:
if (status === 'success') {
window.setTimeout(function () {
var html = page.evaluate(function() {
return document.documentElement.outerHTML;
});
try {
fs.write("/root/choate/page.html", html, 'w');
} catch(e) {
console.log(e);
}
}, 1000); //Increase the value if you need more time
}

http request POST with javascript using Java/Python?

I try to connect to http://leya2.eu/.
I´ve tried a lot of things... the problem is to run the Script on the Website.
I do want to do it with Python or Java... last thing i tried was PhatomJS, but i only got the BlazingFast Page like everytime.
The Problem is this script:
var XHR="onload"in new XMLHttpRequest?XMLHttpRequest:XDomainRequest,xhr=new XHR;var ww = $(window).width();xhr.open("GET","/___S___/?rid=CLYVwTkbSONnYzhmsnBo6AhooeCoHsgxayFRarvktEYBIdpcL2aQPVoW7U32QGrh&sid=" + ww +"&d=leya2.eu&tz=1500505915.508",true),xhr.onreadystatechange=function(){if(4==xhr.readyState&&(xhr.status==200)){var t=document.createElement("script");t.type="text/javascript",t.text=xhr.responseText,document.body.appendChild(t)}},xhr.send(null);function wait(){}; setTimeout(wait(),4000);
So maybe someone got an idea ?
my standard script is like:
from urllib.request import urlopen
from urllib.parse import urlencode
url = 'http://evidence-server.com/?s=login'
response = urlopen(url, urlencode(data).encode("utf-8"))
content = response.read().decode(response.headers.get_content_charset())
print(content)
This allready was near:
var page = require("webpage").create(),
url = "http://leya2.eu/";
function onPageReady() {
var htmlContent = page.evaluate(function () {
return document.documentElement.outerHTML;
});
console.log(htmlContent);
phantom.exit();
}
page.open(url, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit();
} else {
window.setTimeout(function () {
console.log(htmlContent);
phantom.exit();
}, 5000);
}
});
Here's a working script to access your target site. Notice a built-in variable page.content:
var page = require("webpage").create(),
url = "http://leya2.eu/";
page.open(url, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit();
} else {
window.setTimeout(function () {
page.render("leya2.jpg");
console.log(page.content);
phantom.exit();
}, 10000);
}
});
Produces this screenshot on PhantomJS version 2.5b for Windows.

PHP: file_get_contents doesn't work with certain javascript sites

Some certain websites return only some of the code/html and not the full page
e.g.: "https://www.origin.com/deu/de-de/store/mirrors-edge/mirrors-edge-catalyst/standard-edition"
You get the full page when viewing it with the browsers developer tools.
But not with:
View Page Source
file_get_contents
curl_init
Is there any way to get the "real" content?
Thanks!
Use phantomjs. For example:
File test.js
var page = require('webpage').create();
var url = 'https://www.origin.com/deu/de-de/store/mirrors-edge/mirrors-edge-catalyst/standard-edition';
page.open(url, function (status) {
console.log(page.content)
phantom.exit();
});
After install phantomjs in your server run command
phantomjs test.js
UPDATE
var ok = 'Your needed content';
var iterator = 0;
page.open(url, function(status) {
setInterval(function () {
if(page.content.indexOf(ok) > -1) {
console.log (page.content);
phantom.exit(0)
}
iterator++;
if(iterator > 50) {
cosole.log('Bad content');
phantom.exit(0);
}
}, timeInterval)
});
Ok, so just for completeness sake, here is the code I'm using now:
PHP
$PhantomTimeout = 5000; // timeout to wait for js-functions on websites like Origin.com
if (parse_url ($_GET["url"], PHP_URL_HOST) == 'www.origin.com')
{
exec ('phantomjs.exe --ignore-ssl-errors=true --load-images=false fetch_external.js "'.$_GET["url"].'" '.$PhantomTimeout, $DataArr);
$Data = implode ('\n', $DataArr);
}
JS
"use strict";
var page = require('webpage').create(), system = require('system'), url;
if (system.args.length < 3) {
console.log ('Usage: fetch_external.js URL TIMEOUT');
phantom.exit (1);
}
var url = system.args[1];
var time = system.args[2];
page.open(url, function(status) {
setTimeout(function () {
console.log (page.content);
phantom.exit(0)
}, time)
});
A callback to wait until the whole page is loaded or a specific element would be better, but I haven't found out on how to do that yet...

phantomjs - execute a Javascript function after page load and then output new changes

I use phantomjs 2.1.1 and something is bothering me.
Here is the piece of code that I use for scraping a url and the html of the website is written into output.html file
page = require('webpage').create();
page.open(url, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit();
} else {
window.setTimeout(function () {
var content = page.content;
fs.write("output.html", content, 'w');
}, 40000); //40 seconds timeout
}
});
Now, I need to scrape its paginations too. The next pages are loaded by a javascript function page(2); or page(3); I tried to get it done using
var pageinationOutput = page.evaluate(function (s) {
page(2);
});
console.log(pageinationOutput); // I need the output made by the `page(2);` call.
page = require('webpage').create();
page.open(url, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit();
} else {
window.setTimeout(function () {
var content = page.content;
fs.write("output.html", content, 'w');
}, 40000); //40 seconds timeout
}
});
But i am not getting any outputs for this.
How can I execute a JavaScript function after a page is finished loading and get the new changes that has happened to the website contents after the javascript exec, in this case website will call the next page (using ajax) after page(2); method call.
Thanks in advance!
I found out the solution myself but I am not sure whether it's the perfect way to do it.
Code:
page.open(url, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit();
} else {
window.setTimeout(function () {
var content = page.content;
fs.write("output.html", content, 'w');
page.evaluate(function (cb) {
window.page(2);
});
var waiter = window.setInterval(function () {
var nextPageContent = page.evaluate(function (cb) {
return document.documentElement.outerHTML;
});
if (nextPageContent !== false) {
window.clearInterval(waiter);
fs.write("output-2.html", content, 'w');
}
}, 40000);//40 seconds timeout
}, 40000);//40 seconds timeout
}
});
I recently published a project that gives PHP access to a browser. Get it here: https://github.com/merlinthemagic/MTS. It is also PhantomJS under the hood.
If you provided the URL i could make a working example. I need to know how you determine the last page. In the example i simply set it to 10.
I also need to know if the page buttons have an id attribute, If they dont no problem, we find another way to trigger them. But for this example I assume they do and to make it simple the ids will be page_2, page_3 ....
After downloading and setup you would simply use the following code:
$myUrl = "http://www.example.com";
$windowObj = \MTS\Factories::getDevices()->getLocalHost()->getBrowser('phantomjs')->getNewWindow($myUrl);
//now you can either retrieve the DOM for each page:
$doms = array();
//get the initial page DOM
$doms[] = $windowObj->getDom();
$pageID = "page_";
$lastPage = 10;
for ($i = 2; $i <= $lastPage; $i++) {
$windowObj->mouseEventOnElement("[id=".$pageID. $i . "]", 'leftclick');
$doms[] = $windowObj->getDom();
}
//$doms now hold all the pages, so you can parse them.

PhantomJS not returning any results

I'm using PhantomJS to scrape data from a webpage. PhantomJS is not returning anything from the evaluate method. The script just runs for a few seconds and then exits.
I've already checked to see if PhantomJS is connecting to the page -- it is.
PhantomJS is also able to grab the page title. I've already double-checked the class I'm looking for, yes -- I'm spelling it correctly.
var page = require('webpage').create();
page.open('http://www.maccosmetics.com/product/13854/36182/Products/Makeup/Lips/Lipstick/Giambattista-Valli-Lipstick', function(status) {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
waitFor(function() {
return page.evaluate(function() {
$('.product__price').is(':visible');
});
}, function(){
search = page.evaluate(function() {
return $('.product__price').text();
});
console.log(search)
});
});
phantom.exit();
});
I don't know what's going wrong here.
It's not showing you anything, because you're exiting too early. All functions (except evaluate()) that take a callback are asynchronous.
You're requesting to include jQuery in the page by calling page.includeJs(), you immediately exit PhantomJS. You need to exit when you're finished:
var page = require('webpage').create();
page.open('http://www.maccosmetics.com/product/13854/36182/Products/Makeup/Lips/Lipstick/Giambattista-Valli-Lipstick', function(status) {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
waitFor(function() {
return page.evaluate(function() {
$('.product__price').is(':visible');
});
}, function(){
search = page.evaluate(function() {
return $('.product__price').text();
});
console.log(search);
phantom.exit();
});
});
});

Categories