Get html rendered by Javascript using PhantomJS - javascript

I am trying to utilize PhantomJS to get html generated by dynamic page. I supposed that this would be easy, but after few hours of trying, I am still not lucky.
The page itself has this source code and what gets saved in 1.html eventually:
<!doctype html>
<html lang="cs" ng-app="appId">
<head ng-controller="MainCtrl">
(ommited some lines)
<script src="/js/conf/config.js?pars"></script>
<script src="/js/all.js?pars"></script>
</head>
<body>
<!--<![endif]-->
<div site-loader></div>
<div page-layout>
<div ng-view></div>
</div>
</body>
</html>
All content of web gets loaded inside site-loader div, but I have no luck to get it, even though I am using timeout before scraping html by PhantomJS. Here goes code I am using:
var url = 'http:...';
var page = require('webpage').create();
var fs = require('fs');
page.open(url, function (status) {
if (status !== 'success') {
console.log('Fail');
phantom.exit();
} else {
window.setTimeout(function () {
fs.write('1.html', page.content, 'w');
phantom.exit();
}, 2000); // Change timeout as required to allow sufficient time
}
});
Please what am I doing wrong?
EDIT:
I have decided to try PJscrapper framework and configured it to scrappe all contents of div block. All I got was lousy:
["","\n\t\tif (window.DOT) {\n\t\t\tDOT.cfg({service: 'sreality', impress: false});\n\t\t}\n\t","","Loader.load()","",""]
Seems that I seriously do not get it and always get code before Loader.load() acts. And obviously, timeout does not solve it.

This will do the trick
page.open(url, function (status) {
if (status !== 'success') {
console.log('Unable to load the url!');
phantom.exit();
} else {
window.setTimeout(function () {
var results = page.evaluate(function() {
return document.documentElement.innerHTML;
});
console.log(results)
phantom.exit();
}, 200);
}
});

Related

Issues scraping dynamic site (PhantomJS)

I am trying to find a way to wget/download a website.
I have tried wget and curl but no luck, then I've been led to PhantomJS.
var url = 'https://www.sagedining.com/menus/admiralfarragutacademy';
var fs = require('fs');
var page = require('webpage').create();
page.open(url, function(status) {
if (status === 'success') {
var html = page.evaluate(function() {
return document.documentElement.outerHTML;
});
try {
fs.write("/root/choate/page.html", html, 'w');
} catch(e) {
console.log(e);
}
}
phantom.exit();
});
When I run this code on my Debian VPS,
sudo xvfb-run -- phantomjs menu.js
It downloads the site when it's still loading, and therefore only downloads the loading screen.
It also throws this error every time it runs:
TypeError: Attempting to change the setter of an unconfigurable property.
TypeError: Attempting to change the setter of an unconfigurable property.
Is there any way to download this website after it loads all the menus? Does the error message have anything to do with it?
Thank you in advance.
That error is coming from PhantomJS because the page code is trying to set some properties in the DOM and maybe it does not have access to them. You should wait for the loading to happen, you can do it using timeout function:
if (status === 'success') {
window.setTimeout(function () {
var html = page.evaluate(function() {
return document.documentElement.outerHTML;
});
try {
fs.write("/root/choate/page.html", html, 'w');
} catch(e) {
console.log(e);
}
}, 1000); //Increase the value if you need more time
}

How can I inject JS and collect alert message from website?

I am trying to collect alert from website by using overwrite method. I searched on google and found wappalyzer, a Chrome/Firefox extension to detect software on website. It injects a script inject.js when page load and collect information. My method is similar. I make a local website and test it. When I inject overwrite_alert.js manually then it works.
But I want to do it dynamically and apply to other website. So I use headless browser like PhantomJS. The following code which I tried in PhantomJS but it does not work.
I am trying to inject a JavaScript on phantom JS. Like this code:
<!DOCTYPE html>
<html>
<head>
<title>Test alert</title>
<!-- !!! Inject here !!! <script src="overwrite_alert.js"></script> -->
<script type="text/javascript" src="other_script.js"> </script>
<script type="text/javascript">
alert("Alert content");
</script>
</head>
<body>
<h1>website content</h1>
</body>
</html>
overwrite_alert.js file from this question:
(function() {
var _alert = window.alert; // <-- Reference
window.alert = function(str) {
// do something additional
if(console) console.log(str);
//return _alert.apply(this, arguments); // <-- The universal method
_alert(str); // Suits for this case
};
})();
I tried with onLoadStarted event and My PhantomJS code:
var webPage = require('webpage');
var page = webPage.create();
var url = "https://localhost:5000/alert.html";
page.onConsoleMessage = function(msg, lineNum, sourceId) {
console.log('CONSOLE> ' + msg);
};
page.onLoadStarted = function() {
if (page.injectJs('do.js')) {
var title = page.evaluate(function() {
// returnTitle is a function loaded from our do.js file - see below
console.log("evaluate completed");
});
console.log(title);
}
}
page.open(url, function(status) {
if (status === "success") {
if (page.injectJs('do.js')) {
var title = page.evaluate(function() {
// returnTitle is a function loaded from our do.js file - see below
console.log("evaluate completed");
});
console.log(title);
phantom.exit();
}
page.render("onOpen.png");
}
});
Result:
$ phantomjs test_inject.js
CONSOLE> from onLoadStarted completed
null
CONSOLE> from page.open
null
Since the page.open callback is called after a page is loaded, it would be simply to late to change the implementation of window.alert. You would need to use earlier events such as page.onInitialized, page.onLoadStarted, etc.
Since you're interested in alerts, you don't need to do that at all, because PhantomJS provides an event for that: page.onAlert

phantomjs - execute a Javascript function after page load and then output new changes

I use phantomjs 2.1.1 and something is bothering me.
Here is the piece of code that I use for scraping a url and the html of the website is written into output.html file
page = require('webpage').create();
page.open(url, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit();
} else {
window.setTimeout(function () {
var content = page.content;
fs.write("output.html", content, 'w');
}, 40000); //40 seconds timeout
}
});
Now, I need to scrape its paginations too. The next pages are loaded by a javascript function page(2); or page(3); I tried to get it done using
var pageinationOutput = page.evaluate(function (s) {
page(2);
});
console.log(pageinationOutput); // I need the output made by the `page(2);` call.
page = require('webpage').create();
page.open(url, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit();
} else {
window.setTimeout(function () {
var content = page.content;
fs.write("output.html", content, 'w');
}, 40000); //40 seconds timeout
}
});
But i am not getting any outputs for this.
How can I execute a JavaScript function after a page is finished loading and get the new changes that has happened to the website contents after the javascript exec, in this case website will call the next page (using ajax) after page(2); method call.
Thanks in advance!
I found out the solution myself but I am not sure whether it's the perfect way to do it.
Code:
page.open(url, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit();
} else {
window.setTimeout(function () {
var content = page.content;
fs.write("output.html", content, 'w');
page.evaluate(function (cb) {
window.page(2);
});
var waiter = window.setInterval(function () {
var nextPageContent = page.evaluate(function (cb) {
return document.documentElement.outerHTML;
});
if (nextPageContent !== false) {
window.clearInterval(waiter);
fs.write("output-2.html", content, 'w');
}
}, 40000);//40 seconds timeout
}, 40000);//40 seconds timeout
}
});
I recently published a project that gives PHP access to a browser. Get it here: https://github.com/merlinthemagic/MTS. It is also PhantomJS under the hood.
If you provided the URL i could make a working example. I need to know how you determine the last page. In the example i simply set it to 10.
I also need to know if the page buttons have an id attribute, If they dont no problem, we find another way to trigger them. But for this example I assume they do and to make it simple the ids will be page_2, page_3 ....
After downloading and setup you would simply use the following code:
$myUrl = "http://www.example.com";
$windowObj = \MTS\Factories::getDevices()->getLocalHost()->getBrowser('phantomjs')->getNewWindow($myUrl);
//now you can either retrieve the DOM for each page:
$doms = array();
//get the initial page DOM
$doms[] = $windowObj->getDom();
$pageID = "page_";
$lastPage = 10;
for ($i = 2; $i <= $lastPage; $i++) {
$windowObj->mouseEventOnElement("[id=".$pageID. $i . "]", 'leftclick');
$doms[] = $windowObj->getDom();
}
//$doms now hold all the pages, so you can parse them.

Including a local version of a library that failed to load

I am using PhantomJS to take a screenshot of a page every five minutes, and it works correctly most of the time. The problem is that sometimes the page I am taking a screenshot of fails to load the AngularJS library, and then, it can't build the page after that. So I am trying to figure out how to load a local copy in its place. Here is what I have been trying...
var page = require('webpage').create(),system = require('system');
var home = 'https://smartway.tn.gov/traffic/';
page.open(home, function (status) {
if(status === "success"){
page.injectJs('angular.js');
window.setTimeout((function() {
page.evaluate(function () {
/*stuff*/
});
}), 2000);
}
});
So angular.js is the name of my local copy of what the site would normally download. The site calls the script at the end of the body with several other scripts, and I am trying to find the best way to include it. I am wondering if it needs to be included by replacing the script tag in the html so it can be loaded in sequence, but I am not sure how to do that.
Thanks
It is problematic to reload a single JavaScript file when it failed, particularly when it is the framework. There are probably many scripts which depend on it. When the core framework is not loaded, those scripts will stop executing, because the angular reference cannot be resolved.
You could inject a local version of angular, but then you would have to go over all the other scripts which reference angular and "reload" them by either downloading and evaling them in order or putting them into the page as script elements. I advise against it, because it is probably very error prone.
You should just reload the page if angular does not exist after page load (callback of page.open). Since the same problem may occurr during reload, this has to be done recursively:
function open(countDown, done){
if (countDown === 0) {
done("ERROR: not loaded");
return;
}
page.open(home, function (status) {
if(status === "success"){
var angularExists = page.evaluate(function () {
return !!angular;
});
if (angularExists){
done();
} else {
open(countDown - 1, done);
}
} else {
open(countDown - 1, done);
}
});
}
open(5, function(err){
if(err) {
console.log(err);
} else {
page.render(target);
}
});
You can also try the page.reload() function instead of a page.open().
The other possiblity is to always inject the local version when the page loading began and stop any request for the remote version of the script:
page.onLoadStarted = function() {
page.injectJs('angular.js');
};
page.onResourceRequested = function(requestData, networkRequest) {
var match = requestData.url.match(/angular\.min\.js/g);
if (match != null) {
networkRequest.abort();
}
};
page.open(home, function (status) {
if(status === "success"){
window.setTimeout((function() {
page.evaluate(function () {
/*stuff*/
});
}), 2000);
}
});
This version works entirely without reloading.

phantomjs: is webpage singleton?

see code below.
I want to capture two web page.
After run this code using phantomjs, I only got one screenshot of webpage(www.baidu.com).
Is webpage in phantomjs singleton? Can I open two webpages in one phantomjs intance?
var webpage1 = require('webpage').create();
webpage1.onLoadFinished = function() {
webpage1.render('1.png');
}
webpage1.open('http://www.google.com');
var webpage2 = require('webpage').create();
webpage2.onLoadFinished = function() {
webpage2.render('2.png');
}
webpage2.open('http://www.baidu.com');
UPDATE:
thank you #Cybermaxs
I delay the end of the script, it works. I got two screenshot.
var webpage1 = require('webpage').create();
webpage1.onLoadFinished = function() {
webpage1.render('1.png');
}
webpage1.open('http://www.google.com.cn');
var webpage2 = require('webpage').create();
webpage2.onLoadFinished = function() {
webpage2.render('2.png');
}
webpage2.open('http://www.baidu.com');
setTimeout(function() {
console.log(webpage1.url);
console.log(webpage2.url);
phantom.exit(0);
}, 9000);
The problem is your script is that PhantomJS is async by Nature.
Your script will end before
There are many ways to solve this :
Delay the end of the script
setTimeout(function()
{
phantom.exit(0);
},5000);
Use the waitFor example to wait for all screen shots
Use a recurive approach
var webpage = require('webpage').create();
webpage.open('http://www.google.com', function(status)
{
if (status !== 'success') {
console.log('FAIL to load the address');
} else
{
webpage.render('1.png');
webpage.open('http://www.baidu.com',function(status)
{
if (status !== 'success') {
console.log('FAIL to load the address');
} else
{
webpage.render('2.png');
}
phantom.exit(0);
});
}
}
);
Fro sure, you can open two pages at the same time.

Categories