I'm trying to pre-render a MathJax html file using PhantomJS. For example, suppose in math.html I have:
<!DOCTYPE html>
<html>
<head>
<script type="text/javascript" src="MathJax/MathJax.js"></script>
<script src="ConfigMathJax.js"></script>
</head>
<body>
<span class="math">\(e = m c^2\)</span>
</body>
</html>
My (broken) render script currently looks like:
var page = require('webpage').create();
var system = require('system');
var fs = require('fs');
page.open(system.args[1], function () {
page.evaluate(function(){
var flag = false;
MathJax.Hub.Queue(["Typeset",MathJax.Hub]);
MathJax.Hub.Queue(function(){
console.log(page.content);
phantom.exit();
});
});
});
I've attempted to write the page to standard out and exit after the MathJax render command is called from the queue. But it seems I'm in the "page"'s context rather than the top-level Phantom context. The variable page can't be found: ReferenceError: Can't find variable: page.
It I hack in a setTimeout instead of using the flag:
var page = require('webpage').create();
var system = require('system');
var fs = require('fs');
page.open(system.args[1], function () {
page.evaluate(function(){
MathJax.Hub.Queue(["Typeset",MathJax.Hub]);
});
setTimeout(function(){
console.log(page.content);
phantom.exit();
},10000);
});
then I get the desired output, but of course the wait time 10000ms will depend on the content.
How can I let PhantomJS know that MathJax is done rendering?
Is this a sandbox issue?
Try the following:
var page = require('webpage').create();
var system = require('system');
var fs = require('fs');
page.open(system.args[1], function () {
page.evaluate(function () {
MathJax.Hub.Queue(
["Typeset",MathJax.Hub],
function () {
console.log(page.content);
phantom.exit();
}
);
});
});
This will queue the console output and phantom.exit() calls to occur immediately after the typesetting occurs. I haven't tested the code, but this is the way to synchronize something with MathJax's process.
UPDATE
Try this:
var page = require('webpage').create();
var system = require('system');
var fs = require('fs');
page.open(system.args[1], function () {
page.onAlert = function (msg) {
if (msg === "MathJax Done") {
console.log(page.content);
} else if (msg === "MathJax Timeout") {
console.log("Timed out waiting for MathJax");
} else {console.log(msg)}
phantom.exit();
};
page.evaluate(function () {
MathJax.Hub.Queue(
["Typeset",MathJax.Hub],
[alert,'MathJax Done']
);
setTimeout(function () {alert("MathJax Timeout")},10000); // timeout after 10 seconds
});
});
Related
i was learning about phantomjs
and i was trying to login into facebook using phantom js
i was successfully able to change the values of the input fields
looks like this
and i could submit the form
the problem is with multiple redirects that facebook do to reach the home page
here is how my code looks like:
var webpage = require('webpage');
var page = webpage.create();
page.settings.javascriptEnabled = true;
page.settings.loadImages = false;
phantom.cookiesEnabled = true;
phantom.javascriptEnabled = true;
var loaded = false;
page.onLoadStarted = function() {
loaded = false;
console.log("load started");
};
page.onLoadFinished = function() {
loaded = true;
console.log("load finished");
};
page.open('http://www.facebook.com',function () {
page.render('1.jpg')
page.evaluate(function () {
document.getElementById('m_login_email').value = "me#email.com";
document.getElementById('m_login_password').value = "*********";
document.getElementById('login_form').submit();
});
page.render('2.jpg');
setInterval(function () {
if (loaded) {
page.render('3.jpg');
console.log("loaded : "+page.url);
phantom.exit();
}
},1000);
});
//EDIT
//the output
load started
load finished
load started
load finished
load started
load finished
loaded : https://m.facebook.com/login/async/?refsrc=https%3A%2F%2Fwww.facebook.com%2F&lwv=100
//
that totally work for me but the 3.jpg image it's black image
and the url of this page >>https://m.facebook.com/login/async/?refsrc=https%3A%2F%2Fwww.facebook.com%2F&lwv=100<<
i was lucky only once, this code worked and i could login .. but it's back to fail again :(
thanks for help BTW
I have a cross platform app built using PhoneGap/Cordova.
I am trying to implement a function that runs an external JavaScript file when a controller loads. I am following a solution from HERE. And similarly HERE. But I want the JavaScript to execute without the window.open event, i.e. I want to run executeScript as soon as the device is ready.
How do I call the executeScript() without defining the var ref first though?
var navigation = angular.module("navigation", []);
navigation.controller("Navigation", function ($scope) {
var init = function () {
document.addEventListener("deviceready", onDeviceReady, false);
};
init();
function onDeviceReady() {
// LOAD EXTERNAL SCRIPT
var ref = window.open('http://www.haruair.com/', '_blank', 'location=yes, toolbar=yes, EnableViewPortScale=yes');
ref.addEventListener("loadstop", function () {
ref.executeScript(
{ file: 'http://haruair.com/externaljavascriptfile.js' },
function () {
ref.executeScript(
{ code: 'getSomething()' },
function (values) {
var data = values[0];
alert("Name: " + data.name + "\nAge: " + data.age);
});
}
);
});
});
You could try to add the script in the index.html file and do whatever you want from JS. Also, you must add to your whitelist this endpoint.
<!-- index.html -->
<script>
function onCustomLoad() {
//do stuff
}
</script>
<script src="your-custom-script" onload="onCustomLoad"></script>
you could use
var script = document.createElement("script");
script.type = "text/javascript";
script.id = "some_id";
script.src = 'http://haruair.com/externaljavascriptfile.js';
document.head.appendChild(script);
then call the function once finished
Some certain websites return only some of the code/html and not the full page
e.g.: "https://www.origin.com/deu/de-de/store/mirrors-edge/mirrors-edge-catalyst/standard-edition"
You get the full page when viewing it with the browsers developer tools.
But not with:
View Page Source
file_get_contents
curl_init
Is there any way to get the "real" content?
Thanks!
Use phantomjs. For example:
File test.js
var page = require('webpage').create();
var url = 'https://www.origin.com/deu/de-de/store/mirrors-edge/mirrors-edge-catalyst/standard-edition';
page.open(url, function (status) {
console.log(page.content)
phantom.exit();
});
After install phantomjs in your server run command
phantomjs test.js
UPDATE
var ok = 'Your needed content';
var iterator = 0;
page.open(url, function(status) {
setInterval(function () {
if(page.content.indexOf(ok) > -1) {
console.log (page.content);
phantom.exit(0)
}
iterator++;
if(iterator > 50) {
cosole.log('Bad content');
phantom.exit(0);
}
}, timeInterval)
});
Ok, so just for completeness sake, here is the code I'm using now:
PHP
$PhantomTimeout = 5000; // timeout to wait for js-functions on websites like Origin.com
if (parse_url ($_GET["url"], PHP_URL_HOST) == 'www.origin.com')
{
exec ('phantomjs.exe --ignore-ssl-errors=true --load-images=false fetch_external.js "'.$_GET["url"].'" '.$PhantomTimeout, $DataArr);
$Data = implode ('\n', $DataArr);
}
JS
"use strict";
var page = require('webpage').create(), system = require('system'), url;
if (system.args.length < 3) {
console.log ('Usage: fetch_external.js URL TIMEOUT');
phantom.exit (1);
}
var url = system.args[1];
var time = system.args[2];
page.open(url, function(status) {
setTimeout(function () {
console.log (page.content);
phantom.exit(0)
}, time)
});
A callback to wait until the whole page is loaded or a specific element would be better, but I haven't found out on how to do that yet...
I use phantomjs 2.1.1 and something is bothering me.
Here is the piece of code that I use for scraping a url and the html of the website is written into output.html file
page = require('webpage').create();
page.open(url, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit();
} else {
window.setTimeout(function () {
var content = page.content;
fs.write("output.html", content, 'w');
}, 40000); //40 seconds timeout
}
});
Now, I need to scrape its paginations too. The next pages are loaded by a javascript function page(2); or page(3); I tried to get it done using
var pageinationOutput = page.evaluate(function (s) {
page(2);
});
console.log(pageinationOutput); // I need the output made by the `page(2);` call.
page = require('webpage').create();
page.open(url, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit();
} else {
window.setTimeout(function () {
var content = page.content;
fs.write("output.html", content, 'w');
}, 40000); //40 seconds timeout
}
});
But i am not getting any outputs for this.
How can I execute a JavaScript function after a page is finished loading and get the new changes that has happened to the website contents after the javascript exec, in this case website will call the next page (using ajax) after page(2); method call.
Thanks in advance!
I found out the solution myself but I am not sure whether it's the perfect way to do it.
Code:
page.open(url, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit();
} else {
window.setTimeout(function () {
var content = page.content;
fs.write("output.html", content, 'w');
page.evaluate(function (cb) {
window.page(2);
});
var waiter = window.setInterval(function () {
var nextPageContent = page.evaluate(function (cb) {
return document.documentElement.outerHTML;
});
if (nextPageContent !== false) {
window.clearInterval(waiter);
fs.write("output-2.html", content, 'w');
}
}, 40000);//40 seconds timeout
}, 40000);//40 seconds timeout
}
});
I recently published a project that gives PHP access to a browser. Get it here: https://github.com/merlinthemagic/MTS. It is also PhantomJS under the hood.
If you provided the URL i could make a working example. I need to know how you determine the last page. In the example i simply set it to 10.
I also need to know if the page buttons have an id attribute, If they dont no problem, we find another way to trigger them. But for this example I assume they do and to make it simple the ids will be page_2, page_3 ....
After downloading and setup you would simply use the following code:
$myUrl = "http://www.example.com";
$windowObj = \MTS\Factories::getDevices()->getLocalHost()->getBrowser('phantomjs')->getNewWindow($myUrl);
//now you can either retrieve the DOM for each page:
$doms = array();
//get the initial page DOM
$doms[] = $windowObj->getDom();
$pageID = "page_";
$lastPage = 10;
for ($i = 2; $i <= $lastPage; $i++) {
$windowObj->mouseEventOnElement("[id=".$pageID. $i . "]", 'leftclick');
$doms[] = $windowObj->getDom();
}
//$doms now hold all the pages, so you can parse them.
see code below.
I want to capture two web page.
After run this code using phantomjs, I only got one screenshot of webpage(www.baidu.com).
Is webpage in phantomjs singleton? Can I open two webpages in one phantomjs intance?
var webpage1 = require('webpage').create();
webpage1.onLoadFinished = function() {
webpage1.render('1.png');
}
webpage1.open('http://www.google.com');
var webpage2 = require('webpage').create();
webpage2.onLoadFinished = function() {
webpage2.render('2.png');
}
webpage2.open('http://www.baidu.com');
UPDATE:
thank you #Cybermaxs
I delay the end of the script, it works. I got two screenshot.
var webpage1 = require('webpage').create();
webpage1.onLoadFinished = function() {
webpage1.render('1.png');
}
webpage1.open('http://www.google.com.cn');
var webpage2 = require('webpage').create();
webpage2.onLoadFinished = function() {
webpage2.render('2.png');
}
webpage2.open('http://www.baidu.com');
setTimeout(function() {
console.log(webpage1.url);
console.log(webpage2.url);
phantom.exit(0);
}, 9000);
The problem is your script is that PhantomJS is async by Nature.
Your script will end before
There are many ways to solve this :
Delay the end of the script
setTimeout(function()
{
phantom.exit(0);
},5000);
Use the waitFor example to wait for all screen shots
Use a recurive approach
var webpage = require('webpage').create();
webpage.open('http://www.google.com', function(status)
{
if (status !== 'success') {
console.log('FAIL to load the address');
} else
{
webpage.render('1.png');
webpage.open('http://www.baidu.com',function(status)
{
if (status !== 'success') {
console.log('FAIL to load the address');
} else
{
webpage.render('2.png');
}
phantom.exit(0);
});
}
}
);
Fro sure, you can open two pages at the same time.