Accessing the contentDocument of an iframe in phantomjs - javascript

I'm having difficulties accessing the contentDocument of an iframe. I am using phantomjs (1.9). I have looked into various threads but none seem to have the answer.
This is my phantomjs script where I have injected jquery to try and select the element.
var page = require('webpage').create();
page.onConsoleMessage = function(msg, lineNum, sourceId) {
console.log('CONSOLE: ' + msg);
};
page.onError = function(msg) {
console.log('ERROR MESSAGE: ' + msg);
};
page.open('http://localhost:8080/', function() {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
page.evaluate(function() {
console.log( $('iframe').contentDocument.documentElement );
});
phantom.exit();
});
});
Apart form jquery, I have also used these two lines of code to get the DOM element that I want (the DOM HTML element that's inside the iframe). PhantomJS seems unable to parse anything beyond getElementsByTagName('iframe') or $('iframe') could it be because it hasn't finished loading yet?
document.getElementsByTagName('iframe')[0].contentDocument.activeElement;
document.getElementsByTagName('iframe')[0].contentDocument.documentElement;
I am also running the script with --web-security=no setting disabled

I ran into this issue but found it was because I was not wrapping the code in evaluate(). You seem to be doing that though. Try this not using jquery.
page.evaluate(function (){
iframe = document.getElementById('iframeName').contentDocument
iframe.getElementById("testInput").value = "test";
});

Related

How to select html elements using jQuery in Phantomjs?

I am going to scrape some contents from a website that use javascript to load dynamic content. Before, I have used request and cheerio to scrape and they worked just fine. But I just find out that request and cheerio cannot scrape dynamic content. After do a research, I found phantomjs that can get all the content after the page has loaded. I have a problem with it now, I cannot use jQuery selector like I used to use in cheerio. This is my sample code but the selector is return nothing.
var page = require('webpage').create();
var url = 'http://angkorauto.com/vehicle';
page.open(url, function (status) {
if (status !== 'success') {
console.log('Unable to load the address!');
phantom.exit();
} else {
window.setTimeout(function () {
// console.log(page.content);
page.includeJs('https://cdnjs.cloudflare.com/ajax/libs/jquery/3.1.1/jquery.min.js', function(){
page.evaluate(function(){
console.log($('.divTitle').find('a').attr('href'));
});
});
phantom.exit();
}, 1500);
}
});
Could you help me with this problem? I really get stuck now.
Thanks for ur time to help.
The website you want to scrape has jQuery already (like many other websites) so you don't have load it again.
This works fine:
var page = require('webpage').create();
var url = 'http://angkorauto.com/vehicle';
page.open(url, function(status) {
var href = page.evaluate(function(){
return jQuery('.divTitle').find('a').attr('href');
});
console.log(href);
});

Why does phantom.exit() have a 2 second delay?

I noticed that for a simple script like:
var url = "http://stackoverflow.com";
var page = require('webpage').create();
page.onConsoleMessage = function(msg) {
console.log('Page title is ' + msg);
};
page.onLoadFinished = function(status) {
console.log('Status: ' + status);
};
page.open(url, function(status) {
page.evaluate(function() {
console.log(document.title);
});
phantom.exit();
});
calling phantom.exit() will not exit immediately, rather it will wait 2 seconds before doing so. I'm using version 2.1.1.
Do you know where this delay comes from and how I can make phantom exit immediately? Thank you!
Look like "open issue":
https://github.com/ariya/phantomjs/issues/14033
I did try your code with the last version and I get the same behaviour.
You can use phantomjs#1.9.X as "jmullo" commented in the issue thread.
Like other browser, phantomjs is just headless. so it's exit() method will do many things, and it may cost couple of seconds.

ReferenceError: ko is not defined when loading from jQuery

I'm trying to load KnockoutJS using the jQuery's getScript function and, after countless tests, I'm still stuck with the
ReferenceError: ko is not defined
error.
Here's a simple code that I'm running in my script to test if everything works (I've also tried to write this code directly into FireBug but I got the same results):
$.getScript('http://knockoutjs.com/downloads/knockout-3.2.0.js',
function(d,s,j){
console.log(d);
console.log(s);
console.log(j);
console.log(ko);
});
Checking the console shows those messages:
undefined
success
Object { readyState=4, status=200, statusText="success", other elements...}
ReferenceError: ko is not defined
and I've no idea on what's going on.
As a side note, I've also tried those urls with the same results
http://cdnjs.cloudflare.com/ajax/libs/knockout/3.2.0/knockout-min.js
http://knockoutjs.com/downloads/knockout-3.2.0.debug.js
EDIT:
Based on #T.J. Crowder's first answer I also tried this code
$("<script>")
.on("load", function() {
// It's loaded now
console.log("success");
})
.on("error", function() {
// It failed to load
console.log("error");
})
.attr("src", "http://knockoutjs.com/downloads/knockout-3.2.0.js");
but nothing is shown on the console.
getScript uses XHR (ajax), which is controlled by the Same Origin Policy. So you can't load scripts from other origins unless they enable your origin via CORS.
Instead, simply add a script tag referencing the script you want to load.
var scr = document.createElement('script');
scr.onload = function() {
display("Loaded: " + typeof ko);
};
scr.onerror = function() {
display("Error");
};
scr.src = "http://knockoutjs.com/downloads/knockout-3.2.0.js";
document.querySelector("script").parentNode.appendChild(scr);
function display(msg) {
$("<p>").html(String(msg)).appendTo(document.body);
}
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js"></script>

Phantomjs check if javascript exists and is working

I am quite new to Phantomjs and am starting to getting to know how to use it. However all of the semi advanced tutorials does not cover the thing i want to use Phantomjs for.
Now my question is how would i check if a Javascript is active on the site and if it is working correcly (i.e not throwing erros in the console).
I hope someone is able to point me in the right direction or know how to do this.
you can interact with the open page using the webpage.evaluate method:
var page = require('webpage').create();
page.open('http://m.bing.com', function(status) {
var title = page.evaluate(function(s) {
//what you do here is done in the context of the page
//this console.log will appear in the virtual page console
console.log("test")
//here they are just returning the page title (tag passed as argument)
return document.querySelector(s).innerText;
//you are not required to return anything
}, 'title');
console.log(title);
phantom.exit(); //closes phantom, free memory!
});
in order to see the virtual page console, you have to add a onConsoleMessage callback:
page.onConsoleMessage = function(msg, lineNum, sourceId) {
console.log('CONSOLE: ' + msg + ' (from line #' + lineNum + ' in "' + sourceId + '")');
};
EDIT: by default javascript is executed.

Detect if the iframe content has loaded successfully

I have a widget that contains an iframe. The user can configure the url of this iframe, but if the url could not be loaded (it does not exists or the user does not have access to internet) then the iframe should failover to a default offline page.
The question is, how can I detect if the iframe could be loaded or not? I tried subscribing to the 'load' event, and, if this event is not fired after some time then I failover, but this only works in Firefox, since IE and Chrome fires the 'load' event when the "Page Not Found" is displayed.
I found the following link via Google: http://wordpressapi.com/2010/01/28/check-iframes-loaded-completely-browser/
Don't know if it solves the 'Page Not Found' issue.
<script type="javascript">
var iframe = document.createElement("iframe");
iframe.src = "http://www.your_iframe.com/";
if (navigator.userAgent.indexOf("MSIE") > -1 && !window.opera) {
iframe.onreadystatechange = function(){
if (iframe.readyState == "complete"){
alert("Iframe is now loaded.");
}
};
} else {
iframe.onload = function(){
alert("Iframe is now loaded.");
};
}
</script>
I haven't tried it myself, so I don't know if it works. Good luck!
Nowadays the browsers have a series of security limitations that keep you away from the content of an iframe (if it isnĀ“t of your domain).
If you really need that functionality, you have to build a server page that have to work as a proxy, that receive the url as a parameter, test if it is a valid url, and does the redirect or display the error page.
If you control the content of the iframe, the iframe can send a message to the parent.
parent.postMessage('iframeIsDone', '*');
The parent callback listens for the message.
var attachFuncEvent = "message";
var attachFunc = window.addEventListener ;
if (! window.addEventListener) {
attachFunc = window.attachEvent;
attachFuncEvent = "onmessage";
}
attachFunc(attachFuncEvent, function(event) {
if (event.data == 'iframeIsDone') { // iframe is done callback here
}
});
How about checking if the url is available and only then setting the actual url of the iframe?
e.g. with JQuery
var url = "google.com"
var loading_url = "/empty.html"
document.getElementById("iframe").src = loading_url;
$.ajax({
url: url,
type: 'GET',
complete: function(e, xhr, settings){
if(e.status === 200){
document.getElementById("iframe").src = url;
}
}
});
Edit:
This does not seem to work cross domain, the status code is 0 in those cases.
If you have control over the contents of the iframe (e.g. you can add arbitrary code to the page), you can try to implement a special function in each of them, then in your page, you call that function and catch an error (via window.onerror handler) if the function called via eval fails because the page didn't load.
Here's example code: http://www.tek-tips.com/viewthread.cfm?qid=1114265&page=420
After the onload fires, you can scavenge the content of the iframe to see if it contains a usefull page or not. You'd have to make this browser specifuc unfortunately because they all display a different "page not found" message.
For more info, take a look here at http://roneiv.wordpress.com/2008/01/18/get-the-content-of-an-iframe-in-javascript-crossbrowser-solution-for-both-ie-and-firefox/

Categories