Recursively walk website pages with phantomjs - javascript

I have to go through all the pages from a website and check for an element on every page. This has to happen recursively, and I chose to do it with PhantomJS. So, I basically have this/such code in main.js:
var page = require('webpage').create();
var allUrls = [];
var pageCheck = function(url) {
page.open(url, function(success) {
page.evaluate(function(allUrls, nextPage) {
// crawl all links, and if they are from this site ..
// add them to the allUrls array ..
// then check the page for the element ..
// and go to next eventual page ..
setTimeout(nextPage, 250);
}, allUrls, nextPage);
});
};
var nextPage = function() {
var nextUrl = allUrls.unshift();
if(nextUrl) pageCheck(nextUrl);
};
pageCheck('http://example.com/');
and I call this with phantomjs main.js.
But I see messages that "Can't find variable ...". And when I cleared all of them - I see now Can't find variable: pageCheck
How am I supposed to do this? ... And what is all this stuff with PhantomJS scopes ?? ...

I managed to figure it out, thanks to #ArtjomB :)
Basically, my mistake was that I was trying to call global stuff from page.evaluate, while I had to use it only for page manipulation. So I changed the code to this/such one:
var page = require('webpage').create();
var allUrls = [];
var pageCheck = function(url) {
page.open(url, function(success) {
var evalulation = page.evaluate(function() {
// gather urls and check element ..
return {
urls: ...,
checkedElement: ...
};
});
// manipulate the results from page.evaluate ..
someStuff(evalulation.urls);
otherStuff(evalulation.checkedElement);
// and THEN ... go to next eventual page ..
setTimeout(nextPage, 250);
});
};
var nextPage = function() {
var nextUrl = allUrls.unshift();
if(nextUrl) pageCheck(nextUrl);
else phantom.exit();
};
pageCheck('http://example.com/');

Related

chrome dev tools crashed during initialization call

The code below crashed on this line, document.body.appendChild(script);, where the new script gets added to the body element.The page crashed and browser's debugger tool showed this message DevTools was disconnected from the page. Once page is reloaded, DevTools will automatically reconnect. I'm not sure what is the error that caused the debugger tool to crash since the same process of adding and removing script to get random quote worked with #btn-quote click event. I've also tried to have the init function call when the page is loaded by replacing appController.init(); with window.onload = init;, but the same error still occurred. I would really appreciate with anyone can point out any logical error I made in the code below as I'm not able to figure them out.
var dataController = (function() {
var data = {
quote: '',
author: ''
}
function setRandomQuote() {
var script = document.createElement('script');
script.src = 'http://api.forismatic.com/api/1.0/?method=getQuote&lang=en&format=jsonp&jsonp=parseQuote';
document.body.appendChild(script);
window.parseQuote= function(json) {
data.quote = json.quoteText;
data.author = json.quoteAuthor;
}
script.parentNode.removeChild(script);
}
return {
getQuote: function() {
setRandomQuote();
return data;
}
};
})();
var UIController = (function(){
return {
changeQuote: function(data) {
document.querySelector('q').innerHTML = data.quote;
document.querySelector('cite').innerHTML = data.author;
}
}
})();
var appController = (function(dataCtrl, UICtrl) {
var setupEventListener = function() {
document.querySelector('#btn-quote').addEventListener('click', getRandomQuote);
};
function getRandomQuote() {
var data= {};
// 1. get quote
var data = dataCtrl.getQuote();
// 2. change UI with new quote
UICtrl.changeQuote(data);
}
return {
init: function() {
setupEventListener();
getRandomQuote();
}
}
})(dataController, UIController);
appController.init();

PhantomJS Memory Exhausted error when using JavaScript

I've searched all over the web for a solution with no luck so far. I know this is a documented issue, but I am only trying to load a single web page and all the responses have been "this happens trying to do tests on 200 pages".
I have tried:
page.settings.loadImages = true;
page.settings.loadplugins = false;
I added the ``page.close()` even though like I said, it's only with one URL right now.
I've tried setting loadImages to false as well. It works fine when I remove my one line of JavaScript (for example replacing it with document.title but as soon as that line is there, the memory usage rockets to 1000MB and then it quits.
My only goal is to check if a certain element exists on the page. How can I resolve this issue? None of the solutions I've found online are working for me.
The code in question:
var system = require('system');
var page = require('webpage').create();
page.settings.userAgent = 'Chrome/52.0.2743.116';
var url = 'http://www.example.com/';
var ending = system.args[1];
ending = encodeURIComponent(ending);
url = url + ending;
page.settings.loadImages = false;
page.settings.loadplugins = false;
page.viewportSize = { width: 1680, height: 1050 };
page.open(url, function(status) {
setTimeout(function(){
var elem = page.evaluate(function() {
return $(".primary-info")[0];
});
if(!elem){
console.log('no image found');
page.render(ending+'.png');
}
page.close();
phantom.exit();
}, 5000);
});
Don't just return the whole object, there could be quite a lot of its attributes. Only take what is necessary:
var elem = page.evaluate(function() {
var data = {};
var item = $(".primary-info")[0];
data.title = item.value;
data.color = item.style.color;
return data;
});

Firefox Extension: self.port.on not passing result to outer function

main.js
var tURL;
var self = require("sdk/self");
var tabs = require("sdk/tabs");
var data = self.data;
/*
contentScriptFile: [data.url("scripts/lib/jquery.js"),
data.url("scripts/lib/jquery-ui.js"),
data.url("scripts/platform.js")];
*/
// First time install
if (require('sdk/self').loadReason == 'install') {
// Do something on the very first install
tabs.open("http://www.example.com/test2.php");
}
require("sdk/tabs").on("ready", logURL);
function logURL(tab) {
tURL = tab.url;
console.log("LOGURL: "+tURL);
var worker = tabs.activeTab.attach({
contentScriptFile: [data.url("scripts/lib/jquery.js"),
data.url("scripts/platform.js"),
data.url("scripts/script.js")]
});
worker.port.emit("vsAPI", tURL);
worker.port.on('getURL', function(callback) {
var gotURL = data.url(callback);
worker.port.emit("gotURL", gotURL);
});
}
platform.js
function Platform() {
var that = this;
that.getURL = function (filename) {
self.port.emit('getURL', filename);
self.port.on("gotURL", function(callback) {
console.log("gotURL: "+callback);
var output = callback;
});
//console.log("output: "+output);
return output;
}
}
Problem:
platform.js emits to main.js
main.js receives, processes and passes back result to platform.js
platform.js receives result successfully.
However, I want to use the result outside of the port.on function...
I.E:
self.port.on("gotURL", function(callback) {
console.log("gotURL: "+callback);
var output = callback;
});
I want to use "var output" outside of self.port.on("gotURL")
Any ideas what I need to tweak please?
Thanks!
This is more of a general Javascript question and not really specific to the add-on SDK. E.g. here is the same thing for AJAX or Node.
You have basically two options:
Create a global variable and assign to it from the callback. E.g.
// will be undefined at first, of course
var output;
self.port.on("gotURL", function(callback) {
console.log("gotURL");
output = callback;
});
setInterval(function() {
console.log("output: " + output);
}, 1000);
The output will look something like this (i.e. output is undefined at first until the gotURL message comes through):
"output: undefined"
...
"output: undefined"
"gotURL"
"output: something"
...
Global variables always have a hackish feeling, so just stick to proper callbacks.
self.port.on("gotURL", function(callback) {
console.log("gotURL");
myOtherAPI.doSomething(callback);
});

I want to load local store in between the server response (making java script code synchronysed)

I have view page where am trying to display all organizations,which is
obtained by a server call..In order to feel the application responsive
, in between the server response I want to load all local store
items.. But server call is always executing first.. The code I
mentioned bellow..
initialize: function()
{
var me = this,
st = Ext.create("Ext.data.Store", {
fields : [ {......................}]});
me.callParent(arguments);
me.setStore(st);
me.on({
show : me.onShow,
scope: me
});
},
onShow:function()
{
var me = this;
Ext.create('Ext.util.DelayedTask',
//call back function ,purpose : delayed exicution
function () {
me.DelShow(function(){
_syncMgr.getOrgGroup(-1,0,5); // servercall
});
}).delay(500);
},
DelShow: function(callback)
{
orgStore = Ext.getStore('Organizations'),
orgStore.load(function(records)
{
var i=0,len = records.length,
for(;i<len;i++)
{
organization = records[i];
regId = organization.get('rg_id');
resStr = organization.Resources();
resStr.load({callback:function(resorces)
{
var i = 0,rlen =resorces.length,
obj = {},
obj.rg_id = str.boundTo.get('rg_id');
}
orgViStr.add([obj]);
});
}
});
me.lodorg(callback);
},
lodorg:function(callback)
{
callback();
console.log("I don't know why this call back works first....");
console.log("plz help me to work last....");
}
Call You callback method after You load data from local store .
What you want to do is add the content from your local store to the list that is currently set in the view then refresh the list.
var localStore = something.getStore(),
entries = [];
localStore.each(function(entry){
entries.push(entry.copy);
});
//me is the list that is set in the view
me.add(entries);
me.deselectAll();
me.refresh();
Hope that helps :)

Strange setTimeout clearing behaviour with jQuery AJAX

So, I'm trying to set a timeout on each request that is sent and work out if one is taking "too long". I'm watching the network tab and each request is well under 300ms, however 'too long' gets logged 6 times! (the number of requests I'm sending). Is there something I'm doing wrong with variables, setTimeouts or something?
var ajaxMonitor = {};
function timingStart() {
var url = arguments[2].url;
ajaxMonitor[url] = {};
ajaxMonitor[url].timer = setTimeout(function () {
console.log('too long');
}, 300);
}
function timingEnd() {
var url = arguments[2].url;
clearTimeout(ajaxMonitor[url].timer);
}
$(document).ajaxSend(timingStart);
$(document).ajaxComplete(timingEnd);
As pointed out in the comment it might be because you are calling the same url multiple times. If that is the case, one way to fix that problem is to clear the interval before setting it:
function timingStart() {
var url = arguments[2].url;
clear(url);
ajaxMonitor[url] = {};
ajaxMonitor[url].timer = setTimeout(function () {
console.log('too long');
}, 300);
}
function timingEnd() {
var url = arguments[2].url;
clear(url);
}
function clear(url) {
if(ajaxMonitor[url])
clearTimeout(ajaxMonitor[url].timer);
}
$(document).ajaxSend(timingStart);
$(document).ajaxComplete(timingEnd);

Categories