How to follow all links in CasperJS? - javascript

I'm having trouble clicking all JavaScript based links in a DOM and saving the
output. The links have the form
<a id="html" href="javascript:void(0);" onclick="goToHtml();">HTML</a>
the following code works great:
var casper = require('casper').create();
var fs = require('fs');
var firstUrl = 'http://www.testurl.com/test.html';
var css_selector = '#jan_html';
casper.start(firstUrl);
casper.thenClick(css_selector, function(){
console.log("whoop");
});
casper.waitFor(function check() {
return this.getCurrentUrl() != firstUrl;
}, function then() {
console.log(this.getCurrentUrl());
var file_title = this.getTitle().split(' ').join('_') + '.html';
fs.write(file_title, this.getPageContent());
});
casper.run();
However, how can I get this to work with a selector of "a", clicking all
available links and saving content? I'm not sure how to get the clickWhileSelector to remove nodes from the selector as is done here: Click on all links matching a selector

I have this script that first will get all links from a page then save 'href' attributes to an array, then will iterate over this array and then open each link one by one and echo the url :
var casper = require('casper').create({
logLevel:"verbose",
debug:true
});
var links;
casper.start('http://localhost:8000');
casper.then(function getLinks(){
links = this.evaluate(function(){
var links = document.getElementsByTagName('a');
links = Array.prototype.map.call(links,function(link){
return link.getAttribute('href');
});
return links;
});
});
casper.then(function(){
this.each(links,function(self,link){
self.thenOpen(link,function(a){
this.echo(this.getCurrentUrl());
});
});
});
casper.run(function(){
this.exit();
});

rusln's answer works great if all the links have a meaningful href attribute (actual URL). If you want to click every a that also triggers a javascript function, you may need to iterate some other way over the elements.
I propose using the XPath generator from stijn de ryck for an element.
You can then sample all XPaths that are on the page.
Then you open the page for every a that you have the XPath for and click it by XPath.
Wait a little if it is a single page application
Do something
var startURL = 'http://localhost:8000',
xPaths
x = require('casper').selectXPath;
casper.start(startURL);
casper.then(function getLinks(){
xPaths = this.evaluate(function(){
// copied from https://stackoverflow.com/a/5178132/1816580
function createXPathFromElement(elm) {
var allNodes = document.getElementsByTagName('*');
for (var segs = []; elm && elm.nodeType == 1; elm = elm.parentNode) {
if (elm.hasAttribute('id')) {
var uniqueIdCount = 0;
for (var n=0;n < allNodes.length;n++) {
if (allNodes[n].hasAttribute('id') && allNodes[n].id == elm.id) uniqueIdCount++;
if (uniqueIdCount > 1) break;
};
if ( uniqueIdCount == 1) {
segs.unshift('id("' + elm.getAttribute('id') + '")');
return segs.join('/');
} else {
segs.unshift(elm.localName.toLowerCase() + '[#id="' + elm.getAttribute('id') + '"]');
}
} else if (elm.hasAttribute('class')) {
segs.unshift(elm.localName.toLowerCase() + '[#class="' + elm.getAttribute('class') + '"]');
} else {
for (i = 1, sib = elm.previousSibling; sib; sib = sib.previousSibling) {
if (sib.localName == elm.localName) i++; };
segs.unshift(elm.localName.toLowerCase() + '[' + i + ']');
};
};
return segs.length ? '/' + segs.join('/') : null;
};
var links = document.getElementsByTagName('a');
var xPaths = Array.prototype.map.call(links, createXPathFromElement);
return xPaths;
});
});
casper.then(function(){
this.each(xPaths, function(self, xpath){
self.thenOpen(startURL);
self.thenClick(x(xpath));
// waiting some time may be necessary for single page applications
self.wait(1000);
self.then(function(a){
// do something meaningful here
this.echo(this.getCurrentUrl());
});
// Uncomment the following line in case each click opens a new page instead of staying at the same page
//self.back()
});
});
casper.run(function(){
this.exit();
});

Related

Looping through array and clicking each link via CasperJS [duplicate]

I'm having trouble clicking all JavaScript based links in a DOM and saving the
output. The links have the form
<a id="html" href="javascript:void(0);" onclick="goToHtml();">HTML</a>
the following code works great:
var casper = require('casper').create();
var fs = require('fs');
var firstUrl = 'http://www.testurl.com/test.html';
var css_selector = '#jan_html';
casper.start(firstUrl);
casper.thenClick(css_selector, function(){
console.log("whoop");
});
casper.waitFor(function check() {
return this.getCurrentUrl() != firstUrl;
}, function then() {
console.log(this.getCurrentUrl());
var file_title = this.getTitle().split(' ').join('_') + '.html';
fs.write(file_title, this.getPageContent());
});
casper.run();
However, how can I get this to work with a selector of "a", clicking all
available links and saving content? I'm not sure how to get the clickWhileSelector to remove nodes from the selector as is done here: Click on all links matching a selector
I have this script that first will get all links from a page then save 'href' attributes to an array, then will iterate over this array and then open each link one by one and echo the url :
var casper = require('casper').create({
logLevel:"verbose",
debug:true
});
var links;
casper.start('http://localhost:8000');
casper.then(function getLinks(){
links = this.evaluate(function(){
var links = document.getElementsByTagName('a');
links = Array.prototype.map.call(links,function(link){
return link.getAttribute('href');
});
return links;
});
});
casper.then(function(){
this.each(links,function(self,link){
self.thenOpen(link,function(a){
this.echo(this.getCurrentUrl());
});
});
});
casper.run(function(){
this.exit();
});
rusln's answer works great if all the links have a meaningful href attribute (actual URL). If you want to click every a that also triggers a javascript function, you may need to iterate some other way over the elements.
I propose using the XPath generator from stijn de ryck for an element.
You can then sample all XPaths that are on the page.
Then you open the page for every a that you have the XPath for and click it by XPath.
Wait a little if it is a single page application
Do something
var startURL = 'http://localhost:8000',
xPaths
x = require('casper').selectXPath;
casper.start(startURL);
casper.then(function getLinks(){
xPaths = this.evaluate(function(){
// copied from https://stackoverflow.com/a/5178132/1816580
function createXPathFromElement(elm) {
var allNodes = document.getElementsByTagName('*');
for (var segs = []; elm && elm.nodeType == 1; elm = elm.parentNode) {
if (elm.hasAttribute('id')) {
var uniqueIdCount = 0;
for (var n=0;n < allNodes.length;n++) {
if (allNodes[n].hasAttribute('id') && allNodes[n].id == elm.id) uniqueIdCount++;
if (uniqueIdCount > 1) break;
};
if ( uniqueIdCount == 1) {
segs.unshift('id("' + elm.getAttribute('id') + '")');
return segs.join('/');
} else {
segs.unshift(elm.localName.toLowerCase() + '[#id="' + elm.getAttribute('id') + '"]');
}
} else if (elm.hasAttribute('class')) {
segs.unshift(elm.localName.toLowerCase() + '[#class="' + elm.getAttribute('class') + '"]');
} else {
for (i = 1, sib = elm.previousSibling; sib; sib = sib.previousSibling) {
if (sib.localName == elm.localName) i++; };
segs.unshift(elm.localName.toLowerCase() + '[' + i + ']');
};
};
return segs.length ? '/' + segs.join('/') : null;
};
var links = document.getElementsByTagName('a');
var xPaths = Array.prototype.map.call(links, createXPathFromElement);
return xPaths;
});
});
casper.then(function(){
this.each(xPaths, function(self, xpath){
self.thenOpen(startURL);
self.thenClick(x(xpath));
// waiting some time may be necessary for single page applications
self.wait(1000);
self.then(function(a){
// do something meaningful here
this.echo(this.getCurrentUrl());
});
// Uncomment the following line in case each click opens a new page instead of staying at the same page
//self.back()
});
});
casper.run(function(){
this.exit();
});

CasperJS loop or iterate through multiple web pages?

I have a CasperJS script that scrapes ratings and dates from one webpage. Now I want to scrape the same data from multiple pages under the same website. How can I loop through the different subpages given this code:
var ratings = [];
var dates = [];
var casper = require('casper').create({
pageSettings: {
loadImages: false,
loadPlugins: false
},
logLevel: "debug",
verbose: true
});
var fs = require('fs');
function getRatings() {
var ratings = document.querySelectorAll('#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img');
return Array.prototype.map.call(ratings, function(e) {
return e.getAttribute('title');
});
}
function getDate() {
var dates = document.querySelectorAll('#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate');
return Array.prototype.map.call(dates, function(e) {
return e.innerHTML;
});
}
casper.start('http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm', function(){
this.echo('hi');
});
casper.then(function() {
ratings = this.evaluate(getRatings);
dates = this.evaluate(getDate);
this.echo(ratings);
});
casper.run(function() {
this.echo(ratings.length + ' ratings found:');
for(var i=0; i<ratings.length; i++){
ratings[i] = ratings[i]+': '+dates[i];
dates[i] = '';
}
this.echo(ratings);
var content = ratings;
content = content.join("\n");
fs.write("C:/Users/Karan/Copy/tweesis/implementation/scraping/samsungratings.txt", content, 'w');
this.echo(dates.length + ' dates found:').exit();
});
Any help is appreciated :)
Since there exists a next page button, you can use it to traverse all pages recursively:
function getRatingsAndWrite(){
ratings = casper.evaluate(getRatings);
dates = casper.evaluate(getDate);
casper.echo(ratings);
casper.echo(ratings.length + ' ratings found:');
for(var i=0; i<ratings.length; i++){
ratings[i] = ratings[i]+': '+dates[i];
dates[i] = '';
}
casper.echo(ratings);
var content = ratings;
content = content.join("\n");
fs.write("C:/Users/Karan/Copy/tweesis/implementation/scraping/samsungratings.txt", content, 'a');
casper.echo(dates.length + ' dates found:');
var nextLink = ".BVRRPageLink.BVRRNextPage > a";
if (casper.visible(nextLink)) {
casper.thenClick(nextLink);
casper.then(getRatingsAndWrite);
} else {
casper.echo("END")
}
}
casper.start('http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm');
casper.then(getRatingsAndWrite);
casper.run();
A related answer is A: CasperJS parse next page after button click.
This code can help you :
you define in an array of objects the wanted urls, selectors for each page and in a loop you do what you want to do with these properties.
You can use a click method in the loop instead of url too.
var navigation = [
{
url: 'http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm',
selectorRatings:'#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img', selectorDate :'#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate'
}
,{
url: 'yourSecondUrl, etc...',
selectorRatings:'#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img',
selectorDate :'#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate'
}
],
content = "";
casper.start()
.then(function(){
//loop on the array
navigation.forEach(function(navIndex){
//open url : property url
casper.thenOpen(navIndex.url)
//wait for the page to load -> must be useless because thenOpen() do it
.waitForUrl(navIndex.url, function(){
//get the value of attribute title of adequate selector
var ratings = this.getElementAttribute(navIndex.selectorRatings, 'title'),
//get the HTML of adequate selector
var dates = this.getHTML(navIndex.selectorDates);
this.echo(ratings);
this.echo(dates);
content = content + ' ' + ratings + ' ' + dates;
});
});
})
.run(function() {
this.echo('----------- All steps done ------------\n');
this.exit();
});
Thanks Fanch and Artjom B. Both of your answers rendered the working solution. I used the recursive walk through the 'next' pages on the pagination as given by Artjom B. Next, I added a wait() function to make sure the next ratings page was loaded before scraping them. Without this wait() function, we scrape the same page multiple times between the instant that 'next' is clicked and the resp. next page is done loading. See the working code below:
var ratings = [];
var dates = [];
var casper = require('casper').create({
pageSettings: {
loadImages: false,
loadPlugins: false
},
logLevel: "debug",
verbose: true
});
var fs = require('fs');
function getRatings() {
var ratings = document.querySelectorAll('#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img');
return Array.prototype.map.call(ratings, function(e) {
return e.getAttribute('title');
});
}
function getDate() {
var dates = document.querySelectorAll('#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate');
return Array.prototype.map.call(dates, function(e) {
return e.innerHTML;
});
}
function getRatingsAndWrite(){
ratings = casper.evaluate(getRatings);
dates = casper.evaluate(getDate);
casper.echo(ratings.length + ' ratings found:');
for(var i=0; i<ratings.length; i++){
var rating = ratings[i].substr(0,1);
ratings[i] = rating +': '+dates[i];
dates[i] = '';
}
var content = ratings;
content = content.join("\n");
fs.write("<filepath to write content>", content, 'a');
casper.echo(dates.length + ' dates found:');
var nextLink = ".BVRRPageLink.BVRRNextPage > a";
if (casper.visible(nextLink)) {
casper.thenClick(nextLink);
casper.wait(3000);
casper.then(getRatingsAndWrite);
} else {
casper.echo("END")
}
}
casper.start('http://www.t-mobile.com/cell-phones/htc-one-m8.html');
casper.then(getRatingsAndWrite);
casper.run();

.load not working in IE8

I have been looking into this issue for the past few days and cannot figure it out. The code below, searches an external file for content based off current page class, then loads content into any matching ID's on the page. It works in Chrome, Firefox, IE9 but recently stopped working in IE8 and I cannot figure out why. Any thoughts would be much appreciated.
HTML looks like this
<body class="jms">
<div id="mainHomeContent" class="shared"></div>
</body>
jquery running on ready
$("div.shared").each(function(){
var Body = $(document).find("body");
var contentID = ("#" + $(this).attr("id"));
var pathname = ""
if(Body.hasClass("pigman")){
var pathname = "/dev/jmsracing/content/pigman/shared-content-include.html"
} else if(Body.hasClass("marion-arts")){
var pathname = "/dev/jmsracing/content/marion-arts/shared-content-include.html"
} else if(Body.hasClass("jms")){
var pathname = "/dev/jmsracing/content/jms/shared-content-include.html"
alert('hello');
}
$(contentID).load(pathname + " " + contentID);
});
What i think is he is iterating with same id where ie is very strict about it so this should be the solution:
$(function() {
var Body = $(document).find("body");
var contentID = ("#" + $(this).attr("id"));
var pathname = ""
if (Body.hasClass("pigman")) {
var pathname = "/dev/jmsracing/content/pigman/shared-content-include.html"
} else if (Body.hasClass("marion-arts")) {
var pathname = "/dev/jmsracing/content/marion-arts/shared-content-include.html"
} else if (Body.hasClass("jms")) {
var pathname = "/dev/jmsracing/content/jms/shared-content-include.html"
alert('hello');
}
$(contentID).load(pathname + " " + contentID);
});​
Try this:
$("div.shared").each(function () {
//combined into one var statement...not really necessary.
var $body = $("body"),
contentId = "#" + $(this).attr("id"),
pathname = "";
//you've declared pathname above no need for "var" each time below
//also added missing semi colons
if ($body.hasClass("pigman")) {
pathname = "/dev/jmsracing/content/pigman/shared-content-include.html";
} else if ($body.hasClass("marion-arts")) {
pathname = "/dev/jmsracing/content/marion-arts/shared-content-include.html";
} else if ($body.hasClass("jms")) {
pathname = "/dev/jmsracing/content/jms/shared-content-include.html";
alert('hello');
}
// $(this) and $(contentId) are the same element
// since you are getting the "id" from "this"
// us $(this) instead
$(this).load(pathname + " " + contentId);
});

Open external links in a new tab without jQuery

What's the best way to open all external links (URLs that don't match the current domain) in a new tab using JavaScript, without using jQuery?
Here's the jQuery I'm current using:
// Open external links in new tab
$('a[href^=http]').click(function () {
var a = new RegExp('/' + window.location.host + '/');
if (!a.test(this.href)) {
window.open(this.href);
return false;
}
});
Pure JS:
function externalLinks() {
for(var c = document.getElementsByTagName("a"), a = 0;a < c.length;a++) {
var b = c[a];
b.getAttribute("href") && b.hostname !== location.hostname && (b.target = "_blank")
}
}
;
externalLinks();
The links property returns a collection of all <area> and <a> elements in a document with a value for the href attribute.
var links = document.links;
for(var i = 0; i < links.length; i++) {
if (links[i].hostname != window.location.hostname) {
links[i].target = '_blank';
}
}
https://developer.mozilla.org/en-US/docs/Web/API/Document/links
Add a target="_blank" to the tag. You could do that in the pre-processor (e.g. PHP) or in a JS during the onload event.
$("a[href^=http]").each(function(){
if(this.href.indexOf(location.hostname) == -1) {
$(this).attr({
target: "_blank",
title: "Opens in a new window"
});
}
})
This script should work for you.
UPDATE : try this fiddle http://jsfiddle.net/sameerast/GuT2y/
JS version
var externalLinks = function(){
var anchors = document.getElementsByTagName('a');
var length = anchors.length;
for(var i=0; i<length;i++){
var href = anchor[i].href;
if(href.indexOf('http://sample.com/') || href.indexOf('http://www.sample.com/')){
return;
}
else{
anchor[i].target='_blank';
}
}
};

Get the hash value which was before hashchange

Suppose my html is
One
Two
and Js is
$(window).on("hashchange"){
alert(document.location.hash);
}
I want to get the hash value which was before hash change .Is it Possible?If yes ,How?
use that
$(window).on("hashchange", function(e){
console.log(e.originalEvent.oldURL)
console.log(e.originalEvent.newURL)
})​;
Demo: http://jsbin.com/ulumil/
You have to track the last hash, for example:
var currentHash = function() {
return location.hash.replace(/^#/, '')
}
var last_hash
var hash = currentHash()
$(window).bind('hashchange', function(event){
last_hash = hash
hash = currentHash()
console.log('hash changed from ' + last_hash + ' to ' + hash)
});
Actually the solution provided by Amit works but with jquery library and crossplatform as well.
Here is a more simplified solution using core javascript and crossbrowser as well. (checked with latest version of IE/FF/Chrome/Safari)
window.onhashchange = function(e){
console.log(e);
var oldURL = e.oldURL;
var newURL = e.newURL;
console.log("old url = " + oldURL);
console.log("new url = " + newURL);
var oldHash = oldURL.split("#")[1];
var newHash = newURL.split("#")[1];
console.log(oldHash);
console.log(newHash);
};
Don't use
$(window).on("hashchange", function(e){
console.log(e.originalEvent.oldURL)
console.log(e.originalEvent.newURL)
})​;
It won't work on IE and probably elsewhere too.
Use this rather.
(function(w, $){
var UrlHashMonitor = {};
UrlHashMonitor.oldHash = '';
UrlHashMonitor.newHash = '';
UrlHashMonitor.oldHref = '';
UrlHashMonitor.newHref = '';
UrlHashMonitor.onHashChange = function(f){
$(window).on('hashchange', function(e){
UrlHashMonitor.oldHash = UrlHashMonitor.newHash;
UrlHashMonitor.newHash = w.location.hash;
UrlHashMonitor.oldHref = UrlHashMonitor.newHref;
UrlHashMonitor.newHref = w.location.href;
f(e);
});
};
UrlHashMonitor.init = function(){
UrlHashMonitor.oldHash = UrlHashMonitor.newHash = w.location.hash;
UrlHashMonitor.oldHref = UrlHashMonitor.newHref = w.location.href;
};
w.UrlHashMonitor = UrlHashMonitor;
return UrlHashMonitor;
})(window, window.jQuery);
/*
* USAGE EXAMPLE
*/
UrlHashMonitor.init();
UrlHashMonitor.onHashChange(function(){
console.log('oldHash: ' + UrlHashMonitor.oldHash);
console.log('newHash: ' + UrlHashMonitor.newHash);
console.log('oldHref: ' + UrlHashMonitor.oldHref);
console.log('newHref: ' + UrlHashMonitor.newHref);
//do other stuff
});
This should work in all modern browsers.
DEMO: https://output.jsbin.com/qafupu#one

Categories