CasperJS loop or iterate through multiple web pages? - javascript

I have a CasperJS script that scrapes ratings and dates from one webpage. Now I want to scrape the same data from multiple pages under the same website. How can I loop through the different subpages given this code:
var ratings = [];
var dates = [];
var casper = require('casper').create({
pageSettings: {
loadImages: false,
loadPlugins: false
},
logLevel: "debug",
verbose: true
});
var fs = require('fs');
function getRatings() {
var ratings = document.querySelectorAll('#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img');
return Array.prototype.map.call(ratings, function(e) {
return e.getAttribute('title');
});
}
function getDate() {
var dates = document.querySelectorAll('#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate');
return Array.prototype.map.call(dates, function(e) {
return e.innerHTML;
});
}
casper.start('http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm', function(){
this.echo('hi');
});
casper.then(function() {
ratings = this.evaluate(getRatings);
dates = this.evaluate(getDate);
this.echo(ratings);
});
casper.run(function() {
this.echo(ratings.length + ' ratings found:');
for(var i=0; i<ratings.length; i++){
ratings[i] = ratings[i]+': '+dates[i];
dates[i] = '';
}
this.echo(ratings);
var content = ratings;
content = content.join("\n");
fs.write("C:/Users/Karan/Copy/tweesis/implementation/scraping/samsungratings.txt", content, 'w');
this.echo(dates.length + ' dates found:').exit();
});
Any help is appreciated :)

Since there exists a next page button, you can use it to traverse all pages recursively:
function getRatingsAndWrite(){
ratings = casper.evaluate(getRatings);
dates = casper.evaluate(getDate);
casper.echo(ratings);
casper.echo(ratings.length + ' ratings found:');
for(var i=0; i<ratings.length; i++){
ratings[i] = ratings[i]+': '+dates[i];
dates[i] = '';
}
casper.echo(ratings);
var content = ratings;
content = content.join("\n");
fs.write("C:/Users/Karan/Copy/tweesis/implementation/scraping/samsungratings.txt", content, 'a');
casper.echo(dates.length + ' dates found:');
var nextLink = ".BVRRPageLink.BVRRNextPage > a";
if (casper.visible(nextLink)) {
casper.thenClick(nextLink);
casper.then(getRatingsAndWrite);
} else {
casper.echo("END")
}
}
casper.start('http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm');
casper.then(getRatingsAndWrite);
casper.run();
A related answer is A: CasperJS parse next page after button click.

This code can help you :
you define in an array of objects the wanted urls, selectors for each page and in a loop you do what you want to do with these properties.
You can use a click method in the loop instead of url too.
var navigation = [
{
url: 'http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm',
selectorRatings:'#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img', selectorDate :'#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate'
}
,{
url: 'yourSecondUrl, etc...',
selectorRatings:'#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img',
selectorDate :'#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate'
}
],
content = "";
casper.start()
.then(function(){
//loop on the array
navigation.forEach(function(navIndex){
//open url : property url
casper.thenOpen(navIndex.url)
//wait for the page to load -> must be useless because thenOpen() do it
.waitForUrl(navIndex.url, function(){
//get the value of attribute title of adequate selector
var ratings = this.getElementAttribute(navIndex.selectorRatings, 'title'),
//get the HTML of adequate selector
var dates = this.getHTML(navIndex.selectorDates);
this.echo(ratings);
this.echo(dates);
content = content + ' ' + ratings + ' ' + dates;
});
});
})
.run(function() {
this.echo('----------- All steps done ------------\n');
this.exit();
});

Thanks Fanch and Artjom B. Both of your answers rendered the working solution. I used the recursive walk through the 'next' pages on the pagination as given by Artjom B. Next, I added a wait() function to make sure the next ratings page was loaded before scraping them. Without this wait() function, we scrape the same page multiple times between the instant that 'next' is clicked and the resp. next page is done loading. See the working code below:
var ratings = [];
var dates = [];
var casper = require('casper').create({
pageSettings: {
loadImages: false,
loadPlugins: false
},
logLevel: "debug",
verbose: true
});
var fs = require('fs');
function getRatings() {
var ratings = document.querySelectorAll('#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img');
return Array.prototype.map.call(ratings, function(e) {
return e.getAttribute('title');
});
}
function getDate() {
var dates = document.querySelectorAll('#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate');
return Array.prototype.map.call(dates, function(e) {
return e.innerHTML;
});
}
function getRatingsAndWrite(){
ratings = casper.evaluate(getRatings);
dates = casper.evaluate(getDate);
casper.echo(ratings.length + ' ratings found:');
for(var i=0; i<ratings.length; i++){
var rating = ratings[i].substr(0,1);
ratings[i] = rating +': '+dates[i];
dates[i] = '';
}
var content = ratings;
content = content.join("\n");
fs.write("<filepath to write content>", content, 'a');
casper.echo(dates.length + ' dates found:');
var nextLink = ".BVRRPageLink.BVRRNextPage > a";
if (casper.visible(nextLink)) {
casper.thenClick(nextLink);
casper.wait(3000);
casper.then(getRatingsAndWrite);
} else {
casper.echo("END")
}
}
casper.start('http://www.t-mobile.com/cell-phones/htc-one-m8.html');
casper.then(getRatingsAndWrite);
casper.run();

Related

Why searchbox become lagging when typing word to search data in CSV?

I developed the store locator using open street map and leaflet. The problem is when I want to type in searchbox it will become lagging to finish the word. That store locator read from the CSV file that has 300++ data. Below is the code for the searchbox:
var locationLat = [];
var locationLng = [];
var locMarker;
var infoDiv = document.getElementById('storeinfo');
var infoDivInner = document.getElementById('infoDivInner');
var toggleSearch = document.getElementById('searchIcon');
var hasCircle = 0;
var circle = [];
//close store infor when x is clicked
var userLocation;
$("#infoClose").click(function() {
$("#storeinfo").hide();
if (map.hasLayer(circle)) {
map.removeLayer(circle);
}
});
var listings = document.getElementById('listingDiv');
var stores = L.geoJson().addTo(map);
var storesData = omnivore.csv('assets/data/table_1.csv');
function setActive(el) {
var siblings = listings.getElementsByTagName('div');
for (var i = 0; i < siblings.length; i++) {
siblings[i].className = siblings[i].className
.replace(/active/, '').replace(/\s\s*$/, '');
}
el.className += ' active';
}
function sortGeojson(a,b,prop) {
return (a.properties.name.toUpperCase() < b.properties.name.toUpperCase()) ? -1 : ((a.properties.name.toUpperCase() > b.properties.name.toUpperCase()) ? 1 : 0);
}
storesData.on('ready', function() {
var storesSorted = storesData.toGeoJSON();
//console.log(storesSorted);
var sorted = (storesSorted.features).sort(sortGeojson)
//console.log(sorted);
storesSorted.features = sorted;
//console.log(storesSorted)
stores.addData(storesSorted);
map.fitBounds(stores.getBounds());
toggleSearch.onclick = function() {
//var s = document.getElementById('searchbox');
//if (s.style.display != 'none') {
//s.style.display = 'yes';
//toggleSearch.innerHTML = '<i class="fa fa-search"></i>';
//$("#search-input").val("");
//search.collapse();
//document.getElementById('storeinfo').style.display = 'none';
//$('.item').show();
//} else {
//toggleSearch.innerHTML = '<i class="fa fa-times"></i>';
//s.style.display = 'block';
//attempt to autofocus search input field when opened
//$('#search-input').focus();
//}
};
stores.eachLayer(function(layer) {
//New jquery search
$('#searchbox').on('change paste keyup', function() {
var txt = $('#search-input').val();
$('.item').each(function() {
if ($(this).text().toUpperCase().indexOf(txt.toUpperCase()) != -1) {
$(this).show();
} else {
$(this).hide();
}
});
});
I dont know what is the cause of the lag in the search box. It is something wrong in code or the csv file? Thank you
Every iteration of $('.item').each is causing a layout change because $(this).hide() or $(this).show() causes the item to removed/added to the DOM as the style is set to display:none back and forth. DOM manipulations and the corresponding layout changes are expensive.
You can consider accumulating the changes and doing one batch update to the DOM using a function like appendChild

Looping through array and clicking each link via CasperJS [duplicate]

I'm having trouble clicking all JavaScript based links in a DOM and saving the
output. The links have the form
<a id="html" href="javascript:void(0);" onclick="goToHtml();">HTML</a>
the following code works great:
var casper = require('casper').create();
var fs = require('fs');
var firstUrl = 'http://www.testurl.com/test.html';
var css_selector = '#jan_html';
casper.start(firstUrl);
casper.thenClick(css_selector, function(){
console.log("whoop");
});
casper.waitFor(function check() {
return this.getCurrentUrl() != firstUrl;
}, function then() {
console.log(this.getCurrentUrl());
var file_title = this.getTitle().split(' ').join('_') + '.html';
fs.write(file_title, this.getPageContent());
});
casper.run();
However, how can I get this to work with a selector of "a", clicking all
available links and saving content? I'm not sure how to get the clickWhileSelector to remove nodes from the selector as is done here: Click on all links matching a selector
I have this script that first will get all links from a page then save 'href' attributes to an array, then will iterate over this array and then open each link one by one and echo the url :
var casper = require('casper').create({
logLevel:"verbose",
debug:true
});
var links;
casper.start('http://localhost:8000');
casper.then(function getLinks(){
links = this.evaluate(function(){
var links = document.getElementsByTagName('a');
links = Array.prototype.map.call(links,function(link){
return link.getAttribute('href');
});
return links;
});
});
casper.then(function(){
this.each(links,function(self,link){
self.thenOpen(link,function(a){
this.echo(this.getCurrentUrl());
});
});
});
casper.run(function(){
this.exit();
});
rusln's answer works great if all the links have a meaningful href attribute (actual URL). If you want to click every a that also triggers a javascript function, you may need to iterate some other way over the elements.
I propose using the XPath generator from stijn de ryck for an element.
You can then sample all XPaths that are on the page.
Then you open the page for every a that you have the XPath for and click it by XPath.
Wait a little if it is a single page application
Do something
var startURL = 'http://localhost:8000',
xPaths
x = require('casper').selectXPath;
casper.start(startURL);
casper.then(function getLinks(){
xPaths = this.evaluate(function(){
// copied from https://stackoverflow.com/a/5178132/1816580
function createXPathFromElement(elm) {
var allNodes = document.getElementsByTagName('*');
for (var segs = []; elm && elm.nodeType == 1; elm = elm.parentNode) {
if (elm.hasAttribute('id')) {
var uniqueIdCount = 0;
for (var n=0;n < allNodes.length;n++) {
if (allNodes[n].hasAttribute('id') && allNodes[n].id == elm.id) uniqueIdCount++;
if (uniqueIdCount > 1) break;
};
if ( uniqueIdCount == 1) {
segs.unshift('id("' + elm.getAttribute('id') + '")');
return segs.join('/');
} else {
segs.unshift(elm.localName.toLowerCase() + '[#id="' + elm.getAttribute('id') + '"]');
}
} else if (elm.hasAttribute('class')) {
segs.unshift(elm.localName.toLowerCase() + '[#class="' + elm.getAttribute('class') + '"]');
} else {
for (i = 1, sib = elm.previousSibling; sib; sib = sib.previousSibling) {
if (sib.localName == elm.localName) i++; };
segs.unshift(elm.localName.toLowerCase() + '[' + i + ']');
};
};
return segs.length ? '/' + segs.join('/') : null;
};
var links = document.getElementsByTagName('a');
var xPaths = Array.prototype.map.call(links, createXPathFromElement);
return xPaths;
});
});
casper.then(function(){
this.each(xPaths, function(self, xpath){
self.thenOpen(startURL);
self.thenClick(x(xpath));
// waiting some time may be necessary for single page applications
self.wait(1000);
self.then(function(a){
// do something meaningful here
this.echo(this.getCurrentUrl());
});
// Uncomment the following line in case each click opens a new page instead of staying at the same page
//self.back()
});
});
casper.run(function(){
this.exit();
});

For loop variable undefined in Javascript

I'm working on implementing a system where elements can be dragged and dropped to create flowcharts. My Issue is with saving the flowchart so that it could be reloaded when needed. For now I've created a method that saves all the previous data of the element onto the final array that holds only elements that are dropped on the container. But I'm getting a Trivial Error as Undefined variable on the debugging interface. Hence I'm not getting the intended output and the alert messages that I included are not being printed when the condition is met.
Code in Context
function saveFlowchart()
{
var nodes = [];
var matches = [];
var searchEles = document.getElementById("container").children;
for(var i = 0; i < searchEles.length; i++)
{
matches.push(searchEles[i]);
var idOfEl = searchEles[i].id;
if(searchEles[i].id !=null || searchEles[i].id !="")
{
var $element = $("#" + searchEles[i].id);
var dropElem = $("#" + searchEles[i].id).attr('class');
var position = $element.position();
position.bottom = position.top + $element.height();
position.right = position.left + $element.width();
alert("class:"+dropElem+"\nTop position: " + position.top + "\nLeft position: " + position.left + "\nBottom position: " + position.bottom + "\nRight position: " + position.right);
finalArray[idOfEl-1][0]= idOfEl;
finalArray[idOfEl-1][1]= dropElem;
var elId = parseInt(idOfEl);
if (dropElem == "stream ui-draggable")
{
for(var count=0;count<100;count++)
{
alert("One loop opened with count="+count);
if(createdImportStreamArray[count][0]==elId)
{
finalArray[elId-1][2]= createdImportStreamArray[count][1]; //Selected Stream from Predefined Streams
finalArray[elId-1][3]= createdImportStreamArray[count][2]; //asName
alert("createdImportStreamArray[count][0]==elId");
}
else if(createdExportStreamArray[count][0]==elId)
{
finalArray[elId-1][2]= createdExportStreamArray[count][1]; //Selected Stream from Predefined Streams
finalArray[elId-1][3]= createdExportStreamArray[count][2]; //asName
}
else if(createdDefinedStreamArray[count][0]==elId)
{
finalArray[elId-1][2]= createdDefinedStreamArray[count][1]; //Stream Name
finalArray[elId-1][3]= createdDefinedStreamArray[count][4]; //Number of Attributes
finalArray[elId-1][4]=[];
for(var f=0;f<createdDefinedStreamArray[r][4];f++)
{
finalArray[elId-1][4][f][0]=createdDefinedStreamArray[count][2][f][0]; //Attribute Name
finalArray[elId-1][4][f][1]=createdDefinedStreamArray[count][2][f][1]; // Attribute Type
}
}
alert("One loop closed with count="+count);
}
alert("Loop ended with count="+count);
}
else if (dropElem == "wstream ui-draggable")
{
ElementType="wstream";
}
// else if conditions...
alert(finalArray);
}
}
//Code to output the connection details in a json format
//The following is not affected by the above mentioned error
$(".node").each(function (idx, elem) {
var $elem = $(elem);
var endpoints = jsPlumb.getEndpoints($elem.attr('id'));
console.log('endpoints of '+$elem.attr('id'));
console.log(endpoints);
nodes.push({
blockId: $elem.attr('id'),
nodetype: $elem.attr('data-nodetype'),
positionX: parseInt($elem.css("left"), 10),
positionY: parseInt($elem.css("top"), 10)
});
});
var connections = [];
$.each(jsPlumb.getConnections(), function (idx, connection) {
connections.push({
connectionId: connection.id,
pageSourceId: connection.sourceId,
pageTargetId: connection.targetId
});
});
var flowChart = {};
flowChart.nodes = nodes;
flowChart.connections = connections;
flowChart.numberOfElements = numberOfElements;
var flowChartJson = JSON.stringify(flowChart);
//console.log(flowChartJson);
$('#jsonOutput').val(flowChartJson);
}
Debugging Interface
According to this the count variable in the for loop is undefined. I've tried taking the first statement var count=0 outside the loop declaration part and defining it in the very beginning of the method. But that simply checks for count=0 against the conditions and doesn't increment at all.
Any help in this regard will be highly appreciated as I've been working on this minor error for almost 2 days now.

How to follow all links in CasperJS?

I'm having trouble clicking all JavaScript based links in a DOM and saving the
output. The links have the form
<a id="html" href="javascript:void(0);" onclick="goToHtml();">HTML</a>
the following code works great:
var casper = require('casper').create();
var fs = require('fs');
var firstUrl = 'http://www.testurl.com/test.html';
var css_selector = '#jan_html';
casper.start(firstUrl);
casper.thenClick(css_selector, function(){
console.log("whoop");
});
casper.waitFor(function check() {
return this.getCurrentUrl() != firstUrl;
}, function then() {
console.log(this.getCurrentUrl());
var file_title = this.getTitle().split(' ').join('_') + '.html';
fs.write(file_title, this.getPageContent());
});
casper.run();
However, how can I get this to work with a selector of "a", clicking all
available links and saving content? I'm not sure how to get the clickWhileSelector to remove nodes from the selector as is done here: Click on all links matching a selector
I have this script that first will get all links from a page then save 'href' attributes to an array, then will iterate over this array and then open each link one by one and echo the url :
var casper = require('casper').create({
logLevel:"verbose",
debug:true
});
var links;
casper.start('http://localhost:8000');
casper.then(function getLinks(){
links = this.evaluate(function(){
var links = document.getElementsByTagName('a');
links = Array.prototype.map.call(links,function(link){
return link.getAttribute('href');
});
return links;
});
});
casper.then(function(){
this.each(links,function(self,link){
self.thenOpen(link,function(a){
this.echo(this.getCurrentUrl());
});
});
});
casper.run(function(){
this.exit();
});
rusln's answer works great if all the links have a meaningful href attribute (actual URL). If you want to click every a that also triggers a javascript function, you may need to iterate some other way over the elements.
I propose using the XPath generator from stijn de ryck for an element.
You can then sample all XPaths that are on the page.
Then you open the page for every a that you have the XPath for and click it by XPath.
Wait a little if it is a single page application
Do something
var startURL = 'http://localhost:8000',
xPaths
x = require('casper').selectXPath;
casper.start(startURL);
casper.then(function getLinks(){
xPaths = this.evaluate(function(){
// copied from https://stackoverflow.com/a/5178132/1816580
function createXPathFromElement(elm) {
var allNodes = document.getElementsByTagName('*');
for (var segs = []; elm && elm.nodeType == 1; elm = elm.parentNode) {
if (elm.hasAttribute('id')) {
var uniqueIdCount = 0;
for (var n=0;n < allNodes.length;n++) {
if (allNodes[n].hasAttribute('id') && allNodes[n].id == elm.id) uniqueIdCount++;
if (uniqueIdCount > 1) break;
};
if ( uniqueIdCount == 1) {
segs.unshift('id("' + elm.getAttribute('id') + '")');
return segs.join('/');
} else {
segs.unshift(elm.localName.toLowerCase() + '[#id="' + elm.getAttribute('id') + '"]');
}
} else if (elm.hasAttribute('class')) {
segs.unshift(elm.localName.toLowerCase() + '[#class="' + elm.getAttribute('class') + '"]');
} else {
for (i = 1, sib = elm.previousSibling; sib; sib = sib.previousSibling) {
if (sib.localName == elm.localName) i++; };
segs.unshift(elm.localName.toLowerCase() + '[' + i + ']');
};
};
return segs.length ? '/' + segs.join('/') : null;
};
var links = document.getElementsByTagName('a');
var xPaths = Array.prototype.map.call(links, createXPathFromElement);
return xPaths;
});
});
casper.then(function(){
this.each(xPaths, function(self, xpath){
self.thenOpen(startURL);
self.thenClick(x(xpath));
// waiting some time may be necessary for single page applications
self.wait(1000);
self.then(function(a){
// do something meaningful here
this.echo(this.getCurrentUrl());
});
// Uncomment the following line in case each click opens a new page instead of staying at the same page
//self.back()
});
});
casper.run(function(){
this.exit();
});

Full url sharepoint item in Javascript

I'm trying to get the ContentTypeId of an item in sharepoint to get the full url of the item to get the binary of it and after send it to another plateform.
So here i put this code in element.xml to get the list ID and the document ids of the items i'm selecting, after this i send them to an ASPX page in a Sharepoint Dialog to define the destination of the items and after this in the postback, stream the binary and send it to the another platform. The problem is : To get the full url of my items i need ListId, ItemId and ContentTypeId.
Because i've found a code to stream the binary here :
How to Programatically Download files from sharepoint document library
And i need the full url of my items.
Any idea?
thanks
var iddocs ='';
var listId ='';
function geturl()
{
var context = SP.ClientContext.get_current();
this.web = context.get_web();
listId = SP.ListOperation.Selection.getSelectedList();
var list = this.web.get_lists().getById(listId);
var ok = false;
try
{
if ( SP.ListOperation.Selection.getSelectedItems(context) !== false)
{
var items = SP.ListOperation.Selection.getSelectedItems(context);
var url='listId:'+listId+ ' Number of selected items: ' + items.length ;
var i = 0;
if(items.length==0)
{
}else{
while( i != items.length )
{
url += ' Doc' + i + ': ' + items[i].id;
if(i>0){iddocs += '-'};
iddocs += items[i].id;
i++;
};
ok = true;
alert(url+' Id of clicked item:'+{ItemId});
};
};
}
catch(err)
{
};
return ok;
};
function OpenDialog(pidliste) {
var options = SP.UI.$create_DialogOptions();
options.width = 600;
options.height = 600;
options.title = 'Envoyer vers Nuxeo';
options.url ='/_Layouts/SPTest.CustomMenuItem/index.aspx?click={ItemId}';
if(pidliste){options.url += '&ids='+pidliste +'-'+ iddocs;};
options.dialogReturnValueCallback = Function.createDelegate(null, CloseCallback);
SP.UI.ModalDialog.showModalDialog(options);
}
function CloseCallback(result, target) {
if (result == SP.UI.DialogResult.OK) {
}
if (result == SP.UI.DialogResult.cancel) {
SP.UI.Notify.addNotification('Opération canceled', false, '', null);
}
}
if(geturl())
{
OpenDialog(listId);
}else{
alert('Please select an item');
};
I've found the solution. In fact, items can be reached via :
{SiteUrl}+{ItemUrl}
The download function is linked in my first Post. But it doesn't work for multiple items, with this method you can only reach the properties of the item you're selecting.
You have to note that if you want to access to a SP file, you have to set your request.credential via :
request.Credentials = System.Net.CredentialCache.DefaultCredentials;
which will take the current credential you're using.
Hope it helps.

Categories