node.js + cheerio scrape: Passing an array of urls to download?

node.js + cheerio scrape: Passing an array of urls to download? - javascript

Firstly, here is my code as I've progressed so far:
var http = require("http");
// Utility function that downloads a URL and invokes
// callback with the data.
function download(url, callback) {
http.get(url, function(res) {
var data = "";
res.on('data', function (chunk) {
data += chunk;
});
res.on("end", function() {
callback(data);
});
}).on("error", function() {
callback(null);
});
}
var cheerio = require("cheerio");
var url = "http://www.bloglovin.com/en/blogs/1/2/all";
var myArray = [];
var a = 0;
var getLinks = function(){download(url, function(data) {
if (data) {
// console.log(data);
var $ = cheerio.load(data);
$(".content").each(function(i, e) {
var blogName = $(e).find(".blog-name").text();
var followLink = $(e).find("a").attr("href");
var blogSite = $(e).find(".description").text();
myArray[a] = [a];
myArray[a]["blogName"] = blogName;
myArray[a]["followLink"] = "http://www.bloglovin.com"+followLink;
myArray[a]["blogSite"] = blogSite;
a++;
console.log(myArray);
});
}
});
}
getLinks();
As you can see, followLinks is concatenated to followUrl, of which I'd like to pass through the 'url' download, so effectively I'll be scraping each of the pages using the same CSS rules, which will be added to the multidimensional array for the corresponding blogger.
How can I go about this?

I do something similar in one of my scraping jobs, but I use the async.js library to accomplish. Note that I'm also using the request module and cheerio.js in my scraping. I fetch and scrape rows of data from a single webpage, but suspect you could do something similar to fetch URLs and request / scrape them in the same manner.
I also admit this is quite basic coding, certainly could be optimized with a bit of refactoring. Hope it gives you some ideas at least...
First, I use request to fetch the page and call my parse function -
var url = 'http://www.target-website.com';
function(lastCallback) {
request(url, function(err, resp, body) {
if(!err) { parsePage(err, resp, body, lastCallback); }
else { console.log('web request error:' + resp.statusCode); }
}
}
Next, in my parsePage function, I load the website into Cheerio, fetch the HTML of each data row into an array, push my parseRow function and each HTML segment into another array, and use async.parallel to process each iteration -
var rows = [];
function parsePage(err, resp, body, callback1) {
var $ = cheerio.load(body);
$('div#targetTable tr').each(function(i, elem) {
rows.push($(this).html());
});
var scrRows = [];
rows.forEach(function(row) {
scrRows.push(function(callback2) {
parseRow(err, resp, row);
callback2();
});
async.parallel(scrRows, function() {
callback1();
});
}

Inside your loop, just create an object with the properties you scrape then push that object onto your array.
var blogInfo = {
blogName: blogName,
followLink: "http://www.bloglovin.com"+followLink;
blogSite: blogSite
};
myArray.push(blogInfo);

You have defined a = 0; So
myArray[a] = [a]; // => myArray[0] = [0]; myArray[0] becomes an array with 0 as only member in it
All these statements throw an error since Array can have only integer as keys.
myArray[a]["blogName"] = blogName;
myArray[a]["followLink"] = "http://www.bloglovin.com"+followLink;
myArray[a]["blogSite"] = blogSite;
Instead try this:
var obj = {
index: a,
blogName: blogName,
followLink: "http://www.bloglovin.com" + followLink,
blogSite: blogSite
}
myArray.push(obj);
console.log(myArray);

Related

Why my JSON file is showing all the values in a single key?

I had scrapped a website using Node.js environment. I had parsed the data in JSON format and saved the JSON data in a file named output.js
But instead of creating different keys (with different arrays) for different values, the whole data (value) gets stored in a single key. I had used "each" function and formed a loop to stored them in the different arrays but that seems like not working.
What may be the possible glitch?
Here is the code:
var cheerio = require('cheerio');
var request = require('request');
var fs = require('fs');
var allJSONdata = [];
request ('http://www.bseindia.com/corporates/ann.aspx',
function (error, response, html) {
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
var allRecords = $('div.content');
allRecords.each(function(index, element){
var title = $(element).find('td.TTHeadergrey').text();
var tempData = {
Header:title
}
allJSONdata.push(tempData);
});
}
fs.writeFile('output.json', JSON.stringify(allJSONdata), function(err){
console.log('successfully saved');
})
});
And my output.json data is following:
[{"Header":" /*all heads in this single key*/ "}]

You are not using the right selectors, so... Change this:
var allRecords = $('div.content');
allRecords.each(function(index, element){
var title = $(element).find('td.TTHeadergrey').text();
var tempData = {
Header:title
}
allJSONdata.push(tempData);
});
to this:
var allRecords = $('.content .TTHeadergrey');
allRecords.each(function(index, element){
if( (index % 5)===0 || (index % 5)===1){
var title = $(element).text();
var tempData = {
Header:title
};
allJSONdata.push(tempData);
}
});

Reading, parsing, iterating through JSON with Node.js

I'm trying to parse Reddit's RSS feed to grab the titles of front page articles, and having some trouble. Source code below:
//var util = require('util');
//var cheerio = require('cheerio');
var fs = require('fs');
var request = require('request');
var parseString = require('xml2js').parseString;
url = 'http://www.reddit.com/.xml';
request(url, function(error, response, xml){
parseString(xml, function(err, result) {
result = result.rss.channel[0];
console.log(result.item[0]['title']); // works fine, gets first title
for(var key in result){
console.log(result[key]['title']); // returns a bunch of 'undefined'
}
//console.log(util.inspect(result,false,null));
fs.writeFile("index.html", result, function(err){
if(err) { return console.log(err); }
return console.log("File saved.");
});
});
});

You get undefined because you should be iterating over result.item instead of just result. For example:
for(var key in result.item) {
console.log(result.item[key]['title']);
}
Additionally, you should just use a regular for-loop instead of using for..in, since it seems like result.item is just a plain array. For example:
var items = result.item;
for (var i = 0; i < items.length; ++i) {
console.log(items[i].title);
}

Return result from a javascript callback (node.js)

I know this question have been asked many times, but I can't make it work.
Here is my situation. I had a string called data, and I want to unshorten all the link inside that string.
Code:
var Bypasser = require('node-bypasser');
var URI = require('urijs');
var data = 'multiple urls : http://example.com/foo http://example.com/bar';
var result = URI.withinString(data, function(url) {
var unshortenedUrl = null;
var w = new Bypasser(url);
w.decrypt(function(err, res) {
// How can I return res ?
unshortenedUrl = res;
});
// I know the w.descrypt function is a asynchronous function
// so unshortenedUrl = null
return unshortenedUrl;
});
Let's me walk you through the code.
URI.withinString will match all the URLs in data, manipulate it and return the result.
You can view an example from URI.js docs
What I want to with these URLs is to unshorten all of them using node-passer.
This is from node-bypasser document:
var Bypasser = require('node-bypasser');
var w = new Bypasser('http://example.com/shortlink');
w.decrypt(function(err, result) {
console.log('Decrypted: ' + result);
});
This is the result that I want multiple urls : http://example.com/foo_processed http://example.com/bar_processed
I created a notebook at tonicdev.com
Solution
var getUrlRegEx = new RegExp(
"(^|[ \t\r\n])((ftp|http|https|gopher|mailto|news|nntp|telnet|wais|file|prospero|aim|webcal):(([A-Za-z0-9$_.+!*(),;/?:#&~=-])|%[A-Fa-f0-9]{2}){2,}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*(),;/?:#&~=%-]*))?([A-Za-z0-9$_+!*();/?:~-]))"
, "g"
);
var urls = data.match(getUrlRegEx);
async.forEachLimit(urls, 5, function (url, callback) {
let w = new Bypasser(url);
w.decrypt(function (err, res) {
if (err == null && res != undefined) {
data = data.replace(url, res);
callback();
}
});
}, function(err) {
res.send(data);
});

You don't really understand what callback is. The callback serves to allow asynchronous code to run without Javascript waiting for it. If you were less lazy and added some debug in your code:
console.log("Started parsing");
var result = URI.withinString(data, function(url) {
console.log("URL parsed (or whatever)");
var unshortenedUrl = null;
var w = new Bypasser(url);
w.decrypt(function(err, res) {
// How can I return res ?
unshortenedUrl = res;
});
// I know the w.descrypt function is a asynchronous function
// so unshortenedUrl = null
return unshortenedUrl;
});
console.log("Call to library over");
You would (most likely) see messages in this order:
Started parsing
Call to library over
URL parsed (or whatever)
The answer: Callback is not guaranteed to run before any code you execute after assigning it. You can't put data in your result variable because the data might not be fetched yet.

variable scope in module asynchronous function

This is my first week in node so I'm sorry if this is a no brainier.
The code works and does what it should. But I can't figure out how to match the name (url) that starts http.get whit the result it gets from the website.
I found this witch is almost like my problem, except this is a premade function so I can't edit the function and add a callback.
variable scope in asynchronous function
If I could run this code synchronous or make a callback in the http.get function it would all be good. But I don't have the skills and don't know if you even can do it.
Thanks
- Robin.
http = require('http');
function download(name) {
//name is an array whit csgo items names.
for (var i = 0; i < name.length; i++) {
var marketHashName = getGoodName(name[i]);
var url = 'http://steamcommunity.com/market/priceoverview/?currency=1&appid=730&market_hash_name=' + marketHashName;
http.get(url, function (res) {
var data = "";
res.on('data', function (chunk) {
data += chunk;
});
res.on("end", function () {
data = JSON.parse(data);
var value= 0;
//get the value in the json array
if(data.median_price) {
value = data.median_price;
}else{
value = data.lowest_price;
}
value = value.substr(5);
console.log("WEAPON",value);
//callback whit name/link and value?
//callback(name,value);
});
}).on("error", function () {
});
}
}

You can just add a callback argument and then call it with the final data. And, if you want to pass to the callback the particular marketHashName that was being processed, then you can create a closure to capture that uniquely for each time through the for loop:
http = require('http');
function download(name, callback) {
//name is an array whit csgo items names.
for (var i = 0; i < name.length; i++) {
var marketHashName = getGoodName(name[i]);
// create closure to capture marketHashName uniquely for each
// iteration of the for loop
(function(theName) {
var url = 'http://steamcommunity.com/market/priceoverview/?currency=1&appid=730&market_hash_name=' + marketHashName;
http.get(url, function (res) {
var data = "";
res.on('data', function (chunk) {
data += chunk;
});
res.on("end", function () {
data = JSON.parse(data);
var value= 0;
//get the value in the json array
if(data.median_price) {
value = data.median_price;
}else{
value = data.lowest_price;
}
value = value.substr(5);
console.log("WEAPON",value);
// now that the async function is done, call the callback
// and pass it our results
callback(theName, value, data);
});
}).on("error", function () {
});
})(marketHasName);
}
}
// sample usage:
download("whatever", function(name, value, data) {
// put your code here to use the results
});
FYI, you may find that the request module which is a higher level set of functionality on top of the http module will save you some work.

nodejs multiple http requests in loop

I'm trying to make simple feed reader in node and I'm facing a problem with multiple requests in node.js.
For example, I got table with urls something like:
urls = [
"http://url1.com/rss.xml",
"http://url2.com",
"http://url3.com"];
Now I want to get contents of each url. First idea was to use for(var i in urls) but it's not good idea. the best option would be to do it asynchronously but I don't know how to make it.
Any ideas?
EDIT:
I got this code:
var data = [];
for(var i = 0; i<urls.length; i++){
http.get(urls[i], function(response){
console.log('Reponse: ', response.statusCode, ' from url: ', urls[i]);
var body = '';
response.on('data', function(chunk){
body += chunk;
});
response.on('end', function() {
data.push(body);
});
}).on('error', function(e){
console.log('Error: ', e.message);
});
}
Problem is that first is call line "http.get..." for each element in loop and after that event response.on('data') is called and after that response.on('end'). It makes mess and I don't know how to handle this.

I know this is an old question, but I think a better solution would be to use JavaScripts Promise.all():
const request = require('request-promise');
const urls = ["http://www.google.com", "http://www.example.com"];
const promises = urls.map(url => request(url));
Promise.all(promises).then((data) => {
// data = [promise1,promise2]
});

By default node http requests are asynchronous. You can start them sequentially in your code and call a function that'll start when all requests are done. You can either do it by hand (count the finished vs started request) or use async.js
This is the no-dependency way (error checking omitted):
var http = require('http');
var urls = ["http://www.google.com", "http://www.example.com"];
var responses = [];
var completed_requests = 0;
for (i in urls) {
http.get(urls[i], function(res) {
responses.push(res);
completed_requests++;
if (completed_requests == urls.length) {
// All download done, process responses array
console.log(responses);
}
});
}

You need to check that on end (data complete event) has been called the exact number of requests... Here's a working example:
var http = require('http');
var urls = ['http://adrianmejia.com/atom.xml', 'http://twitrss.me/twitter_user_to_rss/?user=amejiarosario'];
var completed_requests = 0;
urls.forEach(function(url) {
var responses = [];
http.get(url, function(res) {
res.on('data', function(chunk){
responses.push(chunk);
});
res.on('end', function(){
if (completed_requests++ == urls.length - 1) {
// All downloads are completed
console.log('body:', responses.join());
}
});
});
})

You can use any promise library with ".all" implementation. I use RSVP library, Its simple enough.
var downloadFileList = [url:'http://stuff',dataname:'filename to download']
var ddownload = downloadFileList.map(function(id){
var dataname = id.dataname;
var url = id.url;
return new RSVP.Promise(function(fulfill, reject) {
var stream = fs.createWriteStream(dataname);
stream.on('close', function() {
console.log(dataname+' downloaded');
fulfill();
});
request(url).on('error', function(err) {
console.log(err);
reject();
}).pipe(stream);
});
});
return new RSVP.hashSettled(ddownload);

Promise.allSettled will not stop at error. It make sure you process all responses, even if some have an error.
Promise.allSettled(promises)
.then((data) => {
// do your stuff here
})
.catch((err) => {
console.log(JSON.stringify(err, null, 4));
});

The problem can be easily solved using closure. Make a function to handle the request and call that function in the loop. Every time the function would be called, it would have it's own lexical scope and using closure, it would be able to retain the address of the URL even if the loop ends. And even is the response is in streams, closure would handle that stuff too.
const request = require("request");
function getTheUrl(data) {
var options = {
url: "https://jsonplaceholder.typicode.com/posts/" + data
}
return options
}
function consoleTheResult(url) {
request(url, function (err, res, body) {
console.log(url);
});
}
for (var i = 0; i < 10; i++) {
consoleTheResult(getTheUrl(i))
}

We Keep Coding

JavaScript is the programming language of the Web.

node.js + cheerio scrape: Passing an array of urls to download? - javascript

Inside your loop, just create an object with the properties you scrape then push that object onto your array. var blogInfo = { blogName: blogName, followLink: "http://www.bloglovin.com"+followLink; blogSite: blogSite }; myArray.push(blogInfo);

Related

Why my JSON file is showing all the values in a single key?

Reading, parsing, iterating through JSON with Node.js

Return result from a javascript callback (node.js)

variable scope in module asynchronous function

nodejs multiple http requests in loop

Categories

Resources