async.each and http get - javascript

I want to programmatically fetch meta tags from multiple urls and use for furthur processing.
I'm using this below code snippet, this one always prints only the first urls meta tag and the async callbacks res is undefined. Am I missing anything here with async?
var http = require('http'),
cheerio = require('cheerio'),
async = require('async'),
urls = [
"http://theatlantic.com",
"http://nytimes.com"
];
function test() {
var $, data = '';
getMetaData = function(uri, callback) {
http.get(uri, function(resp) {
console.log('Fetching Url:' + uri);
resp.on('data', function (chunk){
data += chunk;
});
resp.on('end', function () {
$ = cheerio.load(data);
console.log('Meta Tag:' + $('meta[property="og:description"]').attr('content') + '\n'); //use for furthur processing
callback(null, $('meta[name="description"]').attr('content'));
});
});
}
async.each(urls, getMetaData, function(err, res) {
console.log(res);
});
};
test();

Related

Why does my for loop mess up all the parameters?

I am trying to parse some data from several web pages using javascript. I wrote a small parser for this purpose. The algorithm looks like this:
Open first URL from my .csv file
Find the data I need on the page
Save URL and data to a json file
My code executes 1. and 2. perfectly but sometimes messes up with number 3. Output looks like this:
URL 1 + data from URL 1 (correct line)
URL 2 + data from URL 2 (correct line)
URL 3 + data from URL 3 (correct line)
URL 4 + data from URL 4 (correct line)
URL 6(wrong URL) + data from another URL
URL 5(wrong URL) + data from another URL
URL 7 + data from URL 7 (correct line)
URL 8 + data from URL 8 (correct line)
URL 9 + data from URL 9 (correct line)
I assume the problem is that some pages load way too long which messes up the whole process. But I still don't understand why it sometimes saves the wrong data.
Heres my code:
var request = require('request');
var cheerio = require('cheerio');
var cloudscraper = require('cloudscraper');
var fs = require('fs');
var path = require('path');
var csvjson = require('csvjson');
//First, we read .csv file with our URL list
function getTheList() {
urlList = fs.readFileSync(path.join(__dirname, 'data.csv'), { encoding : 'utf8'});
var options = {
delimiter : ';', // optional
quote : '"' // optional
};
urlList = csvjson.toObject(urlList, options);
end = urlList.length;
logs = [];
//here we start the loop reading and saving data from each url
for (let p = 0; p < end; p += 1){
grabTheData(urlList, p)
}
}
//this code extracts the data from the page and saves it to a json file
function grabTheData(urlList, p){
setTimeout(function() {
url = url[p].ItemLink;
cloudscraper.get(url, function(err, res, body){
if (err) {
console.log(other.Time() + colors.yellow('Warn: ') + '- something went wrong with item ' + url);
callback();
} else {
var $ = cheerio.load(body);
/*
here are the lines which extract the data I need
dataIneed = ...;
*/
logs.push({
url, dataINeed
});
fs.writeFileSync('./logs.json', JSON.stringify(logs, null, 4));
}
});
//here I set a 2 seconds delay between each URL
}, 2000 * p);
}
getTheList()
The reason this is happening is that there is a potential mismatch between the callback result and the url variable in grabTheData.
Now there is a very quick fix for this, simple change the scope of the url variable like so:
function grabTheData(urlList, p){
setTimeout(function() {
// Set scope of url variable to block
let url = url[p].ItemLink;
cloudscraper.get(url, function(err, res, body){
if (err) {
console.log(other.Time() + colors.yellow('Warn: ') + '- something went wrong with item ' + url);
callback();
} else {
var $ = cheerio.load(body);
/*
here are the lines which extract the data I need
dataIneed = ...;
*/
logs.push({
url, dataINeed
});
fs.writeFileSync('./logs.json', JSON.stringify(logs, null, 4));
}
});
//here I set a 2 seconds delay between each URL
}, 2000 * p);
}
This should keep your results in order.
Here's another (IMHO much better) option, using promises and avoiding the use of setTimeout to separate calls. This should avoid any potential race condition, since the Promise.all call will preserve order:
async function getTheList() {
urlList = fs.readFileSync(path.join(__dirname, 'data.csv'), { encoding : 'utf8'});
var options = {
delimiter : ';', // optional
quote : '"' // optional
};
urlList = csvjson.toObject(urlList, options);
let promiseList = urlList.map(urlEntry => grabTheDataUpdated(urlEntry.ItemLink));
let logs = await Promise.all(promiseList);
fs.writeFileSync('./new_logs.json', JSON.stringify(logs, null, 4));
}
// Promisified version of cloudscraper.get
function getCloudScraperData(url) {
return new Promise((resolve, reject) => {
cloudscraper.get(url, (err, res, body) => {
if (err) {
reject(err);
} else {
resolve ( { url, res, body });
}
})
})
}
function getDataINeed(url, body) {
// Use cheerio to process data..
// Return mock data for now.. replace with actual data processed by cheerio..
return `data from ${url}`;
}
async function grabTheDataUpdated(url) {
try {
let result = await getCloudScraperData(url);
let dataINeed = getDataINeed(result.url, result.body);
return { url, dataINeed };
} catch (error) {
return { url, dataINeed: "Error occurred: " + error.message };
}
}

Node JS Fetching JSON from url is executing AFTER the rest of the code

I wanted to get a JSON from a URL in my node JS code. The code is working fine, but the sequence of the execution is messed up because of the Async nature of the execution.
var http = require("https");
var number = 37302;
// these functions need to execute is sequence.
console.log('Before API Call');
var response = fetchJson(number);
console.log(response);
console.log('After API Call');
function fetchJson(number)
{
var url = 'https://example.com/api/getactionitems/' + number;
http.get(url, function(res){
var body = '';
res.on('data', function(chunk){
body += chunk;
console.log('JSON Retrieved.');
});
res.on('end', function(){
console.log('Parsing JSON');
var APIResponse = JSON.parse(body);
var Name = APIResponse.EmpName;
var outstring = APIResponse.ActionItem;
return ('Hi ' + Name + ', Your action Items are: '+ outstring);
});
})
.on('error', function(e){
return ("Got an error while fetching data.");
});
}
When this code executes, the sequence of the output strings are as follows:
Before API Call
undefined
After API Call
JSON Retrieved.
Parsing JSON
How can I correct the execution order, so that the sequence are like the following:
Before API Call
JSON Retrieved.
Parsing JSON
<Outpt from the JSON parsing>
After API Call
var http = require("https");
var number = 37302;
// these functions need to execute is sequence.
console.log('Before API Call');
fetchJson(number).then(function(res){
console.log(res);
console.log('After API Call');
}).catch(function(e){console.log('err',e)});
function fetchJson(number)
{
return new Promise(function(resolve,reject){
var url = 'https://example.com/api/getactionitems/' + number;
http.get(url, function(res){
var body = '';
res.on('data', function(chunk){
body += chunk;
console.log('JSON Retrieved.');
});
res.on('end', function(){
console.log('Parsing JSON');
var APIResponse = JSON.parse(body);
var Name = APIResponse.EmpName;
var outstring = APIResponse.ActionItem;
resolve('Hi ' + Name + ', Your action Items are: '+ outstring);
});
})
.on('error', function(e){
reject("Got an error while fetching data.");
});
});
}

Nodes http request before an async.map

I'm facing issue to receive an array from a http request before using async.map to launch queries on them.
My server side controller code below (express 4) :
'use strict';
var _ = require('lodash');
var request = require('request');
var asynce = require('async');
exports.index = function (req, res) {
function cleanip(str) {
return str.replace("/", "%2F");
}
var myUrls = [];
var IpBlockedForSpam = [];
var list = ["127.0.0.1/32", "192.168.0.1/32"];
for (var i in list) {
myUrls.push("http://localhost:9000/myapi/ip/blockedForSpam/" + cleanip(list[i]));
}
asynce.map(myUrls, function (url, callback) {
request(url, function (error, response, html) {
var r = JSON.parse(html);
for (var i in r) {
IpBlockedForSpam.push(r[i]);
}
callback(error, html);
});
}, function (err, results) {
res.jsonp(IpBlockedForSpam);
});
};
This code work with var list as static.
What i want to achieive is to be able fill this variable using a http request like this :
request("http://localhost:9000/myapi/ip", function(error, response, body) {
//console.log(body);
remotelist.push(JSON.parse(body));
});
Calling http://localhost:9000/myapi/ip return :
[
"127.0.0.1/32",
"192.168.0.1/32"
]
I tried many thing without results because most time, my async method is launched before my required http call request to retrieve list.
Another thing, is it possible to not use url like http://localhost:9000/myapi/* and use only /myapi/*
Thank you in advance for suggestions, maybe i am wrong with this method.
See you.
You can just put the code inside the request() callback so that the list is obtained first and only when it has been retrieved run the rest of the code:
'use strict';
var _ = require('lodash');
var request = require('request');
var asynce = require('async');
exports.index = function (req, res) {
function cleanip(str) {
return str.replace("/", "%2F");
}
var myUrls = [];
var IpBlockedForSpam = [];
var list = ["127.0.0.1/32", "192.168.0.1/32"];
request("http://localhost:9000/myapi/ip", function(error, response, body) {
// add the IP address array retrieved from this request
list = list.concat(JSON.parse(body));
list.forEach(function(item, i) {
myUrls.push("http://localhost:9000/myapi/ip/blockedForSpam/" + cleanip(item));
});
asynce.map(myUrls, function (url, callback) {
request(url, function (error, response, html) {
var r = JSON.parse(html);
for (var i in r) {
IpBlockedForSpam.push(r[i]);
}
callback(error, html);
});
}, function (err, results) {
res.jsonp(IpBlockedForSpam);
});
});
};
P.S. it is not considered a good practice to iterate arrays with for (var i in r) because that is a property iteration that will accidentially include any enumerable properties of the array, not just array items.
I got it now, below working code :
'use strict';
var _ = require('lodash');
var request = require('request');
var asynce = require('async');
exports.index = function(req, res) {
function cleanip(str) {
return str.replace("/", "%2F");
}
var myUrls = [];
var IpBlockedForSpam = [];
//var list = ["127.0.0.1/32", "192.168.0.1/32"];
var list = [];
request("http://localhost:9000/myapi/ip", function(error, response, body) {
list = list.concat(JSON.parse(body));
list.forEach(function(item, i) {
myUrls.push("http://localhost:9000/myapi/ip/blockedForSpam/" + cleanip(item));
});
asynce.map(myUrls, function(url, callback) {
request(url, function(error, response, html) {
var r = JSON.parse(html);
r.forEach(function(item, i) {
IpBlockedForSpam.push(r[i]);
});
callback(error, html);
});
}, function(err, results) {
res.jsonp(IpBlockedForSpam);
});
});
};
Some brackets was not closed and concact instead of concat (it helped me to really understand this approach from now) :)
Last thing, is it possible to not use url like http://localhost:9000/myapi/* and use only /myapi/* ?
Thank you #jfriend00

node.js + cheerio scrape: Passing an array of urls to download?

Firstly, here is my code as I've progressed so far:
var http = require("http");
// Utility function that downloads a URL and invokes
// callback with the data.
function download(url, callback) {
http.get(url, function(res) {
var data = "";
res.on('data', function (chunk) {
data += chunk;
});
res.on("end", function() {
callback(data);
});
}).on("error", function() {
callback(null);
});
}
var cheerio = require("cheerio");
var url = "http://www.bloglovin.com/en/blogs/1/2/all";
var myArray = [];
var a = 0;
var getLinks = function(){download(url, function(data) {
if (data) {
// console.log(data);
var $ = cheerio.load(data);
$(".content").each(function(i, e) {
var blogName = $(e).find(".blog-name").text();
var followLink = $(e).find("a").attr("href");
var blogSite = $(e).find(".description").text();
myArray[a] = [a];
myArray[a]["blogName"] = blogName;
myArray[a]["followLink"] = "http://www.bloglovin.com"+followLink;
myArray[a]["blogSite"] = blogSite;
a++;
console.log(myArray);
});
}
});
}
getLinks();
As you can see, followLinks is concatenated to followUrl, of which I'd like to pass through the 'url' download, so effectively I'll be scraping each of the pages using the same CSS rules, which will be added to the multidimensional array for the corresponding blogger.
How can I go about this?
I do something similar in one of my scraping jobs, but I use the async.js library to accomplish. Note that I'm also using the request module and cheerio.js in my scraping. I fetch and scrape rows of data from a single webpage, but suspect you could do something similar to fetch URLs and request / scrape them in the same manner.
I also admit this is quite basic coding, certainly could be optimized with a bit of refactoring. Hope it gives you some ideas at least...
First, I use request to fetch the page and call my parse function -
var url = 'http://www.target-website.com';
function(lastCallback) {
request(url, function(err, resp, body) {
if(!err) { parsePage(err, resp, body, lastCallback); }
else { console.log('web request error:' + resp.statusCode); }
}
}
Next, in my parsePage function, I load the website into Cheerio, fetch the HTML of each data row into an array, push my parseRow function and each HTML segment into another array, and use async.parallel to process each iteration -
var rows = [];
function parsePage(err, resp, body, callback1) {
var $ = cheerio.load(body);
$('div#targetTable tr').each(function(i, elem) {
rows.push($(this).html());
});
var scrRows = [];
rows.forEach(function(row) {
scrRows.push(function(callback2) {
parseRow(err, resp, row);
callback2();
});
async.parallel(scrRows, function() {
callback1();
});
}
Inside your loop, just create an object with the properties you scrape then push that object onto your array.
var blogInfo = {
blogName: blogName,
followLink: "http://www.bloglovin.com"+followLink;
blogSite: blogSite
};
myArray.push(blogInfo);
You have defined a = 0; So
myArray[a] = [a]; // => myArray[0] = [0]; myArray[0] becomes an array with 0 as only member in it
All these statements throw an error since Array can have only integer as keys.
myArray[a]["blogName"] = blogName;
myArray[a]["followLink"] = "http://www.bloglovin.com"+followLink;
myArray[a]["blogSite"] = blogSite;
Instead try this:
var obj = {
index: a,
blogName: blogName,
followLink: "http://www.bloglovin.com" + followLink,
blogSite: blogSite
}
myArray.push(obj);
console.log(myArray);

nodejs multiple http requests in loop

I'm trying to make simple feed reader in node and I'm facing a problem with multiple requests in node.js.
For example, I got table with urls something like:
urls = [
"http://url1.com/rss.xml",
"http://url2.com",
"http://url3.com"];
Now I want to get contents of each url. First idea was to use for(var i in urls) but it's not good idea. the best option would be to do it asynchronously but I don't know how to make it.
Any ideas?
EDIT:
I got this code:
var data = [];
for(var i = 0; i<urls.length; i++){
http.get(urls[i], function(response){
console.log('Reponse: ', response.statusCode, ' from url: ', urls[i]);
var body = '';
response.on('data', function(chunk){
body += chunk;
});
response.on('end', function() {
data.push(body);
});
}).on('error', function(e){
console.log('Error: ', e.message);
});
}
Problem is that first is call line "http.get..." for each element in loop and after that event response.on('data') is called and after that response.on('end'). It makes mess and I don't know how to handle this.
I know this is an old question, but I think a better solution would be to use JavaScripts Promise.all():
const request = require('request-promise');
const urls = ["http://www.google.com", "http://www.example.com"];
const promises = urls.map(url => request(url));
Promise.all(promises).then((data) => {
// data = [promise1,promise2]
});
By default node http requests are asynchronous. You can start them sequentially in your code and call a function that'll start when all requests are done. You can either do it by hand (count the finished vs started request) or use async.js
This is the no-dependency way (error checking omitted):
var http = require('http');
var urls = ["http://www.google.com", "http://www.example.com"];
var responses = [];
var completed_requests = 0;
for (i in urls) {
http.get(urls[i], function(res) {
responses.push(res);
completed_requests++;
if (completed_requests == urls.length) {
// All download done, process responses array
console.log(responses);
}
});
}
You need to check that on end (data complete event) has been called the exact number of requests... Here's a working example:
var http = require('http');
var urls = ['http://adrianmejia.com/atom.xml', 'http://twitrss.me/twitter_user_to_rss/?user=amejiarosario'];
var completed_requests = 0;
urls.forEach(function(url) {
var responses = [];
http.get(url, function(res) {
res.on('data', function(chunk){
responses.push(chunk);
});
res.on('end', function(){
if (completed_requests++ == urls.length - 1) {
// All downloads are completed
console.log('body:', responses.join());
}
});
});
})
You can use any promise library with ".all" implementation. I use RSVP library, Its simple enough.
var downloadFileList = [url:'http://stuff',dataname:'filename to download']
var ddownload = downloadFileList.map(function(id){
var dataname = id.dataname;
var url = id.url;
return new RSVP.Promise(function(fulfill, reject) {
var stream = fs.createWriteStream(dataname);
stream.on('close', function() {
console.log(dataname+' downloaded');
fulfill();
});
request(url).on('error', function(err) {
console.log(err);
reject();
}).pipe(stream);
});
});
return new RSVP.hashSettled(ddownload);
Promise.allSettled will not stop at error. It make sure you process all responses, even if some have an error.
Promise.allSettled(promises)
.then((data) => {
// do your stuff here
})
.catch((err) => {
console.log(JSON.stringify(err, null, 4));
});
The problem can be easily solved using closure. Make a function to handle the request and call that function in the loop. Every time the function would be called, it would have it's own lexical scope and using closure, it would be able to retain the address of the URL even if the loop ends. And even is the response is in streams, closure would handle that stuff too.
const request = require("request");
function getTheUrl(data) {
var options = {
url: "https://jsonplaceholder.typicode.com/posts/" + data
}
return options
}
function consoleTheResult(url) {
request(url, function (err, res, body) {
console.log(url);
});
}
for (var i = 0; i < 10; i++) {
consoleTheResult(getTheUrl(i))
}

Categories