I would like to scrap an url:
1 request to get a list of elements
1 request on each result to get details
Here what I have:
var request = require('request')
, cheerio = require('cheerio')
, async = require('async')
, format = require('util').format;
var baseurl = 'http://magiccards.info';
async.waterfall([
function (callback) {
request(baseurl + '/sitemap.html', function (err, response, body) {
var sets = [];
var $ = cheerio.load(body);
$('a[href$="/en.html"]').each(function () {
sets.push({"name": $(this).text(), "code":$(this).attr('href').match(/\/([^)]+)\//)[1], "path": $(this).attr('href'), "translations":[]});
});
callback(null, sets);
});
},
function (sets, callback) {
console.log(sets);
async.eachSeries(sets, function (set, callback) {
console.log('SET ' + set.code.toUpperCase());
request(baseurl + set.path, function (err, response, body) {
var $ = cheerio.load(body);
$('body > a[href^="/' + set.code + '/"]').each(function () {
console.log(' %s (%s)', $(this).text(), $(this).attr('href'));
});
});
});
}
], function (err, result) {
console.log('ERR');
// result now equals 'done'
});
The problem is that the 2nd waterfall function run only once, if I replace the eachSeries with an each, the loop does run X times (but I need to wait for result).
Wath am I missing?
You need to call the eachSeries callback function. Otherwise async won't know that you are done. (1)
You also need to tell the waterfall function that you are done with that step, also by calling the callback function. (2)
function (sets, waterfallCallback) {
async.eachSeries(sets, function (set, seriesCallback) {
console.log('SET ' + set.code.toUpperCase());
request(baseurl + set.path, function (err, response, body) {
var $ = cheerio.load(body);
$('body > a[href^="/' + set.code + '/"]').each(function () {
console.log(' %s (%s)', $(this).text(), $(this).attr('href'));
});
seriesCallback(null); /* 1 */
});
}, waterfallCallback /* 2 */);
}
Related
Here is my code:
'use strict';
var unitID = 0;
var getById = function(generalOptions, specificOptions) {
describe('API tests for: ' + specificOptions.name, function() {
var url = generalOptions.baseUrl + specificOptions.route;
// GET all items
it('= = = GET ALL test for ' + specificOptions.name + ' return status
code 200', function(done) {
generalOptions.request.get({
url: url
}, function(error, response, body) {
expect(response.statusCode).toBe(200);
expect(JSON.parse(body)).not.toBeFalsy();
if (specificOptions.route == '/devices/') {
var bodyJS = JSON.parse(body);
unitID = bodyJS.devices[0].id;
} else {
unitID = '';
}
console.log('Result 1 - ' + unitID);
done();
});
});
//GET by ID
it('= = = GET by ID test for ' + specificOptions.name + ' return status code 200', function(done) {
console.log('Result 2 - ' + unitID);
generalOptions.request.get({
url: url + unitID
}, function(error, response, body) {
expect(response.statusCode).toBe(200);
expect(JSON.parse(body)).not.toBeFalsy();
done();
});
});
})
};
module.exports = getById;
I need to wait, while unitID will be updated with first GET request and then use in in the next request.
The problem is, that it works asynchronously and unitID in the second request stay 0.
Can show how to implement solution with async/await or Promises?
Thanks!
For debugging reason I do console.log. For now it print:
Result 2 - 0
Result 1 - 59dffdgfdgfg45545g
You should not write test in such fashion where output of one test goes into other.Each "it" should be independent.
Instead you should make call twice(nested call) to achieve the value of unitID or ideally you should mock the service to return the data that is expected by the "it".
I'm facing issue to receive an array from a http request before using async.map to launch queries on them.
My server side controller code below (express 4) :
'use strict';
var _ = require('lodash');
var request = require('request');
var asynce = require('async');
exports.index = function (req, res) {
function cleanip(str) {
return str.replace("/", "%2F");
}
var myUrls = [];
var IpBlockedForSpam = [];
var list = ["127.0.0.1/32", "192.168.0.1/32"];
for (var i in list) {
myUrls.push("http://localhost:9000/myapi/ip/blockedForSpam/" + cleanip(list[i]));
}
asynce.map(myUrls, function (url, callback) {
request(url, function (error, response, html) {
var r = JSON.parse(html);
for (var i in r) {
IpBlockedForSpam.push(r[i]);
}
callback(error, html);
});
}, function (err, results) {
res.jsonp(IpBlockedForSpam);
});
};
This code work with var list as static.
What i want to achieive is to be able fill this variable using a http request like this :
request("http://localhost:9000/myapi/ip", function(error, response, body) {
//console.log(body);
remotelist.push(JSON.parse(body));
});
Calling http://localhost:9000/myapi/ip return :
[
"127.0.0.1/32",
"192.168.0.1/32"
]
I tried many thing without results because most time, my async method is launched before my required http call request to retrieve list.
Another thing, is it possible to not use url like http://localhost:9000/myapi/* and use only /myapi/*
Thank you in advance for suggestions, maybe i am wrong with this method.
See you.
You can just put the code inside the request() callback so that the list is obtained first and only when it has been retrieved run the rest of the code:
'use strict';
var _ = require('lodash');
var request = require('request');
var asynce = require('async');
exports.index = function (req, res) {
function cleanip(str) {
return str.replace("/", "%2F");
}
var myUrls = [];
var IpBlockedForSpam = [];
var list = ["127.0.0.1/32", "192.168.0.1/32"];
request("http://localhost:9000/myapi/ip", function(error, response, body) {
// add the IP address array retrieved from this request
list = list.concat(JSON.parse(body));
list.forEach(function(item, i) {
myUrls.push("http://localhost:9000/myapi/ip/blockedForSpam/" + cleanip(item));
});
asynce.map(myUrls, function (url, callback) {
request(url, function (error, response, html) {
var r = JSON.parse(html);
for (var i in r) {
IpBlockedForSpam.push(r[i]);
}
callback(error, html);
});
}, function (err, results) {
res.jsonp(IpBlockedForSpam);
});
});
};
P.S. it is not considered a good practice to iterate arrays with for (var i in r) because that is a property iteration that will accidentially include any enumerable properties of the array, not just array items.
I got it now, below working code :
'use strict';
var _ = require('lodash');
var request = require('request');
var asynce = require('async');
exports.index = function(req, res) {
function cleanip(str) {
return str.replace("/", "%2F");
}
var myUrls = [];
var IpBlockedForSpam = [];
//var list = ["127.0.0.1/32", "192.168.0.1/32"];
var list = [];
request("http://localhost:9000/myapi/ip", function(error, response, body) {
list = list.concat(JSON.parse(body));
list.forEach(function(item, i) {
myUrls.push("http://localhost:9000/myapi/ip/blockedForSpam/" + cleanip(item));
});
asynce.map(myUrls, function(url, callback) {
request(url, function(error, response, html) {
var r = JSON.parse(html);
r.forEach(function(item, i) {
IpBlockedForSpam.push(r[i]);
});
callback(error, html);
});
}, function(err, results) {
res.jsonp(IpBlockedForSpam);
});
});
};
Some brackets was not closed and concact instead of concat (it helped me to really understand this approach from now) :)
Last thing, is it possible to not use url like http://localhost:9000/myapi/* and use only /myapi/* ?
Thank you #jfriend00
Can't seem to get the pipe function working properly as I am getting a "TypeError: Object # has no method 'join'" error. My input file is a very basic comma delimitated csv, nothing too fancy.
I'd like to pipe the output directly to my response.
var output = [];
var parser = parse({auto_parse: true, columns: true});
var input = fs.createReadStream('./uploads/' + req.body.file);
var transformer = transform(function (record, callback) {
setTimeout(function () {
callback(null, record.join(' ') + '\n');
}, 500);
}, {parallel: 10});
transformer.on('error', function (err) {
res.send(500,err);
console.log(output);
});
transformer.on('finish', function () {
console.log('finish');
console.log(output);
});
input.pipe(parser).pipe(transformer).pipe(process.stdout);
I want to programmatically fetch meta tags from multiple urls and use for furthur processing.
I'm using this below code snippet, this one always prints only the first urls meta tag and the async callbacks res is undefined. Am I missing anything here with async?
var http = require('http'),
cheerio = require('cheerio'),
async = require('async'),
urls = [
"http://theatlantic.com",
"http://nytimes.com"
];
function test() {
var $, data = '';
getMetaData = function(uri, callback) {
http.get(uri, function(resp) {
console.log('Fetching Url:' + uri);
resp.on('data', function (chunk){
data += chunk;
});
resp.on('end', function () {
$ = cheerio.load(data);
console.log('Meta Tag:' + $('meta[property="og:description"]').attr('content') + '\n'); //use for furthur processing
callback(null, $('meta[name="description"]').attr('content'));
});
});
}
async.each(urls, getMetaData, function(err, res) {
console.log(res);
});
};
test();
// this function will recieve array of two sets, containing youtube url and which will in turn call youtube oembed api and receive iframe and video properties, everything was working i am able to receive data from youtube api and convert into them array, but i am unable to pass to the caller function
function getWaterfall(data){
var resultArr = [];
function iter(datas, callback){
async.waterfall([
function getData(callback){
var link = nconf.get('youtubeuri')+encodeURIComponent(datas.src);
console.log(link);
http.get(link, function(res) {
console.log("Got response: " + res.statusCode);
res.on("data", function(chunk) {
var result = JSON.parse(chunk);
console.log(JSON.stringify(result));
callback(null, result)
});
}).on('error', function(e) {
callback(err);
console.log("Got error: " + e.message);
})
},
function processData(results,callback){
var sets = {};
sets.videotitle = results.title;
sets.videothumb_url= results.thumburl;
sets.videoiframe= results.html;
sets.videoheight= results.height;
sets.videowidth= results.width;
sets.videoauthor= results.author_name;
sets.videoauthurl= results.author_url;
callback(null,sets);
},
function updateArray(resu, callback){
resultArr.push(resu);
console.log(resu);
callback(null);
}
],
function finalcall(){
console.log('operation completed');
console.log(resultArr.length);
console.log(JSON.stringify(resultArr));
callback(null);
});
}
async.each(data, iter, function(){
console.log('array completed');
console.log(resultArr.length);
return resultArr;
console.log(JSON.stringify(resultArr));
});
}
// caller function
exports.uploadPaletteVideo = function(req,res){
Palette.findById(req.body.palid, function(err, videodoc){
if(err){
util.log('palette with id not found video '+err);
}else{
var mar = [{'src':'http://www.youtube.com/watch?v=k98k47X_Ugo'}, {'src':'https://www.youtube.com/watch?v=6OLsmfxalPw'}];
console.log('the palette' + getWaterfall(mar));
//console.log('From PaletteVideo'+ JSON.stringify(youtube));
}
})
res.send('success');
}
my code unable to return array to caller function, but i am able to pass this array to another function and its working. please help me out..