I'm trying to make simple feed reader in node and I'm facing a problem with multiple requests in node.js.
For example, I got table with urls something like:
urls = [
"http://url1.com/rss.xml",
"http://url2.com",
"http://url3.com"];
Now I want to get contents of each url. First idea was to use for(var i in urls) but it's not good idea. the best option would be to do it asynchronously but I don't know how to make it.
Any ideas?
EDIT:
I got this code:
var data = [];
for(var i = 0; i<urls.length; i++){
http.get(urls[i], function(response){
console.log('Reponse: ', response.statusCode, ' from url: ', urls[i]);
var body = '';
response.on('data', function(chunk){
body += chunk;
});
response.on('end', function() {
data.push(body);
});
}).on('error', function(e){
console.log('Error: ', e.message);
});
}
Problem is that first is call line "http.get..." for each element in loop and after that event response.on('data') is called and after that response.on('end'). It makes mess and I don't know how to handle this.
I know this is an old question, but I think a better solution would be to use JavaScripts Promise.all():
const request = require('request-promise');
const urls = ["http://www.google.com", "http://www.example.com"];
const promises = urls.map(url => request(url));
Promise.all(promises).then((data) => {
// data = [promise1,promise2]
});
By default node http requests are asynchronous. You can start them sequentially in your code and call a function that'll start when all requests are done. You can either do it by hand (count the finished vs started request) or use async.js
This is the no-dependency way (error checking omitted):
var http = require('http');
var urls = ["http://www.google.com", "http://www.example.com"];
var responses = [];
var completed_requests = 0;
for (i in urls) {
http.get(urls[i], function(res) {
responses.push(res);
completed_requests++;
if (completed_requests == urls.length) {
// All download done, process responses array
console.log(responses);
}
});
}
You need to check that on end (data complete event) has been called the exact number of requests... Here's a working example:
var http = require('http');
var urls = ['http://adrianmejia.com/atom.xml', 'http://twitrss.me/twitter_user_to_rss/?user=amejiarosario'];
var completed_requests = 0;
urls.forEach(function(url) {
var responses = [];
http.get(url, function(res) {
res.on('data', function(chunk){
responses.push(chunk);
});
res.on('end', function(){
if (completed_requests++ == urls.length - 1) {
// All downloads are completed
console.log('body:', responses.join());
}
});
});
})
You can use any promise library with ".all" implementation. I use RSVP library, Its simple enough.
var downloadFileList = [url:'http://stuff',dataname:'filename to download']
var ddownload = downloadFileList.map(function(id){
var dataname = id.dataname;
var url = id.url;
return new RSVP.Promise(function(fulfill, reject) {
var stream = fs.createWriteStream(dataname);
stream.on('close', function() {
console.log(dataname+' downloaded');
fulfill();
});
request(url).on('error', function(err) {
console.log(err);
reject();
}).pipe(stream);
});
});
return new RSVP.hashSettled(ddownload);
Promise.allSettled will not stop at error. It make sure you process all responses, even if some have an error.
Promise.allSettled(promises)
.then((data) => {
// do your stuff here
})
.catch((err) => {
console.log(JSON.stringify(err, null, 4));
});
The problem can be easily solved using closure. Make a function to handle the request and call that function in the loop. Every time the function would be called, it would have it's own lexical scope and using closure, it would be able to retain the address of the URL even if the loop ends. And even is the response is in streams, closure would handle that stuff too.
const request = require("request");
function getTheUrl(data) {
var options = {
url: "https://jsonplaceholder.typicode.com/posts/" + data
}
return options
}
function consoleTheResult(url) {
request(url, function (err, res, body) {
console.log(url);
});
}
for (var i = 0; i < 10; i++) {
consoleTheResult(getTheUrl(i))
}
Related
Consider a node.js loop to fire off http requests, like this:
var http = require('http');
function sendOne(opts, body)
{
var post_options = {
...
}
var sender = http.request(post_options, function(res) {
// process response
});
sender.write(body)
sender.end()
return sender;
}
for( var i =0; i < maxUploadIterations; i++)
{
var body = // construct payload
var sent = sendOne(opts, body);
// somehow wait on sent...
}
Note that the http.request object has a callback function specified for handling the response. My question is how do I synchronously wait on the "Sent" object returned from sendOne using the built in Node primitives.
I understand there are multiple frameworks like Express and Futures that can handle this, but I want to understand the primitive behavior rather than using a "magic" framework.
Is it possible to take "Sent" and put a handler on it like this:
var sent = ...
sent.on("complete", function() { ... } );
?
If so, exactly which handler and how to format the loop with it?
Another option, use old fashion callbacks... Though promises may be cleaner.
var http = require('http');
function sendOne(opts, body, next) {
var post_options = {
// ...
};
var sender = http.request(post_options, function (res) {
// process response
next();
});
sender.write(body);
sender.end();
}
var maxUploadIterations = 100;
function next(index) {
// termination condition
if (index >= maxUploadIterations) {
return;
}
// setup next callback
var nextCallback = next.bind(null, index + 1);
// get these from wherever
var opts = {};
var body = {};
sendOne(opts, body, nextCallback);
}
next(0);
If you have more work to do afterwards, you can change the termination condition to call something else, rather than return.
I know this question have been asked many times, but I can't make it work.
Here is my situation. I had a string called data, and I want to unshorten all the link inside that string.
Code:
var Bypasser = require('node-bypasser');
var URI = require('urijs');
var data = 'multiple urls : http://example.com/foo http://example.com/bar';
var result = URI.withinString(data, function(url) {
var unshortenedUrl = null;
var w = new Bypasser(url);
w.decrypt(function(err, res) {
// How can I return res ?
unshortenedUrl = res;
});
// I know the w.descrypt function is a asynchronous function
// so unshortenedUrl = null
return unshortenedUrl;
});
Let's me walk you through the code.
URI.withinString will match all the URLs in data, manipulate it and return the result.
You can view an example from URI.js docs
What I want to with these URLs is to unshorten all of them using node-passer.
This is from node-bypasser document:
var Bypasser = require('node-bypasser');
var w = new Bypasser('http://example.com/shortlink');
w.decrypt(function(err, result) {
console.log('Decrypted: ' + result);
});
This is the result that I want multiple urls : http://example.com/foo_processed http://example.com/bar_processed
I created a notebook at tonicdev.com
Solution
var getUrlRegEx = new RegExp(
"(^|[ \t\r\n])((ftp|http|https|gopher|mailto|news|nntp|telnet|wais|file|prospero|aim|webcal):(([A-Za-z0-9$_.+!*(),;/?:#&~=-])|%[A-Fa-f0-9]{2}){2,}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*(),;/?:#&~=%-]*))?([A-Za-z0-9$_+!*();/?:~-]))"
, "g"
);
var urls = data.match(getUrlRegEx);
async.forEachLimit(urls, 5, function (url, callback) {
let w = new Bypasser(url);
w.decrypt(function (err, res) {
if (err == null && res != undefined) {
data = data.replace(url, res);
callback();
}
});
}, function(err) {
res.send(data);
});
You don't really understand what callback is. The callback serves to allow asynchronous code to run without Javascript waiting for it. If you were less lazy and added some debug in your code:
console.log("Started parsing");
var result = URI.withinString(data, function(url) {
console.log("URL parsed (or whatever)");
var unshortenedUrl = null;
var w = new Bypasser(url);
w.decrypt(function(err, res) {
// How can I return res ?
unshortenedUrl = res;
});
// I know the w.descrypt function is a asynchronous function
// so unshortenedUrl = null
return unshortenedUrl;
});
console.log("Call to library over");
You would (most likely) see messages in this order:
Started parsing
Call to library over
URL parsed (or whatever)
The answer: Callback is not guaranteed to run before any code you execute after assigning it. You can't put data in your result variable because the data might not be fetched yet.
This is my first week in node so I'm sorry if this is a no brainier.
The code works and does what it should. But I can't figure out how to match the name (url) that starts http.get whit the result it gets from the website.
I found this witch is almost like my problem, except this is a premade function so I can't edit the function and add a callback.
variable scope in asynchronous function
If I could run this code synchronous or make a callback in the http.get function it would all be good. But I don't have the skills and don't know if you even can do it.
Thanks
- Robin.
http = require('http');
function download(name) {
//name is an array whit csgo items names.
for (var i = 0; i < name.length; i++) {
var marketHashName = getGoodName(name[i]);
var url = 'http://steamcommunity.com/market/priceoverview/?currency=1&appid=730&market_hash_name=' + marketHashName;
http.get(url, function (res) {
var data = "";
res.on('data', function (chunk) {
data += chunk;
});
res.on("end", function () {
data = JSON.parse(data);
var value= 0;
//get the value in the json array
if(data.median_price) {
value = data.median_price;
}else{
value = data.lowest_price;
}
value = value.substr(5);
console.log("WEAPON",value);
//callback whit name/link and value?
//callback(name,value);
});
}).on("error", function () {
});
}
}
You can just add a callback argument and then call it with the final data. And, if you want to pass to the callback the particular marketHashName that was being processed, then you can create a closure to capture that uniquely for each time through the for loop:
http = require('http');
function download(name, callback) {
//name is an array whit csgo items names.
for (var i = 0; i < name.length; i++) {
var marketHashName = getGoodName(name[i]);
// create closure to capture marketHashName uniquely for each
// iteration of the for loop
(function(theName) {
var url = 'http://steamcommunity.com/market/priceoverview/?currency=1&appid=730&market_hash_name=' + marketHashName;
http.get(url, function (res) {
var data = "";
res.on('data', function (chunk) {
data += chunk;
});
res.on("end", function () {
data = JSON.parse(data);
var value= 0;
//get the value in the json array
if(data.median_price) {
value = data.median_price;
}else{
value = data.lowest_price;
}
value = value.substr(5);
console.log("WEAPON",value);
// now that the async function is done, call the callback
// and pass it our results
callback(theName, value, data);
});
}).on("error", function () {
});
})(marketHasName);
}
}
// sample usage:
download("whatever", function(name, value, data) {
// put your code here to use the results
});
FYI, you may find that the request module which is a higher level set of functionality on top of the http module will save you some work.
Firstly, here is my code as I've progressed so far:
var http = require("http");
// Utility function that downloads a URL and invokes
// callback with the data.
function download(url, callback) {
http.get(url, function(res) {
var data = "";
res.on('data', function (chunk) {
data += chunk;
});
res.on("end", function() {
callback(data);
});
}).on("error", function() {
callback(null);
});
}
var cheerio = require("cheerio");
var url = "http://www.bloglovin.com/en/blogs/1/2/all";
var myArray = [];
var a = 0;
var getLinks = function(){download(url, function(data) {
if (data) {
// console.log(data);
var $ = cheerio.load(data);
$(".content").each(function(i, e) {
var blogName = $(e).find(".blog-name").text();
var followLink = $(e).find("a").attr("href");
var blogSite = $(e).find(".description").text();
myArray[a] = [a];
myArray[a]["blogName"] = blogName;
myArray[a]["followLink"] = "http://www.bloglovin.com"+followLink;
myArray[a]["blogSite"] = blogSite;
a++;
console.log(myArray);
});
}
});
}
getLinks();
As you can see, followLinks is concatenated to followUrl, of which I'd like to pass through the 'url' download, so effectively I'll be scraping each of the pages using the same CSS rules, which will be added to the multidimensional array for the corresponding blogger.
How can I go about this?
I do something similar in one of my scraping jobs, but I use the async.js library to accomplish. Note that I'm also using the request module and cheerio.js in my scraping. I fetch and scrape rows of data from a single webpage, but suspect you could do something similar to fetch URLs and request / scrape them in the same manner.
I also admit this is quite basic coding, certainly could be optimized with a bit of refactoring. Hope it gives you some ideas at least...
First, I use request to fetch the page and call my parse function -
var url = 'http://www.target-website.com';
function(lastCallback) {
request(url, function(err, resp, body) {
if(!err) { parsePage(err, resp, body, lastCallback); }
else { console.log('web request error:' + resp.statusCode); }
}
}
Next, in my parsePage function, I load the website into Cheerio, fetch the HTML of each data row into an array, push my parseRow function and each HTML segment into another array, and use async.parallel to process each iteration -
var rows = [];
function parsePage(err, resp, body, callback1) {
var $ = cheerio.load(body);
$('div#targetTable tr').each(function(i, elem) {
rows.push($(this).html());
});
var scrRows = [];
rows.forEach(function(row) {
scrRows.push(function(callback2) {
parseRow(err, resp, row);
callback2();
});
async.parallel(scrRows, function() {
callback1();
});
}
Inside your loop, just create an object with the properties you scrape then push that object onto your array.
var blogInfo = {
blogName: blogName,
followLink: "http://www.bloglovin.com"+followLink;
blogSite: blogSite
};
myArray.push(blogInfo);
You have defined a = 0; So
myArray[a] = [a]; // => myArray[0] = [0]; myArray[0] becomes an array with 0 as only member in it
All these statements throw an error since Array can have only integer as keys.
myArray[a]["blogName"] = blogName;
myArray[a]["followLink"] = "http://www.bloglovin.com"+followLink;
myArray[a]["blogSite"] = blogSite;
Instead try this:
var obj = {
index: a,
blogName: blogName,
followLink: "http://www.bloglovin.com" + followLink,
blogSite: blogSite
}
myArray.push(obj);
console.log(myArray);
I made my Request object queue up individual HTTP requests and process them one by one using process.nextTick. However, I am getting an error that I don't know how to solve:
node.js:244
callback();
^
TypeError: undefined is not a function
at process.startup.processNextTick.process._tickCallback (node.js:244:9)
I'm not sure what I am doing wrong. Here is the relevant class.
var Request = function() {
return this;
};
Request.prototype = {
queue_: []
};
Request.prototype.send = function(url, done) {
this.queue_.push(new QueueableRequest(url, done));
this.processRequest_();
}
Request.prototype.processRequest_ = function() {
if (this.queue_.length > 0) {
var request = this.queue_.shift();
var data = '';
http.get(request.url_, function(res) {
res.setEncoding('utf8');
res.on('data', function(chunk) {
data += chunk;
}).on('end', function() {
request.callback_(null, JSON.parse(data));
process.nextTick(this.processRequest_);
}).on('error', function(err) {
request.callback_(err, null);
process.nextTick(this.processRequest_);
});
});
}
}
My other question is whether this is a good method to slowing down my HTTP requests? What I am trying to do is this... I make an HTTP request for a list of threads (about 15-20), and then for each thread, I make another request to obtain its replies. Sometimes within replies, I have to make another request for the deeply nested replies. My initial solution was simply call http.get for every request, but I find that my node.js stops responding after a few requests and I have to keep restarting the server and refreshing the page. My thought was that I am perhaps sending too many requests at once, so I tried to implement this queue.
Your this inside your event handlers is incorrect, so your this.processRequest_ is undefined.
Request.prototype.processRequest_ = function() {
// Assign the outer request object to a variable so you can access it.
var self = this;
if (this.queue_.length > 0) {
var request = this.queue_.shift();
var data = '';
http.get(request.url_, function(res) {
res.setEncoding('utf8');
res.on('data', function(chunk) {
data += chunk;
}).on('end', function() {
request.callback_(null, JSON.parse(data));
process.nextTick(function(){
// Call 'processRequest_' on the correct object.
self.processRequest_()
});
}).on('error', function(err) {
request.callback_(err, null);
process.nextTick(function(){
// Call 'processRequest_' on the correct object.
self.processRequest_()
});
});
});
}
}
That said, you might consider using the request module to simplify this.
var request = require('request');
Request.prototype.processRequest_ = function() {
var self = this;
if (this.queue_.length > 0) {
var requestData = this.queue_.shift();
request(requestData.url_, function(error, response, body){
requestData.callback_(err, err ? null : JSON.parse(body));
process.nextTick(function(){
self.processRequest_();
});
});
}
};