So right now I'm trying to use Nodejs to access files in order to write them to a server and process them.
I've split it into the following steps:
Traverse directories to generate an array of all of the file paths
Put the raw text data from each of file paths in another array
Process the raw data
The first two steps are working fine, using these functions:
var walk = function(dir, done) {
var results = [];
fs.readdir(dir, function(err, list) {
if (err) return done(err);
var pending = list.length;
if (!pending) return done(null, results);
list.forEach(function(file) {
file = path.resolve(dir, file);
fs.stat(file, function(err, stat) {
if (stat && stat.isDirectory()) {
walk(file, function(err, res) {
results = results.concat(res);
if (!--pending) done(null, results);
});
} else {
results.push(file);
if (!--pending) done(null, results);
}
});
});
});
};
function processfilepaths(callback) {
// reading each file
for (var k in filepaths) { if (arrayHasOwnIndex(filepaths, k)) {
fs.readFile(filepaths[k], function (err, data) {
if (err) throw err;
rawdata[k] = data.toString().split(/ *[\t\r\n\v\f]+/g);
for (var j in rawdata[k]) { if (arrayHasOwnIndex(rawdata[k], j)) {
rawdata[k][j] = rawdata[k][j].split(/: *|: +/);
}}
});
}}
if (callback) callback();
}
Obviously, I want to call the function processrawdata() after all of the data has been loaded. However, using callbacks doesn't seem to work.
walk(rootdirectory, function(err, results) {
if (err) throw err;
filepaths = results.slice();
processfilepaths(processrawdata);
});
This never causes an error. Everything seems to run perfectly except that processrawdata() is always finished before processfilepaths(). What am I doing wrong?
You are having a problem with callback invocation and asynchronously calling functions. IMO I'll recommend that you use a library such as after-all to execute a callback once all your functions get executed.
Here's a example, here the function done will be called once all the functions wrapped with next are called.
var afterAll = require('after-all');
// Call `done` once all the functions
// wrapped with next() get called
next = afterAll(done);
// first execute this
setTimeout(next(function() {
console.log('Step two.');
}), 500);
// then this
setTimeout(next(function() {
console.log('Step one.');
}), 100);
function done() {
console.log("Yay we're done!");
}
I think for your problem, you can use async module for Node.js:
async.series([
function(){ ... },
function(){ ... }
]);
To answer you actual question, I need to explain how Node.js works:
Say, when you call an async operation (say mysql db query), Node.js sends "execute this query" to MySQL. Since this query will take some time (may be some milliseconds), Node.js performs the query using the MySQL async library - getting back to the event loop and doing something else there while waiting for MySQL to get back to us. Like handling that HTTP request.
So, In your case both functions are independent and executes almost in parallel.
For more information:
Async.js for use with Node.js
function processfilepaths(callback) {
// reading each file
for (var k in filepaths) { if (arrayHasOwnIndex(filepaths, k)) {
fs.readFile(filepaths[k], function (err, data) {
if (err) throw err;
rawdata[k] = data.toString().split(/ *[\t\r\n\v\f]+/g);
for (var j in rawdata[k]) { if (arrayHasOwnIndex(rawdata[k], j)) {
rawdata[k][j] = rawdata[k][j].split(/: *|: +/);
}}
});
}}
if (callback) callback();
}
Realize that you have:
for
readfile (err, callback) {... }
if ...
Node will call each readfile asynchronously, which only sets up the event and callback, then when it is done calling each readfile, it will do the if, before the callback probably even has a chance to get invoked.
You need to use either Promises, or a promise module like async to serialize it. What you would then do looks like:
async.XXXX(filepaths, processRawData,
function (err, ...) {
// function for when all are done
if (callback) callback();
}
);
Where XXXX is one of the functions from the library like series, parallel, each, etc... The only thing you also need to know is in your process raw data, async gives you a callback to call when done. Unless you really need sequential access (I don't think you do) use parallel so that you can queue up as many i/o events as possible, it should execute faster, maybe only marginally, but it'll better leverage the hardware.
Related
I'm just starting to work with Javascript and Node, and Async and callbacks concepts are not something I have under control right now.
I have to call a function for each element of a documents Array. This function will call to DB and get me an array of the document annotations. I want to get all the annotations and put them on the same array. Something similar to this:
//function in an async waterfall
function(docs,callback){
let annotationsArray = [];
async.each(docs, (doc, callback2) => {
getAnnotationsFromDocument(doc.Id, callback2);
}, function (err,annotations){
if (err){
callback(err);
}
annotationsArray = annotationsArray.concat(annotations);
callback(null, annotationsArray);
});
},
//Next waterfall function
About the getAnnotationsFromDocument function, this is a simplified structure of it:
function getAnnotationsFromDocument(docId,callback){
initDB();
var async = require('async');
async.waterfall([
function authorize(callback){
//checkAuthorization
(...)
},
function getRfpdocAnnotations(auth, metadata, callback){
//call to DB
(...)
},
function processRfpdocAnnotations(rfpDocAnnotations,metadata,callback){
(...)
callback(null,annotationsList);
}
], function (err, result) {
if(err) {
callback(err);
} else {
callback(null, result);
}
});
}
Unfortunately, I'm unable to code it properly. I'm unable to get the results from the function before exiting the async.each. Could somebody explain me how to structurate the code for this?
Debugging I've found that the function getAnnotationsFromDocument gets the data and execute the last callback(null, result); properly, but when I get to function (err,annotations){, annotations is undefined.
Ok, I think I got it:
First problem was that async.each doesn't return the results on the callback like I was expecting. Unlike waterfall, it just returns the errors. I should have payed more attention reading the documentation.
Secondly, I had to create a callback on the getAnnotationsFromDocument call to process the results.
And finally, I was not executing the call to the callback of async.each, so the execution didn't get to the async.each callback and didn't continue to the next async.waterfall function.
To be quite honest, I'm not sure it's a correct answer, but it does what I was trying to achieve.
// function part of an async.waterfall
function(docs,callback){
let annotationsArray = [];
async.each(docs, (doc,callback2) => {
getAnnotationsFromDocument(doc._id, function(err,result){
if (err){
callback2(err);
}else{
annotationsArray = annotationsArray.concat(result);
}
callback2();
})
}, (err) =>{
if( err ) {
callback(err);
} else {
callback(null,annotationsArray); //to the next waterfall function
}
});
I'm trying to iterate threw a list of item and do some actions on them by calling an API like this example :
for (i = 0; i < arr.length; i++) {
if (arr[i].id == 42) {
api.requestAction(arr[i].id, function(error, response){ });
}
}
Problem is the loop obviously ended before all the requests are done and the program exits. What should I do to manage it ? I saw the "Promise" method but don't really know how I can use it in this case or maybe there's an other solution.
Thank you by advance !
With node-fetch (a promisify http api) you can together with async/await halt the for loop until it's done but this requires node v6+ with --harmony-async-await flag added
const fetch = require('node-fetch')
async function foo() {
for (let item of arr) {
if (item.id == 42) {
let res = await fetch(url)
let body = await res.text()
console.log(body)
}
}
console.log('done (after request)')
}
now every time you add the async keyword in front of a function it will always return a promise that resolve/rejects when everything is done
foo().then(done, fail)
alternetive you can just wrap you api fn in a promise if you don't want to install node-fetch
await new Promise((rs, rj) => {
api.requestAction(arr[i].id, function(error, response){
error ? rj(error) : rs(response)
})
})
Install bluebird
npm install bluebird --save
Code
//require npm
var Promise = require("bluebird");
//code
//"promisify" converts traditional callback function into a Promise based function
var _requestAction = Promise.promisify(api.requestAction);
//loop over array
Promise.map(arr, function (value) {
if (value.id == 42) {
//async request
return _requestAction(value.id).then(function (_result) {
//success
console.log(_result);
}).catch(function (e) {
//error
console.error(e);
});
}
});
You could use async.js. It's an asyncronous control flow library which provides control flows for things like sequential loops, looping in parralel, and many other common flow control mechanism, check it out.
See code below, the code assumes that you're variable 'arr' is defined somewhere in scope.
npm install async
var async = require("async");
//Loop through each item, waiting for your
//asyncronous function to finish before continuing
//to move onto the next item in the array
//NOTE: This does not loop sequentially, if you want that function with asyncjs then user eachSeries
async.each(arr,
//Item is the current item being iterated over,
//callback is the callback you call to finish the current iteration, it accepts an error and result parameter callback(error, result);
function (item, callback) {
api.requestAction(item.id, function(error, response){
//Check for any errors...
if (error) return callback(error);
callback(null);
});
},
function (err, result) {
//You've now finished the loop
if (err) {
//Do something, you passed an error object to
//in one of the loop's iterations
}
//No errors, move on with your code..
});
Use Bluebird Promises:
var Promise = require('bluebird');
Promise.map(arrayOfIds, function(item){
return api.requestAction(item);
})
.then(function(response){
// all the requests are resolved here
})
if u want sequential execution of the ids then use Promise.mapSeries (is slow as it waits for task to finish)
I'm trying out the framework node.js on one of my projects.
I'm really seeing some good advantages on what they called "event-driven, non-blocking I/O model" however if my project there are some moments where I don't necessarily want to have some asynchronous calls and to be able to several operation before launching some asynchronous call.
Especially when I want to do some factorization and create some functions.
Typically I have the following case:
I know that in several part of my program I have to check if a media is existing in my database for a given string or id.
So as a guy who tried to stay organize I want to create a function that I will call each time I need to check this.
However, I did not find the way to do that with node.js and pg (the npm PostgreSQL library (https://github.com/brianc/node-postgres/) . Indeed, there is always a callback in the function and the return is null because of the callback. Here is an example below
/*
Function which is supposed to check if a media existing
*/
function is_media_existing (url_or_id){
log.debug("is_media_existing : begin of the function", {"Parameter" : url_or_id});
pg.connect(connectionstring, function (err, client, done) {
if (err) {
log.warning("is_media_existing : Problem with Database connection", {
"Parameter": url_or_id,
"Error": err
});
}
if (isNaN(url_or_id)) {
// Case is parameter is not a number (string)
var query = client.query('SELECT COUNT(*) as count FROM media WHERE url = $1::string ', url_or_id);
query.on('error', function (error) {
log.warning("is_media_existing : Problem with Database query (connection to db passed but not query " +
"", {"Parameter": url_or_id, "Error": error});
});
return query;
} else {
// Case is parameter is a int
log.debug("is_media_existing : Type of Parameter is a string");
// Case is parameter is not a number (string)
var query = client.query('SELECT COUNT(*) as count FROM media WHERE id = $1::id ', url_or_id);
query.on('error', function (error) {
log.warning("is_media_existing : Problem with Database query (connection to db passed but not query " +
"", {"Parameter": url_or_id, "Error": error});
});
return query;
}
});
}
// Executing the function
var test = is_media_existing("http://random_url_existing_in_db");
// test is always null as the return is in a callback and the callback is asynchronous
i have the feeling my question is touching the core concepts of node.js, and perhaps my approach is wrong and I apologize in advance.
I know it's not good to wait for a response before doing something.
But what's the alternative? How can I factorize my code into functions when I need some functionalities in several part of my code?
So if there would be anyone who could explain how to do that with a best practice of programming it would be great.
Thanks
Anselme
As Cody says, you probably dont want to do synchronous function.
The way you should handle the situation in your example is to pass in your own callback like this
function is_media_existing (url_or_id, callback){
and then instead of return query; use your callback like this-
callback(query);
or probably better to follow the node convention for callback functions to have two parameters (err, result) so your callback would look like this
callback(null, query);
Here is a rework of your sample
function is_media_existing (url_or_id, callback){ /* callback(err, result) */
log.debug("is_media_existing : begin of the function", {"Parameter" : url_or_id});
pg.connect(connectionstring, function (err, client, done) {
if (err) {
done(err);
log.warning("is_media_existing : Problem with Database connection", {
"Parameter": url_or_id,
"Error": err
});
return callback(err, null);
/* note that this return is simply used to exit the connect's callback and the return value is typically
* not used it is the call to callback() that returns the error value */
}
var qrystr;
if (isNaN(url_or_id)) {
log.debug("is_media_existing : Type of Parameter is a string");
qrystr = 'SELECT COUNT(*) as count FROM media WHERE url = $1::string;';
} else {
qrystr = 'SELECT COUNT(*) as count FROM media WHERE id = $1::id;';
}
client.query(qrystr, [url_or_id], function(err, result){
done();
if(err){
/* .. */
}
callback(err, result);
});
});
}
// Executing the function
var test = is_media_existing("http://random_url_existing_in_db", function(err, result){
if(err){
}else {
}
});
If you end up with a hard nest of callbacks, promises are really worth looking into.
I don't think you really do want a synchronous call. The problem with synchronous calls in node is that it stops the entire process from doing anything while a synchronous function is running as it will stop the event loop. As an example lets say your sync function takes 2 seconds to complete. Your server will then do nothing for 2 full seconds. That 2 seconds includes everything (accepting new connections, everything else, etc). The reason blocking functions don't exist is because they are (very) bad. Here is an example how your function will react in an async manor.
is_media_existing("http://random_url_existing_in_db", function(exists){
if (exists){
//do stuff
} else {
//do this other stuff
}
});
Then within is_media_existing you will need to call that callback function when your query completes.
//psuedo
function is_media_existing(url, callback){
query('select COUNT(*) as count FROM media WHERE id = $1::id '. [url], function(err, result){
if (err)
callback(false)
else
callback(result.count > 0)
})
}
With the new ES6 plus async stuff and babel its simpler. You can npm i -g babel npm i babel-runtime then compile and run the following with babel test.js --optional runtime --stage 2 | node. Please read the following example carefully to see how to adapt it to your use case:
let testData = [
{ id: 0, childIds: [1,2]},
{ id: 1, childIds:[] }
];
function dbGet(ids) {
return new Promise( r=> {
// this an example; you could do any db
// query here and call r with the results
r(ids.map((id) => { return testData[id];}));
});
}
async function dbExists(ids) {
let found = await dbGet(ids);
return (found && found.length>0);
}
async function test() {
var exists = await dbExists([0]);
console.log(exists);
}
test().then(f=>{}).catch( e=> {console.log('e',e)});
I am trying to solve an exam problem, so I cannot post my exam code as it is. So I have simplified such that it addresses the core concept that I do not understand. Basically, I do not know how to slow down node's asynchronous execution so that my mongo code can catch up with it. Here is the code:
MongoClient.connect('mongodb://localhost:27017/somedb', function(err, db) {
if (err) throw err;
var orphans = [];
for (var i; i < 100000; i++) {
var query = { 'images' : i };
db.collection('albums').findOne(query, function(err, doc_album) {
if(err) throw err;
if (doc_album === null) {
orphans.push(i);
}
});
}
console.dir(orphans.length);
return db.close();
});
So I am trying to create an array of those images who do not match my query criteria. I end up with a orphans.length value of 0 since Node does not wait for the callbacks to finish. How can I modify the code such that the callbacks finish executing before I count the number of images in the array that did not meet my query criteria?
Thanks in advance for your time.
Bharat
I assume you want to do 100000 parallel DB calls. To "wait" 10000 calls completion in each call callback we increase finished calls counter and invoke main callback when last one finished. Note that very common mistake here is to use for loop variable as a closure inside callback. This does not work as expected as all 10000 handlers scheduled first and by the time first is executed loop variable is of the same, maximum value.
function getOrphans(cb) {
MongoClient.connect('mongodb://localhost:27017/somedb', function(err, db) {
if (err) cb(err);
var orphans = [];
var numResponses = 0;
var maxIndex = 100000
for (var i = 0; i < maxIndex; i++) {
// problem: by the time you get reply "i" would be 100000.
// closure variable changed to function argument:
(function(index) {
var query = { 'images' : index };
db.collection('albums').findOne(query, function(err, doc_album) {
numResponses++;
if(err) cb(err);
if (doc_album === null) {
orphans.push(index);
}
if (numResponses == maxIndex) {
db.close();
cb(null, orphans);
}
});
})(i); // this is "immediately executed function
}
});
}
getOrphans(function(err, o) {
if (err)
return console.log('error:', err);
console.log(o.length);
});
Im not suggesting this is the best way to handle this specific problem in Mongo, but if you need to wait to the DB to reply before continuing then just use the callback to start next request.
This is not obvious at first, but you can refer to the result processing function inside the function itself:
var i = 0;
var mycback = function(err, doc_album) {
// ... process i-th result ...
if (++i < 100000) {
db.collections("album").findOne({'images': i}, mycback);
} else {
// request is complete, "return" result
result_cback(null, res);
}
};
db.collections('album').findOne({'images': 0}, mycback);
This also means that your function itself will be async (i.e. will want a result_cback parameter to call with the result instead of using return).
Writing a sync function that calls an async one is just not possible.
You cannot "wait" for an event in Javascript... you must set up an handler for the result and then terminate.
Waiting for an event is done in event-based processing by writing a "nested event loop" and this is for example how message boxes are handled in most GUI frameworks. This is a capability that Javascript designers didn't want to give to programmers (not really sure why, though).
Since you know it does not wait for the call to come back. You can do the console.dir inside your callback function, this should work (although I haven't tested it)
db.collection('albums').findOne(query, function(err, doc_album) {
if(err) throw err;
if (doc_album === null) {
orphans.push(i);
}
console.dir(orphans.length);
});
You don't need to slow anything down. If you are simply trying to load 100,000 images from the albums collection, you could consider using the async framework. This will let you assign tasks until the job is complete.
Also, you probably don't want request 100,000 records one-by-one. Instead, you probably want to page them.
I have the following code in my nodejs application:
function someFileOperation(callback) {
var files = ...;
files.forEach(function (file) {
doSomethingAsync(file, function (err, result) {
});
});
}
What is an elegant way to call the callback of someFileOperation() in case all doSomethingAsync() called their callback function and call it only once, when an error in doSomethingAsync() occurred?
For now I came up with something like this:
function someFileOperation(callback) {
var files = ...;
var noFiles = files.length;
files.forEach(function (file) {
doSomethingAsync(file, function (err, result) {
if (err) {
callback(err);
callback = function () {}; // I don't want it to be called again
} else if (--noFiles <= 0) {
callback(null, true);
}
});
});
}
But I think this is a lot of overhead for such a simple task. So I am looking for a much more elegant way or maybe a little framework for these kind of problems
Use async.map or async.foreach see here: https://github.com/caolan/async#map and https://github.com/caolan/async#forEach
the async.map method takes an array of items and performs the same async call for each item in your array in parallel. If no errors are encountered, it will call a callback with a new array of results.