Calculating when multiple writes to a file will cause inaccuracies? - javascript

in my node server I have a variable,
var clicks = 0;
each time a user clicks in the webapp, a websocket event sends a message. on the server,
clicks++;
if (clicks % 10 == 0) {
saveClicks();
}
function saveClicks() {
var placementData = JSON.stringify({'clicks' : clicks});
fs.writeFile( __dirname + '/clicks.json', placementData, function(err) {
});
}
At what rate do I have to start worrying about overwrites? How would I calculate this math?
(I'm looking at creating a MongoDB json object for each click but I'm curious what a native solution can offer).

From the node.js doc for fs.writeFile():
Note that it is unsafe to use fs.writeFile() multiple times on the
same file without waiting for the callback. For this scenario,
fs.createWriteStream() is strongly recommended.
This isn't a math problem to figure out when this might cause a problem - it's just bad code that gives you the chance of a conflict in circumstances that cannot be predicted. The node.js doc clearly states that this can cause a conflict.
To make sure you don't have a conflict, write the code in a different way so a conflict cannot happen.
If you want to make sure that all writes happen in the proper order of incoming requests so the last request to arrive is always the one who ends up in the file, then you make need to queue your data as it arrives (so order is preserved) and write to the file in a way that opens the file for exclusive access so no other request can write while that prior request is still writing and handle contention errors appropriately.
This is an issue that databases mostly do for you automatically so it may be one reason to use a database.
Assuming you weren't using clustering and thus do not have multiple processes trying to write to this file and that you just want to make sure the last value sent is the one written to the file by this process, you could do something like this:
var saveClicks = (function() {
var isWriting = false;
var lastData;
return function() {
// always save most recent data here
lastData = JSON.stringify({'clicks' : clicks});
if (!isWriting) {
writeData(lastData);
}
function writeData(data) {
isWriting = true;
lastData = null;
fs.writeFile(__dirname + '/clicks.json', data, function(err) {
isWriting = false;
if (err) {
// decide what to do if an error occurs
}
// if more data arrived while we were writing this, then write it now
if (lastData) {
writeData(lastData);
}
});
}
}
})();

#jfriend00 is definitely right about createWriteStream and already made a point about the database, and everything's pretty much said, but I would like to emphasize on the point about databases because basically the file-saving approach seems weird to me.
So, use databases.
Not only would this save you from the headache of tracking such things, but would significantly speed up things (remember that the way stuff is done in node, the numerous file reading-writing processes would be parallelized in a single thread, so basically if one of them lasts for ages, it might slightly affect the overall performance).
Redis is a perfect solution to store key-value data, so you can store data like clicks per user in a Redis database which you'll have to get running alongside anyway when your get enough traffic :)
If you're not convinced yet, take a look at this simple benchmark:
Redis:
var async = require('async');
var redis = require("redis"),
client = redis.createClient();
console.time("To Redis");
async.mapLimit(new Array(100000).fill(0), 1, (el, cb) => client.set("./test", 777, cb), () => {
console.timeEnd("To Redis");
});
To Redis: 5410.383ms
fs:
var async = require('async');
var fs = require('fs');
console.time("To file");
async.mapLimit(new Array(100000).fill(0), 1, (el, cb) => fs.writeFile("./test", 777, cb), () => {
console.timeEnd("To file");
});
To file: 20344.749ms
And, by the way, you can significantly increase the number of clicks after which the progress would be stored (now it's 10) by simply adding this "click-saver" to the socket socket.on('disconnect', ....

Related

How to run an infinite blocking process in NodeJS?

I have a set of API endpoints in Express. One of them receives a request and starts a long running process that blocks other incoming Express requests.
My goal to make this process non-blocking. To understand better inner logic of Node Event Loop and how I can do it properly, I want to replace this long running function with my dummy long running blocking function that would start when I send a request to its endpoint.
I suppose, that different ways of making the dummy function blocking could cause Node manage these blockings differently.
So, my question is - how can I make a basic blocking process as a function that would run infinitely?
You can use node-webworker-threads.
var Worker, i$, x$, spin;
Worker = require('webworker-threads').Worker;
for (i$ = 0; i$ < 5; ++i$) {
x$ = new Worker(fn$);
x$.onmessage = fn1$;
x$.postMessage(Math.ceil(Math.random() * 30));
}
(spin = function(){
return setImmediate(spin);
})();
function fn$(){
var fibo;
fibo = function(n){
if (n > 1) {
return fibo(n - 1) + fibo(n - 2);
} else {
return 1;
}
};
return this.onmessage = function(arg$){
var data;
data = arg$.data;
return postMessage(fibo(data));
};
}
function fn1$(arg$){
var data;
data = arg$.data;
console.log("[" + this.thread.id + "] " + data);
return this.postMessage(Math.ceil(Math.random() * 30));
}
https://github.com/audreyt/node-webworker-threads
So, my question is - how can I make a basic blocking process as a function that would run infinitely?
function block() {
// not sure why you need that though
while(true);
}
I suppose, that different ways of making the dummy function blocking could cause Node manage these blockings differently.
Not really. I can't think of a "special way" to block the engine differently.
My goal to make this process non-blocking.
If it is really that long running you should really offload it to another thread.
There are short cut ways to do a quick fix if its like a one time thing, you can do it using a npm module that would do the job.
But the right way to do it is setting up a common design pattern called 'Work Queues'. You will need to set up a queuing mechanism, like rabbitMq, zeroMq, etc. How it works is, whenever you get a computation heavy task, instead of doing it in the same thread, you send it to the queue with relevant id values. Then a separate node process commonly called a 'worker' process will be listening for new actions on the queue and will process them as they arrive. This is a worker queue pattern and you can read up on it here:
https://www.rabbitmq.com/tutorials/tutorial-one-javascript.html
I would strongly advise you to learn this pattern as you would come across many tasks that would require this kind of mechanism. Also with this in place you can scale both your node servers and your workers independently.
I am not sure what exactly your 'long processing' is, but in general you can approach this kind of problem in two different ways.
Option 1:
Use the webworker-threads module as #serkan pointed out. The usual 'thread' limitations apply in this scenario. You will need to communicate with the Worker in messages.
This method should be preferable only when the logic is too complicated to be broken down into smaller independent problems (explained in option 2). Depending on complexity you should also consider if native code would better serve the purpose.
Option 2:
Break down the problem into smaller problems. Solve a part of the problem, schedule the next part to be executed later, and yield to let NodeJS process other events.
For example, consider the following example for calculating the factorial of a number.
Sync way:
function factorial(inputNum) {
let result = 1;
while(inputNum) {
result = result * inputNum;
inputNum--;
}
return result;
}
Async way:
function factorial(inputNum) {
return new Promise(resolve => {
let result = 1;
const calcFactOneLevel = () => {
result = result * inputNum;
inputNum--;
if(inputNum) {
return process.nextTick(calcFactOneLevel);
}
resolve(result);
}
calcFactOneLevel();
}
}
The code in second example will not block the node process. You can send the response when returned promise resolves.

Semaphore equivalent in Node js , variable getting modified in concurrent request?

I am facing this issue for the past 1 week and I am just confused about this.
Keeping it short and simple to explain the problem.
We have an in memory Model which stores values like budget etc.Now when a call is made to the API it has a spent associated with it.
We then check the in memory model and add the spent to the existing spend and then check to the budget and if it exceeds we donot accept any more clicks of that model. for each call we also udpate the db but that is a async operation.
A short example
api.get('/clk/:spent/:id', function(req, res) {
checkbudget(spent, id);
}
checkbudget(spent, id){
var obj = in memory model[id]
obj.spent+= spent;
obj.spent > obj.budjet // if greater.
obj.status = 11 // 11 is the stopped status
update db and rebuild model.
}
This used to work fine but now with concurrent requests we are getting false spends out spends increase more than budget and it stops after some time. We simulated the call with j meter and found this.
As far as we could find node is async so by the time the status is updated to 11 many threads have already updated the spent for the campaign.
How to have a semaphore kind of logic for Node.js so that the variable budget is in sync with the model
update
db.addSpend(campaignId, spent, function(err, data) {
campaign.spent += spent;
var totalSpent = (+camp.spent) + (+camp.cpb);
if (totalSpent > camp.budget) {
logger.info('Stopping it..');
camp.status = 11; // in-memory stop
var History = [];
History.push(some data);
db.stopCamp(campId, function(err, data) {
if (err) {
logger.error('Error while stopping );
}
model.campMAP = buildCatMap(model);
model.campKeyMap = buildKeyMap(model);
db.campEventHistory(cpcHistory, false, function(err) {
if (err) {
logger.error(Error);
}
})
});
}
});
GIST of the code can anyone help now please
Q: Is there semaphore or equivalent in NodeJs?
A: No.
Q: Then how do NodeJs users deal with race condition?
A: In theory you shouldn't have to as there is no thread in javascript.
Before going deeper into my proposed solution I think it is important for you to know how NodeJs works.
For NodeJs it is driven by an event based architecture. This means that in the Node process there is an event queue that contains all the "to-do" events.
When an event gets pop from the queue, node will execute all of the required code until it is finished. Any async calls that were made during the run were spawned as other events and they are queued up in the event queue until a response is heard back and it is time to run them again.
Q: So what can I do to ensure that only 1 request can perform updates to the database at a time?
A: I believe there are many ways you can achieve this but one of the easier way out is to use the set_timeout API.
Example:
api.get('/clk/:spent/:id', function(req, res) {
var data = {
id: id
spending: spent
}
canProceed(data, /*functions to exec after canProceed=*/ checkbudget);
}
var canProceed = function(data, next) {
var model = in memory model[id];
if (model.is_updating) {
set_timeout(isUpdating(data, next), /*try again in=*/1000/*milliseconds*/);
}
else {
// lock is released. Proceed.
next(data.spending, data.id)
}
}
checkbudget(spent, id){
var obj = in memory model[id]
obj.is_updating = true; // Lock this model
obj.spent+= spent;
obj.spent > obj.budjet // if greater.
obj.status = 11 // 11 is the stopped status
update db and rebuild model.
obj.is_updating = false; // Unlock the model
}
Note: What I got here is pseudo code as well so you'll may have to tweak it a bit.
The idea here is to have a flag in your model to indicate whether a HTTP request can proceed to do the critical code path. In this case your checkbudget function and beyond.
When a request comes in it checks the is_updating flag to see if it can proceed. If it is true then it schedules an event, to be fired in a second later, this "setTimeout" basically becomes an event and gets placed into node's event queue for later processing
When this event gets fired later, the checks again. This occurs until the is_update flag becomes false then the request goes on to do its stuff and is_update is set to false again when all the critical code is done.
Not the most efficient way but it gets the job done, you can always revisit the solution when performance becomes a problem.

how to call setImmediate before/above .on('data') fast-csv in nodejs

I am using fast-csv ( https://www.npmjs.com/package/fast-csv ) to parse a csv file.
It might have 10k records so it takes a lot of time to parse and almost all other operations on that server get blocked.
So i want to use "setImmediate()" to delay the execution/parsing of records. So that other processess also get cpu time.
csv
.fromPath(csvfile, {headers: true, discardUnmappedColumns:true, ignoreEmpty:true})
.validate( function(record) {
//some validations here
})
.on("data-invalid", function(record){
logger.error("Validation of record failed:" + record);
})
.on("data", function(record){
// i know i can add a setImmediate here but i dont want the code to parse all the records in csv at a go.
// i want to call setImmediate above the .on("data") so that the contacts are validated/parsed slowly(when they get cpu as per setImmediate)
});
Either the above or any other way to control/delay the .on("data").
As per my search it cant be controlled since its a stream.
Any and all suggestion welcome !
So i want to use setImmediate() to delay the execution/parsing of records...or any other way to control/delay the .on("data")
setImmediate() will not be able to help you; even though it will yield to the event loop, your pipeline is already yielding to the event loop because of the CSV file being read.
Instead, you can use a module like concurrent-map-stream to limit the number of concurrent database queries being performed, which is likely the real problem that you're running in to.
For example, to limit the number of queries to 100:
function queryWorker(record, done) {
performQuery(record, done);
};
var queue = require('concurrent-map-stream');
var queueStream = queue(queryWorker, 100).on('data', function(record) {
...
}).on('close', function() {
// done
});
csv.fromPath(...).pipe(queueStream);
I solved thisproblem by pushing each record(which i get into .on("data")) in an array and then inside .on("end") using async.mapLimit() which solves my problem
Ref here https://github.com/caolan/async#mapcoll-iteratee-callback and found an example here http://promise-nuggets.github.io/articles/16-map-limit.html

node.js handling stream back pressure when using setTimeout

This is a follow on question for a further issue I've encountered from this earlier question:
nodejs: read from file and store to db, limit maximum concurrent db operations
Problem:
I want to condtionally reschedule some operations for a later time, however this is breaking my method for handling back-pressure.
Detail:
I have a CSV file that I am reading in as a stream, and using transforms to convert to JSON and then asynchronously store each line to a DB.
As lines are processed by the transform, they are placed onto an async queue which is responsible for issuing the DB operations.
E.g.
parser._transform = function(data, encoding, done) {
var tick = this._parseRow(data);
dbQueue.push(tick, function(err, result) {
if (typeof(err) != 'undefined') { console.log(err) }
});
this.push(tick);
done();
}
Back pressure is handled by pausing and resuming the parser when the queue is saturated/empty:
dbQueue.saturated = function() {
parser.pause();
}
dbQueue.empty = function() {
parser.resume();
}
The change I have been trying to make is that when an item is pulled off the queue, it is conditionally rescheduled for some time (100ms) in future:
var dbQueue = async.queue(function(data, callback) {
if (condition) {
// re-schedule operation by adding back to queue 100ms later
setTimeout(function(data, callback) {
dbQueue.push(data, function(err, result){
});
}, 100, data, callback);
} else {
//execute the db store
... ...
}
}
I believe my problem is that now many operations will spend most their time in setTimeout, so the dbQueue will be empty, and the back-pressure on the transform stream is not being handled as desired.
I have tried a few attempts at using counters such as max_ops and running_ops to ensure the stream is paused/resumed, but unsuccessfully.
Is there a more idiomatic way of handling this in node.js?
Since this looks like it's an external condition and not something related to what dbQueue is doing, instead of re-inserting the data in to the queue when the condition occurs, I would pause simply pause dbQueue. For example, lets say your condition is that the database disconnected for some reason and there's an event you can listen to for that. In that case you can just do something similar to what you're doing when dbQueue is saturated/empty:
db.on('disconnect', function() {
dbQueue.pause();
});
db.on('connect', function() {
dbQueue.resume();
});
This is usually a better approach than waiting for some pre-determined timeout. That being said, sometimes waiting for a timeout is the only option. In that case you could do something similar but, instead of waiting for a separate event to trigger the resume(), simply use setTimeout():
db.on('disconnect', function() {
dbQueue.pause();
setTimeout(function() {
dbQueue.resume();
});
});
Note: If we are really talking about db disconnects here, then you might also want to pause/resume dbQueue if there's a db error in the case that 100ms isn't enough time for the db to re-connect
If you have a more specific condition you're looking for, and you're willing to share what that is, I may be able to give you a better example :)

Handling interdependent and/or layered asynchronous calls

As an example, suppose I want to fetch a list of files from somewhere, then load the contents of these files and finally display them to the user. In a synchronous model, it would be something like this (pseudocode):
var file_list = fetchFiles(source);
if (!file_list) {
display('failed to fetch list');
} else {
for (file in file_list) { // iteration, not enumeration
var data = loadFile(file);
if (!data) {
display('failed to load: ' + file);
} else {
display(data);
}
}
}
This provides decent feedback to the user and I can move pieces of code into functions if I so deem necessary. Life is simple.
Now, to crush my dreams: fetchFiles() and loadFile() are actually asynchronous. The easy way out is to transform them into synchronous functions. But this is not good if the browser locks up waiting for calls to complete.
How can I handle multiple interdependent and/or layered asynchronous calls without delving deeper and deeper into an endless chain of callbacks, in classic reductio ad spaghettum fashion? Is there a proven paradigm to cleanly handle these while keeping code loosely coupled?
Deferreds are really the way to go here. They capture exactly what you (and a whole lot of async code) want: "go away and do this potentially expensive thing, don't bother me in the meantime, and then do this when you get back."
And you don't need jQuery to use them. An enterprising individual has ported Deferred to underscore, and claims you don't even need underscore to use it.
So your code can look like this:
function fetchFiles(source) {
var dfd = _.Deferred();
// do some kind of thing that takes a long time
doExpensiveThingOne({
source: source,
complete: function(files) {
// this informs the Deferred that it succeeded, and passes
// `files` to all its success ("done") handlers
dfd.resolve(files);
// if you know how to capture an error condition, you can also
// indicate that with dfd.reject(...)
}
});
return dfd;
}
function loadFile(file) {
// same thing!
var dfd = _.Deferred();
doExpensiveThingTwo({
file: file,
complete: function(data) {
dfd.resolve(data);
}
});
return dfd;
}
// and now glue it together
_.when(fetchFiles(source))
.done(function(files) {
for (var file in files) {
_.when(loadFile(file))
.done(function(data) {
display(data);
})
.fail(function() {
display('failed to load: ' + file);
});
}
})
.fail(function() {
display('failed to fetch list');
});
The setup is a little wordier, but once you've written the code to handle the Deferred's state and stuffed it off in a function somewhere you won't have to worry about it again, you can play around with the actual flow of events very easily. For example:
var file_dfds = [];
for (var file in files) {
file_dfds.push(loadFile(file));
}
_.when(file_dfds)
.done(function(datas) {
// this will only run if and when ALL the files have successfully
// loaded!
});
Events
Maybe using events is a good idea. It keeps you from creating code-trees and de-couples your code.
I've used bean as the framework for events.
Example pseudo code:
// async request for files
function fetchFiles(source) {
IO.get(..., function (data, status) {
if(data) {
bean.fire(window, 'fetched_files', data);
} else {
bean.fire(window, 'fetched_files_fail', data, status);
}
});
}
// handler for when we get data
function onFetchedFiles (event, files) {
for (file in files) {
var data = loadFile(file);
if (!data) {
display('failed to load: ' + file);
} else {
display(data);
}
}
}
// handler for failures
function onFetchedFilesFail (event, status) {
display('Failed to fetch list. Reason: ' + status);
}
// subscribe the window to these events
bean.on(window, 'fetched_files', onFetchedFiles);
bean.on(window, 'fetched_files_fail', onFetchedFilesFail);
fetchFiles();
Custom events and this kind of event handling is implemented in virtually all popular JS frameworks.
Sounds like you need jQuery Deferred. Here is some untested code that might help point you in the right direction:
$.when(fetchFiles(source)).then(function(file_list) {
if (!file_list) {
display('failed to fetch list');
} else {
for (file in file_list) {
$.when(loadFile(file)).then(function(data){
if (!data) {
display('failed to load: ' + file);
} else {
display(data);
}
});
}
}
});
I also found another decent post which gives a few uses cases for the Deferred object
If you do not want to use jQuery, what you could use instead are web workers in combination with synchronous requests. Web workers are supported across every major browser with the exception of any Internet Explorer version before 10.
Web Worker browser compatability
Basically, if you're not entirely certain what a web worker is, think of it as a way for browsers to execute specialized JavaScript on a separate thread without impacting the main thread (Caveat: On a single-core CPU, both threads will run in an alternating fashion. Luckily, most computers nowadays come equipped with dual-core CPUs). Usually, web workers are reserved for complex computations or some intense processing task. Just keep in mind that any code within the web worker CANNOT reference the DOM nor can it reference any global data structures that have not been passed to it. Essentially, web workers run independent of the main thread. Any code that the worker executes should be kept separate from the rest of your JavaScript code base, within its own JS file. Furthermore, if the web workers need specific data in order to properly work, you need to pass that data into them upon starting them up.
Yet another important thing worth noting is that any JS libraries that you need to use to load the files will need to be copied directly into the JavaScript file that the worker will execute. That means these libraries should first be minified(if they haven't been already), then copied and pasted into the top of the file.
Anyway, I decided to write up a basic template to show you how to approach this. Check it out below. Feel free to ask questions/criticize/etc.
On the JS file that you want to keep executing on the main thread, you want something like the following code below in order to invoke the worker.
function startWorker(dataObj)
{
var message = {},
worker;
try
{
worker = new Worker('workers/getFileData.js');
}
catch(error)
{
// Throw error
}
message.data = dataObj;
// all data is communicated to the worker in JSON format
message = JSON.stringify(message);
// This is the function that will handle all data returned by the worker
worker.onMessage = function(e)
{
display(JSON.parse(e.data));
}
worker.postMessage(message);
}
Then, in a separate file meant for the worker (as you can see in the code above, I named my file getFileData.js), write something like the following...
function fetchFiles(source)
{
// Put your code here
// Keep in mind that any requests made should be synchronous as this should not
// impact the main thread
}
function loadFile(file)
{
// Put your code here
// Keep in mind that any requests made should be synchronous as this should not
// impact the main thread
}
onmessage = function(e)
{
var response = [],
data = JSON.parse(e.data),
file_list = fetchFiles(data.source),
file, fileData;
if (!file_list)
{
response.push('failed to fetch list');
}
else
{
for (file in file_list)
{ // iteration, not enumeration
fileData = loadFile(file);
if (!fileData)
{
response.push('failed to load: ' + file);
}
else
{
response.push(fileData);
}
}
}
response = JSON.stringify(response);
postMessage(response);
close();
}
PS: Also, I dug up another thread which would better help you understand the pros and cons of using synchronous requests in combination with web workers.
Stack Overflow - Web Workers and Synchronous Requests
async is a popular asynchronous flow control library often used with node.js. I've never personally used it in the browser, but apparently it works there as well.
This example would (theoretically) run your two functions, returning an object of all the filenames and their load status. async.map runs in parallel, while waterfall is a series, passing the results of each step on to the next.
I am assuming here that your two async functions accept callbacks. If they do not, I'd require more info as to how they're intended to be used (do they fire off events on completion? etc).
async.waterfall([
function (done) {
fetchFiles(source, function(list) {
if (!list) done('failed to fetch file list');
else done(null, list);
});
// alternatively you could simply fetchFiles(source, done) here, and handle
// the null result in the next function.
},
function (file_list, done) {
var loadHandler = function (memo, file, cb) {
loadFile(file, function(data) {
if (!data) {
display('failed to load: ' + file);
} else {
display(data);
}
// if any of the callbacks to `map` returned an error, it would halt
// execution and pass that error to the final callback. So we don't pass
// an error here, but rather a tuple of the file and load result.
cb(null, [file, !!data]);
});
};
async.map(file_list, loadHandler, done);
}
], function(err, result) {
if (err) return display(err);
// All files loaded! (or failed to load)
// result would be an array of tuples like [[file, bool file loaded?], ...]
});
waterfall accepts an array of functions and executes them in order, passing the result of each along as the arguments to the next, along with a callback function as the last argument, which you call with either an error, or the resulting data from the function.
You could of course add any number of different async callbacks between or around those two, without having to change the structure of the code at all. waterfall is actually only 1 of 10 different flow control structures, so you have a lot of options (although I almost invariably end up using auto, which allows you to mix parallel and series execution in the same function via a Makefile like requirements syntax).
I had this issue with a webapp I'm working on and here's how I solved it (with no libraries).
Step 1: Wrote a very lightweight pubsub implementation. Nothing fancy. Subscribe, Unsubscribe, Publish and Log. Everything (with comments) adds up 93 lines of Javascript. 2.7kb before gzip.
Step 2: Decoupled the process you were trying to accomplish by letting the pubsub implementation do the heavy lifting. Here's an example:
// listen for when files have been fetched and set up what to do when it comes in
pubsub.notification.subscribe(
"processFetchedResults", // notification to subscribe to
"fetchedFilesProcesser", // subscriber
/* what to do when files have been fetched */
function(params) {
var file_list = params.notificationParams.file_list;
for (file in file_list) { // iteration, not enumeration
var data = loadFile(file);
if (!data) {
display('failed to load: ' + file);
} else {
display(data);
}
}
);
// trigger fetch files
function fetchFiles(source) {
// ajax call to source
// on response code 200 publish "processFetchedResults"
// set publish parameters as ajax call response
pubsub.notification.publish("processFetchedResults", ajaxResponse, "fetchFilesFunction");
}
Of course this is very verbose in the setup and scarce on the magic behind the scenes.
Here's some technical details:
I'm using setTimeout to handle triggering subscriptions. This way they run in a non-blocking fashion.
The call is effectively decoupled from the processing. You can write a different subscription to the notification "processFetchedResults" and do multiple things once the response comes through (for example logging and processing) while keeping them in very separate, tiny and easily-managed code blocks.
The above code sample doesn't address fallbacks or run proper checks. I'm sure it will require a bit of tooling to get to production standards. Just wanted to show you how possible it is and how library-independent your solution can be.
Cheers!

Categories