JavaScript function blocks web socket & causes sync issue & delay - javascript

I have a web socket that receives data from a web socket server every 100 to 200ms, ( I have tried both with a shared web worker as well as all in the main.js file),
When new JSON data arrives my main.js runs filter_json_run_all(json_data) which updates Tabulator.js & Dygraph.js Tables & Graphs with some custom color coding based on if values are increasing or decreasing
1) web socket json data ( every 100ms or less) -> 2) run function filter_json_run_all(json_data) (takes 150 to 200ms) -> 3) repeat 1 & 2 forever
Quickly the timestamp of the incoming json data gets delayed versus the actual time (json_time 15:30:12 vs actual time: 15:31:30) since the filter_json_run_all is causing a backlog in operations.
So it causes users on different PC's to have websocket sync issues, based on when they opened or refreshed the website.
This is only caused by the long filter_json_run_all() function, otherwise if all I did was console.log(json_data) they would be perfectly in sync.
Please I would be very very grateful if anyone has any ideas how I can prevent this sort of blocking / backlog of incoming JSON websocket data caused by a slow running javascript
function :)
I tried using a shared web worker which works but it doesn't get around the delay in main.js blocked by filter_json_run_all(), I dont thing I can put filter_json_run_all() since all the graph & table objects are defined in main & also I have callbacks for when I click on a table to update a value manually (Bi directional web socket)
If you have any ideas or tips at all I will be very grateful :)
worker.js:
const connectedPorts = [];
// Create socket instance.
var socket = new WebSocket(
'ws://'
+ 'ip:port'
+ '/ws/'
);
// Send initial package on open.
socket.addEventListener('open', () => {
const package = JSON.stringify({
"time": 123456,
"channel": "futures.tickers",
"event": "subscribe",
"payload": ["BTC_USD", "ETH_USD"]
});
socket.send(package);
});
// Send data from socket to all open tabs.
socket.addEventListener('message', ({ data }) => {
const package = JSON.parse(data);
connectedPorts.forEach(port => port.postMessage(package));
});
/**
* When a new thread is connected to the shared worker,
* start listening for messages from the new thread.
*/
self.addEventListener('connect', ({ ports }) => {
const port = ports[0];
// Add this new port to the list of connected ports.
connectedPorts.push(port);
/**
* Receive data from main thread and determine which
* actions it should take based on the received data.
*/
port.addEventListener('message', ({ data }) => {
const { action, value } = data;
// Send message to socket.
if (action === 'send') {
socket.send(JSON.stringify(value));
// Remove port from connected ports list.
} else if (action === 'unload') {
const index = connectedPorts.indexOf(port);
connectedPorts.splice(index, 1);
}
});
Main.js This is only part of filter_json_run_all which continues on for about 6 or 7 Tabulator & Dygraph objects. I wante to give an idea of some of the operations called with SetTimeout() etc
function filter_json_run_all(json_str){
const startTime = performance.now();
const data_in_array = json_str //JSON.parse(json_str.data);
// if ('DATETIME' in data_in_array){
// var milliseconds = (new Date()).getTime() - Date.parse(data_in_array['DATETIME']);
// console.log("milliseconds: " + milliseconds);
// }
if (summary in data_in_array){
if("DATETIME" in data_in_array){
var time_str = data_in_array["DATETIME"];
element_time.innerHTML = time_str;
}
// summary Data
const summary_array = data_in_array[summary];
var old_sum_arr_krw = [];
var old_sum_arr_irn = [];
var old_sum_arr_ntn = [];
var old_sum_arr_ccn = [];
var old_sum_arr_ihn = [];
var old_sum_arr_ppn = [];
var filtered_array_krw_summary = filterByProperty_summary(summary_array, "KWN")
old_sum_arr_krw.unshift(Table_summary_krw.getData());
Table_summary_krw.replaceData(filtered_array_krw_summary);
//Colour table
color_table(filtered_array_krw_summary, old_sum_arr_krw, Table_summary_krw);
var filtered_array_irn_summary = filterByProperty_summary(summary_array, "IRN")
old_sum_arr_irn.unshift(Table_summary_inr.getData());
Table_summary_inr.replaceData(filtered_array_irn_summary);
//Colour table
color_table(filtered_array_irn_summary, old_sum_arr_irn, Table_summary_inr);
var filtered_array_ntn_summary = filterByProperty_summary(summary_array, "NTN")
old_sum_arr_ntn.unshift(Table_summary_twd.getData());
Table_summary_twd.replaceData(filtered_array_ntn_summary);
//Colour table
color_table(filtered_array_ntn_summary, old_sum_arr_ntn, Table_summary_twd);
// remove formatting on fwds curves
setTimeout(() => {g_fwd_curve_krw.updateOptions({
'file': dataFwdKRW,
'labels': ['Time', 'Bid', 'Ask'],
strokeWidth: 1,
}); }, 200);
setTimeout(() => {g_fwd_curve_inr.updateOptions({
'file': dataFwdINR,
'labels': ['Time', 'Bid', 'Ask'],
strokeWidth: 1,
}); }, 200);
// remove_colors //([askTable_krw, askTable_inr, askTable_twd, askTable_cny, askTable_idr, askTable_php])
setTimeout(() => { askTable_krw.getRows().forEach(function (item, index) {
row = item.getCells();
row.forEach(function (value_tmp){value_tmp.getElement().style.backgroundColor = '';}
)}); }, 200);
setTimeout(() => { askTable_inr.getRows().forEach(function (item, index) {
row = item.getCells();
row.forEach(function (value_tmp){value_tmp.getElement().style.backgroundColor = '';}
)}); }, 200);
color_table Function
function color_table(new_arr, old_array, table_obj){
// If length is not equal
if(new_arr.length!=old_array[0].length)
console.log("Diff length");
else
{
// Comparing each element of array
for(var i=0;i<new_arr.length;i++)
//iterate old dict dict
for (const [key, value] of Object.entries(old_array[0][i])) {
if(value == new_arr[i][key])
{}
else{
// console.log("Different element");
if(key!="TENOR")
// console.log(table_obj)
table_obj.getRows()[i].getCell(key).getElement().style.backgroundColor = 'yellow';
if(key!="TIME")
if(value < new_arr[i][key])
//green going up
//text_to_speech(new_arr[i]['CCY'] + ' ' +new_arr[i]['TENOR']+ ' getting bid')
table_obj.getRows()[i].getCell(key).getElement().style.backgroundColor = 'Chartreuse';
if(key!="TIME")
if(value > new_arr[i][key])
//red going down
table_obj.getRows()[i].getCell(key).getElement().style.backgroundColor = 'Crimson';
}
}
}
}
Potential fudge / solution, thanks Aaron :):
function limiter(fn, wait){
let isCalled = false,
calls = [];
let caller = function(){
if (calls.length && !isCalled){
isCalled = true;
if (calls.length >2){
calls.splice(0,calls.length-1)
//remove zero' upto n-1 function calls from array/ queue
}
calls.shift().call();
setTimeout(function(){
isCalled = false;
caller();
}, wait);
}
};
return function(){
calls.push(fn.bind(this, ...arguments));
// let args = Array.prototype.slice.call(arguments);
// calls.push(fn.bind.apply(fn, [this].concat(args)));
caller();
};
}
This is then defined as a constant for a web worker to call:
const filter_json_run_allLimited = limiter(data => { filter_json_run_all(data); }, 300); // 300ms for examples
Web worker calls the limited function when new web socket data arrives:
// Event to listen for incoming data from the worker and update the DOM.
webSocketWorker.port.addEventListener('message', ({ data }) => {
// Limited function
filter_json_run_allLimited(data);
});
Please if anyone knows how websites like tradingview or real time high performance data streaming sites allow for low latency visualisation updates, please may you comment, reply below :)

I'm reticent to take a stab at answering this for real without knowing what's going on in color_table. My hunch, based on the behavior you're describing is that filter_json_run_all is being forced to wait on a congested DOM manipulation/render pipeline as HTML is being updated to achieve the color-coding for your updated table elements.
I see you're already taking some measures to prevent some of these DOM manipulations from blocking this function's execution (via setTimeout). If color_table isn't already employing a similar strategy, that'd be the first thing I'd focus on refactoring to unclog things here.
It might also be worth throwing these DOM updates for processed events into a simple queue, so that if slow browser behavior creates a rendering backlog, the function actually responsible for invoking pending DOM manipulations can elect to skip outdated render operations to keep the UI acceptably snappy.
Edit: a basic queueing system might involve the following components:
The queue, itself (this can be a simple array, it just needs to be accessible to both of the components below).
A queue appender, which runs during filter_json_run_all, simply adding objects to the end of the queue representing each DOM manipulation job you plan to complete using color_table or one of your setTimeout` callbacks. These objects should contain the operation to performed (i.e: the function definition, uninvoked), and the parameters for that operation (i.e: the arguments you're passing into each function).
A queue runner, which runs on its own interval, and invokes pending DOM manipulation tasks from the front of the queue, removing them as it goes. Since this operation has access to all of the objects in the queue, it can also take steps to optimize/combine similar operations to minimize the amount of repainting it's asking the browser to do before subsequent code can be executed. For example, if you've got several color_table operations that coloring the same cell multiple times, you can simply perform this operation once with the data from the last color_table item in the queue involving that cell. Additionally, you can further optimize your interaction with the DOM by invoking the aggregated DOM manipulation operations, themselves, inside a requestAnimationFrame callback, which will ensure that scheduled reflows/repaints happen only when the browser is ready, and is preferable from a performance perspective to DOM manipulation queueing via setTimeout/setInterval.

Related

node is waiting for a timeout that already killed

I have set a timeout for sending back an error. The problem is that if I need to clear the timeout with clearTimeOut() it does indeed kill it, I can see that as the value of errTimeout's _kill shows true in the debugger. But for some reason node still keeps running the script until the timeOutPeriod is over. I guess it won't really create issues in production because the calling function will receive the returned value. But it still kinda pisses me off that it keeps waiting instead of killing the script.
return new Promise((resolve,reject) => {
function checkResponse () {
//creates a timeout that returns an error if data not receieved after the specified time.
let errTimeout = setTimeout(reject, config.timeOutPeriod);
//the +1 is there because we originally reduced one, we need to use the physical number now.
if(layerKeycodes.length !== (rows+1) * (columns+1)){
//if the array is not complete, check again in 100 ms
setTimeout(checkResponse, 100);
} else {
//clear the error timeout
clearTimeout(errTimeout);
//send the layerKeycodes to the calling function
resolve(layerKeycodes);
}
}
It looks like this code is something you're trying to fit into getLayerKeycodes() from this other question to somehow know when all the data has been received from your keyboard hardware.
I'll illustrate how you can plug into that without using timers. Here's what you started with in that other question:
Your original function
const getLayerKeycodes = (keyboard, layer, rows, columns) => {
//array that stores all the keycodes according to their order
let layerKeycodes = [];
//rows and columns start the count at 0 in low level, so we need to decrease one from the actual number.
columns --;
rows --;
//loop that asks about all the keycodes in a layer
const dataReceived = (err, data) => {
if(err) {
return err;
}
// push the current keycode to the array
// The keycode is always returned as the fifth object.
layerKeycodes.push(data[5]);
console.log(layerKeycodes);
};
for (let r = 0 , c = 0;c <= columns; r ++){
//callback to fire once data is receieved back from the keyboard.
if(r > rows){
c++;
//r will turn to 0 once the continue fires and the loop executes again
r = -1;
continue;
}
//Start listening and call dataReceived when data is received
keyboard[0].read(dataReceived);
//Ask keyboard for information about keycode
// [always 0 (first byte is ignored),always 0x04 (get_keycode),layer requested,row being checked,column being checked]
keyboard[0].write([0x01,0x04,layer,r,c]);
}
console.log(layerKeycodes);
}
Manually created promise to resolve upon completion of all rows/columns
And, you can incorporate the completion detection code inside of the dataReceived() function without any timers and without reworking much of the rest of your logic like this:
const getLayerKeycodes = (keyboard, layer, rows, columns) => {
return new Promise((resolve, reject) => {
//array that stores all the keycodes according to their order
const layerKeycodes = [];
const totalCells = rows * columns;
let abort = false;
//rows and columns start the count at 0 in low level, so we need to decrease one from the actual number.
columns--;
rows--;
// function that gets with keyboard data
function dataReceived(err, data) => {
if (err) {
abort = true; // set flag to stop sending more requests
reject(err);
return;
}
// push the current keycode to the array
// The keycode is always returned as the fifth object.
layerKeycodes.push(data[5]);
// now see if we're done with all of them
if (layerKeycodes.length >= totalCells) {
resolve(layerKeycodes);
}
}
// loop that asks about all the keycodes in a layer
for (let r = 0, c = 0; c <= columns; r++) {
// stop sending more requests if we've already gotten an error
if (abort) {
break;
}
//callback to fire once data is receieved back from the keyboard.
if (r > rows) {
c++;
//r will turn to 0 once the continue fires and the loop executes again
r = -1;
continue;
}
//Start listening and call dataReceived when data is received
keyboard[0].read(dataReceived);
//Ask keyboard for information about keycode
// [always 0 (first byte is ignored),always 0x04 (get_keycode),layer requested,row being checked,column being checked]
keyboard[0].write([0x01, 0x04, layer, r, c]);
}
}
}
}
A simplified version by promisifying the read function
And, here's a bit simpler version that promisifies the read function so we can use await on it and then just use an async function and a dual nested for loop for simpler loop mechanics.
const util = require('util');
async function getLayerKeycodes(keyboard, layer, rows, columns) => {
// promisify the keyboard.read()
const readKeyboard = util.promisify(keyboard[0].read).bind(keyboard[0]);
//array that stores all the keycodes according to their order
const layerKeycodes = [];
// loop that asks about all the keycodes in a layer
for (let rowCntr = 0; rowCntr < rows; rowCntr++) {
for (let colCntr = 0; colCntr < columns; colCntr++) {
// Start listening and collect the promise
const readPromise = readKeyboard();
// Ask keyboard for information about keycode
// [always 0 (first byte is ignored),always 0x04 (get_keycode),layer requested,row being checked,column being checked]
keyboard[0].write([0x01, 0x04, layer, rowCntr, colCntr]);
// wait for data to come in
const data = await readPromise;
// push the current keycode to the array
// The keycode is always returned as the fifth object.
layerKeycodes.push(data[5]);
}
}
return layerCodes;
}
This also makes sure that we send a write, then wait for the data from that write to come back before we sent the next write which seems like a potentially safer way to handle the hardware. Your original code fires all the writes at once which might work, but it seems like the reads could come back in any order. This guarantees sequential order in the layerCodes array which seems safer (I'm not sure if that matters with this data or not).
Error handling in this last version is somewhat automatically handled by the async function and the promises. If the read returns an error, then the readPromise will automatically reject which will abort our loop and in turn reject the promise that the async function returned. So, we don't have to do the abort checking that the previous function with the manually created promise had.
Now, of course, I don't have the ability to run any of these to test them so it's possible there are some typos somewhere, but hopefully you can work through any of those and see the concept for how these work.

Is it possible to list / kill / ... all pending promises / async events in a headless chrome?

I've got a bunch of integration tests using headless chrome. Because restarting the browser on an entirely new profile is so expensive the harness tries to "clean up" the browser state (flush caches, clear cookies and storage, ...) on teardown.
However there's a recurring issue that during the cleanup phase some async operations resolve and try to do whatever they do in a now nonsensical state.
There are two issues here:
async stack traces support in CDT are listed as experimental and don't appear at all in the response (possibly because they have to be enabled via a hidden flag somehow)
I have no idea what's still running at that point, and can't really even debug what breaks due to (1)
Is there any way to improve the situation expect by trawling through heisenbugs as they occur, trying to slowly make my way up the async callstacks throuth ever more logging until the root cause is found?
First we make a hook to be able to capture all xhr packets. You'll have to execute this before any of your other scripts load. Probaly put this in your boot/prepare script before running tests.
I have implemented below a start and stop button. start makes 300 xhr requests, just the "normal" way. If you press stop, you can cancel them all. Ideally you'd put the stop event handler code in an beforeunload event.
If you don't want to stop them, you can analyze their state, requested urls, etc... from one neat array where you keep track of everything within code.
This example works because only "so" many requests can be made at the same time by the browser. The rest in the queue waits as pending until a slot comes free. I used a 300 requests because I don't know a large/slow source to request from that isn't CORS protected, and this gives us humans enough time to press the stop button(I hope).
function addXMLRequestCallback(callback){
var oldSend, i;
if( XMLHttpRequest.callbacks ) {
// we've already overridden send() so just add the callback
XMLHttpRequest.callbacks.push( callback );
} else {
// create a callback queue
XMLHttpRequest.callbacks = [callback];
// store the native send()
oldSend = XMLHttpRequest.prototype.send;
// override the native send()
XMLHttpRequest.prototype.send = function(){
// process the callback queue
// the xhr instance is passed into each callback but seems pretty useless
// you can't tell what its destination is or call abort() without an error
// so only really good for logging that a request has happened
// I could be wrong, I hope so...
// EDIT: I suppose you could override the onreadystatechange handler though
for( i = 0; i < XMLHttpRequest.callbacks.length; i++ ) {
XMLHttpRequest.callbacks[i]( this );
}
// call the native send()
oldSend.apply(this, arguments);
}
}
}
/**
* adding some debug data to the XHR objects. Note, don't depend on this,
* this is against good practises, ideally you'll have your own wrapper
* to deal with xhr objects and meta data.
* The same way you can extend the XHR object to catch post data etc...
*/
var xhrProto = XMLHttpRequest.prototype,
origOpen = xhrProto.open;
origSend = xhrProto.send;
xhrProto.open = function (method, url) {
this._url = url;
return origOpen.apply(this, arguments);
};
xhrProto.send = function (data) {
this._data = data;
return origSend.apply(this, arguments);
};
+function() {
var xhrs = [],
i,
statuscount = 0,
status = document.getElementById('status'),
DONE = 4;;
addXMLRequestCallback((xhr) => {
xhrs.push(xhr);
});
document.getElementById('start').addEventListener('click',(e) => {
statuscount = 0;
var data = JSON.stringify({
'user': 'person',
'pwd': 'password',
'organization': 'place',
'requiredkey': 'key'
});
for(var i = 0;i < 300; i++) {
var oReq = new XMLHttpRequest();
oReq.addEventListener("load", (e) => {
statuscount++;
status.value=statuscount;
});
oReq.open("GET", 'https://code.jquery.com/jquery-3.4.1.js');
oReq.send(data);
}
});
document.getElementById('cancel').addEventListener('click', (event) => {
for(i = 0; i < xhrs.length; i++) {
if(xhrs[i].readyState !== DONE) {
console.log(xhrs[i]._url, xhrs[i]._data , 'is not done');
}
}
/** Cancel everything */
for(i = 0; i < xhrs.length; i++) {
if(xhrs[i]) {
xhrs[i].abort();
}
}
});
}();
<button id="start">start requests</button>
<button id="cancel">cancel requests</button>
<progress id="status" value="0" max="300"></progress>
Code of addXMLRequestCallback courtesy of meouw from this answer
Code of xhrProto keeping debug variables courtesy Joel Richard of from this answer

Any way to kill String.prototype.match after specified time passed? [duplicate]

Is it possible to cancel a regex.match operation if takes more than 10 seconds to complete?
I'm using an huge regex to match a specific text, and sometimes may work, and sometimes can fail...
regex: MINISTÉRIO(?:[^P]*(?:P(?!ÁG\s:\s\d+\/\d+)[^P]*)(?:[\s\S]*?))PÁG\s:\s+\d+\/(\d+)\b(?:\D*(?:(?!\1\/\1)\d\D*)*)\1\/\1(?:[^Z]*(?:Z(?!6:\s\d+)[^Z]*)(?:[\s\S]*?))Z6:\s+\d+
Working example: https://regex101.com/r/kU6rS5/1
So.. i want cancel the operation if takes more than 10 seconds. Is it possible? I'm not finding anything related in sof
Thanks.
You could spawn a child process that does the regex matching and kill it off if it hasn't completed in 10 seconds. Might be a bit overkill, but it should work.
fork is probably what you should use, if you go down this road.
If you'll forgive my non-pure functions, this code would demonstrate the gist of how you could communicate back and forth between the forked child process and your main process:
index.js
const { fork } = require('child_process');
const processPath = __dirname + '/regex-process.js';
const regexProcess = fork(processPath);
let received = null;
regexProcess.on('message', function(data) {
console.log('received message from child:', data);
clearTimeout(timeout);
received = data;
regexProcess.kill(); // or however you want to end it. just as an example.
// you have access to the regex data here.
// send to a callback, or resolve a promise with the value,
// so the original calling code can access it as well.
});
const timeoutInMs = 10000;
let timeout = setTimeout(() => {
if (!received) {
console.error('regexProcess is still running!');
regexProcess.kill(); // or however you want to shut it down.
}
}, timeoutInMs);
regexProcess.send('message to match against');
regex-process.js
function respond(data) {
process.send(data);
}
function handleMessage(data) {
console.log('handing message:', data);
// run your regex calculations in here
// then respond with the data when it's done.
// the following is just to emulate
// a synchronous computational delay
for (let i = 0; i < 500000000; i++) {
// spin!
}
respond('return regex process data in here');
}
process.on('message', handleMessage);
This might just end up masking the real problem, though. You may want to consider reworking your regex like other posters have suggested.
Another solution I found here:
https://www.josephkirwin.com/2016/03/12/nodejs_redos_mitigation/
Based on the use of VM, no process fork.
That's pretty.
const util = require('util');
const vm = require('vm');
var sandbox = {
regex:/^(A+)*B/,
string:"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC",
result: null
};
var context = vm.createContext(sandbox);
console.log('Sandbox initialized: ' + vm.isContext(sandbox));
var script = new vm.Script('result = regex.test(string);');
try{
// One could argue if a RegExp hasn't processed in a given time.
// then, its likely it will take exponential time.
script.runInContext(context, { timeout: 1000 }); // milliseconds
} catch(e){
console.log('ReDos occurred',e); // Take some remedial action here...
}
console.log(util.inspect(sandbox)); // Check the results

General solution for pre-emptive background work scheduling on javascript

Here is the scenario:
When my web app starts, I want to load data from several tables in local storage (using indexedDB). I delegate this work to a web worker. It will load each table in turn, and fire a message with the data as it loads each one. On the main thread, a listener will receive the message and store the data in a cache.
But let's say the user presses a button to view the data for a specific table. The app calls a function that checks the cache, and sees that the data for that table has not been loaded yet.
How does this function wait until the data for that table has been cached so that it can return the data? Even more important, what if the table is scheduled to be loaded last? How can this function send a message to the web worker to prioritize loading that specific table so that its data will available as soon as possible?
What is a general pattern for a clean solution to this pre-emptive scheduling problem? I would like to avoid polling if at all possible.
The Worker may use an asynchronous queue that contains all the tables to be loaded and is sorted after a certain priority, so you can priorize certain tables and they get sorted to the front of the table. As you havent shown a real implementation here is a more generalized version:
class AsyncPriorityQueue {
constructor(task){
this.task = task;
this.queue = [];
}
push(element, priority = 0){
const pos = this.queue.findIndex(el => el.priority < priority) + 1;
this.queue.splice(pos, 0, {element, priority});
if(this.running) return;
this.running = true;
this._run();
}
prioritize(element, priority = 10){
const pos = this.queue.findIndex(el => el.element === element);
if(pos != -1) this.queue.splice(pos, 1);
this.push(element, priority);
}
async _run(){
while(this.queue.length)
await this.task(this.queue.shift().element);
}
}
Note: If the task is not asynchronous you should use sth like setTimeout(next, 0) to allow the process messaging to interrupt it...
A sample implementation could be an image loader:
class ImageLoader extends AsyncPriorityQueue {
constructor(){
super(function task(url){
const img = new Image();
img.src = url;
return new Promise(res => img.onload = res);
});
}
}
const loader = new ImageLoader;
loader.push("a.jpg");
loader.push("b.jpg", 1); // a bit more important
// Oh, wait:
loader.prioritize("a.jpg");

Javascript memory consumption with map() over a large set and callbacks

I don't even know how to properly ask this question but I have concerns about the performance (mostly memory consumption) of the following code. I am anticipating that this code will consume a lot of memory because of map on a large set and a lot of 'hanging' functions that wait for external service. Are my concerns justified here? What would be a better approach?
var list = fs.readFileSync('./mailinglist.txt') // say 1.000.000 records
.split("\n")
.map( processEntry );
var processEntry = function _processEntry(i){
i = i.split('\t');
getEmailBody( function(emailBody, name){
var msg = {
"message" : emailBody,
"name" : i[0]
}
request(msg, function reqCb(err, result){
...
});
}); // getEmailBody
}
var getEmailBody = function _getEmailBody(obj, cb){
// read email template from file;
// v() returns the correct form for person's name with web-based service
v(obj.name, function(v){
cb(obj, v)
});
}
If you're worried about submitting a million http requests in a very short time span (which you probably should be), you'll have to set up a buffer of some kind.
one simple way to do it:
var lines = fs.readFileSync('./mailinglist.txt').split("\n");
var entryIdx = 0;
var done = false;
var processNextEntry = function () {
if (entryIdx < lines.length) {
processEntry(lines[entryIdx++]);
} else {
done = true;
}
};
var processEntry = function _processEntry(i){
i = i.split('\t');
getEmailBody( function(emailBody, name){
var msg = {
"message" : emailBody,
"name" : name
}
request(msg, function reqCb(err, result){
// ...
!done && processNextEntry();
});
}); // getEmailBody
}
// getEmailBody didn't change
// you set the ball rolling by calling processNextEntry n times,
// where n is a sensible number of http requests to have pending at once.
for (var i=0; i<10; i++) processNextEntry();
Edit: according to this blog post node has an internal queue system, it will only allow 5 simultaneous requests. But you can still use this method to avoid filling up that internal queue with a million items if you're worried about memory consumption.
Firstly I would advise against using readFileSync, and instead favour the async equivalent. Blocking on IO operations should be avoided as reading from a disk is very expensive, and whilst that's the sole purpose of your code now, I would consider how that might change in the future - and arbitrarily wasting clock cycles is never a good idea.
For large data files I would read them in in defined chunks and process them. If you can come up with some schema, either sentinels to distinguish data blocks within the file, or padding to boundaries, then process the file piece by piece.
This is just rough, untested off the top of my head, but something like:
var fs = require("fs");
function doMyCoolWork(startByteIndex, endByteIndex){
fs.open("path to your text file", 'r', function(status, fd) {
var chunkSize = endByteIndex - startByteIndex;
var buffer = new Buffer(chunkSize);
fs.read(fd, buffer, 0, chunkSize, 0, function(err, byteCount) {
var data = buffer.toString('utf-8', 0, byteCount);
// process your data here
if(stillWorkToDo){
//recurse
doMyCoolWork(endByteIndex, endByteIndex + 100);
}
});
});
}
Or look into one of the stream library functions for similar functionality.
H2H
ps. Javascript and Node works extremely well with async and eventing.. using sync is an antipattern in my opinion, and likely to cause code to be a headache in future

Categories