NodeJS stream parse and write json line to line upon Promise result - javascript

I have a large json file that looks like that:
[
{"name": "item1"},
{"name": "item2"},
{"name": "item3"}
]
I want to stream this file (pretty easy so far), for each line run a asynchronous function (that returns a promise) upon the resolve/reject call edit this line.
The result of the input file could be:
[
{"name": "item1", "response": 200},
{"name": "item2", "response": 404},
{"name": "item3"} // not processed yet
]
I do not wish to create another file, I want to edit on the fly the SAME FILE (if possible!).
Thanks :)

I don't really answer the question, but don't think it can be answered in a satisfactory way anyway, so here are my 2 cents.
I assume that you know how to stream line by line, and run the function, and that the only problem you have is editing the file that you are reading from.
Consequences of inserting
It is not possible to natively insert data into any file (which is what you want to do by changing the JSON live). A file can only grow up at its end.
So inserting 10 bytes of data at the beginning of a 1GB file means that you need to write 1GB to the disk (to move all the data 10 bytes further).
Your filesystem does not understand JSON, and just sees that you are inserting bytes in the middle of a big file so this is going to be very slow.
So, yes it is possible to do.
Write a wrapper over the file API in NodeJS with an insert() method.
Then write some more code to be able to know where to insert bytes into a JSON file without loading the whole file and not producing invalid JSON at the end.
Now I would not recommend it :)
=> Read this question: Is it possible to prepend data to an file without rewriting?
Why do it then?
I assume that want to either
Be able to kill your process at any time, and easily resume work by reading the file again.
Retry partially treated files to fill only the missing bits.
First solution: Use a database
Abstracting the work that needs to be done to live edit files at random places is the sole purpose of existence of databases.
They all exist only to abstract the magic that is behind UPDATE mytable SET name = 'a_longer_name_that_the_name_that_was_there_before' where name = 'short_name'.
Have a look at LevelUP/Down, sqlite, etc...
They will abstract all the magic that needs to be done in your JSON file!
Second solution: Use multiple files
When you stream your file, write two new files!
One that contain current position in the input file and lines that need to be retried
The other one the expected result.
You will also be able to kill your process at any time and restart

According to this answer writing to the same file while reading is not reliable. As a commenter there says, better to write to a temporary file, and then delete the original and rename the temp file over it.
To create a stream of lines you can use byline. Then for each line, apply some operation and pipe it out to the output file.
Something like this:
var fs = require('fs');
var stream = require('stream');
var util = require('util');
var LineStream = require('byline').LineStream;
function Modify(options) {
stream.Transform.call(this, options);
}
util.inherits(Modify, stream.Transform);
Modify.prototype._transform = function(chunk, encoding, done) {
var self = this;
setTimeout(function() {
// your modifications here, note that the exact regex depends on
// your json format and is probably the most brittle part of this
var modifiedChunk = chunk.toString();
if (modifiedChunk.search('response:[^,}]+') === -1) {
modifiedChunk = modifiedChunk
.replace('}', ', response: ' + new Date().getTime() + '}') + '\n';
}
self.push(modifiedChunk);
done();
}, Math.random() * 2000 + 1000); // to simulate an async modification
};
var inPath = './data.json';
var outPath = './out.txt';
fs.createReadStream(inPath)
.pipe(new LineStream())
.pipe(new Modify())
.pipe(fs.createWriteStream(outPath))
.on('close', function() {
// replace input with output
fs.unlink(inPath, function() {
fs.rename(outPath, inPath);
});
});
Note that the above results in only one async operation happening at a time. You could also save the modifications to an array and once all of them are done write the lines from the array to a file, like this:
var fs = require('fs');
var stream = require('stream');
var LineStream = require('byline').LineStream;
var modifiedLines = [];
var modifiedCount = 0;
var inPath = './data.json';
var allModified = new Promise(function(resolve, reject) {
fs.createReadStream(inPath).pipe(new LineStream()).on('data', function(chunk) {
modifiedLines.length++;
var index = modifiedLines.length - 1;
setTimeout(function() {
// your modifications here
var modifiedChunk = chunk.toString();
if (modifiedChunk.search('response:[^,}]+') === -1) {
modifiedChunk = modifiedChunk
.replace('}', ', response: ' + new Date().getTime() + '}');
}
modifiedLines[index] = modifiedChunk;
modifiedCount++;
if (modifiedCount === modifiedLines.length) {
resolve();
}
}, Math.random() * 2000 + 1000);
});
}).then(function() {
fs.writeFile(inPath, modifiedLines.join('\n'));
}).catch(function(reason) {
console.error(reason);
});
If instead of lines you wish to stream chunks of valid json which would be a more robust approach, take a look at JSONStream.

As mentioned in the comment, the file you have is not proper JSON, although is valid in Javascript. In order to generate proper JSON, JSON.stringify() could be used. I think it would make life difficult for others to parse nonstandard JSON as well, therefore I would recommend furnishing a new output file instead of keeping the original one.
However, it is still possible to parse the original file as JSON. This is possible via eval('(' + procline + ')');, however it is not secure to take external data into node.js like this.
const fs = require('fs');
const readline = require('readline');
const fr = fs.createReadStream('file1');
const rl = readline.createInterface({
input: fr
});
rl.on('line', function (line) {
if (line.match(new RegExp("\{name"))) {
var procline = "";
if (line.trim().split('').pop() === ','){
procline = line.trim().substring(0,line.trim().length-1);
}
else{
procline = line.trim();
}
var lineObj = eval('(' + procline + ')');
lineObj.response = 200;
console.log(JSON.stringify(lineObj));
}
});
The output would be like this:
{"name":"item1","response":200}
{"name":"item2","response":200}
{"name":"item3","response":200}
Which is line-delimited JSON (LDJSON) and could be useful for streaming stuff, without the need for leading and trailing [, ], or ,. There is an ldjson-stream package for it as well.

Related

How to handle large JSON files in NodeJS and get specifc values out of them? [duplicate]

I have a file which stores many JavaScript objects in JSON form and I need to read the file, create each of the objects, and do something with them (insert them into a db in my case). The JavaScript objects can be represented a format:
Format A:
[{name: 'thing1'},
....
{name: 'thing999999999'}]
or Format B:
{name: 'thing1'} // <== My choice.
...
{name: 'thing999999999'}
Note that the ... indicates a lot of JSON objects. I am aware I could read the entire file into memory and then use JSON.parse() like this:
fs.readFile(filePath, 'utf-8', function (err, fileContents) {
if (err) throw err;
console.log(JSON.parse(fileContents));
});
However, the file could be really large, I would prefer to use a stream to accomplish this. The problem I see with a stream is that the file contents could be broken into data chunks at any point, so how can I use JSON.parse() on such objects?
Ideally, each object would be read as a separate data chunk, but I am not sure on how to do that.
var importStream = fs.createReadStream(filePath, {flags: 'r', encoding: 'utf-8'});
importStream.on('data', function(chunk) {
var pleaseBeAJSObject = JSON.parse(chunk);
// insert pleaseBeAJSObject in a database
});
importStream.on('end', function(item) {
console.log("Woot, imported objects into the database!");
});*/
Note, I wish to prevent reading the entire file into memory. Time efficiency does not matter to me. Yes, I could try to read a number of objects at once and insert them all at once, but that's a performance tweak - I need a way that is guaranteed not to cause a memory overload, not matter how many objects are contained in the file.
I can choose to use FormatA or FormatB or maybe something else, just please specify in your answer. Thanks!
To process a file line-by-line, you simply need to decouple the reading of the file and the code that acts upon that input. You can accomplish this by buffering your input until you hit a newline. Assuming we have one JSON object per line (basically, format B):
var stream = fs.createReadStream(filePath, {flags: 'r', encoding: 'utf-8'});
var buf = '';
stream.on('data', function(d) {
buf += d.toString(); // when data is read, stash it in a string buffer
pump(); // then process the buffer
});
function pump() {
var pos;
while ((pos = buf.indexOf('\n')) >= 0) { // keep going while there's a newline somewhere in the buffer
if (pos == 0) { // if there's more than one newline in a row, the buffer will now start with a newline
buf = buf.slice(1); // discard it
continue; // so that the next iteration will start with data
}
processLine(buf.slice(0,pos)); // hand off the line
buf = buf.slice(pos+1); // and slice the processed data off the buffer
}
}
function processLine(line) { // here's where we do something with a line
if (line[line.length-1] == '\r') line=line.substr(0,line.length-1); // discard CR (0x0D)
if (line.length > 0) { // ignore empty lines
var obj = JSON.parse(line); // parse the JSON
console.log(obj); // do something with the data here!
}
}
Each time the file stream receives data from the file system, it's stashed in a buffer, and then pump is called.
If there's no newline in the buffer, pump simply returns without doing anything. More data (and potentially a newline) will be added to the buffer the next time the stream gets data, and then we'll have a complete object.
If there is a newline, pump slices off the buffer from the beginning to the newline and hands it off to process. It then checks again if there's another newline in the buffer (the while loop). In this way, we can process all of the lines that were read in the current chunk.
Finally, process is called once per input line. If present, it strips off the carriage return character (to avoid issues with line endings – LF vs CRLF), and then calls JSON.parse one the line. At this point, you can do whatever you need to with your object.
Note that JSON.parse is strict about what it accepts as input; you must quote your identifiers and string values with double quotes. In other words, {name:'thing1'} will throw an error; you must use {"name":"thing1"}.
Because no more than a chunk of data will ever be in memory at a time, this will be extremely memory efficient. It will also be extremely fast. A quick test showed I processed 10,000 rows in under 15ms.
Just as I was thinking that it would be fun to write a streaming JSON parser, I also thought that maybe I should do a quick search to see if there's one already available.
Turns out there is.
JSONStream "streaming JSON.parse and stringify"
Since I just found it, I've obviously not used it, so I can't comment on its quality, but I'll be interested to hear if it works.
It does work consider the following Javascript and _.isString:
stream.pipe(JSONStream.parse('*'))
.on('data', (d) => {
console.log(typeof d);
console.log("isString: " + _.isString(d))
});
This will log objects as they come in if the stream is an array of objects. Therefore the only thing being buffered is one object at a time.
As of October 2014, you can just do something like the following (using JSONStream) - https://www.npmjs.org/package/JSONStream
var fs = require('fs'),
JSONStream = require('JSONStream'),
var getStream() = function () {
var jsonData = 'myData.json',
stream = fs.createReadStream(jsonData, { encoding: 'utf8' }),
parser = JSONStream.parse('*');
return stream.pipe(parser);
}
getStream().pipe(MyTransformToDoWhateverProcessingAsNeeded).on('error', function (err) {
// handle any errors
});
To demonstrate with a working example:
npm install JSONStream event-stream
data.json:
{
"greeting": "hello world"
}
hello.js:
var fs = require('fs'),
JSONStream = require('JSONStream'),
es = require('event-stream');
var getStream = function () {
var jsonData = 'data.json',
stream = fs.createReadStream(jsonData, { encoding: 'utf8' }),
parser = JSONStream.parse('*');
return stream.pipe(parser);
};
getStream()
.pipe(es.mapSync(function (data) {
console.log(data);
}));
$ node hello.js
// hello world
I had similar requirement, i need to read a large json file in node js and process data in chunks and call a api and save in mongodb.
inputFile.json is like:
{
"customers":[
{ /*customer data*/},
{ /*customer data*/},
{ /*customer data*/}....
]
}
Now i used JsonStream and EventStream to achieve this synchronously.
var JSONStream = require("JSONStream");
var es = require("event-stream");
fileStream = fs.createReadStream(filePath, { encoding: "utf8" });
fileStream.pipe(JSONStream.parse("customers.*")).pipe(
es.through(function(data) {
console.log("printing one customer object read from file ::");
console.log(data);
this.pause();
processOneCustomer(data, this);
return data;
}),
function end() {
console.log("stream reading ended");
this.emit("end");
}
);
function processOneCustomer(data, es) {
DataModel.save(function(err, dataModel) {
es.resume();
});
}
I realize that you want to avoid reading the whole JSON file into memory if possible, however if you have the memory available it may not be a bad idea performance-wise. Using node.js's require() on a json file loads the data into memory really fast.
I ran two tests to see what the performance looked like on printing out an attribute from each feature from a 81MB geojson file.
In the 1st test, I read the entire geojson file into memory using var data = require('./geo.json'). That took 3330 milliseconds and then printing out an attribute from each feature took 804 milliseconds for a grand total of 4134 milliseconds. However, it appeared that node.js was using 411MB of memory.
In the second test, I used #arcseldon's answer with JSONStream + event-stream. I modified the JSONPath query to select only what I needed. This time the memory never went higher than 82MB, however, the whole thing now took 70 seconds to complete!
I wrote a module that can do this, called BFJ. Specifically, the method bfj.match can be used to break up a large stream into discrete chunks of JSON:
const bfj = require('bfj');
const fs = require('fs');
const stream = fs.createReadStream(filePath);
bfj.match(stream, (key, value, depth) => depth === 0, { ndjson: true })
.on('data', object => {
// do whatever you need to do with object
})
.on('dataError', error => {
// a syntax error was found in the JSON
})
.on('error', error => {
// some kind of operational error occurred
})
.on('end', error => {
// finished processing the stream
});
Here, bfj.match returns a readable, object-mode stream that will receive the parsed data items, and is passed 3 arguments:
A readable stream containing the input JSON.
A predicate that indicates which items from the parsed JSON will be pushed to the result stream.
An options object indicating that the input is newline-delimited JSON (this is to process format B from the question, it's not required for format A).
Upon being called, bfj.match will parse JSON from the input stream depth-first, calling the predicate with each value to determine whether or not to push that item to the result stream. The predicate is passed three arguments:
The property key or array index (this will be undefined for top-level items).
The value itself.
The depth of the item in the JSON structure (zero for top-level items).
Of course a more complex predicate can also be used as necessary according to requirements. You can also pass a string or a regular expression instead of a predicate function, if you want to perform simple matches against property keys.
If you have control over the input file, and it's an array of objects, you can solve this more easily. Arrange to output the file with each record on one line, like this:
[
{"key": value},
{"key": value},
...
This is still valid JSON.
Then, use the node.js readline module to process them one line at a time.
var fs = require("fs");
var lineReader = require('readline').createInterface({
input: fs.createReadStream("input.txt")
});
lineReader.on('line', function (line) {
line = line.trim();
if (line.charAt(line.length-1) === ',') {
line = line.substr(0, line.length-1);
}
if (line.charAt(0) === '{') {
processRecord(JSON.parse(line));
}
});
function processRecord(record) {
// Process the records one at a time here!
}
I solved this problem using the split npm module. Pipe your stream into split, and it will "Break up a stream and reassemble it so that each line is a chunk".
Sample code:
var fs = require('fs')
, split = require('split')
;
var stream = fs.createReadStream(filePath, {flags: 'r', encoding: 'utf-8'});
var lineStream = stream.pipe(split());
linestream.on('data', function(chunk) {
var json = JSON.parse(chunk);
// ...
});
Using the #josh3736 answer, but for ES2021 and Node.js 16+ with async/await + AirBnb rules:
import fs from 'node:fs';
const file = 'file.json';
/**
* #callback itemProcessorCb
* #param {object} item The current item
*/
/**
* Process each data chunk in a stream.
*
* #param {import('fs').ReadStream} readable The readable stream
* #param {itemProcessorCb} itemProcessor A function to process each item
*/
async function processChunk(readable, itemProcessor) {
let data = '';
let total = 0;
// eslint-disable-next-line no-restricted-syntax
for await (const chunk of readable) {
// join with last result, remove CR and get lines
const lines = (data + chunk).replace('\r', '').split('\n');
// clear last result
data = '';
// process lines
let line = lines.shift();
const items = [];
while (line) {
// check if isn't a empty line or an array definition
if (line !== '' && !/[\[\]]+/.test(line)) {
try {
// remove the last comma and parse json
const json = JSON.parse(line.replace(/\s?(,)+\s?$/, ''));
items.push(json);
} catch (error) {
// last line gets only a partial line from chunk
// so we add this to join at next loop
data += line;
}
}
// continue
line = lines.shift();
}
total += items.length;
// Process items in parallel
await Promise.all(items.map(itemProcessor));
}
console.log(`${total} items processed.`);
}
// Process each item
async function processItem(item) {
console.log(item);
}
// Init
try {
const readable = fs.createReadStream(file, {
flags: 'r',
encoding: 'utf-8',
});
processChunk(readable, processItem);
} catch (error) {
console.error(error.message);
}
For a JSON like:
[
{ "name": "A", "active": true },
{ "name": "B", "active": false },
...
]
https.get(url1 , function(response) {
var data = "";
response.on('data', function(chunk) {
data += chunk.toString();
})
.on('end', function() {
console.log(data)
});
});
I think you need to use a database. MongoDB is a good choice in this case because it is JSON compatible.
UPDATE:
You can use mongoimport tool to import JSON data into MongoDB.
mongoimport --collection collection --file collection.json

Electron - write file before open save dialog

I'm using electron to develop an app. after some encryption operations are done, I need to show a dialog to the user to save the file. The filename I want to give to the file is a random hash but I have no success also with this. I'm trying with this code but the file will not be saved. How I can fix this?
const downloadPath = app.getPath('downloads')
ipcMain.on('encryptFiles', (event, data) => {
let output = [];
const password = data.password;
data.files.forEach( (file) => {
const buffer = fs.readFileSync(file.path);
const dataURI = dauria.getBase64DataURI(buffer, file.type);
const encrypted = CryptoJS.AES.encrypt(dataURI, password).toString();
output.push(encrypted);
})
const filename = hash.createHash('md5').toString('hex');
console.log(filename)
const response = output.join(' :: ');
dialog.showSaveDialog({title: 'Save encrypted file', defaultPath: downloadPath }, () => {
fs.writeFile(`${filename}.mfs`, response, (err) => console.log(err) )
})
})
The problem you're experiencing is resulting from the asynchronous nature of Electron's UI functions: They do not take callback functions, but return promises instead. Thus, you do not have to pass in a callback function, but rather handle the promise's resolution. Note that this only applies to Electron >= version 6. If you however run an older version of Electron, your code would be correct -- but then you should really update to a newer version (Electron v6 was released well over a year ago).
Adapting your code like below can be a starting point to solve your problem. However, since you do not state how you generate the hash (where does hash.createHash come from?; did you forget to declare/import hash?; did you forget to pass any message string?; are you using hash as an alias for NodeJS' crypto module?), it is (at this time) impossible to debug why you do not get any output from console.log (filename) (I assume you mean this by "in the code, the random filename will not be created"). Once you provide more details on this problem, I'd be happy to update this answer accordingly.
As for the default filename: As per the Electron documentation, you can pass a file path into dialog.showSaveDialog () to provide the user with a default filename.
The file type extension you're using should also actually be passed with the file extension into the save dialog. Also passing this file extension as a filter into the dialog will prevent users from selecting any other file type, which is ultimately what you're also currently doing by appending it to the filename.
Also, you could utilise CryptoJS for the filename generation: Given some arbitrary string, which could really be random bytes, you could do: filename = CryptoJS.MD5 ('some text here') + '.mfs'; However, remember to choose the input string wisely. MD5 has been broken and should thus no longer be used to store secrets -- using any known information which is crucial for the encryption of the files you're storing (such as data.password) is inherently insecure. There are some good examples on how to create random strings in JavaScript around the internet, along with this answer here on SO.
Taking all these issues into account, one might end up with the following code:
const downloadPath = app.getPath('downloads'),
path = require('path');
ipcMain.on('encryptFiles', (event, data) => {
let output = [];
const password = data.password;
data.files.forEach((file) => {
const buffer = fs.readFileSync(file.path);
const dataURI = dauria.getBase64DataURI(buffer, file.type);
const encrypted = CryptoJS.AES.encrypt(dataURI, password).toString();
output.push(encrypted);
})
// not working:
// const filename = hash.createHash('md5').toString('hex') + '.mfs';
// alternative requiring more research on your end
const filename = CryptoJS.MD5('replace me with some random bytes') + '.mfs';
console.log(filename);
const response = output.join(' :: ');
dialog.showSaveDialog(
{
title: 'Save encrypted file',
defaultPath: path.format ({ dir: downloadPath, base: filename }), // construct a proper path
filters: [{ name: 'Encrypted File (*.mfs)', extensions: ['mfs'] }] // filter the possible files
}
).then ((result) => {
if (result.canceled) return; // discard the result altogether; user has clicked "cancel"
else {
var filePath = result.filePath;
if (!filePath.endsWith('.mfs')) {
// This is an additional safety check which should not actually trigger.
// However, generally appending a file extension to a filename is not a
// good idea, as they would be (possibly) doubled without this check.
filePath += '.mfs';
}
fs.writeFile(filePath, response, (err) => console.log(err) )
}
}).catch ((err) => {
console.log (err);
});
})

How do I write a LZ compressed string to text file using JXA?

I am trying to write a JXA script in Apple Script Editor, that compresses a string using the LZ algorithm and writes it to a text (JSON) file:
var story = "Once upon a time in Silicon Valley..."
var storyC = LZString.compress(story)
var data_to_write = "{\x22test\x22\x20:\x20\x22"+storyC+"\x22}"
app.displayAlert(data_to_write)
var desktopString = app.pathTo("desktop").toString()
var file = `${desktopString}/test.json`
writeTextToFile(data_to_write, file, true)
Everything works, except that the LZ compressed string is just transformed to a set of "?" by the time it reaches the output file, test.json.
It should look like:
{"test" : "㲃냆੠Њޱᐈ攀렒삶퓲ٔ쀛䳂䨀푖㢈Ӱນꀀ"}
Instead it looks like:
{"test" : "????????????????????"}
I have a feeling the conversion is happening in the app.write command used by the writeTextToFile() function (which I pulled from an example in Apple's Mac Automation Scripting Guide):
var app = Application.currentApplication()
app.includeStandardAdditions = true
function writeTextToFile(text, file, overwriteExistingContent) {
try {
// Convert the file to a string
var fileString = file.toString()
// Open the file for writing
var openedFile = app.openForAccess(Path(fileString), { writePermission: true })
// Clear the file if content should be overwritten
if (overwriteExistingContent) {
app.setEof(openedFile, { to: 0 })
}
// Write the new content to the file
app.write(text, { to: openedFile, startingAt: app.getEof(openedFile) })
// Close the file
app.closeAccess(openedFile)
// Return a boolean indicating that writing was successful
return true
}
catch(error) {
try {
// Close the file
app.closeAccess(file)
}
catch(error) {
// Report the error is closing failed
console.log(`Couldn't close file: ${error}`)
}
// Return a boolean indicating that writing was successful
return false
}
}
Is there a substitute command for app.write that maintains the LZ compressed string / a better way to accomplish what I am trying to do?
In addition, I am using the readFile() function (also from the Scripting Guide) to load the LZ string back into the script:
function readFile(file) {
// Convert the file to a string
var fileString = file.toString()
// Read the file and return its contents
return app.read(Path(fileString))
}
But rather than returning:
{"test" : "㲃냆੠Њޱᐈ攀렒삶퓲ٔ쀛䳂䨀푖㢈Ӱນꀀ"}
It is returning:
"{\"test\" : \"㲃냆੠Њޱᐈ攀렒삶퓲ٔ쀛䳂䨀푖㢈Ӱນꀀ\"}"
Does anybody know a fix for this too?
I know that it is possible to use Cocoa in JXA scripts, so maybe the solution lies therein?
I am just getting to grips with JavaScript so I'll admit trying to grasp Objective-C or Swift is way beyond me right now.
I look forward to any solutions and/or pointers that you might be able to provide me. Thanks in advance!
After some further Googl'ing, I came across these two posts:
How can I write UTF-8 files using JavaScript for Mac Automation?
read file as class utf8
I have thus altered my script accordingly.
writeTextToFile() now looks like:
function writeTextToFile(text, file) {
// source: https://stackoverflow.com/a/44293869/11616368
var nsStr = $.NSString.alloc.initWithUTF8String(text)
var nsPath = $(file).stringByStandardizingPath
var successBool = nsStr.writeToFileAtomicallyEncodingError(nsPath, false, $.NSUTF8StringEncoding, null)
if (!successBool) {
throw new Error("function writeFile ERROR:\nWrite to File FAILED for:\n" + file)
}
return successBool
};
While readFile() looks like:
ObjC.import('Foundation')
const readFile = function (path, encoding) {
// source: https://github.com/JXA-Cookbook/JXA-Cookbook/issues/25#issuecomment-271204038
pathString = path.toString()
!encoding && (encoding = $.NSUTF8StringEncoding)
const fm = $.NSFileManager.defaultManager
const data = fm.contentsAtPath(pathString)
const str = $.NSString.alloc.initWithDataEncoding(data, encoding)
return ObjC.unwrap(str)
};
Both use Objective-C to overcome app.write and app.read's inability to handle UTF-8.

Use Fetch Streams API to consume chunked data asynchronously without using recursion

I'm using the JavaScript fetch streams API to consume chunked JSON asynchronously like in this answer.
My application may be receiving up to 25 small JSON objects per second (one for each frame in a video) over the span of an hour.
When the incoming chunks are large (1000+ JSON objects per chunk), my code functions well - fast, minimal memory use - it can easily receive 1,000,000 JSON objects reliably.
When the incoming chunks are smaller (5 JSON objects per chunk), my code functions poorly - slow, lots of memory consumption. The browser dies at about 50,000 JSON objects.
After doing a lot of debugging in the Developer tools, it appears the problem lies in the recursive nature of the code.
I tried to remove the recursion, but it seems required because the API is reliant on my code returning a promise to chain?!
How do I remove this recursion, or should I use something other than fetch?
Code with recursion (works)
String.prototype.replaceAll = function(search, replacement) {
var target = this;
return target.replace(new RegExp(search, 'g'), replacement);
};
results = []
fetch('http://localhost:9999/').then(response => {
const reader = response.body.getReader();
td = new TextDecoder("utf-8");
buffer = "";
reader.read().then(function processText({ done, value }) {
if (done) {
console.log("Stream done.");
return;
}
try {
decoded = td.decode(value);
buffer += decoded;
if (decoded.length != 65536){
toParse = "["+buffer.trim().replaceAll("\n",",")+"]";
result = JSON.parse(toParse);
results.push(...result);
console.log("Received " + results.length.toString() + " objects")
buffer = "";
}
}
catch(e){
// Doesn't need to be reported, because partial JSON result will be parsed next time around (from buffer).
//console.log("EXCEPTION:"+e);
}
return reader.read().then(processText);
})
});
Code without recursion (doesn't work)
String.prototype.replaceAll = function(search, replacement) {
var target = this;
return target.replace(new RegExp(search, 'g'), replacement);
};
results = []
finished = false
fetch('http://localhost:9999/').then(response => {
const reader = response.body.getReader();
td = new TextDecoder("utf-8");
buffer = "";
lastResultSize = -1
while (!finished)
if (lastResultSize < results.length)
{
lastResultSize = results.length;
reader.read().then(function processText({ done, value }) {
if (done) {
console.log("Stream done.");
finished = true;
return;
}
else
try {
decoded = td.decode(value);
//console.log("Received chunk " + decoded.length.toString() + " in length");
buffer += decoded;
if (decoded.length != 65536){
toParse = "["+buffer.trim().replaceAll("\n",",")+"]";
result = JSON.parse(toParse);
results.push(...result);
console.log("Received " + results.length.toString() + " objects")
buffer = "";
//console.log("Parsed chunk " + toParse.length.toString() + " in length");
}
}
catch(e) {
// Doesn't need to be reported, because partial JSON result will be parsed next time around (from buffer).
//console.log("EXCEPTION:"+e);
}
})
}
});
For completeness, here is the python code I'm using on the test server. Note the line containing sleep which changes chunking behavior:
import io
import urllib
import inspect
from http.server import HTTPServer,BaseHTTPRequestHandler
from time import sleep
class TestServer(BaseHTTPRequestHandler):
def do_GET(self):
args = urllib.parse.parse_qs(self.path[2:])
args = {i:args[i][0] for i in args}
response = ''
self.send_response(200)
self.send_header('Content-type', 'text/html')
self.send_header('Access-Control-Allow-Origin', '*')
self.send_header('Transfer-Encoding', 'chunked')
self.end_headers()
for i in range (1000000):
self.wfile.write(bytes(f'{{"x":{i}, "text":"fred!"}}\n','utf-8'))
sleep(0.001) # Comment this out for bigger chunks sent to the client!
def main(server_port:"Port to serve on."=9999,server_address:"Local server name."=''):
httpd = HTTPServer((server_address, server_port), TestServer)
print(f'Serving on http://{httpd.server_name}:{httpd.server_port} ...')
httpd.serve_forever()
if __name__ == '__main__':
main()
The part you're missing is that the function passed to .then() is always called asynchronously, i.e. with an empty stack. So there is no actual recursion here. This is also why your 'without recursion' version doesn't work.
The simple solution to this is to use async functions and the await statement. If you call read() like this:
const {value, done} = await reader.read();
...then you can call it in a loop and it will work how you would expect.
I don't know specifically where your memory leak is, but your use of global variables looks like a problem. I recommend you always put 'use strict'; at the top of your code so the compiler will catch these problems for you. Then use let or const whenever you declare a variable.
I recommend you use TextDecoderStream to avoid problems when a character is split between multiple chunks. You will also have issues when a JSON object is split between multiple chunks.
See Append child writable stream demo for how to do this safely (but note that you need TextDecoderStream where that demo has "TextDecoder").
Note also the use of a WritableStream in that demo. Firefox doesn't support it yet AFAIK, but WritableStream provides much easier syntax to consume chunks without having to explicitly loop or recurse. You can find the web streams polyfill here.

nodejs prepending to a file

For Node.js, what is the best way to prepend to a file in a way SIMILAR to
fs.appendFile(path.join(__dirname, 'app.log'), 'appendme', 'utf8')
Personally, the best way really revolves around a asynchronous solution to create a log where I can basically push onto the file from the top.
This solution isn't mine and I don't know where it's from but it works.
const data = fs.readFileSync('message.txt')
const fd = fs.openSync('message.txt', 'w+')
const insert = Buffer.from("text to prepend \n")
fs.writeSync(fd, insert, 0, insert.length, 0)
fs.writeSync(fd, data, 0, data.length, insert.length)
fs.close(fd, (err) => {
if (err) throw err;
});
It is impossible to add to a beginning of a file. See this question for the similar problem in C or this question for the similar problem in C#.
I suggest you do your logging in the conventional way (that is, log to the end of file).
Otherwise, there is no way around reading the file, adding the text to the start and writing it back to the file which can get really costly really fast.
It seems it is indeed possible with https://www.npmjs.com/package/prepend-file
Here is an example of how to prepend text to a file using gulp and a custom built function.
var through = require('through2');
gulp.src('somefile.js')
.pipe(insert('text to prepend with'))
.pipe(gulp.dest('Destination/Path/'))
function insert(text) {
function prefixStream(prefixText) {
var stream = through();
stream.write(prefixText);
return stream;
}
let prefixText = new Buffer(text + "\n\n"); // allocate ahead of time
// creating a stream through which each file will pass
var stream = through.obj(function (file, enc, cb) {
//console.log(file.contents.toString());
if (file.isBuffer()) {
file.contents = new Buffer(prefixText.toString() + file.contents.toString());
}
if (file.isStream()) {
throw new Error('stream files are not supported for insertion, they must be buffered');
}
// make sure the file goes through the next gulp plugin
this.push(file);
// tell the stream engine that we are done with this file
cb();
});
// returning the file stream
return stream;
}
Sources: [cole_gentry_github_dealingWithStreams][1]
Its possible by using the prepend-file node module. Do the following:
npm i prepend-file -S
import prepend-file module in your respective code.
Example:
let firstFile = 'first.txt';
let secondFile = 'second.txt';
prependFile(firstFile, secondFile, () => {
console.log('file prepend successfully');
})

Categories