NodeJS Running through file by steams

NodeJS Running through file by steams - javascript

I'm trying to read through a file using a stream. The file is a custom file format that has some number of sections, each being 66 bytes. I have to check the value of the the first 32 bytes against a given 32 bytes, and if they match return the remaining 34 bytes. Currently I'm doing the following
const fs = require('fs')
/**
* #param {string} known
*/
function check(known) {
let stream = fs.createReadStream('path/to/file', {highWaterMark: 66});
let out = '';
stream.on('data', (chunk) => {
let temp = '';
for (let i = 0; i < 32; i++) {
temp += String.fromCharCode(chunk.at(i));
}
if (temp === known) {
for (let i = 32; i < 66; i++) {
out += String.fromCharCode(chunk.at(i));
}
}
});
return out;
}
However (from checking with console.log) I know that the function in stream.on is run after check "finishes" so regardless of if the given string is found, the returned value will always be the same. How fix? Thanks in advance

Streams are non-blocking and asynchronous. They call their event handlers some time in the future, after your check() function has returned. That is how they work.
As such, you cannot directly return your result from the function because the function returns before the result is known. Instead, you have to return the result using an asynchronous mechanism such as an event, a callback or a promise (I would prefer a promise in this case). see How to return the response from an asynchronous call for more detail on returning your asynchronously retrieved value.
But, using a stream to read a file that is structured in 66 byte chunks is the hard way to write this code because data blocks are going to be randomly sized and are not guaranteed to line up with your 66 byte blocks. It would be simpler to use const fileHandle = await fs.promises.open(...) and fileHandle.read(...) to read a 66 byte chunk from the beginning of the file, check the 32 bytes you want to check and then communicate back the remaining 34 bytes using a promise.
Here's one such way to write it using handle.read() instead:
const fsp = require('fs').promises;
async function check(known) {
const kReadLen = 66;
const kHeaderLen = 32;
let handle = await fsp.open('path/to/file');
try {
let buffer = Buffer.alloc(kReadLen, 0);
let { bytesRead } = await handle.read(buffer, 0, kReadLen, 0);
if (bytesRead !== readLen) {
throw new Error("Insufficient data in the file");
}
const headerStr = buffer.toString('utf8', 0, kHeaderLen);
if (headerStr === known) {
return buffer.toString('utf8', kHeaderLen, kReadLen);
} else {
return "";
}
} finally {
handle.close();
}
}
Sample Usage:
check(testKnown).then(result => {
console.log(result);
}).catch(err => {
console.log(err);
})

Related

How to correctly parse Json Objects from text Stream delimited by new lines

I have written some code to read JSON objects from a stream returned from a fetch API. However, I am unable to successfully parse out the JSON objects successfully. I want return the JSON objects from the chunks in a array and feed the results to a transformer that will apply some custom transformation the the final JSON. However, I am unable to deal with the partial chunks that result from the chunks after splitting the chunks into new lines. My main problem is I don't know how the chunking format looks from the data returned in the stream. I have written some code which does not work entirely. I am looking for some help to get it parsing out all the JSON objects. Here is my code that attempts to do that. First, I wrote an async iterator that works fine. Next, i use that to split the chunks into new lines
function streamAsyncIterator(stream) {
const reader = stream.pipeThrough(new TextDecoderStream()).getReader();
return {
next() {
return reader.read();
},
return() {
reader.releaseLock();
return {};
},
[Symbol.asyncIterator]() {
return this;
}
};
}
Next, I use the iterator to to get new lines to convert to JSON objects; Here is that code, which partially works:
const fetchProductsStream = async () => {
const body = {
searchRange: ['fragrance', 'perfumery', 'cologne'],
exclusionCategory: ['discounted']
}
let time1 = performance.now();
const response = await fetch('/products/stream', {
method: 'POST',
body: JSON.stringify(body)
});
let chunkCount = 0;
let remaining = "";
let currentChunk = "";
let totalItems = 0;
for await (const chunk of streamAsyncIterator(response.body)) {
let items = [];
try{
currentChunk = chunk;
let lastIndex;
if(remaining.trim().length > 1){
currentChunk = remaining + chunk;
}
currentChunk.split(/\r?\n/).forEach((line, index) => {
try{
items.push(JSON.parse(line));
console.log(`Pushed line ${line} to items array`);
lastIndex = index;
}catch(err){
console.error(`Error is ${err}`);
}
})
totalItems += items.length;
remaining = "";
let i = currentChunk.indexOf(lastIndex);
if(i != -1){
remaining = currentChunk.substring(lastIndex);
}
items = [];
}catch(error){
console.log(error);
}
chunkCount++;
}
let time2 = performance.now();
console.log(`Total number of items is ${totalItems}`);
console.log(`Total number of chunks :: ${chunkCount} in time
${(time2 - time1)/1000} seconds`)
}
Here is what a sample looks like
"{"productId":"1671","category":"fragrance","categoryId":1WW23,"productName":"Paco Rabane","productClass":"M","warehouseCode":"1242","alternateName":"ALT_ELL332"}\n{"productId":"1671","category":"fragrance","categoryId":1WW23,"productName":"Paco Rabane","productClass":"M","warehouseCode":"1242","alternateName":"ALT_ELL332"}\n{"productId":"1671","category":"fragrance","categoryId":1WW23,"productName":"Paco Rabane","productClass":"M","warehouseCode":"1242","alternateName":"ALT_ELL332","category":{"desc":"1654 Cologne Perfume","code":"221","feature":"1","salesCode":"S2237"}\n{"productId":"1671","category":"fragrance","categoryId":1WW23,"productName":"Paco Rabane","productClass":"M","warehouseCode":"1242","alternateName":"ALT_ELL332"}"
This is my attempt, which does not work perfectly, since there are some json objects that are not correctly parsed. Most of them are, so I'd appreciate some help in getting it to work perfectly. If there is a better way to do it, I will also be grateful for that solution.

Best practice for comparing two large files in Node.js

I want to compare two large files(5GB+) and find if they are the same or not. One solution I considered is hashing both with crypto and then comparing the hashes. But this would take a lot of time since I will have to go through the entire files instead of stopping when a difference is found.
Another solution I thought was to compare the file as they are being streamed with fs.createReadStream() and break when a difference is found.
stream.on('data', (data) => {
//compare the data from this stream with the other stream
})
But I am not quite sure how I can have two streams that are synchronized.

As requested in your comments, if you want to see how an implementation can be written to do this, here's one. Here's how it works:
Open each of the two files
Compare the two files sizes. If not the same, resolve false.
Allocate two 8k buffers (you can choose the size of buffer to use)
Read 8k of each file (or less if not 8k left in the file) into your buffers
Compare those two buffers. If not identical, resolve false.
When you finish comparing all the bytes, resolve true
Here's the code:
const fs = require('fs');
const fsp = fs.promises;
// resolves to true or false
async function compareFiles(fname1, fname2) {
const kReadSize = 1024 * 8;
let h1, h2;
try {
h1 = await fsp.open(fname1);
h2 = await fsp.open(fname2);
const [stat1, stat2] = await Promise.all([h1.stat(), h2.stat()]);
if (stat1.size !== stat2.size) {
return false;
}
const buf1 = Buffer.alloc(kReadSize);
const buf2 = Buffer.alloc(kReadSize);
let pos = 0;
let remainingSize = stat1.size;
while (remainingSize > 0) {
let readSize = Math.min(kReadSize, remainingSize);
let [r1, r2] = await Promise.all([h1.read(buf1, 0, readSize, pos), h2.read(buf2, 0, readSize, pos)]);
if (r1.bytesRead !== readSize || r2.bytesRead !== readSize) {
throw new Error("Failed to read desired number of bytes");
}
if (buf1.compare(buf2, 0, readSize, 0, readSize) !== 0) {
return false;
}
remainingSize -= readSize;
pos += readSize;
}
return true;
} finally {
if (h1) {
await h1.close();
}
if (h2) {
await h2.close();
}
}
}
// sample usage
compareFiles("temp.bin", "temp2.bin").then(result => {
console.log(result);
}).catch(err => {
console.log(err);
});
This could be sped up a bit by opening and closing the files in parallel using Promise.allSettled() to track when they are both open and then both closed, though because of the complications if one succeeds in opening and the other doesn't and you don't want to leak the one opened file handle, it takes a bit more code to do that perfectly so I kept it simpler here.
And, if you really wanted to optimize for performance, it would be worth testing larger buffers to see if it makes things faster or not.
It's also possible that buf1.equals(buf2) might be faster than buf1.compare(buf2), but you have to make sure that a partial buffer read at the end of the file still works properly when using that since .equals() always compares the entire buffer. You could build two versions and compare their performance.
Here's a more complicated version that opens and closes the files in parallel and might be slightly faster:
const fs = require('fs');
const fsp = fs.promises;
async function compareFiles(fname1, fname2) {
const kReadSize = 1024 * 8;
let h1, h2;
try {
let openResults = await Promise.allSettled([fsp.open(fname1), fsp.open(fname2)]);
let err;
if (openResults[0].status === "fulfilled") {
h1 = openResults[0].value;
} else {
err = openResults[0].reason;
}
if (openResults[1].status === "fulfilled") {
h2 = openResults[1].value;
} else {
err = openResults[1].reason;
}
// after h1 and h2 are set (so they can be properly closed)
// throw any error we got
if (err) {
throw err;
}
const [stat1, stat2] = await Promise.all([h1.stat(), h2.stat()]);
if (stat1.size !== stat2.size) {
return false;
}
const buf1 = Buffer.alloc(kReadSize);
const buf2 = Buffer.alloc(kReadSize);
let pos = 0;
let remainingSize = stat1.size;
while (remainingSize > 0) {
let readSize = Math.min(kReadSize, remainingSize);
let [r1, r2] = await Promise.all([h1.read(buf1, 0, readSize, pos), h2.read(buf2, 0, readSize, pos)]);
if (r1.bytesRead !== readSize || r2.bytesRead !== readSize) {
throw new Error("Failed to read desired number of bytes");
}
if (buf1.compare(buf2, 0, readSize, 0, readSize) !== 0) {
return false;
}
remainingSize -= readSize;
pos += readSize;
}
return true;
} finally {
// does not return file close errors
// but does hold resolving the promise until the files are closed
// or had an error trying to close them
// Since we didn't write to the files, a close error would be fairly
// unprecedented unless the disk went down
const closePromises = [];
if (h1) {
closePromises.push(h1.close());
}
if (h2) {
closePromises.push(h2.close());
}
await Promise.allSettled(closePromises);
}
}
compareFiles("temp.bin", "temp2.bin").then(result => {
console.log(result);
}).catch(err => {
console.log(err);
});

There are certainly libraries that do this, and file-sync-cmp is very popular (270k weekly downloads). It does the comparison in the simplest way, by reading the same number of bytes from the two files in different buffers, and then comparing the buffers byte by byte.
There's also a more modern library, filecompare, "using native Promises and native BufferTools (alloc and Buffer comparisons)".
Whenever practical, don't reinvent the wheel :)

Since the difference might be at the very end of the files, I guess calculating a hash of the files is the most (yet costly) straightforward and secure process.
Did you try the MD5-File npm package and get some performance indicators?

How to inject javascript in existing HTML response with node.js and cloudflare workers

I have a vanity URL pointing to a GitBook. GitBook doesn't support the insertion of arbitrary javascript snippets. At the moment GitBook has 4 "integrations" only.
I could route through my own VM server to accomplish this, but I have CloudFlare and I want to try out workers. (Javascript running at the CDN edge).
The CloudFlare worker environment makes header injection very easy, but there is no obvious way to do this.

It's important to process with a TransformStream so that processing is async and doesn't require memory buffering (for scalability and to minimise GC) - there's only a 5ms CPU time budget.
Overview:
To use for yourself, change the strings forHeadStart, forHeadEnd, and forBodyEnd.
This deferredInjection approach is the recommended way that minimises CPU time for the worker. It's more efficient because it only needs to parse the very start of the HTML. The other approach requires parsing of the whole head section for headInjection, and if you use bodyInjection it practically needs to parse the whole html response.
The deferredInjection approach works by injecting the content into the start of the head tag, then on the client-side at runtime your HTML content will be deployed to the desired places.
You can inject directly if needed using headInjection and/or bodyInjection. Uncommenting related code, including code in injectScripts, and setting the strings for tagBytes that will be encoded.
This solution will only parse HTML content types
This solution works directly on bytes (not strings) for better efficiency. Searching for the bytes of the end-tag strings.
You could potentially target more end-tags, but usually you don't need to target more than these two
Processes data with streaming (the whole HTML string is not cached in memory). This lowers peak memory usage and speeds up time to first byte.
Handles a rare edge case where the closing tag is on a text read boundary. I believe a boundary might occur every ~1000 bytes (TCP packets 1000-1500 bytes each), and this can vary due to gzip compression.
Keeps the injection parsing code separate for the code to simply forward the rest for clarity.
You can disable the second body-tag injector by commenting it out if you don't need it - that will speed up processing.
I have tested this exact code for myself and it works. There might be remaining bugs (depending on location of closing tag, and depending if your server replies with partial html templates (body only)). I may have fixed one today 2019-06-28
Code
addEventListener('fetch', event => {
event.passThroughOnException();
event.respondWith(handleRequest(event.request))
})
/**
* Fetch and log a request
* #param {Request} request
*/
async function handleRequest(request) {
const response = await fetch(request);
var ctype = response.headers.get('content-type');
if (ctype.startsWith('text/html') === false)
return response; //Only parse html body
let { readable, writable } = new TransformStream();
let promise = injectScripts(response.body, writable);
return new Response(readable, response);
}
let encoder = new TextEncoder('utf-8');
let deferredInjection = function() {
let forHeadStart = `<script>var test = 1; //Start of head section</script>`;
let forHeadEnd = `<script>var test = 2; //End of head section</script>`;
let forBodyEnd = `<script>var test = 3; //End of body section</script><button>click</button>`;
let helper = `
${forHeadStart}
<script>
function appendHtmlTo(element, htmlContent) {
var temp = document.createElement('div');
temp.innerHTML = htmlContent;
while (temp.firstChild) {
element.appendChild(temp.firstChild);
};
}
let forHeadEnd = "${ btoa(forHeadEnd) }";
let forBodyEnd = "${ btoa(forBodyEnd) }";
if (forHeadEnd.length > 0) appendHtmlTo(document.head, atob(forHeadEnd));
if (forBodyEnd.length > 0) window.onload = function() {
appendHtmlTo(document.body, atob(forBodyEnd));
};
</script>
`;
return {
forInjection: encoder.encode(helper),
tagBytes: encoder.encode("<head>"),
insertAfterTag: true
};
}();
// let headInjection = {
// forInjection: encoder.encode("<script>var test = 1;</script>"),
// tagBytes: encoder.encode("</head>"), //case sensitive
// insertAfterTag: false
// };
// let bodyInjection = {
// forInjection: encoder.encode("<script>var test = 1;</script>"),
// tagBytes: encoder.encode("</body>"), //case sensitive
// insertAfterTag: false
// }
//console.log(bodyTagBytes);
encoder = null;
async function injectScripts(readable, writable) {
let processingState = {
readStream: readable,
writeStream: writable,
reader: readable.getReader(),
writer: writable.getWriter(),
leftOvers: null, //data left over after a closing tag is found
inputDone: false,
result: {charactersFound: 0, foundIndex: -1, afterHeadTag: -1} //Reused object for the duration of the request
};
await parseForInjection(processingState, deferredInjection);
//await parseForInjection(processingState, headInjection);
//await parseForInjection(processingState, bodyInjection);
await forwardTheRest(processingState);
}
///Return object will have foundIndex: -1, if there is no match, and no partial match at the end of the array
///If there is an exact match, return object will have charactersFound:(tagBytes.Length)
///If there is a partial match at the end of the array, return object charactersFound will be < (tagBytes.Length)
///The result object needs to be passed in to reduce Garbage Collection - we can reuse the object
function searchByteArrayChunkForClosingTag(chunk, tagBytes, result)
{
//console.log('search');
let searchStart = 0;
//console.log(tagBytes.length);
//console.log(chunk.length);
for (;;) {
result.charactersFound = 0;
result.foundIndex = -1;
result.afterHeadTag = -1;
//console.log(result);
let sweepIndex = chunk.indexOf(tagBytes[0], searchStart);
if (sweepIndex === -1)
return; //Definitely not found
result.foundIndex = sweepIndex;
sweepIndex++;
searchStart = sweepIndex; //where we start searching from next
result.charactersFound++;
result.afterHeadTag = sweepIndex;
//console.log(result);
for (let i = 1; i < tagBytes.length; i++)
{
if (sweepIndex === chunk.length) return; //Partial match
if (chunk[sweepIndex++] !== tagBytes[i]) { result.charactersFound = 0; result.afterHeadTag = -1; break; } //Failed to match (even partially to boundary)
result.charactersFound++;
result.afterHeadTag = sweepIndex; //Because we work around the actual found tag in case it's across a boundary
}
if (result.charactersFound === tagBytes.length)
return; //Found
}
}
function continueSearchByteArrayChunkForClosingTag(chunk, tagBytes, lastSplitResult, result)
{
//console.log('continue');
//Finish the search (no need to check the last buffer at all)
//console.log('finish the search');
result.charactersFound = lastSplitResult.charactersFound; //We'll be building on the progress from the lastSplitResult
result.foundIndex = (-1 * result.charactersFound); //This won't be used, but a negative value is indicative of chunk spanning
let sweepIndex = 0;
result.afterHeadTag = 0;
for (let i = lastSplitResult.charactersFound; i < tagBytes.length; i++) //Zero-based
{
if (sweepIndex === chunk.length) return result; //So we support working on a chunk that's smaller than the tagBytes search size
if (chunk[sweepIndex++] !== tagBytes[i]) { result.charactersFound = 0; result.afterHeadTag = -1; break; }
result.charactersFound++;
result.afterHeadTag = sweepIndex;
}
}
function continueOrNewSearch(chunk, tagBytes, lastSplitResult, result)
{
//console.log('continueOrNewSearch');
if (lastSplitResult == null)
searchByteArrayChunkForClosingTag(chunk, tagBytes, result);
else
{
continueSearchByteArrayChunkForClosingTag(chunk, tagBytes, lastSplitResult, result);
if (result.charactersFound === tagBytes.length)
return result;
else
return searchByteArrayChunkForClosingTag(chunk, tagBytes, result); //Keep searching onward
}
}
async function parseForInjection(processingState, injectionJob)
{
if (processingState.inputDone) return; //Very edge case: Somehow </head> is never found?
if (!injectionJob) return;
if (!injectionJob.tagBytes) return;
if (!injectionJob.forInjection) return;
let reader = processingState.reader;
let writer = processingState.writer;
let result = processingState.result;
let tagBytes = injectionJob.tagBytes;
//(reader, writer, tagBytes, forInjection)
let lastSplitResult = null;
let chunk = null;
processingState.inputDone = false;
for (;;) {
if (processingState.leftOvers)
{
chunk = processingState.leftOvers;
processingState.leftOvers = null;
}
else
{
let readerResult = await reader.read();
chunk = readerResult.value;
processingState.inputDone = readerResult.done;
}
if (processingState.inputDone) {
if (lastSplitResult !== null) {
//Very edge case: Somehow tagBytes is never found?
console.log('edge');
throw 'tag not found'; //Causing the system to fall back to the direct request
}
await writer.close();
return true;
}
//console.log(value.length);
continueOrNewSearch(chunk, tagBytes, lastSplitResult, result)
//console.log(result);
if (result.charactersFound === tagBytes.length) //Complete match
{
//Inject
//console.log('inject');
if (result.foundIndex > 0)
{
let partValue = chunk.slice(0, result.foundIndex);
//console.log(partValue);
await writer.write(partValue);
}
console.log('injected');
if (parseForInjection.insertAfterTag)
{
await writer.write(injectionJob.forInjection);
await writer.write(injectionJob.tagBytes);
}
else
{
await writer.write(injectionJob.tagBytes);
await writer.write(injectionJob.forInjection);
}
let remainder = chunk.slice(result.afterHeadTag, chunk.length - 1);
processingState.leftOvers = remainder;
lastSplitResult = null;
return;
}
if (lastSplitResult !== null)
{
//console.log('no match over boundary');
//The remainder wasn't found, so write the partial match from before (maybe `<` or `</`)
let failedLastBit = injectionJob.tagBytes.slice(0, lastSplitResult.charactersFound);
await writer.write(failedLastBit);
lastSplitResult = null;
}
if (result.charactersFound === 0)
{
//console.log('not found')
await writer.write(chunk);
continue;
}
if (result.charactersFound < tagBytes.length)
{
//console.log('boundary: ' + result.charactersFound);
lastSplitResult = result;
let partValue = chunk.slice(0, result.foundIndex);
//console.log(partValue);
await writer.write(partValue);
continue;
}
}
}
async function forwardTheRest(processingState)
{
try
{
if (processingState.inputDone) return; //Very edge case: Somehow </head> is never found?
if (processingState.leftOvers)
{
chunk = processingState.leftOvers;
await processingState.writer.write(chunk);
}
processingState.reader.releaseLock();
processingState.writer.releaseLock();
await processingState.readStream.pipeTo(processingState.writeStream);
//Should there be an explicit close method called? I couldn't find one
}
catch (e)
{
console.log(e);
}
}
Further explanation of working directly with (utf-8) bytes:
Only working with byte values. This is possible at least by searching for the first distinctive utf-8 byte of a character (< 128 and > 192). But in this case, we're searching for </head> which is made up of lower-than-128 bytes, very easy to work with.
Given the nature of searching for utf-8 (which is the trickiest), this should work with ['utf-8', 'utf8', 'iso-8859-1', 'us-ascii']. You will need to change the snippet encoder to match.
This isn't thoroughly tested. The boundary case, didn't trigger for me. Ideally, we would have a testing rig for the core functions
thanks to Kenton Varda for challenging me
Please let me know if there's a CloudFlare workers way to do pipeTo in the forwardTheRest function
You might find continueOrNewSearch and the two sub-functions to be an interesting approach to finding multi-bytes across a chunk boundary. Up until the boundary we just count how many bytes are found. There's no need to keep those bytes (we know what they are). Then on the next chunk we continue where we left off. We always cut the array buffer around the header, and make sure we write the header bytes (using the tagBytes)

How can I get the raw download size of a request using Puppeteer?

That is, the total amount of data downloaded across all resources (including video/media), similar to that returned by Chrome DevTools' Network tab.

There doesn't seem to be any way to do this as of January 2018 that works with all resource types (listening for the response event fails for videos), and that correctly counts compressed resources.
The best workaround seems to be to listen for the Network.dataReceived event, and process the event manually:
const resources = {};
page._client.on('Network.dataReceived', (event) => {
const request = page._networkManager._requestIdToRequest.get(
event.requestId
);
if (request && request.url().startsWith('data:')) {
return;
}
const url = request.url();
// encodedDataLength is supposed to be the amount of data received
// over the wire, but it's often 0, so just use dataLength for consistency.
// https://chromedevtools.github.io/devtools-protocol/tot/Network/#event-dataReceived
// const length = event.encodedDataLength > 0 ?
// event.encodedDataLength : event.dataLength;
const length = event.dataLength;
if (url in resources) {
resources[url] += length;
} else {
resources[url] = length;
}
});
// page.goto(...), etc.
// totalCompressedBytes is unavailable; see comment above
const totalUncompressedBytes = Object.values(resources).reduce((a, n) => a + n, 0);

The solution of #mjs works perfectly even in 2021. Just need to replace:
page._networkManager -> page._frameManager._networkManager
Full example that works for me:
const resources = {};
page._client.on('Network.dataReceived', (event) => {
const request = page._frameManager._networkManager._requestIdToRequest.get(
event.requestId
);
if (request && request.url().startsWith('data:')) {
return;
}
const url = request.url();
const length = event.dataLength;
if (url in resources) {
resources[url] += length;
} else {
resources[url] = length;
}
});
await page.goto('https://stackoverflow.com/questions/48263345/how-can-i-get-the-raw-download-size-of-a-request-using-puppeteer');
const totalUncompressedBytes = Object.values(resources).reduce((a, n) => a + n, 0);
console.log(totalUncompressedBytes);

If you are using puppeteer, you have server side node... Why not pipe the request through a stream, or streams and then calculate the content size?
Also there is https://github.com/watson/request-stats
Also you may want to call page.waitForNavigation as you may be wrestling with async timing issues

const imgaes_width = await page.$$eval('img', anchors => [].map.call(anchors, img => img.width));
const imgaes_height = await page.$$eval('img', anchors => [].map.call(anchors, img => img.height));

Read a large file N lines at a time in Node.JS

I have a file with 65,000,000 lines, that is about 2gb in size.
I want to read this file in N lines at a time, perform a db insert operation, and then read the next N, with N being, say, 1000 in this case. Insert order doesn't matter, so synchronous is fine.
What's the best way of doing this? I've only found was to either load in 1 line at a time, or methods that read the whole file into memory. Sample code below, that I've been using to read the file one line at a time. :
var singleFileParser = (file, insertIntoDB) => {
var lr = new LineByLineReader(file);
lr.on('error', function(err) {
// 'err' contains error object
console.error(err);
console.error("Error reading file!");
});
lr.on('line', function(line) {
insertIntoDB(line);
// 'line' contains the current line without the trailing newline character.
});
lr.on('end', function() {
// All lines are read, file is closed now.
});
};

Lines can only be parsed one at a time by someone. So, if you want 10 at once, then you just collect them one at a time until you have collected 10 and then process the 10.
I did not think Jarek's code quite worked right so here's a different version that collects 10 lines into an array and then calls dbInsert():
var tenLines = [];
lr.on('line', function(line) {
tenLines.push(line);
if (tenLines.length === 10) {
lr.pause();
dbInsert(<yourSQL>, function(error, returnVal){
if (error) {
// some sort of error handling here
}
tenLines = [];
lr.resume();
});
}
});
// process last set of lines in the tenLines buffer (if any)
lr.on('end', function() {
if (tenLines.length !== 0) {
// process last set of lines
dbInsert(...);
}
});
Jarek's version seems to call dbInsert() on every line event rather than only every 10th line event and did not process any left over lines at the end of the file if they weren't a perfect multiple of 10 lines long.

Something like this should do
var cnt = 0;
var tenLines = [];
lr.on('line', function(line) {
tenLines.push(line);
if (++cnt >= 10) {
lr.pause();
// prepare your SQL statements from tenLines
dbInsert(<yourSQL>, function(error, returnVal){
cnt = 0;
tenLines = [];
lr.resume();
});
}
});

This is my solution inside an async function:
let multipleLines = [];
const filepath = '<file>';
const numberLines = 50;
const lineReader = require('readline').createInterface({
input: require('fs').createReadStream(filepath)
});
// process lines by numberLines
for await (const line of lineReader) {
multipleLines.push(line);
if (multipleLines.length === numberLines) {
await dbInsert();
multipleLines = [];
}
}
// process last set of lines (if any)
if (multipleLines.length !== 0) {
await dbInsert();
}

This is my solution using built modules rather then NPM's line-by-line package
var lineReader = require('readline').createInterface({
input: require('fs').createReadStream(fileToRead)
});
lines = []
lineReader.on('line', function (line) {
lines.push(line.split(","));
if (lines.length == 10) {
makeCall()
lines = []
}
});
lineReader.on('close', function () {
if (lines.length !== 0) {
makeCall()
lines = []
}
});
function makeCall() {
//Do something with lines
}

We Keep Coding

JavaScript is the programming language of the Web.

NodeJS Running through file by steams - javascript

Related

How to correctly parse Json Objects from text Stream delimited by new lines

Best practice for comparing two large files in Node.js

How to inject javascript in existing HTML response with node.js and cloudflare workers

How can I get the raw download size of a request using Puppeteer?

Read a large file N lines at a time in Node.JS

Categories

Resources