Read a large file N lines at a time in Node.JS - javascript

I have a file with 65,000,000 lines, that is about 2gb in size.
I want to read this file in N lines at a time, perform a db insert operation, and then read the next N, with N being, say, 1000 in this case. Insert order doesn't matter, so synchronous is fine.
What's the best way of doing this? I've only found was to either load in 1 line at a time, or methods that read the whole file into memory. Sample code below, that I've been using to read the file one line at a time. :
var singleFileParser = (file, insertIntoDB) => {
var lr = new LineByLineReader(file);
lr.on('error', function(err) {
// 'err' contains error object
console.error(err);
console.error("Error reading file!");
});
lr.on('line', function(line) {
insertIntoDB(line);
// 'line' contains the current line without the trailing newline character.
});
lr.on('end', function() {
// All lines are read, file is closed now.
});
};

Lines can only be parsed one at a time by someone. So, if you want 10 at once, then you just collect them one at a time until you have collected 10 and then process the 10.
I did not think Jarek's code quite worked right so here's a different version that collects 10 lines into an array and then calls dbInsert():
var tenLines = [];
lr.on('line', function(line) {
tenLines.push(line);
if (tenLines.length === 10) {
lr.pause();
dbInsert(<yourSQL>, function(error, returnVal){
if (error) {
// some sort of error handling here
}
tenLines = [];
lr.resume();
});
}
});
// process last set of lines in the tenLines buffer (if any)
lr.on('end', function() {
if (tenLines.length !== 0) {
// process last set of lines
dbInsert(...);
}
});
Jarek's version seems to call dbInsert() on every line event rather than only every 10th line event and did not process any left over lines at the end of the file if they weren't a perfect multiple of 10 lines long.

Something like this should do
var cnt = 0;
var tenLines = [];
lr.on('line', function(line) {
tenLines.push(line);
if (++cnt >= 10) {
lr.pause();
// prepare your SQL statements from tenLines
dbInsert(<yourSQL>, function(error, returnVal){
cnt = 0;
tenLines = [];
lr.resume();
});
}
});

This is my solution inside an async function:
let multipleLines = [];
const filepath = '<file>';
const numberLines = 50;
const lineReader = require('readline').createInterface({
input: require('fs').createReadStream(filepath)
});
// process lines by numberLines
for await (const line of lineReader) {
multipleLines.push(line);
if (multipleLines.length === numberLines) {
await dbInsert();
multipleLines = [];
}
}
// process last set of lines (if any)
if (multipleLines.length !== 0) {
await dbInsert();
}

This is my solution using built modules rather then NPM's line-by-line package
var lineReader = require('readline').createInterface({
input: require('fs').createReadStream(fileToRead)
});
lines = []
lineReader.on('line', function (line) {
lines.push(line.split(","));
if (lines.length == 10) {
makeCall()
lines = []
}
});
lineReader.on('close', function () {
if (lines.length !== 0) {
makeCall()
lines = []
}
});
function makeCall() {
//Do something with lines
}

Related

How to break out a loop in a callback

hi how can I break out of the for loop ? I want to be able to break out of it in the callback in the if statement
I want this program to create a folder in the given directory and every time it throws an error I want it to change the folder name and add a number to it so when it says that the folder already exists, It'll create a unique folder name until it doesn't throw an error.
I will check for the error code later help me solve this first
const path = require('path');
function folder(folderName) {
for (let i = 1; i <= 10; i++) {
let pathNumber = i;
let fullPath = folderName + pathNumber;
fs.mkdir(path.join("D:", fullPath), (err) => {
if (!err) {
return; // I want to break out of the loop here
}
})
}
}
folder("folder");
You can't write the code that way because the for loop will already be done before any of the fs.mkdir() callbacks are called. They are asynchronous and happen LATER.
If you want to execute one iteration of the loop, including the fs.mkdir() before moving on to any other, then you can use async/await with fs.promises.mkdir().
Here's what a solution could look like with fs.promises.mkdir(). I've also added error handling for the case where all 10 sub-dir names you're trying already exist.
async function folder(folderName) {
let lastError;
for (let pathNumber = 1; pathNumber <= 10; pathNumber++) {
let fullPath = path.join("D:", folderName + pathNumber);
try {
await fs.promises.mkdir(fullPath);
return fullPath;
} catch(e) {
lastError = e;
// ignore error so we keep trying other numbers
}
}
throw lastError;
}
folder("folder").then(fullPath => {
console.log(`dir created: ${fullPath}`);
}).catch(err => {
console.log(err);
});
Much simpler without await
const numFolders = 10,
folders = Array.from(Array(numFolders), (_,i) => `folder${i+1}`), len = folder.length;
let cnt = 0;
const makeFolder = () => {
if (cnt >= len) return; // stop because done
fs.mkdir(path.join("D:", fullPath), (err) => {
if (err) {
makeFolder(); // only call again if error
}
cnt++
}
makeFolder()

Nodejs: merge join mulitple files

consider this scenario:
I have 2 csv files, each one is sorted and contains the id filed.
I need to join the rows using the id field. Because the files are already sorted by the id I wanted to perform merge join (https://en.wikipedia.org/wiki/Sort-merge_join).
For that I need to have a way to load some portion of both files, process it and iteratively load more again from one or both files.
(The files are big and would not fit into memory so only streaming approach will work).
The problem is the Node API, what to use? readline will not work because of https://github.com/nodejs/node/issues/33463. Any other ideas?
I had to do something quite similar recently and decided to use the node-line-reader module that has a simpler interface than the built-in readline. I then created a little recursive function that determines which file to read from next by comparing the id of each csv-entry of each provided file. After that the corresponding line gets written out to the target file, and the method is called again until all lines of all files are processed. Here's the whole class I ended up with:
const fs = require('fs');
const LineReader = require('node-line-reader').LineReader;
class OrderedCsvFileMerger {
constructor(files, targetFile) {
this.lineBuffer = [];
this.initReaders(files);
this.initWriter(targetFile);
}
initReaders(files) {
this.readers = files.map(file => new LineReader(file));
}
initWriter(targetFile) {
this.writer = fs.createWriteStream(targetFile);
}
async mergeFiles() {
// initially read first line from all files
for (const reader of this.readers) {
this.lineBuffer.push(await this.nextLine(reader));
}
return this.merge();
}
async nextLine(reader) {
return new Promise((resolve, reject) => {
reader.nextLine(function (err, line) {
if (err) reject(err);
resolve(line);
});
})
}
async merge() {
if (this.allLinesProcessed()) {
return;
}
let currentBufferIndex = -1;
let minRowId = Number.MAX_VALUE;
for (let i = 0; i < this.lineBuffer.length; i++) {
const currentRowId = parseInt(this.lineBuffer[i]); // implement parsing logic if your lines do not start
// with an integer id
if (currentRowId < minRowId) {
minRowId = currentRowId;
currentBufferIndex = i;
}
}
const line = this.lineBuffer[currentBufferIndex];
this.writer.write(line + "\n");
this.lineBuffer[currentBufferIndex] = await this.nextLine(this.readers[currentBufferIndex]);
return this.merge();
}
allLinesProcessed() {
return this.lineBuffer.every(l => !l);
}
}
(async () => {
const input = ['./path/to/csv1.csv', './path/to/csv2.csv'];
const target = './path/to/target.csv';
const merger = new OrderedCsvFileMerger(files, output);
await merger.mergeFiles();
console.log("Files were merged successfully!")
})().catch(err => {
console.log(err);
});

Conditional async callbacks

I'm writing an Electron program which takes a CSV file as input, and does file operations depending on the CSV content and file existence (it's to manage MAME arcade roms).
In order to have a progress bar on the UI side, I have switched the code from fully synchronous (because it was much easier) to asynchronous.
I just cannot find out how to reliably display a message to the user when all the lines in the CSV file are processed, and all the zip files are copied or removed.
Here is a (simplified) sample method:
fs.readFile(file, { 'encoding': 'utf8' }, (err, fileContents) => {
let fileCsv = csvparse(fileContents);
let lines = fileCsv.length;
fileCsv.forEach((line) => {
lines--;
let zip = line.name + '.zip';
let sourceRom = path.join(romset, zip);
let destRom = path.join(selection, zip);
this.emit('progress.add', fileCsv.length, fileCsv.length - lines, zip);
if (fs.existsSync(sourceRom) && !fs.existsSync(destRom)) {
fs.copy(sourceRom, destRom, (err) => {
let sourceChd = path.join(romset, game);
if (fs.existsSync(sourceChd)) {
fs.copy(sourceChd, path.join(selection, game), (err) => {
if (lines <= 0) { callback(); } // chd is copied
});
} else {
if (lines <= 0) { callback(); } // no chd, rom is copied
}
});
} else {
if (lines <= 0) { callback(); } // no source found or already exists
}
});
});
The problem is that the CSV file is processed really fast, but the file are not copied as fast. So it decrements the lines counter to 0, then after each file copy, it finds that it's zero and triggers the callback.
How do I reliably trigger the callback at the end of all these nested callbacks and conditions?
Thanks
I tried to change the code without massively overwriting your style - assuming there is a reason to avoid things like bluebird, async/await & native Promises, and the async lib.
You need to decrement lines after a line is processed. I pulled the processing logic out into a function to make this clearer:
function processLine({
sourceRom, destRom, romset, game, callback
}) {
if (fs.existsSync(sourceRom) && !fs.existsSync(destRom)) {
fs.copy(sourceRom, destRom, (err) => {
// *really* should handle this error
let sourceChd = path.join(romset, game);
if (fs.existsSync(sourceChd)) {
fs.copy(sourceChd, path.join(selection, game), (err) => {
// *really* should handle this error
callback();
});
} else {
callback();
}
});
} else {
callback() // no source found or already exists
}
}
fs.readFile(file, { 'encoding': 'utf8' }, (err, fileContents) => {
let fileCsv = csvparse(fileContents);
let lines = fileCsv.length;
fileCsv.forEach((line) => {
let zip = line.name + '.zip';
let sourceRom = path.join(romset, zip);
let destRom = path.join(selection, zip);
this.emit('progress.add', fileCsv.length, fileCsv.length - lines, zip);
processLine({ sourceRom, destRom, game, romset, callback: () => {
lines--;
if (lines <= 0) {
callback();
}
}})
});
});

Hashchain generating using async

I'm trying to generate a hashchain using the following code:
var async = require('async');
var _ = require('lodash');
var offset = 1e7;
var games = 1e7;
var game = games;
var serverSeed = '238asd1231hdsad123nds7a182312nbds1';
function loop(cb) {
var parallel = Math.min(game, 1000);
var inserts = _.range(parallel).map(function() {
return function(cb) {
serverSeed = genGameHash(serverSeed);
game--;
query('INSERT INTO `hash` SET `hash` = ' + pool.escape(serverSeed));
};
});
async.parallel(inserts, function(err) {
if (err) throw err;
// Clear the current line and move to the beginning.
var pct = 100 * (games - game) / games;
console.log('PROGRESS: ' + pct.toFixed(2) + '%')
if (game > 0){
loop(cb);
}else {
console.log('Done');
cb();
}
});
}
loop(function() {
console.log('Finished with SEED: ', serverSeed);
});
When I run this code it generates a hash chain of 1k hash's, while I'm trying to generate a chain of 1m hash's. It seems like async isn't working properly, but I have no idea why, there are no errors in console, nothing that points out a flaw.
Any ideas?
Do you can run it with smaller games (about 3000)?
Your parallel function nerver send done signal because the callback of inserts item never trigged. I think query function has two pramasters query(sql: string, callback?: (err, result) => void) (Typescript style).
I suggest you change your logic and flow like below block code:
var inserts = _.range(parallel).map(function() {
return function(cb) {
serverSeed = genGameHash(serverSeed);
query('INSERT INTO `hash` SET `hash` = ' + pool.escape(serverSeed), function(err, result) {
if(result && !err) {
game--;
}
cb(); // remember call the callback
});
};
});
In your code, you have used async.parallel, I think it is not good idea, too many connection has be open(1m). Recommeded for this case is parallelLimit

Node.js Readline, get the current line number

I have the following implementation where everything works but this line:
lineNumber: line.lineNumber
this line returns undefined, I am adding a full fragment of code below, my question is: Does Readline provide a standard way to get the line number somehow? Or I have to implement my counter to keep track of the line number, which would be simple but I don't want if there's a standard way of doing it?
/**
* Search for occurrences of the specified pattern in the received list of files.
* #param filesToSearch - the list of files to search for the pattern
* #returns {Promise} - resolves with the information about the encountered matches for the pattern specified.
*/
const findPattern = (filesToSearch) => {
console.log(filesToSearch);
return new Promise((resolve, reject) => {
var results = [ ];
// iterate over the files
for(let theFile of filesToSearch){
let dataStream = fs.createReadStream(theFile);
let lineReader = readLine.createInterface({
input: dataStream
});
let count = 0; // this would do the trick but I'd rather use standard approach if there's one
// iterates over each line of the current file
lineReader.on('line',(line) => {
count++;
if(line.indexOf(searchPattern) > 0) {
let currLine = line.toString();
currLine = currLine.replace(/{/g, '');//cleanup { from string if present
results.push({
fileName: theFile,
value: currLine,
lineNumber: line.lineNumber //HERE: this results undefined
//lineNumber: count // this does the trick but I'd rather use standard approach.
});
}
});
// resolve the promise once the file scan is finished.
lineReader.on('close', () => resolve(results));
}
});
};
Unfortunately there isn't a way to find the line number using the readline node module, however, using ES6 it isn't difficult to roll your own counter in one line of code.
const line_counter = ((i = 0) => () => ++i)();
When we create the callback function we simply default the second parameter to the line_counter function, and we can act as though the line and the line number are both being passed when a line event occurs.
rl.on("line", (line, lineno = line_counter()) => {
console.log(lineno); //1...2...3...10...100...etc
});
Simply, using a variable increment together with foo(data, ++i) it will always pass the number of the new line to the function.
let i = 0
const stream = fs.createReadStream(yourFileName)
stream.pipe().on("data", (data) => foo(data, ++i))
const foo = (data, line) => {
consle.log("Data: ", data)
consle.log("Line number:", line)
}
You need to include the lineno param if you are using node linereader
lineReader.on('line', function (lineno, line) {
if(line.indexOf(searchPattern) > 0) {
let currLine = line.toString();
currLine = currLine.replace(/{/g, '');//cleanup { from string if present
results.push({
fileName: theFile,
value: currLine,
lineNumber: lineno
});
}
});

Categories