I have a large CSV file of postcode data (~1.1GB), I am trying to filter out the data I need and then write an array of values to a JS file.
The issue is, that i'm always using too much memory and receiving this error:
Ineffective mark-compacts near heap limit Allocation failed - JavaScript heap out of memory
I have tried increasing the memory using this command: node --max-old-space-size=4096 fileName.js but I still hit my memory limit, it just takes longer!
Here is my code to write to the JS
const csvFilePath = "./data/postcodes.csv";
const csv = require("csvtojson");
const fs = require("fs");
csv()
.fromFile(csvFilePath)
.then((jsonArray) => {
const inUsePostcodes = jsonArray.filter((x) => x["In Use?"] === "Yes").map((y) => y.Postcode);
fs.writeFileSync("postcodes.js", inUsePostcodes);
});
Here is a sample of postcodes.csv:
Postcode,In Use?,Latitude,Longitude,Easting,Northing,Grid Ref,County,District,Ward,District Code,Ward Code,Country,County Code,Constituency,Introduced,Terminated,Parish,National Park,Population,Households,Built up area,Built up sub-division,Lower layer super output area,Rural/urban,Region,Altitude,London zone,LSOA Code,Local authority,MSOA Code,Middle layer super output area,Parish Code,Census output area,Constituency Code,Index of Multiple Deprivation,Quality,User Type,Last updated,Nearest station,Distance to station,Postcode area,Postcode district,Police force,Water company,Plus Code,Average Income
AB1 0AA,No,57.101474,-2.242851,385386,801193,NJ853011,"","Aberdeen City","Lower Deeside",S12000033,S13002843,Scotland,S99999999,"Aberdeen South",1980-01-01,1996-06-01,"","",,,"","","Cults, Bieldside and Milltimber West - 02","Accessible small town",,46,,S01006514,,S02001237,"Cults, Bieldside and Milltimber West",,S00090303,S14000002,6808,1,0,2020-02-19,"Portlethen",8.31408,AB,AB1,"Scotland","Scottish Water",9C9V4Q24+HV,
AB1 0AB,No,57.102554,-2.246308,385177,801314,NJ851013,"","Aberdeen City","Lower Deeside",S12000033,S13002843,Scotland,S99999999,"Aberdeen South",1980-01-01,1996-06-01,"","",,,"","","Cults, Bieldside and Milltimber West - 02","Accessible small town",,61,,S01006514,,S02001237,"Cults, Bieldside and Milltimber West",,S00090303,S14000002,6808,1,0,2020-02-19,"Portlethen",8.55457,AB,AB1,"Scotland","Scottish Water",9C9V4Q33+2F,
AB1 0AD,No,57.100556,-2.248342,385053,801092,NJ850010,"","Aberdeen City","Lower Deeside",S12000033,S13002843,Scotland,S99999999,"Aberdeen South",1980-01-01,1996-06-01,"","",,,"","","Cults, Bieldside and Milltimber West - 02","Accessible small town",,45,,S01006514,,S02001237,"Cults, Bieldside and Milltimber West",,S00090399,S14000002,6808,1,0,2020-02-19,"Portlethen",8.54352,AB,AB1,"Scotland","Scottish Water",9C9V4Q22+6M,
How can I write to the JS file from this CSV, without hitting my memory limit?
You need a csv stream parser that will parse it and provide output a line at a time and let you stream it to a file.
Here's one way to do it using the cvs-reader module:
const fs = require('fs');
const csvReader = require('csv-reader');
const { Transform } = require('stream');
const myTransform = new Transform({
readableObjectMode: true,
writableObjectMode: true,
transform(obj, encoding, callback) {
let data = JSON.stringify(obj);
if (this.tFirst) {
// beginning of transformed data
this.push("[");
this.tFirst = false;
} else {
data = "," + data; // add comma separator if not first object
}
this.push(data);
callback();
}
});
myTransform.tFirst = true;
myTransform._flush = function(callback) {
// end of transformed data
this.push("]");
callback();
}
// All of these arguments are optional.
const options = {
skipEmptyLines: true,
asObject: true, // convert data to object
parseNumbers: true,
parseBooleans: true,
trim: true
};
const csvStream = new csvReader(options);
const readStream = fs.createReadStream('example.csv', 'utf8');
const writeStream = fs.createWriteStream('example.json', {autoClose: false});
readStream.on('error', err => {
console.log(err);
csvStream.destroy(err);
}).pipe(csvStream).pipe(myTransform).pipe(writeStream).on('error', err => {
console.error(err);
}).on('finish', () => {
console.log('done');
});
The issue is that the csvtojson node module is trying to store this massive jsonObj in memory!
I found a different solution which involves using the csv-parser node module and then just parsed one row at a time instead of the whole csv!
Here is my solution:
const csv = require('csv-parser');
const fs = require('fs');
var stream = fs.createWriteStream("postcodes.js", {flags:'a'});
let first = false;
fs.createReadStream('./data/postcodes.csv')
.pipe(csv())
.on('data', (row) => {
if (row["In Use?"]) {
if (!first) {
first = true;
stream.write(`const postcodes = ["${row.Postcode},\n"`);
} else {
stream.write(`"${row.Postcode},\n"`);
}
}
})
.on('end', () => {
stream.write("]");
console.log('CSV file successfully processed');
});
It's not very pretty writing strings like const postcodes = to represent JavaScript, but it performs the desired function.
please help me to search a string across multiple files, I need to print the line number of that particular string with filename using buffer & streams concept in node.js.
for example:
there are 5 text files and there is " hello " string in 10 and 15th line of the 3rd file. same hello string in the 50th line of the 5th file. now I need to print line number of file name 3 with the line number of that searched string "hello"
same as for the 5th file.
help me to write this program in buffer concept in node.js
const readline = require("readline");
const fs = require("fs");
// Start methods implementation
const beginSearch = (readStream, filePath, queries) => {
let lineCount = 0;
let matches = new Map();
queries.forEach(query => matches.set(query, []));
return new Promise((resolve, reject) => {
readStream.on("line", line => {
lineCount++;
for (query of matches.keys()) {
if (searchForTerm(line, query))
matches.set(query, [...matches.get(query), lineCount]);
}
});
readStream.on("close", () => resolve({
filePath,
matches
}));
});
};
const searchForTerm = (line, query) => line.match(query);
const createLineInterfaces = filePaths =>
filePaths.map(filePath => {
const readStream = readline.createInterface({
input: fs.createReadStream(filePath),
crlfDelay: Infinity
});
return {
filePath,
readStream
};
});
// End methods implementation
// Start main function
const filesToSearch = ["sample.txt", "sample2.txt"];
const queriesToSeatch = ["hello"];
let searchProms = createLineInterfaces(filesToSearch).map(
({
readStream,
filePath
}) =>
beginSearch(readStream, filePath, queriesToSeatch)
);
Promise.all(searchProms).then(searchResults =>
searchResults.forEach(result => console.log(result))
);
// End main function
A little explain
I am using the readline module to split each file into lines. Keep in mind the whole implementation is with streams. Then i am attaching a listener to the line event and I am searching each line for a specific query. The search method is a simple regexp. You could use a fuzzy search method if you want. Then the matched lines are saved in a Map which keys are the queries and values the arrays of lineNumbers that the query has found.
I am assuming that you are familiar with the stream concept and you know about ES6 stuff.
I need a client to upload a text file. Then I want to parse the text file such that only lines with the word "object" in it is the only thing left in the text file. I have successfully coded the uploading part. But need help coding how to parse out the lines with "object" not in it. My node js code is below.
You can use the ReadLine API that's part of Node Core to iterate through the file line-by-line. You can use string.includes() to determine if your line contains the phrase you're looking for.
var readline = require('readline');
var fs = require('fs');
function filterFile(phrase, input) {
return Promise((resolve, reject) => {
var lines = [];
let rl = readline.createInterface({
input: input
});
rl.on('line', (line) => {
if (line.includes(phrase, 0))
lines.push(line);
});
rl.on('close', () => {
let filteredLines = Buffer.from(lines);
return resolve(fs.createReadStream(filteredLines));
});
rl.on('error', (err) => {
return reject(err);
});
});
}
Edit for Filtered Output Write Stream Example
We can take the resulting stream returned by filterFile() and pipe its contents into a new file like so
var saveDest = './filteredLines.txt');
filterFile('object', inputStream)
.then((filteredStream) => {
let ws = fs.createWriteStream(saveDest);
filteredStream.once('error', (err) => {
return Promise.reject(err);
});
filteredStream.once('end', () => {
console.log(`Filtered File has been created at ${saveDest}`);
return Promise.resolve();
});
filteredStream.pipe(ws);
});
Step : 1
Divide the line using --
var x='i am object\ni m object';
var arr = x.split('\n');
Step : 2
For each line, test with object regexp
var reg = /object/g
if(reg.test(<eachline>)){
// write new line
}else{
// do nothing
}
Imagine you have many long text files, and you need to only extract data from the first line of each one (without reading any further content). What is the best way in Node JS to do it?
Thanks!
I ended up adopting this solution, which seems the most performant I've seen so far:
var fs = require('fs');
var Q = require('q');
function readFirstLine (path) {
return Q.promise(function (resolve, reject) {
var rs = fs.createReadStream(path, {encoding: 'utf8'});
var acc = '';
var pos = 0;
var index;
rs
.on('data', function (chunk) {
index = chunk.indexOf('\n');
acc += chunk;
index !== -1 ? rs.close() : pos += chunk.length;
})
.on('close', function () {
resolve(acc.slice(0, pos + index));
})
.on('error', function (err) {
reject(err);
})
});
}
I created a npm module for convenience, named "firstline".
Thanks to #dandavis for the suggestion to use String.prototype.slice()!
There's a built-in module almost for this case - readline. It avoids messing with chunks and so forth. The code would look like the following:
const fs = require('fs');
const readline = require('readline');
async function getFirstLine(pathToFile) {
const readable = fs.createReadStream(pathToFile);
const reader = readline.createInterface({ input: readable });
const line = await new Promise((resolve) => {
reader.on('line', (line) => {
reader.close();
resolve(line);
});
});
readable.close();
return line;
}
I know this doesn't exactly answer the question but for those who are looking for a READABLE and simple way to do so:
const fs = require('fs').promises;
async function getFirstLine(filePath) {
const fileContent = await fs.readFile(filePath, 'utf-8');
return (fileContent.match(/(^.*)/) || [])[1] || '';
}
NOTE:
naturaly, this will only work with text files, which I assumed you used from your description
this will work with empty files and will return an empty string
this regexp is very performant since it is simple (no OR conditions`or complex matches) and only reads the first line
Please try this:
https://github.com/yinrong/node-line-stream-util#get-head-lines
It unpipe the upstream once got the head lines.
Node.js >= 16
In all current versions of Node.js, readline.createInterface can be used as an async iterable, to read a file line by line - or just for the first line. This is also safe to use with empty files.
Unfortunately, the error handling logic is broken in versions of Node.js before 16, where certain file system errors may go uncaught even if the code is wrapped in a try-catch block because of the way asynchronous errors are propagated in streams. So I recommend using this method only in Node.js >= 16.
import { createReadStream } from "fs";
import { createInterface } from "readline";
async function readFirstLine(path) {
const inputStream = createReadStream(path);
try {
for await (const line of createInterface(inputStream)) return line;
return ''; // If the file is empty.
}
finally {
inputStream.destroy(); // Destroy file stream.
}
}
const firstLine = await readFirstLine("path/to/file");
//Here you go;
var lineReader = require('line-reader');
var async = require('async');
exports.readManyFiles = function(files) {
async.map(files,
function(file, callback))
lineReader.open(file, function(reader) {
if (reader.hasNextLine()) {
reader.nextLine(function(line) {
callback(null,line);
});
}
});
},
function(err, allLines) {
//do whatever you want to with the lines
})
}