Writing large CSV to JS file using Node FS - javascript

I have a large CSV file of postcode data (~1.1GB), I am trying to filter out the data I need and then write an array of values to a JS file.
The issue is, that i'm always using too much memory and receiving this error:
Ineffective mark-compacts near heap limit Allocation failed - JavaScript heap out of memory
I have tried increasing the memory using this command: node --max-old-space-size=4096 fileName.js but I still hit my memory limit, it just takes longer!
Here is my code to write to the JS
const csvFilePath = "./data/postcodes.csv";
const csv = require("csvtojson");
const fs = require("fs");
csv()
.fromFile(csvFilePath)
.then((jsonArray) => {
const inUsePostcodes = jsonArray.filter((x) => x["In Use?"] === "Yes").map((y) => y.Postcode);
fs.writeFileSync("postcodes.js", inUsePostcodes);
});
Here is a sample of postcodes.csv:
Postcode,In Use?,Latitude,Longitude,Easting,Northing,Grid Ref,County,District,Ward,District Code,Ward Code,Country,County Code,Constituency,Introduced,Terminated,Parish,National Park,Population,Households,Built up area,Built up sub-division,Lower layer super output area,Rural/urban,Region,Altitude,London zone,LSOA Code,Local authority,MSOA Code,Middle layer super output area,Parish Code,Census output area,Constituency Code,Index of Multiple Deprivation,Quality,User Type,Last updated,Nearest station,Distance to station,Postcode area,Postcode district,Police force,Water company,Plus Code,Average Income
AB1 0AA,No,57.101474,-2.242851,385386,801193,NJ853011,"","Aberdeen City","Lower Deeside",S12000033,S13002843,Scotland,S99999999,"Aberdeen South",1980-01-01,1996-06-01,"","",,,"","","Cults, Bieldside and Milltimber West - 02","Accessible small town",,46,,S01006514,,S02001237,"Cults, Bieldside and Milltimber West",,S00090303,S14000002,6808,1,0,2020-02-19,"Portlethen",8.31408,AB,AB1,"Scotland","Scottish Water",9C9V4Q24+HV,
AB1 0AB,No,57.102554,-2.246308,385177,801314,NJ851013,"","Aberdeen City","Lower Deeside",S12000033,S13002843,Scotland,S99999999,"Aberdeen South",1980-01-01,1996-06-01,"","",,,"","","Cults, Bieldside and Milltimber West - 02","Accessible small town",,61,,S01006514,,S02001237,"Cults, Bieldside and Milltimber West",,S00090303,S14000002,6808,1,0,2020-02-19,"Portlethen",8.55457,AB,AB1,"Scotland","Scottish Water",9C9V4Q33+2F,
AB1 0AD,No,57.100556,-2.248342,385053,801092,NJ850010,"","Aberdeen City","Lower Deeside",S12000033,S13002843,Scotland,S99999999,"Aberdeen South",1980-01-01,1996-06-01,"","",,,"","","Cults, Bieldside and Milltimber West - 02","Accessible small town",,45,,S01006514,,S02001237,"Cults, Bieldside and Milltimber West",,S00090399,S14000002,6808,1,0,2020-02-19,"Portlethen",8.54352,AB,AB1,"Scotland","Scottish Water",9C9V4Q22+6M,
How can I write to the JS file from this CSV, without hitting my memory limit?

You need a csv stream parser that will parse it and provide output a line at a time and let you stream it to a file.
Here's one way to do it using the cvs-reader module:
const fs = require('fs');
const csvReader = require('csv-reader');
const { Transform } = require('stream');
const myTransform = new Transform({
readableObjectMode: true,
writableObjectMode: true,
transform(obj, encoding, callback) {
let data = JSON.stringify(obj);
if (this.tFirst) {
// beginning of transformed data
this.push("[");
this.tFirst = false;
} else {
data = "," + data; // add comma separator if not first object
}
this.push(data);
callback();
}
});
myTransform.tFirst = true;
myTransform._flush = function(callback) {
// end of transformed data
this.push("]");
callback();
}
// All of these arguments are optional.
const options = {
skipEmptyLines: true,
asObject: true, // convert data to object
parseNumbers: true,
parseBooleans: true,
trim: true
};
const csvStream = new csvReader(options);
const readStream = fs.createReadStream('example.csv', 'utf8');
const writeStream = fs.createWriteStream('example.json', {autoClose: false});
readStream.on('error', err => {
console.log(err);
csvStream.destroy(err);
}).pipe(csvStream).pipe(myTransform).pipe(writeStream).on('error', err => {
console.error(err);
}).on('finish', () => {
console.log('done');
});

The issue is that the csvtojson node module is trying to store this massive jsonObj in memory!
I found a different solution which involves using the csv-parser node module and then just parsed one row at a time instead of the whole csv!
Here is my solution:
const csv = require('csv-parser');
const fs = require('fs');
var stream = fs.createWriteStream("postcodes.js", {flags:'a'});
let first = false;
fs.createReadStream('./data/postcodes.csv')
.pipe(csv())
.on('data', (row) => {
if (row["In Use?"]) {
if (!first) {
first = true;
stream.write(`const postcodes = ["${row.Postcode},\n"`);
} else {
stream.write(`"${row.Postcode},\n"`);
}
}
})
.on('end', () => {
stream.write("]");
console.log('CSV file successfully processed');
});
It's not very pretty writing strings like const postcodes = to represent JavaScript, but it performs the desired function.

Related

How could I duplicate/copy file in an automatized way with JavaScript?

I have an gif file that is stored in a directory call assets on my computer. I would like to create X amount of duplicates and they should be stored in the same directory and each of them should have a different file name.
Example:
I in the assets directory is the gif file call 0.gif I would like to duplicate this gif file 10 times and The duplicates should be called 1.gif, 2.gif, 3.R and so on.
The simplest option is to use fs and using copyFile function available
const fs = require("fs");
const path = require("path");
let copyMultiple = (src, count) => {
let initCount = 0;
while (initCount < count) {
initCount++;// you can put this at bottom too acc to your needs
const newFileName = `${initCount}_${initCount}${path.extname(src)}`;
console.log(newFileName, "is new file name");
fs.copyFile(src, newFileName, (error) => {
// if errors comes
if (error) {
console.log(error);
}
});
}
};
copyMultiple("1.gif", 3);
Another elegant way of doing this is
const util = require("util");
const fs = require("fs");
const path = require("path");
const copyFilePromise = util.promisify(fs.copyFile);
function copyFiles(srcFile, destDir, destFileNames) {
return Promise.all(
destFileNames.map((file) => {
return copyFilePromise(srcFile, path.join(destDir, file));
})
);
}
const myDestinationFileNames = ["second.gif", "third.gif"];
const sourceFileName = "1.gif";
copyFiles(sourceFileName, "", myDestinationFileNames)
.then(() => {
console.log("Copying is Done");
})
.catch((err) => {
console.log("Got and Error", error);
});
Using this will also give upperhand of knowing when it is done.
You can read docs here
const fs = require("fs")
const filename = "index.js".split(".") //filename like 0.gif to gif
const times = 10 // number of times to duplicate
for(var int = 1; int < times; int++){
const newFilename = `${(parseInt(filename[0]) + init)}.${filename[1]}` //new filename like 0.gif to 1.gif
fs.copyFileSync(filename, newfilename)
}
use the write file and read file from the fs module and a simple for loop
not sure which framework you're on but fs.copyFile() is the standard way for node.js https://nodejs.org/api/fs.html#fscopyfilesrc-dest-mode-callback

How long should it take to process big files in node.js?

I am not used to working with big files but I was wondering how long does it approximately takes to read and write in node.js when using files that are 2-3GB big or ~ 7.5 million lines.
my current implementation takes about ~ 12 minutes or 737859.276ms
I know that it's hard to approximate and depends on context but I wanted to make sure if it could be done faster.
function LogParser(filePath, outputFile, cb) {
try {
const rl = createInterface({
input: createReadStream(filePath),
crlfDelay: Infinity,
});
writer.pipe(createWriteStream(outputFile));
rl.on('line', (line) => {
// Gets line and splits it by " - " where the ip is the first value
const IPAddress = line.split(' - ')[0];
const locationData = lookup(IPAddress); // translate to data object with region and country and city
const userAgentData = parser(line); // get userAgent data
const result = {
// object with data inside
};
writer.write(result)
});
rl.on('close', () => {
writer.end()
});
} catch (e) {
throw new Error(e.message);
}

Javascript,Nodejs: search for a specific word string in files

i'm trying to make an app that searches for all files
contains a specified string under the current directory/subdirectory.
as i understand it means i need to create a read stream, loop it, load the read data to an array, if the word found give __filename, dirname and if ! not found message.
unfortunately, i could not make it work...
any clue?
var path = require('path'),
fs=require('fs');
function fromDir(startPath,filter,ext){
if (!fs.existsSync(startPath)){
console.log("no dir ",startPath);
return;
};
var files=fs.readdirSync(startPath);
let found = files.find((file) => {
let thisFilename = path.join(startPath, file);
let stat = fs.lstatSync(thisFilename);
var readStream = fs.createReadStream(fs);
var readline = require('readline');
if (stat.isDirectory()) {
fromDir(thisFilename, filename,readline, ext);
} else {
if (path.extname(createReadStream) === ext && path.basename(thisFilename, ext) === filename) {
return true;
}
}
});
console.log('-- your word has found on : ',filename,__dirname);
}
if (!found) {
console.log("Sorry, we didn't find your term");
}
}
fromDir('./', process.argv[3], process.argv[2]);
Because not everything was included in the question, I made an assumption:
We are looking for full words (if that's not the case, replace the regex with a simple indexOf()).
Now, I've split the code into two functions - to make it both more readable and easier to recursively find the files.
Synchronous version:
const path = require('path');
const fs = require('fs');
function searchFilesInDirectory(dir, filter, ext) {
if (!fs.existsSync(dir)) {
console.log(`Specified directory: ${dir} does not exist`);
return;
}
const files = getFilesInDirectory(dir, ext);
files.forEach(file => {
const fileContent = fs.readFileSync(file);
// We want full words, so we use full word boundary in regex.
const regex = new RegExp('\\b' + filter + '\\b');
if (regex.test(fileContent)) {
console.log(`Your word was found in file: ${file}`);
}
});
}
// Using recursion, we find every file with the desired extention, even if its deeply nested in subfolders.
function getFilesInDirectory(dir, ext) {
if (!fs.existsSync(dir)) {
console.log(`Specified directory: ${dir} does not exist`);
return;
}
let files = [];
fs.readdirSync(dir).forEach(file => {
const filePath = path.join(dir, file);
const stat = fs.lstatSync(filePath);
// If we hit a directory, apply our function to that dir. If we hit a file, add it to the array of files.
if (stat.isDirectory()) {
const nestedFiles = getFilesInDirectory(filePath, ext);
files = files.concat(nestedFiles);
} else {
if (path.extname(file) === ext) {
files.push(filePath);
}
}
});
return files;
}
Async version - because async is cool:
const path = require('path');
const fs = require('fs');
const util = require('util');
const fsReaddir = util.promisify(fs.readdir);
const fsReadFile = util.promisify(fs.readFile);
const fsLstat = util.promisify(fs.lstat);
async function searchFilesInDirectoryAsync(dir, filter, ext) {
const found = await getFilesInDirectoryAsync(dir, ext);
for (file of found) {
const fileContent = await fsReadFile(file);
// We want full words, so we use full word boundary in regex.
const regex = new RegExp('\\b' + filter + '\\b');
if (regex.test(fileContent)) {
console.log(`Your word was found in file: ${file}`);
}
};
}
// Using recursion, we find every file with the desired extention, even if its deeply nested in subfolders.
async function getFilesInDirectoryAsync(dir, ext) {
let files = [];
const filesFromDirectory = await fsReaddir(dir).catch(err => {
throw new Error(err.message);
});
for (let file of filesFromDirectory) {
const filePath = path.join(dir, file);
const stat = await fsLstat(filePath);
// If we hit a directory, apply our function to that dir. If we hit a file, add it to the array of files.
if (stat.isDirectory()) {
const nestedFiles = await getFilesInDirectoryAsync(filePath, ext);
files = files.concat(nestedFiles);
} else {
if (path.extname(file) === ext) {
files.push(filePath);
}
}
};
return files;
}
If you have not worked with / understand async/await yet, it is a great step to take and learn it as soon as possible. Trust me, you will love not seeing those ugly callbacks again!
UPDATE:
As you pointed in comments, you want it to execute the function after running node process on the file. You also want to pass the function parameters as node's arguments.
To do that, at the end of your file, you need to add:
searchFilesInDirectory(process.argv[2], process.argv[3], process.argv[4]);
This extracts our arguments and passes them to the function.
With that, you can call our process like so (example arguments):
node yourscriptname.js ./ james .txt
Personally, if I were to write this, I would leverage the beauty of asynchronous code, and Node.js's async / await.
As a very side note:
You can easily improve readability of your code, if you add proper formatting to it. Don't get me wrong, it's not terrible - but it can be improved:
Use spaces OR newlines after commas.
Use spaces around equality operators and arithmetic operators.
As long as you are consistent with formatting, everything looks much better.

Node JS - How to write stream big json data into json array file?

I have difficult to write a json data into json file using stream module.
I learn about this from several blog tutorial, one of them is this page
Let say i am working with big json data on a json file. I think it is not possible to store all json object inside my memory. So i decided to do it using stream module.
Here the codes i have done:
writeStream.js
var Writable = require('stream').Writable,
util = require('util');
var WriteStream = function() {
Writable.call(this, {
objectMode: true
});
};
util.inherits(WriteStream, Writable);
WriteStream.prototype._write = function(chunk, encoding, callback) {
console.log('write : ' + JSON.stringify(chunk));
callback();
};
module.exports = WriteStream;
readStream.js
var data = require('./test_data.json'),
Readable = require('stream').Readable,
util = require('util');
var ReadStream = function() {
Readable.call(this, {
objectMode: true
});
this.data = data;
this.curIndex = 0;
};
util.inherits(ReadStream, Readable);
ReadStream.prototype._read = function() {
if (this.curIndex === this.data.length) {
return this.push(null);
}
var data = this.data[this.curIndex++];
console.log('read : ' + JSON.stringify(data));
this.push(data);
};
module.exports = ReadStream;
Called with this code:
var ReadStream = require('./readStream.js'),
WriteStream = require('./writeStream.js');
var rs = new ReadStream();
var ws = new WriteStream();
rs.pipe(ws);
Problem: I want to write it into different file, how is it possible?
Can you please help me?
If you are looking for a solution to just write the data from your ReadStream into a different file, you can try fs.createWriteStream. It will return you a writeable stream which can be piped directly to your ReadStream.
You will have to make a minor change in your readStream.js. You are currently pushing an object thus making it an object stream while a write stream expects either String or Buffer unless started in the ObjectMode. So you can do one of the following:
Start the write stream in the object mode. More info here.
Push String or Buffer in your read stream as writable stream internally calls writable.write which expects either String or Buffer. More info here.
If we follow the second option as an example, then your readStream.js should look like this:
var data = require('./test_data.json'),
Readable = require('stream').Readable,
util = require('util');
var ReadStream = function() {
Readable.call(this, {
objectMode: true
});
this.data = data;
this.curIndex = 0;
};
util.inherits(ReadStream, Readable);
ReadStream.prototype._read = function() {
if (this.curIndex === this.data.length) {
return this.push(null);
}
var data = this.data[this.curIndex++];
console.log('read : ' + JSON.stringify(data));
this.push(JSON.stringify(data));
};
module.exports = ReadStream;
You can call the above by using the following code
var ReadStream = require('./readStream.js');
const fs = require('fs');
var rs = new ReadStream();
const file = fs.createWriteStream('/path/to/output/file');
rs.pipe(file);
This will write the data from test_data.json to the output file.
Also as a good practice and to reliably detect write errors, add a listener for the 'error' event. For the above code, you can add the following:
file.on('error',function(err){
console.log("err:", err);
});
Hope this helps.

Node csv-parse halting after 16 rows

I'm experiencing very weird behavior running csv-parse in the following setup:
csv - ^1.1.0
stream-transform - ^0.1.1
node - v4.6.0
And running the following code to transform CSVs into an array of objects:
var parse = require('csv').parse
var fs = require('fs')
var streamtransform = require('stream-transform')
function mapCsvRow(headers, record) {
return record.reduce((p, c, i) => {
p[headers[i]] = c //eslint-disable-line
return p
}, {})
}
function parseFile(path) {
var headers
var output = []
var parser = parse({ delimiter: ',' })
var input = fs.createReadStream(path)
var transformer = streamtransform((record) => {
if (!headers) {
headers = record
return record
}
output.push(mapCsvRow(headers, record))
return record
})
// Return a new promise to wrap the parsing stream
return new Promise((resolve, reject) => {
input
.pipe(parser)
.pipe(transformer)
.on('error', e => reject(e))
.on('finish', () => resolve(output))
})
}
module.exports = parseFile
module.exports = parseFile
What happens is that the parser halts on processing files larger than 16 records. No error, no finish, no nothing.
I have no idea how to debug this, I couldn't get any input from the parser when that happens.
Looks like you have reader stream and transformer stream, but you don't have any writer stream. Hence transformer stream gets full and pauses read stream.
Try rewrite your code to not use output array. It's pointless to use stream if you hold results in memory.

Categories