Read csv file with nodejs with a max number rows - javascript

I'm trying to read a CSV file with node.js using the csv-parser library.
Since it's a big file, I need to check the header and the first 100 rows and the stop the method and return true if everything is ok or false if the data doesn't respect the condition.
How can I achieve this?
This is what I have so far:
const csv = require('csv-parser');
const fs = require('fs');
exports.checkFileFormat = (file) => {
let stream = fs.createReadStream(file.tempFilePath)
.pipe(csv())
.on('headers', (headers) => {
/*...some logic...*/
})
.on('data', (row) => {
if (!typeof (row["USAGE"]) == 'number'
|| !moment(row["START_DATE"], 'YYYYMMDD', true).isValid()
|| !moment(row["END_DATE"], 'YYYYMMDD', true).isValid()) {
stream.unpipe(csv());
return false;
}
})
.on('end', () => {
console.log('CSV file successfully processed');
});
return true;
}
In a previous version I had also declared: var num = 100 and tested it inside .on('data', (row) => {...} but it didn't work.

Following up from my comment
make the function checkFileFormat return a promise. Inside the promise, resolve(false) instead of return false and resolve(true) in the '.on('end') callback. I'm not completely sure this will work, but that's how I would approach it
const csv = require('csv-parser');
const fs = require('fs');
exports.checkFileFormat = (file) => {
return new Promise((resolve, reject) => {
let stream = fs.createReadStream(file.tempFilePath)
.pipe(csv())
.on('headers', (headers) => {
/*...some logic...*/
})
.on('data', (row) => {
if (!typeof (row["USAGE"]) == 'number'
|| !moment(row["START_DATE"], 'YYYYMMDD', true).isValid()
|| !moment(row["END_DATE"], 'YYYYMMDD', true).isValid()) {
stream.end(); // stream.unpipe(csv());
resolve(false);
}
})
.on('end', () => {
console.log('CSV file successfully processed');
resolve(true);
});
});
}

If you want to read a certain amount of lines and then break, you can try the following:
const csv = require('csv-parser');
const fs = require('fs');
let count = 0;
let maxLines = 3;
let fsStream = fs.createReadStream('./data.csv');
let csvStream = csv();
fsStream.pipe(csvStream)
.on('headers', (headers) => {
console.log(headers)
})
.on('data', (data) => {
if (count >= maxLines) {
fsStream.unpipe(csvStream);
csvStream.end();
fsStream.destroy();
} else {
console.log(data);
count++;
}
});
Basically you just count each read line and when the max is reached, you unpipe the csv-stream from the fs-stream, then end the csv-stream and finally destroy the fs-stream.

Related

Confused about Node's event loop by using promises

I'm writing an recursive function, which creates an object tree of selected file directory. My code works, but in the wrong order that I expected. I can't see the output of my code. Here is the code:
const fs = require("fs");
const basePath = process.argv[2];
const result = {};
const isDirectory = path => {
return new Promise((resolve, reject) => {
fs.lstat(path, (err, stats) => {
if (err) reject("No such file or Directory");
resolve(stats.isDirectory());
});
});
};
const createTree = (path, target) => {
return new Promise((reject, resolve) => {
fs.readdir(path, (err, list) => {
for (const item of list) {
const currentLocation = `${path}/${item}`;
isDirectory(currentLocation).then(isDir => {
console.log(result); //I CAN SEE THE RESULT HERE
if (!isDir) {
target[item] = true;
} else {
target[item] = {};
resolve(createTree(currentLocation, target[item]));
}
});
}
});
reject("Somthing went wrong while getting the list of files");
});
};
createTree(basePath, result)
.then(() => console.log("result --->", result)) //BUT NOT HERE
.catch(err => console.log("Consume Error ==>", err));
I also done it with async await, but I'm curious why it doesn't work with promises.
Here is the fully working exaple with async await:
const fs = require("fs");
const basePath = process.argv[2]; //Getting the path
const result = {};
//Function to check my path is exist and it's a directory
const isDirectory = async path => {
try {
const stats = await fs.promises.lstat(path); //Used istat to get access to the "isDirectory()" method
return stats.isDirectory();
} catch (error) {
throw new Error("No such file or Directory");
}
};
//Recursive function that should create the object tree of the file system
const createTree = async (path, target) => {
try {
const list = await fs.promises.readdir(path);
for (const item of list) {
const currentLocation = `${path}/${item}`;
const isDir = await isDirectory(currentLocation);
//If it's a file, then assign it to true
//Otherwise create object of that directory and do the same for it
if (!isDir) {
target[item] = true;
} else {
target[item] = {};
await createTree(currentLocation, target[item]);
}
}
} catch (err) {
console.log("Somthing went wrong while getting the list of files");
}
};
//Consuming the createTree function
(async () => {
try {
await createTree(basePath, result);
console.log(result);
} catch (error) {
console.log(error.message);
}
})();
I'm just curious is it possible to do the same but only with promises.
async and await are simply syntactic sugar that makes it easier to work with Promise-based programs. Any program depending on those keywords can be rewritten not to use them -
// main.js
import { readdir } from "fs/promises"
import { join } from "path"
function createTree (init = ".")
{ const one = path => p =>
p.isDirectory()
? many(join(path, p.name)).then(r => ({ [p.name]: r }))
: { [p.name]: true }
const many = path =>
readdir(path, { withFileTypes: true })
.then(r => Promise.all(r.map(one(path))))
.then(r => Object.assign(...r))
return many(init)
}
createTree(".")
.then(v => console.log(JSON.stringify(v, null, 2)))
.catch(console.error)
Now let's add some sample files so we can see our program working correctly -
$ yard add immutable # (any example package)
$ node main.js
Output -
{
"main.js": true,
"node_modules": {
".yarn-integrity": true,
"immutable": {
"LICENSE": true,
"README.md": true,
"contrib": {
"cursor": {
"README.md": true,
"__tests__": {
"Cursor.ts.skip": true
},
"index.d.ts": true,
"index.js": true
}
},
"dist": {
"immutable-nonambient.d.ts": true,
"immutable.d.ts": true,
"immutable.es.js": true,
"immutable.js": true,
"immutable.js.flow": true,
"immutable.min.js": true
},
"package.json": true
}
},
"package.json": true,
"yarn.lock": true
}
If you would like the init path to be included in the tree, only a small modification is necessary -
// main.js
import { readdir } from "fs/promises"
import { join, basename } from "path" // !
function createTree (init = ".")
{ const one = path => p =>
p.isDirectory()
? many(join(path, p.name)).then(r => ({ [p.name]: r })) // !
: { [p.name]: true }
const many = path =>
readdir(path, { withFileTypes: true })
.then(r => Promise.all(r.map(one(path))))
.then(r => Object.assign(...r)) // !
.then(r => ({ [basename(path)]: Object.assign(...r) })) // !
return many(init)
}
Now the tree contains our initial path -
createTree(".")
.then(v => console.log(JSON.stringify(v, null, 2)))
.catch(console.error)
{ ".": // <- starting path
{ ... }
}
To see how to write this program using async generators, please see the original Q&A.

createReadStream wait for first to finish before starting second createReadStream

From my understanding createReadStream is asynchronous and there is no equivalent synchronous method. This leads us to my problem I'm having today. As you can see in my code, I have two createReadStream, and the second one depends on the first one and it absolutely needs to be run in order so that stationIds gets filled with the desired values. My problem here is that stationIds is empty and so the second createReadStream doesn't do anything at all, how can I run these two createReadStream in the correct way I want?
// Read content of files
csvDataFiles.forEach(file => {
const data = []
fs.createReadStream(directory + file)
.pipe(csv())
.on('data', function (row) {
data.push(row)
})
.on('end', function() {
const object = {}
let stationId = parseInt(file.substr(0, file.indexOf('.')))
stationIds.push(stationId)
object[stationId] = {}
object[stationId]['length'] = data.length
object[stationId]['data'] = data
stationsData.push(object)
})
})
// Read stations
fs.createReadStream(directory + 'asfdsfsd.csv')
.pipe(csv({
skipLines: 3
}))
.on('data', function (row) {
if (stationIds.includes(parseInt(row['Station ID']))) {
// console.log(row)
stations.push(row)
}
})
.on('end', function () {
// console.log(stations)
})
You could wrap the processing of the streams in a promise and wait until all data is read. After that you can process the final file. This still needs some error handling, but should give you something to start with:
// Read content of files
async function processStationsCsv(directory, csvDataFiles) {
let stationsIds = [];
let stationsData = [];
for (const csvFile of csvDataFiles) {
await readStationDataFromFile(directory + csvFile, stationsIds, stationData);
}
let stations = [];
await new Promise(resolve => {
// Read stations
fs.createReadStream(directory + 'asfdsfsd.csv')
.pipe(csv({
skipLines: 3
}))
.on('data', row => {
if (stationsIds.includes(parseInt(row['Station ID']))) {
stations.push(row)
}
})
.on('end', () => {
resolve();
})
})
console.log(stations);
}
function readStationDataFromFile(filePath, stationIds, stationsData) {
return new Promise(resolve => {
fs.createReadStream(filePath)
.pipe(csv())
.on('data', function (row) {
data.push(row)
})
.on('end', function () {
const object = {}
let stationId = parseInt(file.substr(0, file.indexOf('.')))
stationIds.push(stationId)
object[stationId] = {}
object[stationId]['length'] = data.length
object[stationId]['data'] = data
stationsData.push(object)
resolve();
})
});
}
// call it with the directory path and the array of csvDataFiles-paths
processStationsCsv(directory, csvDataFiles);

Read and write to csv file with Node.js fast-csv library

I may be lacking some in depth understanding of streams in general. However, I would like to know how efficiently what I need should work.
I want to implement so that a csv file would be read, then to each row a query to the database (or api) is made and data is attached. After that the row with attached data is written to a new csv file. I am using fast-csv node library for this.
Here is my implementation:
const fs = require("fs");
const csv = require("fast-csv");
const delay = t => new Promise(resolve => setTimeout(resolve, t));
const asyncFunction = async (row, csvStream) => {
// Imitate some stuff with database
await delay(1200);
row.data = "data";
csvStream.write(row);
};
const array = [];
const csvStream = csv.format({ headers: true });
const writeStream = fs.createWriteStream("output.csv");
csvStream.pipe(writeStream).on("finish", () => {
console.log("End of writing");
});
fs.createReadStream("input.csv")
.pipe(csv.parse({ headers: true }))
.transform(async function(row, next) {
array.push(asyncFunction(row, csvStream));
next();
})
.on("finish", function() {
console.log("finished reading file");
//Wait for all database requests and writings to be finished to close write stream
Promise.all(array).then(() => {
csvStream.end();
console.log("finished writing file");
});
});
Particularly I would like to know are there ways to optimize what I am doing here, because I feel that I am missing something important on how this library can be used for these type of cases
Regards,
Rokas
I was able to find a solution in fast-csv issues section. A good person doug-martin, provided this gist, on how you can do efficiently this kind of operation via Transform stream:
const path = require('path');
const fs = require('fs');
const { Transform } = require('stream');
const csv = require('fast-csv');
class PersistStream extends Transform {
constructor(args) {
super({ objectMode: true, ...(args || {}) });
this.batchSize = 100;
this.batch = [];
if (args && args.batchSize) {
this.batchSize = args.batchSize;
}
}
_transform(record, encoding, callback) {
this.batch.push(record);
if (this.shouldSaveBatch) {
// we have hit our batch size to process the records as a batch
this.processRecords()
// we successfully processed the records so callback
.then(() => callback())
// An error occurred!
.catch(err => err(err));
return;
}
// we shouldnt persist so ignore
callback();
}
_flush(callback) {
if (this.batch.length) {
// handle any leftover records that were not persisted because the batch was too small
this.processRecords()
// we successfully processed the records so callback
.then(() => callback())
// An error occurred!
.catch(err => err(err));
return;
}
// no records to persist so just call callback
callback();
}
pushRecords(records) {
// emit each record for down stream processing
records.forEach(r => this.push(r));
}
get shouldSaveBatch() {
// this could be any check, for this example is is record cont
return this.batch.length >= this.batchSize;
}
async processRecords() {
// save the records
const records = await this.saveBatch();
// besure to emit them
this.pushRecords(records);
return records;
}
async saveBatch() {
const records = this.batch;
this.batch = [];
console.log(`Saving batch [noOfRecords=${records.length}]`);
// This is where you should save/update/delete the records
return new Promise(res => {
setTimeout(() => res(records), 100);
});
}
}
const processCsv = ({ file, batchSize }) =>
new Promise((res, rej) => {
let recordCount = 0;
fs.createReadStream(file)
// catch file read errors
.on('error', err => rej(err))
.pipe(csv.parse({ headers: true }))
// catch an parsing errors
.on('error', err => rej(err))
// pipe into our processing stream
.pipe(new PersistStream({ batchSize }))
.on('error', err => rej(err))
.on('data', () => {
recordCount += 1;
})
.on('end', () => res({ event: 'end', recordCount }));
});
const file = path.resolve(__dirname, `batch_write.csv`);
// end early after 30000 records
processCsv({ file, batchSize: 5 })
.then(({ event, recordCount }) => {
console.log(`Done Processing [event=${event}] [recordCount=${recordCount}]`);
})
.catch(e => {
console.error(e.stack);
});
https://gist.github.com/doug-martin/b434a04f164c81da82165f4adcb144ec

Get First Column of CSV file javascript

Have tried PapaParse without success, how would one get the first column value of a CSV file?
const csv = require('csv-parser');
const fs = require('fs');
(async () => {
try {
fs.createReadStream('test.csv')
.pipe(csv())
.on('data', (row) => {
console.log(row);
})
.on('end', () => {
console.log('CSV file successfully processed');
});
} catch (err) {
console.log(error(err));
await browser.close();
console.log(error("Browser Closed"));
}
})();
For anyone in the future, set a function then set as a const to your CSV list of URLs, the number [1] represents the column.
function readURLFile(path) {
return fs.readFileSync(path, 'utf-8')
.split('\n')
.map((elt) => {
const url = elt.split(',')[1].replace('\r', '');
return `http://${url.toLowerCase()}`;
});
}

.then() does not appear to be waiting for the previous .then()

I'm creating a process that converts multiple markdown files into a single pdf. It creates a pdf file for each .md file found in the source directory. Then it merges the individual pdf files into one pdf. It is this last step that is failing saying the individual pdf files do not exist.
const markdownpdf = require('markdown-pdf')
const path = require('path')
const PDFMerge = require('pdf-merge')
const fse = require('fs-extra')
const srcDir = '../manuscript'
const outDir = 'out'
const main = () => {
fse.pathExists(outDir)
.then(() => {
fse.remove(outDir).then(() => {
fse.ensureDir(outDir)
}).then(() => {
return fse.readdir(srcDir)
}).then((srcDirFiles) => {
console.log('source directory file count = ', srcDirFiles.length)
return srcDirFiles.filter(f => path.extname(f) === '.md')
}).then((mdFiles) => {
console.log('number of md files', mdFiles.length);
return mdFiles.map(file => {
const outFileName = `${path.basename(file, '.md')}.pdf`
fse.createReadStream(`${srcDir}/${file}`)
.pipe(markdownpdf())
.pipe(fse.createWriteStream(`${outDir}/${outFileName}`))
return `${outDir}/${outFileName}`
})
}).then(outFiles => {
console.log('number of pdf files created =', outFiles.length)
PDFMerge(outFiles, { output: `${__dirname}/3.pdf` })
})
})
}
main()
If I wrap the PDFMerge() line in setTimeout() it does work
setTimeout(() => {
PDFMerge(outFiles, { output: `${__dirname}/3.pdf` })
}, 1000)
I'm wondering why the setTimeout() is needed and what needs to be changed so it isn't.
I also wrote an async/await version that had the same problem and also worked with setTimeOut()
Edit
In response to Zach Holt's suggestion, here is the async/await version:
const markdownpdf = require('markdown-pdf')
const path = require('path')
const PDFMerge = require('pdf-merge')
const fse = require('fs-extra')
const srcDir = '../manuscript'
const outDir = 'out'
const createPdf = async (file) => {
try {
const outFileName = `${path.basename(file, '.md')}.pdf`
await fse.createReadStream(`${srcDir}/${file}`)
.pipe(markdownpdf())
.pipe(await fse.createWriteStream(`${outDir}/${outFileName}`))
}
catch (e) {
console.log(e)
}
}
const makePdfFiles = (files) => {
files.forEach(file => {
if (path.extname(file) === '.md') {
createPdf(file)
}
})
}
const mergeFiles = async (files) => {
try {
await PDFMerge(files, {output: `${__dirname}/3.pdf`})
}
catch (e) {
console.log(e)
}
}
const addPathToPdfFiles = (files) => {
return files.map(file => {
return `${outDir}/${file}`
})
}
const main = async () => {
try {
const exists = await fse.pathExists(outDir)
if (exists) {
await fse.remove(outDir)
}
await fse.ensureDir(outDir)
const mdFiles = await fse.readdir(srcDir)
const filesMade = await makePdfFiles(mdFiles)
const pdfFiles = await fse.readdir(outDir)
const pdfFilesWithPath = addPathToPdfFiles(pdfFiles)
mergeFiles(pdfFilesWithPath)
// setTimeout(() => {
// mergeFiles(pdfFilesWithPath)
// }, 1000)
} catch (e) {
console.log(e)
}
}
It has the same problem.
I also tried:
const makePdfFiles = files => {
return new Promise((resolve, reject) => {
try {
files.forEach(file => {
if (path.extname(file) === '.md') {
createPdf(file)
}
})
resolve(true)
} catch (e) {
reject(false)
console.log('makePdfFiles ERROR', e)
}
})
}
But it made no difference.
You need to return the promise from ensureDir() to make it wait for it.
I think the issue might be that you're creating a read stream for each of the .md files, but not waiting for the reads to finish before trying to merge outFiles.
You could likely wait until the outFiles length is the same as the number of md files found before merging.
Also, you should stick with async/await for this. It'll keep the code much clearer
Let me over-simplify your code to illustrate the problem:
p1.then(() => {
p2.then().then().then()
}).then(/* ??? */)
which is the same as:
p1.then(() => {
p2.then().then().then()
return undefined
}).then(/* undefined */)
What you need for chaining is to return the inner Promise:
p1.then(() => // no {code block} here, just return value
p2.then().then().then()
).then(/* ??? */)
which is the same as:
p1.then(() => {
p3 = p2.then()
p4 = p3.then()
p5 = p4.then()
return p5
}).then(/* p5 */)
As far as I can tell the original problem was the approach and not the obvious errors correctly pointed out by others. I found a much simpler solution to the overall goal of producing a single pdf from multiple md files.
const markdownpdf = require('markdown-pdf')
const path = require('path')
const fse = require('fs-extra')
const srcDir = '../manuscript'
const filterAndAddPath = (files) => {
try {
const mdFiles = files
.filter(f => path.extname(f) === '.md')
.map(f => `${srcDir}/${f}`)
return mdFiles
}
catch (e) {
console.log('filterAndAddPath', e)
}
}
const main4 = async () => {
const allFiles = await fse.readdir(srcDir)
const mdFiles = filterAndAddPath(allFiles)
const bookPath = 'book.pdf'
markdownpdf()
.concat.from(mdFiles)
.to(bookPath, function() {
console.log('Created', bookPath)
})
}
main4()

Categories