createReadStream wait for first to finish before starting second createReadStream - javascript

From my understanding createReadStream is asynchronous and there is no equivalent synchronous method. This leads us to my problem I'm having today. As you can see in my code, I have two createReadStream, and the second one depends on the first one and it absolutely needs to be run in order so that stationIds gets filled with the desired values. My problem here is that stationIds is empty and so the second createReadStream doesn't do anything at all, how can I run these two createReadStream in the correct way I want?
// Read content of files
csvDataFiles.forEach(file => {
const data = []
fs.createReadStream(directory + file)
.pipe(csv())
.on('data', function (row) {
data.push(row)
})
.on('end', function() {
const object = {}
let stationId = parseInt(file.substr(0, file.indexOf('.')))
stationIds.push(stationId)
object[stationId] = {}
object[stationId]['length'] = data.length
object[stationId]['data'] = data
stationsData.push(object)
})
})
// Read stations
fs.createReadStream(directory + 'asfdsfsd.csv')
.pipe(csv({
skipLines: 3
}))
.on('data', function (row) {
if (stationIds.includes(parseInt(row['Station ID']))) {
// console.log(row)
stations.push(row)
}
})
.on('end', function () {
// console.log(stations)
})

You could wrap the processing of the streams in a promise and wait until all data is read. After that you can process the final file. This still needs some error handling, but should give you something to start with:
// Read content of files
async function processStationsCsv(directory, csvDataFiles) {
let stationsIds = [];
let stationsData = [];
for (const csvFile of csvDataFiles) {
await readStationDataFromFile(directory + csvFile, stationsIds, stationData);
}
let stations = [];
await new Promise(resolve => {
// Read stations
fs.createReadStream(directory + 'asfdsfsd.csv')
.pipe(csv({
skipLines: 3
}))
.on('data', row => {
if (stationsIds.includes(parseInt(row['Station ID']))) {
stations.push(row)
}
})
.on('end', () => {
resolve();
})
})
console.log(stations);
}
function readStationDataFromFile(filePath, stationIds, stationsData) {
return new Promise(resolve => {
fs.createReadStream(filePath)
.pipe(csv())
.on('data', function (row) {
data.push(row)
})
.on('end', function () {
const object = {}
let stationId = parseInt(file.substr(0, file.indexOf('.')))
stationIds.push(stationId)
object[stationId] = {}
object[stationId]['length'] = data.length
object[stationId]['data'] = data
stationsData.push(object)
resolve();
})
});
}
// call it with the directory path and the array of csvDataFiles-paths
processStationsCsv(directory, csvDataFiles);

Related

How to wait for all in Javascript

My code is nearly working well my problem is that only the first CSV data is logged and the lambda function ends.
I guess I need someway to wait for all stream pipes to end.
myCsvList.forEach((myElem) => {
const data = [];
// setup params
const csvFile = s3.getObject(params).createReadStream(); // works fine
csvFile
.pipe(csv())
.on('data', function(entry) {
data.push(entry);
})
.on('end', () => {
console.log(data); // after the log of the first element on myCsvList, the code finishes. It should log all csvFiles from myCsvList
});
});
I guess I need a promises or something?
you can change every item to Promise, then use Promise.all():
const getItem = (myElem) => {
const data = [];
// setup params
const csvFile = s3.getObject(params).createReadStream(); // works fine
return new Promise((resolve) => {
csvFile
.pipe(csv())
.on('data', function (entry) {
data.push(entry);
})
.on('end', () => {
resolve(data); // after the log of the first element on myCsvList, the code finishes. It should log all csvFiles from myCsvList
});
});
};
const start = () => {
// all promises have been finished
return Promise.all(myCsvList.map(myElem => getItem(myElem)));
}
myCsvList.map(myElem => getItem(myElem)) can get a promise list, when all promises have been resolved, it will trigger Promise.all().

Read csv file with nodejs with a max number rows

I'm trying to read a CSV file with node.js using the csv-parser library.
Since it's a big file, I need to check the header and the first 100 rows and the stop the method and return true if everything is ok or false if the data doesn't respect the condition.
How can I achieve this?
This is what I have so far:
const csv = require('csv-parser');
const fs = require('fs');
exports.checkFileFormat = (file) => {
let stream = fs.createReadStream(file.tempFilePath)
.pipe(csv())
.on('headers', (headers) => {
/*...some logic...*/
})
.on('data', (row) => {
if (!typeof (row["USAGE"]) == 'number'
|| !moment(row["START_DATE"], 'YYYYMMDD', true).isValid()
|| !moment(row["END_DATE"], 'YYYYMMDD', true).isValid()) {
stream.unpipe(csv());
return false;
}
})
.on('end', () => {
console.log('CSV file successfully processed');
});
return true;
}
In a previous version I had also declared: var num = 100 and tested it inside .on('data', (row) => {...} but it didn't work.
Following up from my comment
make the function checkFileFormat return a promise. Inside the promise, resolve(false) instead of return false and resolve(true) in the '.on('end') callback. I'm not completely sure this will work, but that's how I would approach it
const csv = require('csv-parser');
const fs = require('fs');
exports.checkFileFormat = (file) => {
return new Promise((resolve, reject) => {
let stream = fs.createReadStream(file.tempFilePath)
.pipe(csv())
.on('headers', (headers) => {
/*...some logic...*/
})
.on('data', (row) => {
if (!typeof (row["USAGE"]) == 'number'
|| !moment(row["START_DATE"], 'YYYYMMDD', true).isValid()
|| !moment(row["END_DATE"], 'YYYYMMDD', true).isValid()) {
stream.end(); // stream.unpipe(csv());
resolve(false);
}
})
.on('end', () => {
console.log('CSV file successfully processed');
resolve(true);
});
});
}
If you want to read a certain amount of lines and then break, you can try the following:
const csv = require('csv-parser');
const fs = require('fs');
let count = 0;
let maxLines = 3;
let fsStream = fs.createReadStream('./data.csv');
let csvStream = csv();
fsStream.pipe(csvStream)
.on('headers', (headers) => {
console.log(headers)
})
.on('data', (data) => {
if (count >= maxLines) {
fsStream.unpipe(csvStream);
csvStream.end();
fsStream.destroy();
} else {
console.log(data);
count++;
}
});
Basically you just count each read line and when the max is reached, you unpipe the csv-stream from the fs-stream, then end the csv-stream and finally destroy the fs-stream.

Read and write to csv file with Node.js fast-csv library

I may be lacking some in depth understanding of streams in general. However, I would like to know how efficiently what I need should work.
I want to implement so that a csv file would be read, then to each row a query to the database (or api) is made and data is attached. After that the row with attached data is written to a new csv file. I am using fast-csv node library for this.
Here is my implementation:
const fs = require("fs");
const csv = require("fast-csv");
const delay = t => new Promise(resolve => setTimeout(resolve, t));
const asyncFunction = async (row, csvStream) => {
// Imitate some stuff with database
await delay(1200);
row.data = "data";
csvStream.write(row);
};
const array = [];
const csvStream = csv.format({ headers: true });
const writeStream = fs.createWriteStream("output.csv");
csvStream.pipe(writeStream).on("finish", () => {
console.log("End of writing");
});
fs.createReadStream("input.csv")
.pipe(csv.parse({ headers: true }))
.transform(async function(row, next) {
array.push(asyncFunction(row, csvStream));
next();
})
.on("finish", function() {
console.log("finished reading file");
//Wait for all database requests and writings to be finished to close write stream
Promise.all(array).then(() => {
csvStream.end();
console.log("finished writing file");
});
});
Particularly I would like to know are there ways to optimize what I am doing here, because I feel that I am missing something important on how this library can be used for these type of cases
Regards,
Rokas
I was able to find a solution in fast-csv issues section. A good person doug-martin, provided this gist, on how you can do efficiently this kind of operation via Transform stream:
const path = require('path');
const fs = require('fs');
const { Transform } = require('stream');
const csv = require('fast-csv');
class PersistStream extends Transform {
constructor(args) {
super({ objectMode: true, ...(args || {}) });
this.batchSize = 100;
this.batch = [];
if (args && args.batchSize) {
this.batchSize = args.batchSize;
}
}
_transform(record, encoding, callback) {
this.batch.push(record);
if (this.shouldSaveBatch) {
// we have hit our batch size to process the records as a batch
this.processRecords()
// we successfully processed the records so callback
.then(() => callback())
// An error occurred!
.catch(err => err(err));
return;
}
// we shouldnt persist so ignore
callback();
}
_flush(callback) {
if (this.batch.length) {
// handle any leftover records that were not persisted because the batch was too small
this.processRecords()
// we successfully processed the records so callback
.then(() => callback())
// An error occurred!
.catch(err => err(err));
return;
}
// no records to persist so just call callback
callback();
}
pushRecords(records) {
// emit each record for down stream processing
records.forEach(r => this.push(r));
}
get shouldSaveBatch() {
// this could be any check, for this example is is record cont
return this.batch.length >= this.batchSize;
}
async processRecords() {
// save the records
const records = await this.saveBatch();
// besure to emit them
this.pushRecords(records);
return records;
}
async saveBatch() {
const records = this.batch;
this.batch = [];
console.log(`Saving batch [noOfRecords=${records.length}]`);
// This is where you should save/update/delete the records
return new Promise(res => {
setTimeout(() => res(records), 100);
});
}
}
const processCsv = ({ file, batchSize }) =>
new Promise((res, rej) => {
let recordCount = 0;
fs.createReadStream(file)
// catch file read errors
.on('error', err => rej(err))
.pipe(csv.parse({ headers: true }))
// catch an parsing errors
.on('error', err => rej(err))
// pipe into our processing stream
.pipe(new PersistStream({ batchSize }))
.on('error', err => rej(err))
.on('data', () => {
recordCount += 1;
})
.on('end', () => res({ event: 'end', recordCount }));
});
const file = path.resolve(__dirname, `batch_write.csv`);
// end early after 30000 records
processCsv({ file, batchSize: 5 })
.then(({ event, recordCount }) => {
console.log(`Done Processing [event=${event}] [recordCount=${recordCount}]`);
})
.catch(e => {
console.error(e.stack);
});
https://gist.github.com/doug-martin/b434a04f164c81da82165f4adcb144ec

Is there any way to count the no. of blank rows in csv file?

I have used fast-csv module and it only has the option to ignore the empty rows but I need to count the empty rows.
csv
.fromPath(importFile.path, {
headers: true,
ignoreEmpty: true
})
.on("data", function(data) {
console.log(data)
})
From the code I am seeing you are already half way there. Just initialize a count variable at the top and in the callback of "data" event split the lines in array and check if the row is empty, based on your requirement.
function getEmptyRows(csv){
return new Promise((resolve, reject) => {
let emptyRows = 0;
csv
.fromPath(importFile.path, {
headers: true,
ignoreEmpty: true
})
.on("data", (data) => {
const lines = data.trim().split(/\s*[\r\n]+\s*/g);
lines.forEach((line) => {
if(line.match(/([^\s,])/)){
count++;
}
});
})
.on('end', () => {
return resolve(count);
});
});
}
And you can use the function above in the following ways-
getEmptyRows(csv).then((count) => {
// do your operation...
});
// or as an async/await
async () => {
const count = await getEmptyRows(csv);
// do your operation...
}

https requests in loop, call function after last request has been made node

Assuming I am calling https a multiple times to retrieve data, and I want to call a function formatJsonToLocale at the end of the last request. Is there a way to determine when that request has ended, other than checking for the last element of the array.
let sheetsArray = []
function sheetsAsJsonById (ids) {
for (let i = 0; i < ids.length; i++) {
const queryUrl = `sheets.html`
https
.get(queryUrl, res => {
let stream = []
res
.on('data', function (data) {
stream.push(data)
})
.on('end', function () {
let data = Buffer.concat(stream)
data = JSON.parse(data)
sheetArrays.push(data['values'])
formatJsonToLocale(sheetsArray) // <----- call this one after last request
})
})
.on('error', err => {
console.error(`Error in response: ${err}`)
})
}
}
when I call formatJsonToLocale outside of the function I will have the problem that the former function might not be finished as https handles stuff asynchronously.
Any suggestions on how to handle this?
You would need to keep track of execution of async code (https.get) that is getting executed within for loop. This can be achieved using promises as below:
let sheetsArray = []
function sheetsAsJsonById (ids) {
let promises = []
for (let i = 0; i < ids.length; i++) {
const queryUrl = `sheets.html`
promises.push(makeHTTPRequest(queryUrl))
}
Promise.all(promises).then((sheetArrays) => {
formatJsonToLocale(sheetsArray)
})
}
const makeHTTPRequest = (url) => {
return new Promise((resolve, reject) => {
https
.get(url, res => {
let stream = []
res
.on('data', function (data) {
stream.push(data)
})
.on('end', function () {
let data = Buffer.concat(stream)
data = JSON.parse(data)
resolve(data)
})
.on('error', err => {
console.error(`Error in response: ${err}`)
})
})
}
If you want to stick to callbacks you could use async.each function of async node module.
Wrap https.get in a Promise, which resolves on the end event, and rejects on any error. Now you can await the promise, and call the function once the for loop is done
let sheetsArray = []
function sheetsAsJsonById(ids) {
for (let i = 0; i < ids.length; i++) {
const queryUrl = `sheets.html`
await new Promise((resolve, reject) => {
https
.get(queryUrl, res => {
let stream = []
res
.on('data', function(data) {
stream.push(data)
})
.on('end', function() {
let data = Buffer.concat(stream)
data = JSON.parse(data)
sheetsArray.push(data['values'])
resolve();
})
})
.on('error', err => {
reject(err);
})
})
}
formatJsonToLocale(sheetsArray)
}

Categories