Read and write to csv file with Node.js fast-csv library - javascript

I may be lacking some in depth understanding of streams in general. However, I would like to know how efficiently what I need should work.
I want to implement so that a csv file would be read, then to each row a query to the database (or api) is made and data is attached. After that the row with attached data is written to a new csv file. I am using fast-csv node library for this.
Here is my implementation:
const fs = require("fs");
const csv = require("fast-csv");
const delay = t => new Promise(resolve => setTimeout(resolve, t));
const asyncFunction = async (row, csvStream) => {
// Imitate some stuff with database
await delay(1200);
row.data = "data";
csvStream.write(row);
};
const array = [];
const csvStream = csv.format({ headers: true });
const writeStream = fs.createWriteStream("output.csv");
csvStream.pipe(writeStream).on("finish", () => {
console.log("End of writing");
});
fs.createReadStream("input.csv")
.pipe(csv.parse({ headers: true }))
.transform(async function(row, next) {
array.push(asyncFunction(row, csvStream));
next();
})
.on("finish", function() {
console.log("finished reading file");
//Wait for all database requests and writings to be finished to close write stream
Promise.all(array).then(() => {
csvStream.end();
console.log("finished writing file");
});
});
Particularly I would like to know are there ways to optimize what I am doing here, because I feel that I am missing something important on how this library can be used for these type of cases
Regards,
Rokas

I was able to find a solution in fast-csv issues section. A good person doug-martin, provided this gist, on how you can do efficiently this kind of operation via Transform stream:
const path = require('path');
const fs = require('fs');
const { Transform } = require('stream');
const csv = require('fast-csv');
class PersistStream extends Transform {
constructor(args) {
super({ objectMode: true, ...(args || {}) });
this.batchSize = 100;
this.batch = [];
if (args && args.batchSize) {
this.batchSize = args.batchSize;
}
}
_transform(record, encoding, callback) {
this.batch.push(record);
if (this.shouldSaveBatch) {
// we have hit our batch size to process the records as a batch
this.processRecords()
// we successfully processed the records so callback
.then(() => callback())
// An error occurred!
.catch(err => err(err));
return;
}
// we shouldnt persist so ignore
callback();
}
_flush(callback) {
if (this.batch.length) {
// handle any leftover records that were not persisted because the batch was too small
this.processRecords()
// we successfully processed the records so callback
.then(() => callback())
// An error occurred!
.catch(err => err(err));
return;
}
// no records to persist so just call callback
callback();
}
pushRecords(records) {
// emit each record for down stream processing
records.forEach(r => this.push(r));
}
get shouldSaveBatch() {
// this could be any check, for this example is is record cont
return this.batch.length >= this.batchSize;
}
async processRecords() {
// save the records
const records = await this.saveBatch();
// besure to emit them
this.pushRecords(records);
return records;
}
async saveBatch() {
const records = this.batch;
this.batch = [];
console.log(`Saving batch [noOfRecords=${records.length}]`);
// This is where you should save/update/delete the records
return new Promise(res => {
setTimeout(() => res(records), 100);
});
}
}
const processCsv = ({ file, batchSize }) =>
new Promise((res, rej) => {
let recordCount = 0;
fs.createReadStream(file)
// catch file read errors
.on('error', err => rej(err))
.pipe(csv.parse({ headers: true }))
// catch an parsing errors
.on('error', err => rej(err))
// pipe into our processing stream
.pipe(new PersistStream({ batchSize }))
.on('error', err => rej(err))
.on('data', () => {
recordCount += 1;
})
.on('end', () => res({ event: 'end', recordCount }));
});
const file = path.resolve(__dirname, `batch_write.csv`);
// end early after 30000 records
processCsv({ file, batchSize: 5 })
.then(({ event, recordCount }) => {
console.log(`Done Processing [event=${event}] [recordCount=${recordCount}]`);
})
.catch(e => {
console.error(e.stack);
});
https://gist.github.com/doug-martin/b434a04f164c81da82165f4adcb144ec

Related

Unit test asynchronous generator function in Jest

I would like to write a unit test for a generator function but I am not able to pass a properly mocked read stream (ReadStream) object.
Testable function:
public async *readChunks(file: string, chunkSize: number): AsyncIterableIterator<Buffer> {
if (!this.cwd) throw new Error('Working directory is not set!');
const readStream: ReadStream = fs.createReadStream(path.join(this.cwd, file), {
highWaterMark: chunkSize
});
for await (const chunk of readStream) yield chunk;
}
Failed implementation (I tried different mocking of the createReadStream but without success):
describe('Work Dir Utils', () => {
jest.mock('fs');
let workDirUtils: WorkDirUtils;
beforeEach(() => {
(os.tmpdir as jest.Mock).mockReturnValue('/tmp');
(fs.mkdtempSync as jest.Mock).mockReturnValue('/tmp/folder/pref-rand');
(fs.createReadStream as jest.Mock).mockReturnValue({});
workDirUtils = new WorkDirUtils();
workDirUtils.createTempDir('pref-');
});
afterEach(() => {
jest.clearAllMocks();
});
it('should read chunks of a file using generator', async () => {
for await (const chunk of workDirUtils.readChunks(
path.join(__dirname, './fixtures/manifest.ts'),
1024 * 1024 * 1024
)) {
expect(chunk).toBeInstanceOf(Buffer);
}
});
});
Any suggestions?
Actually, it turned out to be quite easy. In the end, I did not want to revoke the question. Maybe it will be useful for others.
jest.mock('fs');
jest.mock('tar');
jest.mock('os');
let workDirUtils: WorkDirUtils;
describe('Work Dir Utils', () => {
beforeEach(() => {
(os.tmpdir as jest.Mock).mockReturnValue('/tmp');
(fs.mkdtempSync as jest.Mock).mockReturnValue('/tmp/folder/pref-rand');
(fs.existsSync as jest.Mock).mockReturnValue(true);
(fs.createReadStream as jest.Mock).mockReturnValue(Readable.from([path.join(__dirname, './fixtures/manifest.ts')]));
workDirUtils = new WorkDirUtils();
workDirUtils.createTempDir('pref-');
});
afterEach(() => {
jest.clearAllMocks();
});
it('should generator function throw an error', async () => {
const workdirUtilsMock = new WorkDirUtils();
const generator = workdirUtilsMock.readChunks('file-path', 5000);
expect(generator.next).rejects.toThrow('Working directory is not set!');
});
it('should read chunks of a file using generator', async () => {
const generator = workDirUtils.readChunks(path.join(__dirname, './fixtures/manifest.ts'), 1024 * 1024 * 1024);
const response = await generator.next();
expect(response).toBeInstanceOf(Object);
expect(response.value).toEqual(path.join(__dirname, './fixtures/manifest.ts'));
});
});

createReadStream wait for first to finish before starting second createReadStream

From my understanding createReadStream is asynchronous and there is no equivalent synchronous method. This leads us to my problem I'm having today. As you can see in my code, I have two createReadStream, and the second one depends on the first one and it absolutely needs to be run in order so that stationIds gets filled with the desired values. My problem here is that stationIds is empty and so the second createReadStream doesn't do anything at all, how can I run these two createReadStream in the correct way I want?
// Read content of files
csvDataFiles.forEach(file => {
const data = []
fs.createReadStream(directory + file)
.pipe(csv())
.on('data', function (row) {
data.push(row)
})
.on('end', function() {
const object = {}
let stationId = parseInt(file.substr(0, file.indexOf('.')))
stationIds.push(stationId)
object[stationId] = {}
object[stationId]['length'] = data.length
object[stationId]['data'] = data
stationsData.push(object)
})
})
// Read stations
fs.createReadStream(directory + 'asfdsfsd.csv')
.pipe(csv({
skipLines: 3
}))
.on('data', function (row) {
if (stationIds.includes(parseInt(row['Station ID']))) {
// console.log(row)
stations.push(row)
}
})
.on('end', function () {
// console.log(stations)
})
You could wrap the processing of the streams in a promise and wait until all data is read. After that you can process the final file. This still needs some error handling, but should give you something to start with:
// Read content of files
async function processStationsCsv(directory, csvDataFiles) {
let stationsIds = [];
let stationsData = [];
for (const csvFile of csvDataFiles) {
await readStationDataFromFile(directory + csvFile, stationsIds, stationData);
}
let stations = [];
await new Promise(resolve => {
// Read stations
fs.createReadStream(directory + 'asfdsfsd.csv')
.pipe(csv({
skipLines: 3
}))
.on('data', row => {
if (stationsIds.includes(parseInt(row['Station ID']))) {
stations.push(row)
}
})
.on('end', () => {
resolve();
})
})
console.log(stations);
}
function readStationDataFromFile(filePath, stationIds, stationsData) {
return new Promise(resolve => {
fs.createReadStream(filePath)
.pipe(csv())
.on('data', function (row) {
data.push(row)
})
.on('end', function () {
const object = {}
let stationId = parseInt(file.substr(0, file.indexOf('.')))
stationIds.push(stationId)
object[stationId] = {}
object[stationId]['length'] = data.length
object[stationId]['data'] = data
stationsData.push(object)
resolve();
})
});
}
// call it with the directory path and the array of csvDataFiles-paths
processStationsCsv(directory, csvDataFiles);

How to return my CSV data from my service - async/await issue?

I am attempting to load some CSV data in my API such that I can manipulate it and pass through to my front end, however I am having a few issues returning the data.
I am using fast-csv to do the parsing here.
service.js
const fs = require('fs');
const csv = require('fast-csv');
module.exports.getFileContents = (filepath) => {
let data = [];
fs.createReadStream(filepath)
.pipe(csv.parse({ headers: true }))
.on('error', error => console.error(error))
.on('data', row => data.push(row))
.on('end', () => {
console.log(data) // This will print the full CSV file fine
return data;
});
};
routes.js
router.get('/data/:filename', (req, res) => {
const file = FS.getFileContents(testUrl + '/' + req.params.filename + '.csv');
console.log(file); // This prints 'undefined'
res.send(file);
});
I can print out the CSV contents fine from the service, but I just get 'undefined' from the actual routes. Can somebody please point out what I'm missing?
This is a common problem with JavaScript code, in the following.
.on('end', () => {
console.log(data);
return data;
});
Your on-end handler is an anonymous callback function (because of () =>), so when you return data, you are returning data out of your on-end handler callback function. You are not returning data out of your enclosing getFileContents() function.
Here's a typical way to write this kind of code:
const getFileContents = async (filepath) => {
const data = [];
return new Promise(function(resolve, reject) {
fs.createReadStream(filepath)
.pipe(csv.parse({ headers: true }))
.on('error', error => reject(error))
.on('data', row => data.push(row))
.on('end', () => {
console.log(data);
resolve(data);
});
});
}
And then, call it as follows, though this must be within an async function:
const data = await getFileContents('games.csv');
What's happened here is as follows:
your getFileContents is now async and returns a promise
the CSV data will be available when resolve(data) is executed
the caller can await the fulfillment/resolution of this promise to get the data
You could just create a Promise in the service and return it. Once the job is done, resolve it. The returned Promise will wait until it is resolved.
service.js
const fs = require('fs');
const csv = require('fast-csv');
module.exports.getFileContents = (filepath) => {
let data = [];
return new Promise((resolve) => {
fs.createReadStream(filepath)
.pipe(csv.parse({ headers: true }))
.on('error', error => console.error(error))
.on('data', row => data.push(row))
.on('end', () => {
resolve(data);
});
}
};
routes.js
router.get('/data/:filename', (req, res) => {
const file = await FS.getFileContents(testUrl + '/' + req.params.filename + '.csv');
console.log(file); // This prints only after it is resolved
res.send(file);
});

Firebase Google Cloud Function: createReadStream results in empty file

I try to process a Video file (stored in Google Firebase storage) through a Google Cloud Function. I have working code that download the entire video files into the NodeJS Google cloud function: await bucket.file(filePath).download({ destination: tempFile }).
But the goal is only to read the framerate, therefore the headers of the videofile would suffice. But createReadStream gives me an empty tempFile. Any advise much appreciated!
exports.checkFramerate = functions.region('europe-west1').storage.object().onFinalize(async (object, context) => {
const bucket = admin.storage().bucket(object.bucket); // Bucket class
const filePath = object.name; // videos/xbEXdMNFb1Blbd9r2E8m/comp_test.mp4
const fileName = filePath.split('/').pop(); // comp_test.mp4
const bucketDir = path.dirname(filePath); // videos/xbEXdMNFb1Blbd9r2E8m
const tempFile = path.join(os.tmpdir(), 'temp.mp4')
fs.closeSync(fs.openSync(tempFile, 'w'))
console.log("tempFile size1", fs.statSync(tempFile).size)
// await bucket.file(filePath).download({ destination: tempFile }); // this works: tempFile size2 = 3180152
await bucket.file(filePath).createReadStream({ // this does not work: tempFile size2 = 0
start: 10000,
end: 20000
})
.on('error', function(err) {console.log(err)})
.pipe(fs.createWriteStream(tempFile));
console.log("tempFile size2", fs.statSync(tempFile).size)
mi(tempFile).then(data => {
console.log("frameRate", data[0].general.frame_rate[0])
return data[0].general.frame_rate[0];
}).catch(e => {console.error(e)});
});
I tried implementing even the example of https://googleapis.dev/nodejs/storage/latest/File.html#createReadStream but to no avail. remoteFile.download works beautifully but remoteFile.createReadStream gives me empty files...
const remoteFile = bucket.file(filePath);
const localFilename = tempFile;
remoteFile.createReadStream()
.on('error', function(err) {})
.on('response', function(response) {})
.on('end', function() {})
.pipe(fs.createWriteStream(localFilename));
fs.stat(localFilename, (err, stats) => {
if (err) {console.log(err)}
return console.log("stats async",stats.size)
})
as mentioned, promise should be used
reading json file example
let buf = '';
const loadData = async () => {
return await new Promise((resolve, reject) => {
storage.bucket('bucket-name').file('test-config.json')
.createReadStream()
.on('error', reject)
.on('data', function(d) {
buf += d;
}).on('end', function() {
resolve(buf)
});
})
}
const data = await loadData()
Your problem is that the stream API isn't promisifed. So, the await does nothing, and your function continues before the stream is piped, and the file is still zero-length when you stat it the second time.
The download method works just fine because it returns a Promise.
This answer outlines the general approach you need to take. In summary though, you basically want the section of your code that does the piping to read like this:
const stream = bucket.file(filePath).createReadStream({
start: 10000,
end: 20000
})
.pipe(fs.createWriteStream(tempFile));
await new Promise((resolve, reject) => {
stream.on('finish', resolve);
stream.on('error', reject);
});
console.log("tempFile size2", fs.statSync(tempFile).size)
Your function will then wait until the finish event occurs when the piping is complete and the stream is closed. Obviously you probably want to do something more clever with the error handler too, but this is the general form of what you need.

How to write an async function that resolves when `data` event emitter fires

I am using node-serialport to communicate with a piece of hardware. It just writes a command and receives a response.
https://serialport.io/docs/en/api-parsers-overview
The following code works:
const port = new SerialPort(path);
const parser = port.pipe(new Readline({ delimiter: '\r', encoding: 'ascii' }));
const requestArray = [];
parser.on('data', (data) => {
// get first item in array
const request = requestArray[0];
// remove first item
requestArray.shift();
// resolve promise
request.promise.resolve(data);
});
export const getFirmwareVersion = async () => {
let resolvePromise;
let rejectPromise;
const promise = new Promise((resolve, reject) => {
resolvePromise = resolve;
rejectPromise = reject;
});
const title = 'getFirmwareVersion';
const cmd = 'V\r';
requestArray.push({
title,
cmd,
promise: {
resolve: resolvePromise,
reject: rejectPromise
}
});
await v2Port.write(cmd);
return promise;
};
Then from my app (which is written in electron/react) I can call the function:
<Button onClick={() => {
let data = await _api.getFirmwareVersion();
console.log('done waiting...');
console.log(data);
}>
Click Me
</Button>
Is there anyway I can refactor this code to make it more succinct?
Is there a way to get the Promise from the async function, rather than having to make a new Promise?
Is there a way to tap into the Transform Stream that already exists and pipe the Promise in there somehow?
I'm also new to async/await, and wanted to avoid using callbacks, especially in the React/Redux side of things.
I aim to have a lot of these endpoints for the api (i.e. getFirmwareVersion, getTemperature, etc...). So I want to make the code as concise as possible. I don't want the UI to have any underlying knowledge of how the API is getting the data. It just needs to request it like any other API and wait for a response.
Oh, I think I get it. The parser is receiving data constantly. So when a request comes, you wait for the next data and send it when it arrives. I suggest you to write an intermediate class.
Like this:
const SerialPort = require('serialport')
const Readline = require('#serialport/parser-readline')
const { EventEmitter } = require('events');
class SerialPortListener extends EventEmitter {
constructor(path) {
super();
this.serialPortPath = path;
}
init() {
this.serialPort = new SerialPort(this.serialPortPath);
const parser = this.serialPort.pipe(new Readline({ delimiter: '\r', encoding: 'ascii' }));
parser.on('data', data => this.emit('data', data));
}
}
Then you could modify the getFirmwareVersion like this:
const serialPortListener = new SerialPortListener(path);
serialPortListener.init();
export const getFirmwareVersion = () => {
return new Promise((resolve, reject) => {
serialPortListener.once('data', async (data) => {
try {
const cmd = 'V\r';
await v2Port.write(cmd);
resolve(data);
} catch (ex) {
reject(ex);
}
});
});
};
Based on help from Mehmet, here is what I ended up with:
const _port = new SerialPort(path);
const _parser = _port.pipe(new Readline({ delimiter: '\r', encoding: 'ascii' }));
const waitForData = async () => {
return new Promise((resolve, reject) => {
const timeoutId = setTimeout(() => reject('Write Timeout'), 500);
_parser.once('data', (data) => {
clearTimeout(timeoutId);
resolve(data);
});
});
};
const createAPIFunction = (cmdTemplate, validationString) => {
return async (config) => {
try {
// replace {key} in template with config[key] props
const cmd = cmdTemplate.replace(/{(\w+)}/g, (_, key) => {
return config[key];
});
_port.write(cmd + '\r');
const data = await waitForData();
// validate data
if (data.startsWith(validationString)) {
// is valid
return data;
} else {
// invalid data
throw new Error('Invalid Data Returned');
}
} catch (err) {
throw err;
}
};
};
export const getFirmwareVersion = createAPIFunction('V', 'V1');
export const enableSampling = createAPIFunction('G1{scope}', 'G11');

Categories