Correctly use fs write inside createReadStream on data - javascript

I am attempting to combine n binary files into a single file in javascript using streams. I have a write stream that is passed to the following function. I notice that the total written bytes does not match the actual number of bytes in the file, and is also not consistent across multiple runs.
After reading the documentation, I noticed that the write call returns a promise and is not safe to be called again until the previous promise is fulfilled. I am not sure how to make readStream.on('data', function (chunk) use await, as the function is not async and I get an error await is only valid in async function
async function concatFile (filename, fileHandle) {
return new Promise((resolve, reject) => {
const readStream = fs.createReadStream(filename, { highWaterMark: 1024 })
readStream.on('data', function (chunk) {
// read
fileHandle.write(chunk)
})
readStream.on('error', e => {
reject(e)
})
readStream.on('close', function (err) {
// close
})
readStream.on('end', function () {
// done
readStream.close()
resolve()
})
}) // end of Promise
}
I am using the above function in the following snippet:
const fileWriter = fs.createWriteStream('concatBins.bin', { flags: 'w' })
let writtenLen = 0
fileList = {}
fileList[0] = "foo.bin"
fileList[1] = "bar.bin"
for (const [key, value] of Object.entries(fileList)) {
await concatFile(value, fileWriter)
writtenLen = fileWriter.bytesWritten
console.log('bytes written ' + writtenLen)
}

You can pause the readStream until the write is done to avoid getting future data events and the resume it when done with the write. And, you can declare the .on('data', ...) callback to be async if you want to use await. But, you do have to pause the readStream because the async/await won't pause it for you.
// stream write that returns a promise when OK to proceed
// with more writes
function write(stream, data) {
return new Promise((resolve, reject) => {
if (stream.write(data)) {
resolve();
} else {
// need to wait for drain event
stream.once('drain', resolve);
}
});
}
async function concatFile (filename, writeStream) {
return new Promise((resolve, reject) => {
const readStream = fs.createReadStream(filename, { highWaterMark: 1024 });
let paused = false;
let ended = false;
readStream.on('data', async function(chunk) {
// read
try {
readStream.pause();
paused = true;
await write(writeStream, chunk);
} catch(e) {
// have to decide what you're doing if you get a write error here
reject(e);
} finally {
paused = false;
readStream.resume();
if (ended) {
readStream.emit("finalEnd");
}
}
});
readStream.on('error', e => {
reject(e)
})
readStream.on('close', function (err) {
// close
})
readStream.on('end', function () {
// done
ended = true;
if (!paused) {
readStream.emit('finalEnd');
}
});
// listen for our real end event
readStream.on('finalEnd', () {
readStream.close();
resolve()
});
}) // end of Promise
}

Related

Node.JS: how to wait for a process to finish before continuing?

I am new to node and stuck with this issue. Here' the file:
I am running 'startProcess' function and I want to run 'downloadFiles' and wait until it's completed and save the files before executing any code after it.
This code always ends up running 'runVideoUploadEngine' even before the download has been completed?
const downloadAndSaveFiles = async ({ url, dir }) => {
try {
https.get(url, (res) => {
// File will be stored at this path
console.log('dir: ', dir);
var filePath = fs.createWriteStream(dir);
res.pipe(filePath);
filePath.on('finish', () => {
filePath.close();
console.log('Download Completed');
});
});
return true;
} catch (e) {
console.log(e);
throw e;
}
};
const downloadFiles = async ({ data }) => {
try {
mediaUrl = data.mediaUrl;
thumbnailUrl = data.thumbnailUrl;
const mediaExt = path.extname(mediaUrl);
const thumbExt = path.extname(thumbnailUrl);
mediaDir = `${__dirname}/temp/${'media'}${mediaExt}`;
thumbDir = `${__dirname}/temp/${'thumb'}${thumbExt}`;
await downloadAndSaveFiles({ url: mediaUrl, dir: mediaDir });
await downloadAndSaveFiles({ url: thumbnailUrl, dir: thumbDir });
return { mediaDir, thumbDir };
} catch (e) {
console.log(e);
throw e;
}
};
module.exports = {
startProcess: async ({ message }) => {
//check if message is proper
data = JSON.parse(message.Body);
//download video and thumbnail and store in temp.
console.log('starting download..');
const { mediaDir, thumbDir } = await downloadFiles({ data });
console.log('dir:- ', mediaDir, thumbDir);
pageAccessToken =
'myRandomToken';
_pageId = 'myRandomPageID';
console.log('running engine');
await runVideoUploadEngine({ pageAccessToken, _pageId, mediaDir, thumbDir });
//start videoUploadEngine
//on success: delete video/thumbnail
},
};
What am I doing wrong?
downloadAndSaveFiles returns a promise (because the function is async) but that promise doesn't "wait" for https.get or fs.createWriteStream to finish, and therefore none of the code that calls downloadAndSaveFiles can properly "wait".
If you interact with callback APIs you cannot really use async/await. You have to create the promise manually. For example:
const downloadAndSaveFiles = ({ url, dir }) => {
return new Promise((resolve, reject) => {
// TODO: Error handling
https.get(url, (res) => {
// File will be stored at this path
console.log('dir: ', dir);
var filePath = fs.createWriteStream(dir);
filePath.on('finish', () => {
filePath.close();
console.log('Download Completed');
resolve(); // resolve promise once everything is done
});
res.pipe(filePath);
});
});
};

How to make reading a file using Stream work Synchronously in Node js?

Here is my code i want work this code in a way that readerstream1 will display all the contents of the file first then it will move towards readerstream2 without changing higherWaterMark. In simple words i want it to work synchronously
Thanks
let readerstream1 = fs.createReadStream('shamoon.txt', { highWaterMark: 8 });
let readerstream2 = fs.createReadStream('shamoon1.txt', { highWaterMark: 8 });
let writestream = fs.createWriteStream('put.txt');
readerstream1.on('data', function(chunk) {
console.log("ReadStream1 Chunk has been received " + chunk)
})
readerstream2.on('data', function(chunk) {
console.log("ReadStream2 Chunk has been received " + chunk)
})
You can wrap the stream's processing in a promise and listen for the end-event where you resolve the promise. After awaiting this promise you can continue with the next one. Something like this which still needs error handling but should give you a start:
function getReadStreamPromise(filePath, opts) {
return new Promise((resolve, reject) => {
const readerstream = fs.createReadStream(filePath, opts);
readerstream.on('data', (chunk) => {
// handle chunk
})
readerstream.on('error', (err)=> {
reject(err);
})
readerstream.on('end', () => {
resolve();
})
})
}
async function processStreams() {
await getReadStreamPromise('shamoon.txt', { highWaterMark: 8 });
await getReadStreamPromise('shamoon1.txt', { highWaterMark: 8 });
}
Note that you could simply use fs.promises.readFile to read the files asynchronously but sequentially:
async function processStreams() {
const content = await fs.promises.readFile('shamoon.txt');
const content2 = await fs.promises.readFile('shamoon1.txt');
await fs.promises.writeFile('put.txt', "someContent");
}

Promisfy function with events

This is a promisified spawn function:
async function aspawn(cmd, args){
return new Promise((resolve, reject) => {
const proc = spawn(cmd, args);
proc.stderr.on('data', data => {
console.error('err', data.toString());
});
proc.stdout.on('data', data => {
console.error('stdout', data.toString());
});
proc.on('close', code => {
console.error('closed with code', code);
resolve();
});
});
}
I was wondering if it's possible to make it less indented
Using async iterator and once event emitter feature you could write them like this:
const { spawn } = require('child_process')
const { once } = require('events')
aspawn1('cat', ['README.md'])
.then(() => aspawn1('cat', ['FOO.md'])) // error stream
.then(() => aspawn2('cat', ['README.md']))
async function aspawn1 (cmd, args) {
try {
const proc = spawn(cmd, args)
// in any case you can add events to `proc`
// consume the stream
for await (const chunk of proc.stdout) {
console.log('>>> ' + chunk.length)
}
for await (const chunk of proc.stderr) {
console.log('err >>> ' + chunk.length)
}
// the stream is ended and the spawn aswell
} catch (err) {
// if you need to retrun always a positive promise
console.log('error happened', err)
}
}
// Since node: v11.13.0, v10.16.0 you may write that function like this to have a strict "fire and forget" spawn:
function aspawn2 (cmd, args) {
return once(spawn(cmd, args), 'close')
}

Node JS async/await with multiple fs.writeFile using through2 (Gulp/Vinyl)

I'm using through2 to generate multiple files from a Gulp stream. I'm using NodeJS 10.6.0 so thought I'd make full use of async/await, but am not fully understanding the mechanics yet. Currently the through2 done() callback is being fired before all files have been written.
Here's what I have (simplified) - note that I'm not returning the stream at the end as there is no need to.
async function createDirectory(pathDir) {
return new Promise((resolve, reject) => {
mkdirp(pathDir, (err) => {
if (err) reject(err);
else resolve();
});
});
}
async function writeFile(outputFilePath, outputFileContent) {
return new Promise((resolve, reject) => {
fs.writeFile(outputFilePath, outputFileContent, (err) => {
if (err) reject(err);
else resolve();
});
});
}
async function doWriteFile(outputFolderPath, outputFilePath, outputContent) {
await createDirectory(outputFolderPath);
await writeFile(outputFilePath, outputContent, outputContent);
}
async function doGenerateVariant(data, variantArr) {
for (const variant of variantArr) {
/* Do a load of stuff */
const variantOutputFolderPath = blah;
const variantOutputFilePath = blah;
const variantOutputContent = blah;
await doWriteFile(variantOutputFolderPath, variantOutputFilePath, variantOutputContent);
}
}
const generateVariant = () => {
return through.obj((file, enc, done) => {
const data = JSON.parse(file.contents.toString());
*/ Do a load of stuff */
const { variant } = data;
const variantArr = Object.values(variant);
doGenerateVariant(data, variantArr);
return done();
});
};
This doesn't work as done() gets returned before all files have been written. I'm guessing I'm missing a return or two but nothing I do seems to be working.
If I pass done() into doGenerateVariant and call it after doWriteFile everything works as expected but I know this isn't correct.
You need to wait for doGenerateVariant to do its job before calling done. Remember async function always returns a Promise. So you could do it this way
const generateVariant = () => {
return through.obj((file, enc, done) => {
const data = JSON.parse(file.contents.toString());
*/ Do a load of stuff */
const { variant } = data;
const variantArr = Object.values(variant);
doGenerateVariant(data, variantArr).then(() => done());
});
};
or using async/await
const generateVariant = () => {
return through.obj(async (file, enc, done) => {
const data = JSON.parse(file.contents.toString());
*/ Do a load of stuff */
const { variant } = data;
const variantArr = Object.values(variant);
await doGenerateVariant(data, variantArr);
done();
});
};

Run callback function after forEach is done

In the project, I have a loop going through a list of urls. It downloads file from every url and do some post process over the downloaded file.
After the all the process done (both download process and post process), I want to execute a callback function. Because post process includes some streaming task, it has close event. If the last item can be identified, I can pass the callback function to the close event. However, since the loop is async, I can't track which item is done at last.
For now, I use a 5 second timeout to make sure the callback is executed after the whole process. Obviously, this is not sustainable. What's a good way to handle this?
loop code:
exports.processArray = (items, process, callback) => {
var todo = items.concat();
setTimeout(function() {
process(todo.shift());
if(todo.length > 0) {
// execute download and post process each second
// however it doesn't guarantee one start after previous one done
setTimeout(arguments.callee, 1000);
} else {
setTimeout(() => {callback();}, 5000);
}
}, 1000);
};
processArray(
// First param, the array
urlList,
// Second param, download and post process
(url) => {
if(url.startsWith('http')) {
getDataReg(url, uid);
}
else if(url.startsWith('ftp')) {
getDataFtp(url, uid);
}
else {
console.log('not a valid resource');
}
},
// Third param, callback to be executed after all done
() => {
Request.get(`${config.demouri}bound=${request.query.boundary};uid=${uid}`, {
method: 'GET',
auth: auth
})
.on('response', (response) => {
console.log('response event emmits');
zipFiles(uid)
.then((path) => {
reply.file(path, { confine: false, filename: uid + '.zip', mode: 'inline'}).header('Content-Disposition');
});
});
}
);
Download and post process:
exports.getDataFtp = (url, uid) => {
console.log('get into ftp');
var usefulUrl = url.split('//')[1];
var spliter = usefulUrl.indexOf('/');
var host = usefulUrl.substring(0, spliter);
var dir = usefulUrl.substring(spliter+1, usefulUrl.length);
var client = new ftp();
var connection = {
host: host
};
var fileNameStart = dir.lastIndexOf('/') + 1;
var fileNameEnd = dir.length;
var fileName = dir.substring(fileNameStart, fileNameEnd);
console.log('filename: ', fileName);
client.on('ready', () => {
console.log('get into ftp ready');
client.get(dir, (err, stream) => {
if (err) {
console.log('get file err:', err);
return;
} else{
console.log('get into ftp get');
stream.pipe(fs.createWriteStream(datadir + `download/${uid}/${fileName}`));
stream.on('end', () => {
console.log('get into ftp close');
unzipData(datadir + `download/${uid}/`, fileName, uid);
client.end();
});
}
});
});
client.connect(connection);
};
exports.getDataReg = (url, uid) => {
console.log('get into http');
var fileNameStart = url.lastIndexOf('/') + 1;
var fileNameEnd = url.length;
var fileName = url.substring(fileNameStart, fileNameEnd);
var file = fs.createWriteStream(datadir + `download/${uid}/${fileName}`);
if (url.startsWith('https')) {
https.get(url, (response) => {
console.log('start piping file');
response.pipe(file);
file.on('finish', () => {
console.log('get into http finish');
unzipData(datadir + `download/${uid}/`, fileName, uid);
});
}).on('error', (err) => { // Handle errors
fs.unlink(datadir + `download/${uid}/${fileName}`);
console.log('download file err: ', err);
});
} else {
http.get(url, (response) => {
console.log('start piping file');
response.pipe(file);
file.on('finish', () => {
unzipData(datadir + `download/${uid}/`, fileName, uid);
});
}).on('error', (err) => {
fs.unlink(datadir + `download/${uid}/${fileName}`);
console.log('download file err: ', err);
});
}
};
function unzipData(path, fileName, uid) {
console.log('get into unzip');
console.log('creating: ', path + fileName);
fs.createReadStream(path + fileName)
.pipe(unzip.Extract({path: path}))
.on('close', () => {
console.log('get into unzip close');
var filelist = listFile(path);
filelist.forEach((filePath) => {
if (!filePath.endsWith('.zip')) {
var components = filePath.split('/');
var component = components[components.length-1];
mv(filePath, datadir + `processing/${uid}/${component}`, (err) => {
if(err) {
console.log('move file err: ');
} else {
console.log('move file done');
}
});
}
});
fs.unlink(path + fileName, (err) => {});
});
}
After the all the process done (both download process and post process), I want to execute a callback function.
The interesting thing about a series of asynchronous processes is that you can never know when exactly all processes will complete. So setting a timeout for the callback is quick&dirty way to do it, but it's not reliable for sure.
You can instead use a counter to solve this problem.
Let's say you have 10 operations to perform. At the beginning you set your counter to ten counter = 10 And after each process is completed, regardless how (it can either succeed or fail), you can decrement the counter by 1 like counter -= 1 and right after it you can check if the counter is 0, if so that means all processes are completed and we reached the end. You can now safely run your callback function, like if(counter === 0) callback();
If I were you, I would do something like this:
*Notice that the called process should return a promise, so that I can know when it finishes (again regardless how)
*If you need help about promises, this useful article might help you: https://howtonode.org/promises
*Oh and one more thing, you should avoid using arguments.callee, because it's deprecated. Here is why Why was the arguments.callee.caller property deprecated in JavaScript?
exports.processArray = (items, process, callback) => {
var todo = [].concat(items);
var counter = todo.length;
runProcess();
function runProcess() {
// Check if the counter already reached 0
if(checkCounter() === false) {
// Nope. Counter is still > 0, which means we got work to do.
var processPromise = process(todo.shift());
processPromise
.then(function() {
// success
})
.catch(function() {
// failure
})
.finally(function() {
// The previous process is done.
// Now we can go with the next one.
--counter;
runProcess();
})
}
};
function checkCounter() {
if(counter === 0) {
callback();
return true;
} else {
return false;
}
}
};
What you want to do is to make all your asynchronous processes converge into a single promise that you can use to execute the callback at the correct moment.
Lets start at the point each process is complete, which I assume is in the callback passed to the mv() function in unzipData(). You want to wrap each of these asynchronous actions in a Promise that resolves in the callback and you also want to use these promises later and for that you use the .map() method to collect the promises in an array (instead of .forEach()).
Here's the code:
var promises = filelist.map((filePath) => {
if (!filePath.endsWith('.zip')) {
var components = filePath.split('/');
var component = components[components.length-1];
return new Promise((resolve, reject) =>
mv(filePath, datadir + `processing/${uid}/${component}`, (err) => {
if(err) {
console.log('move file err: ');
reject(); // Or resolve() if you want to ignore the error and not cause it to prevent the callback from executing later
} else {
console.log('move file done');
resolve();
}
}));
}
return Promise.resolve();
});
(if the asynchronous action is not to be executed, a Promise that resolves immediately is returned instead)
Now, we can turn this list of Promises into a single Promise that resolves when all of the promises in the list has resolved:
var allPromise = Promise.all(promises);
Next, we need to look further up in the code. We can see that the code we've just been looking at is itself part of an event handler of an asynchronous action, i.e. fs.createReadStream(). You need to wrap that in a promise that gets resolved when the inner promises resolve and this is the promise that the unzipData() function shall return:
function unzipData(path, fileName, uid) {
console.log('get into unzip');
console.log('creating: ', path + fileName);
return new Promise((outerResolve) =>
fs.createReadStream(path + fileName)
.pipe(unzip.Extract({path: path}))
.on('close', () => {
console.log('get into unzip close');
var filelist = listFile(path);
// Code from previous examples
allPromise.then(outerResolve);
}));
}
Next, we look at the functions that use unzipData(): getDataReg() and getDataFtp(). They only perform one asynchronous action so all you need to do is to make them return a promise that resolves when the promise returned by unzipData() resolves.
Simplified example:
exports.getDataReg = (url, uid) => {
return new Promise((resolve, reject) => {
// ...
https.get(url, (response) => {
response.pipe(file);
file.on('finish', () => {
unzipData(datadir + `download/${uid}/`, fileName, uid)
.then(resolve);
});
}).on('error', (err) => { // Handle errors
fs.unlink(datadir + `download/${uid}/${fileName}`);
reject(); // Or resolve() if you want to ignore the error and not cause it to prevent the callback from executing later
});
// ...
});
}
Finally, we get to the processArray() function and here you need to do the same thing we did to begin with: map the processes into a list of promises. First, the process function passed needs to return the promises returned by getDataReg() and getDataFtp():
// Second param, download and post process
(url) => {
if(url.startsWith('http')) {
return getDataReg(url, uid);
}
else if(url.startsWith('ftp')) {
return getDataFtp(url, uid);
}
else {
console.log('not a valid resource');
}
return Promise.reject(); // or Promise.resolve() if you want invalid resources to be ignored and not prevent the callback from executing later
}
Now, your processArray() function can look like this:
exports.processArray = (items, process, callback) =>
Promise.all(items.map(process))
.then(callback)
.catch(() => console.log('Something went wrong somewhere'));
Your callback will get invoked when all asynchronous actions have completed, regardless of in which order they do. If any one of the promises rejects, the callback will never be executed so manage your promise rejections accordingly.
Here's a JSFiddle with the complete code: https://jsfiddle.net/upn4yqsw/
In general, since nodejs does not appear to have implemented Streams Standard to be Promise based, at least from what can gather; but rather, uses an event based or callback mechanism, you can use Promise constructor within function call, to return a fulfilled Promise object when a specific event has been dispatched
const doStuff = (...args) => new Promise((resolve, reject)) => {
/* define and do stream stuff */
doStreamStuff.on(/* "close", "end" */, => {
// do stuff
resolve(/* value */)
})
});
doStuff(/* args */)
.then(data => {})
.catch(err => {})

Categories