I need to download large no of files(say 100k, each file size 0.2 - 1 MB) from aws s3 to node js server. The code I am using is
app.get('/api/download-all', function(req, res) {
res.json({status: 'download initiated'})
downloadFromS3(getDocs());
});
The function that downloads the audios is
function downloadFromS3(docs){
docs.forEach((doc, fileIndex)=>{
var s3FilePath = doc.wav
var fileName = s3FilePath.split('/').pop();
var s3Params = {Bucket: 'zzzzz', Key: s3FilePath};
var file = fs.createWriteStream(dir + '/' + fileName);
console.log(downloadSession);
s3.getObject(s3Params)
.on('httpData', function (chunk) {
console.log("file writing happening", fileName);
file.write(chunk);
})
.send();
}); }
Here the download function fires S3.getObject call as many times as the no of files to download. it doesn't wait for the status of the file. its almost like some 100k (in my case) s3.getObject has been made before letting a file to download. is this a right way or should I wait for one file to download and invoke the s3 call after that. what will be the right approach.
2) There is one other issue I am facing with this code. Once I make the download api call from UI the server gets busy with download. its not returning any requests from the UI. all requests gets pending. Is there is anyway to do the download in background. I had gone through some approaches like fork a child process or a web worker to handle this. I am not sure which one to use. what is the best way to handle this.
I'd advise an in-between approach. Kicking off 100k downloads in parallel is really not a good idea. But similarly, waiting for each download to fully complete won't utilise your full bandwidth. I'd suggest a solution that "pools" jobs - e.g., you create a pool of promises, each of which can download one file at a time, as soon as it finishes it starts the next.
I've been using a function like this:
Promise.pool = function pool(funcs, inParallel, progressCallback) {
const promises = [];
const results = [];
function getNext() {
if (funcs.length) {
return funcs.pop()()
.catch(() => {})
.then((res) => {
results.push(res);
if (progressCallback) {
progressCallback(results);
}
return getNext();
});
}
}
for (let i = 0; i < Math.min(inParallel, funcs.length); i++) {
promises.push(getNext());
}
return Promise.all(promises)
.then(() => results);
};
Then you'd define an array of functions, each downloads one file and returns a promise which resolves on completion:
const funcs = docs.map((doc) => {
return () => {
return new Promise((resolve) => {
var s3FilePath = doc.wav
var fileName = s3FilePath.split('/').pop();
var s3Params = {Bucket: 'zzzzz', Key: s3FilePath};
var file = fs.createWriteStream(dir + '/' + fileName);
console.log(downloadSession);
s3.getObject(s3Params)
.on('httpData', function (chunk) {
console.log("file writing happening", fileName);
file.write(chunk);
})
.on("end", () => resolve())
.send();
});
}
});
Finally, you'd use it like this:
const inParallel = 32;
function callback(partialResults) {
//console log, whatever
}
Promise.pool(funcs, inParallel, callback)
.then(() => console.log("all done!"));
Related
I am working on a small project. discussing Step by step
At first I am uploading zip files though multer
extracting those files (How can I call extract function after completing upload using multer?)
After extracting those I am trying to filter those files
after filtering those files I want to move some files to another directory
in my main index.js I have
A simple route to upload files which is working
// MAIN API ENDPOINT
app.post("/api/zip-upload", upload, async (req, res, next) => {
console.log("FIles - ", req.files);
});
Continuous checking for if there is any zip file that needs to unzip but the problem is after uploading it's not showing any files or dir
// UNZIP FILES
const dir = `${__dirname}/uploads`;
const files = fs.readdirSync("./uploads");
const filesUnzip = async () => {
try {
if (fs.existsSync(dir)) {
console.log("files - ", files);
for (const file of files) {
console.log("file - ", file);
try {
const extracted = await extract("./uploads/" + file, { dir: __dirname + "/uploads/" });
console.log("Extracted - ",extracted);
// const directories = await fs.statSync(dir + '/' + file).isDirectory();
} catch (bufErr) {
// console.log("------------");
console.log(bufErr.syscall);
}
};
// const directories = await files.filter(function (file) { return fs.statSync(dir + '/' + file).isDirectory(); });
// console.log(directories);
}
} catch (err) {
console.log(err);
}
return;
}
setInterval(() => {
filesUnzip();
}, 2000);
Moving files to static directory but here is the same problem no directory found
const getAllDirs = async () => {
// console.log(fs.existsSync(dir));
// FIND ALL DIRECTORIES
if (fs.existsSync(dir)) {
const directories = await files.filter(function (file) { return fs.statSync(dir + '/' + file).isDirectory(); });
console.log("Directories - ",directories);
if (directories.length > 0) {
for (let d of directories) {
const subdirFiles = fs.readdirSync("./uploads/" + d);
for (let s of subdirFiles) {
if (s.toString().match(/\.xml$/gm) || s.toString().match(/\.xml$/gm) !== null) {
console.log("-- ", d + "/" + s);
const move = await fs.rename("uploads/" + d + "/" + s, __dirname + "/static/" + s, (err) => { console.log(err) });
console.log("Move - ", move);
}
}
}
}
}
}
setInterval(getAllDirs, 3000);
There are so many issues with your code, I don't know where to begin:
Why are you using fs.xxxSync() methods if all your functions are async? Using xxxSync() methods is highly discouraged because it's blocking the server (ie parallel requests can't/won't be accepted while a sync reading is in progress). The fs module supports a promise api ...
Your "Continuous checking" for new files is always checking the same (probably empty) files array because it seems you are executing files = fs.readdirSync("./uploads"); only once (probably at server start, but I can't tell for sure because there isn't any context for that snippet)
You shouldn't be polling that "uploads" directory. Because as writing a file (if done properly) is an asynchronous process, you may end up reading incomplete files. Instead you should trigger the unzipping from your endpoint handler. Once it is hit, body.files contains the files that have been uploaded. So you can simply use this array to start any further processing instead of frequently polling a directory.
At some points you are using the callback version of the fs API (for instance fs.rename(). You cannot await a function that expects a callback. Again, use the promise api of fs.
EDIT
So I'm trying to address your issues. Maybe I can't solve all of them because of missing infomation, but you should get the general idea.
First of all, you shuld use the promise api of the fs module. And also for path manipulation, you should use the available path module, which will take care of some os specific issues.
const fs = require('fs').promises;
const path = require('path');
Your API endpoint isn't currently returning anything. I suppose you stripped away some code, but still. Furthermore, you should trigger your filehandling from here, so you don't have to do directory polling, which is
error prone,
wasting resources and
if you do it synchronously like you do blocks the server
app.post("/api/zip-upload", upload, async (req, res, next) => {
console.log("FIles - ", req.files);
//if you want to return the result only after the files have been
//processed use await
await handleFiles(req.files);
//if you want to return to the client immediately and process files
//skip the await
//handleFiles(req.files);
res.sendStatus(200);
});
Handling the files seems to consist of two different steps:
unzipping the uploaded zip files
copying some of the extracted files into another directory
const source = path.join(".", "uploads");
const target = path.join(__dirname, "uploads");
const statics = path.join(__dirname, "statics");
const handleFiles = async (files) => {
//a random folder, which will be unique for this upload request
const tmpfolder = path.join(target, `tmp_${new Date().getTime()}`);
//create this folder
await fs.mkdir(tmpfolder, {recursive: true});
//extract all uploaded files to the folder
//this will wait for a list of promises and resolve once all of them resolved,
await Promise.all(files.map(f => extract(path.join(source, f), { dir: tmpfolder })));
await copyFiles(tmpfolder);
//you probably should delete the uploaded zipfiles and the tempfolder
//after they have been handled
await Promise.all(files.map(f => fs.unlink(path.join(source, f))));
await fs.rmdir(tmpfolder, { recursive: true});
}
const copyFiles = async (tmpfolder) => {
//get all files/directory names in the tmpfolder
const allfiles = await fs.readdir(tmpfolder);
//get their stats
const stats = await Promise.all(allfiles.map(f => fs.stat(path.join(tmpfolder, f))));
//filter directories only
const dirs = allfiles.filter((_, i) => stats[i].isDirectory());
for (let d of dirs) {
//read all filenames in the subdirectory
const files = await fs.readdir(path.join(tmpfolder, d)));
//filter by extension .xml
const xml = files.filter(x => path.extname(x) === ".xml");
//move all xml files
await Promise.all(xml.map(f => fs.rename(path.join(tmpfolder, d, f), path.join(statics, f))));
}
}
That should do the trick. Of course you may notice there is no error handling with this code. You should add that.
And I'm not 100% sure about your paths. You should consider the following
./uploads refers to a directory uploads in the current working directory (whereever that may be)
${__dirname}/uploads refers to a directory uploads which is in the same directory as the script file currently executing Not sure if that is the directory you want ...
./uploads and ${__dirname}/uploads may point to the same folder or to completely different folders. No way knowing that without additional context.
Furthermore in your code you extract the ZIP files from ./uploads to ${__dirname}/uploads and then later try to copy XML files from ./uploads/xxx to ${__dirname}/statics, but there won't be any directory xxx in ./uploads because you extracted the ZIP file to a (probably) completely different folder.
I have a project that has functions that read files and extract their hash code. After these hash codes are extracted in the project, subfiles are built one by one. Finally, what I want to do is to throw all these hash codes into an array and create a json file. I need to do this after the IterateFolders() function has run and finished in readDirectory function. But console.log is running on a bottom line without waiting for this function, please help.
My functions are as follows:
//Calculate build time
function getBuildTime(start,end) {
let time = (end - start);
let buildTime = `${new Date().toLocaleDateString()} ${new Date().toLocaleTimeString()} Build time: ${time} ms \n`
fs.writeFile('build-time.log', buildTime,function (err) { //output log file
if (err) return console.log(err);
});
}
//async metaHash calculation from folder path
async function computeMetaHash(folder, inputHash = null) {
const hash = inputHash ? inputHash : createHash('sha256');
const info = await fsp.readdir(folder, { withFileTypes: true });
//construct a string from the modification date, the filename and the filesize
for (let item of info) {
const fullPath = path.join(folder, item.name)
if (item.isFile()) {
const statInfo = await fsp.stat(fullPath); //stat return all informations about file
// compute hash string name:size:mtime
const fileInfo = `${fullPath}:${statInfo.size}:${statInfo.mtimeMs}`;
hash.update(fileInfo);
} else if (item.isDirectory()) {
// recursively walk sub-folders
await computeMetaHash(fullPath, hash);
}
}
// if not being called recursively, get the digest and return it as the hash result
if (!inputHash) {
return hash.digest('base64');
}
}
async function iterateFolders(folderPath) {
folderPath.forEach(function (files) {
//function takes folder path as inputh
computeMetaHash(files).then(result => { //call create hash function
console.log({"path":files,"hashCode":result});
}).then(()=>{ //build fragments
//The files is array, so each. files is the folder name. can handle the folder.
console.log("%s build...", files);
execSync(`cd ${files} && npm run build`, { encoding: 'utf-8' });
}).then(()=>{// Finish timing
end = new Date().getTime();
getBuildTime(start,end);
}).catch(err => {
console.log(err);
});
});
}
async function readDirectory() {
let files = await readdir(p)
const folderPath = files.map(function (file) {
//return file or folder path
return path.join(p, file);
}).filter(function (file) {
//use sync judge method. The file will add next files array if the file is directory, or not.
return fs.statSync(file).isDirectory();
})
//check hash.json exist or not
if (fs.existsSync(hashFile)) {
// path exists
console.log("File exists: ", hashFile);
}
else
{
//This is the first pipeline, all fragments will build then hash.json will created.
console.log(hashFile," does NOT exist, build will start and hash.json will created:");
// Start timing
start = new Date().getTime();
iterateFolders(folderPath,files);
console.log("IT WILL BE LAST ONE ")
}
}
readDirectory();
Well if you want to wait for its execution, then you have to use await :) Currently it's just iterateFolders(folderPath,files);, so you run it, but you don't wait for it.
await iterateFolders(folderPath,files);
That's your first issue. Then this method runs some loop and calls some other methods. But first async-await needs to return a promise (which you do not do). And second - it doesn't work in forEach, as stated in the comments above. Read Using async/await with a forEach loop for more details.
Fix those three issues and you'll make it.
In the iterateFolders function, you need to await computeMetaHash calls. To do so you can either use a for loop instead of calling forEach on folderPath or change forEach to map and use Promise.all.
Using the for loop method (synchronous):
async function iterateFolders(folderPath) {
for (let files of folderPath) {
//function takes folder path as inputh
await computeMetaHash(files).then(result => { //call create hash function
console.log({"path":files,"hashCode":result});
}).then(()=>{ //build fragments
//The files is array, so each. files is the folder name. can handle the folder.
console.log("%s build...", files);
execSync(`cd ${files} && npm run build`, { encoding: 'utf-8' });
}).then(()=>{// Finish timing
end = new Date().getTime();
getBuildTime(start,end);
}).catch(err => {
console.log(err);
});
}
}
Using the Promise.all method (asynchronous):
async function iterateFolders(folderPath) {
return Promise.all(folderPath.map(function (files) {
//function takes folder path as inputh
return computeMetaHash(files).then(result => { //call create hash function
console.log({"path":files,"hashCode":result});
}).then(()=>{ //build fragments
//The files is array, so each. files is the folder name. can handle the folder.
console.log("%s build...", files);
execSync(`cd ${files} && npm run build`, { encoding: 'utf-8' });
}).then(()=>{// Finish timing
end = new Date().getTime();
getBuildTime(start,end);
}).catch(err => {
console.log(err);
});
}));
}
If you prefer, using async/await also allows you to get rid of the then and catch in both methods which I believe makes it a little easier to read and understand.
Here's an example using the Promise.all method:
async function iterateFolders(folderPath) {
return Promise.all(folderPath.map(async (files) => {
try {
const result = await computeMetaHash(files);
console.log({ path: files, hashCode: result });
// build fragments
//The files is array, so each. files is the folder name. can handle the folder.
console.log('%s build...', files);
execSync(`cd ${files} && npm run build`, { encoding: 'utf-8' });
// Finish timing
const end = Date.now();
getBuildTime(start, end);
} catch(err) {
console.log(err);
}
}));
}
You might also want to check out for await... of
Note: you also need to await iterateFolders when it's called in readDirectory.
I have this problem when I try to upload more than a few hundred of files at the same time.
The API interface is for one file only so I have to call the service sending each file. Right now I have this:
onFilePaymentSelect(event): void {
if (event.target.files.length > 0) {
this.paymentFiles = event.target.files[0];
}
let i = 0;
let save = 0;
const numFiles = event.target.files.length;
let procesed = 0;
if (event.target.files.length > 0) {
while (event.target.files[i]) {
const formData = new FormData();
formData.append('file', event.target.files[i]);
this.payrollsService.sendFilesPaymentName(formData).subscribe(
(response) => {
let added = null;
procesed++;
if (response.status_message === 'File saved') {
added = true;
save++;
} else {
added = false;
}
this.payList.push({ filename, message, added });
});
i++;
}
}
So really I have a while for sending each file to the API but I get the message "429 too many request" on a high number of files. Any way I can improve this?
Working with observables will make that task easier to reason about (rather than using imperative programming).
A browser usually allows you to make 6 request in parallel and will queue the others. But we don't want the browser to manage that queue for us (or if we're running in a node environment we wouldn't have that for ex).
What do we want: We want to upload a lot of files. They should be queued and uploaded as efficiently as possible by running 5 requests in parallel at all time. (so we keep 1 free for other requests in our app).
In order to demo that, let's build some mocks first:
function randomInteger(min, max) {
return Math.floor(Math.random() * (max - min + 1)) + min;
}
const mockPayrollsService = {
sendFilesPaymentName: (file: File) => {
return of(file).pipe(
// simulate a 500ms to 1.5s network latency from the server
delay(randomInteger(500, 1500))
);
}
};
// array containing 50 files which are mocked
const files: File[] = Array.from({ length: 50 })
.fill(null)
.map(() => new File([], ""));
I think the code above is self explanatory. We are generating mocks so we can see how the core of the code will actually run without having access to your application for real.
Now, the main part:
const NUMBER_OF_PARALLEL_CALLS = 5;
const onFilePaymentSelect = (files: File[]) => {
const uploadQueue$ = from(files).pipe(
map(file => mockPayrollsService.sendFilesPaymentName(file)),
mergeAll(NUMBER_OF_PARALLEL_CALLS)
);
uploadQueue$
.pipe(
scan(nbUploadedFiles => nbUploadedFiles + 1, 0),
tap(nbUploadedFiles =>
console.log(`${nbUploadedFiles}/${files.length} file(s) uploaded`)
),
tap({ complete: () => console.log("All files have been uploaded") })
)
.subscribe();
};
onFilePaymentSelect(files);
We use from to send the files one by one into an observable
using map, we prepare our request for 1 file (but as we don't subscribe to it and the observable is cold, the request is just prepared, not triggered!)
we now use mergeMap to run a pool of calls. Thanks to the fact that mergeMap takes the concurrency as an argument, we can say "please run a maximum of 5 calls at the same time"
we then use scan for display purpose only (to count the number of files that have been uploaded successfully)
Here's a live demo: https://stackblitz.com/edit/rxjs-zuwy33?file=index.ts
Open up the console to see that we're not uploading all them at once
I am working on MEAN stack application and I am using AWS SDK to upload multiple files to S3. I am using busboy and AWS SDK.
Code:
var inputObj = {};
var busboy = new Busboy({ headers: req.headers });
// The file upload has completed
busboy.on('finish', function() {
console.log('Upload finished.....');
var file = [];
const file1 = req.files.clogo;
const file2 = req.files.cbanner1;
const file3 = req.files.cbanner2;
const file4 = req.files.cbanner3;
const file5 = req.files.cbanner4;
const file6 = req.files.clongHeader;
file.push(file1);
file.push(file2);
file.push(file3);
file.push(file4);
file.push(file5);
file.push(file6);
multipleUploadToS3(req.body.cname, file, function(fileName) {
console.log("client file upload finished.....");
if(fileName.length == 6){
inputObj.clogo = fileName[0];
inputObj.cbanner1 = fileName[1];
inputObj.cbanner2 = fileName[2];
inputObj.cbanner3 = fileName[3];
inputObj.cbanner4 = fileName[4];
inputObj.clongHeader = fileName[5];
console.log(inputObj);
var clientObj = new client(inputObj);
clientObj.save(function(err, client) {
console.log("Client Saved.....");
if (err) {
return res.status(400).send({
message: errorHandler.getErrorMessage(err)
});
} else {
res.json(client);
}
});
}
});
});
req.pipe(busboy);
File Upload Method:
function multipleUploadToS3(client, file, callback) {
console.log("multipleUpload to S3");
console.log(client);
console.log(file);
let s3bucket = new AWS.S3({
accessKeyId: IAM_USER_KEY,
secretAccessKey: IAM_USER_SECRET,
Bucket: BUCKET_NAME,
});
var fileNames = [];
for(var i=0; i<file.length; i++){
s3bucket.createBucket(function () {
var params = {
Bucket: BUCKET_NAME,
Key: client+ '/' + file[i].name,
Body: file[i].data,
};
s3bucket.upload(params, function (err, data) {
if (err) {
console.log('error in callback');
console.log(err);
}
console.log('success');
//console.log(data.key);
fileNames.push(data.key);
if(i == file.length){ callback(fileNames);}
});
});
}
};
The issue: file upload is asynchronous so for example if file1 I am uploading is honest.jpg then I want multipleUploadToS3 method to return file name after its done uploading to S3. I am binding this in inputObj keys which will be saved to mongo db. so inputObj.logo should have logo.png in it not the banner image which is happening due to asynchronous call.
This is working for a single file but failing for multiple files.
The problem is because for loop is synchronous and file upload is asynchronous.
Take a look at this example below,
for(var i = 0; i<5;i++) {
setTimeout(function(){ console.log(i); }, 100);
}
The above loop will print 5 for 5 times i.e 55555.
This behaviour is because for loop gets executed immediately making i=5 and when timeout gets executed it prints "i" value 5 for 5 times. 5 times because setTimeout is pushed in the queue for five times.
There are two ways to solve the problem you are facing
You can use Recursion.
Use neo-async(async-parallel) lib to control the async flow of javascript(Nodejs). click here for neo-async lib
Hope this clears your doubt. Please comment for more info.
I am parsing multiple large JSON files to my mongoDB database. At the moment I am using stream-json npm package. After I load one file I change the filename that I am loading and relaunch the script to load the next file. This is unnecessarily time consuming. So how can I iterate through all the files automatically? At the moment my code looks like this:
const StreamArray = require('stream-json/utils/StreamArray');
const path = require('path');
const fs = require('fs');
const filename = path.join(__dirname, './data/xa0.json'); //The next file is named xa1.json and so on.
const stream = StreamArray.make();
stream.output.on('data', function (object) {
// my function block
});
stream.output.on('end', function () {
console.log('File Complete');
});
fs.createReadStream(filename).pipe(stream.input);
I tried iterating through the names of the files by adding a loop that would add +1 to the filename i.e. xa0 to xa1 at the same point where the script console.log('File Complete') but this didn't work. Any ideas how I might be able to achieve this or something similar.
Just scan your JSON files directory using fs.readdir. It will return a list of file names that you can then iterate, something like this :
fs.readdir("./jsonfiles", async (err, files) => {
for( file in files ){
await saveToMongo("./jsonfiles/" + file)
}
})
So you just launch your script once and wait until full completion.
Of course, in order for it to be awaited, you need to promisify the saveToMongo function, something like :
const saveToMongo = fileName => {
return new Promise( (resolve, reject) => {
// ... logic here
stream.output.on('end', function () {
console.log('File Complete');
resolve() // Will trigger the next await
});
})
}