I am reading a CSV using CSV-parser npm module and have to perform some operation on the data I get from the CSV (for each line).
const readstream = fs.createReadStream('src/working_file.csv');
const stream = readstream.pipe(parser());
stream.on('data', async data => {
// data is a JSON object of the row in CSV.
// Now i am calling another async function from user using the data in the JSON
console.log('before calling');
const writeToFile = await getImage(data.searchKey);
console.log('after calling');
// do other stuff
}
async function getImage(searchKey){
// im doing web scraping here using puppeeter
// it has some await calls too
console.log('in getimage');
const results = await scrapper.run().catch(err => {
console.error(err);
process.exit(1);
});
}
let say my csv has 2 rows then, my output is coming like below
before calling
in getimage
before calling
in getimage
after calling
after calling
but when I am doing this all callings are happening at a time though I used await. If I have 10 rows in the CSV all 10 rows calling the function is happening at the same time. but I want it to happen one by one. Only when the operation with the first row completes then I want the operate the second row.
my problem is all calls are happening at once rather than once by one.
Try this code.
var fs = require('fs');
var parse = require('csv-parse');
var async = require('async');
var inputFile='src/working_file.csv';
var parser = parse({delimiter: ','}, function (err, data) {
async.eachSeries(data, function (line, callback) {
// do something with the line
doSomething(line).then(function() {
// when processing finishes invoke the callback to move to the next one
callback();
});
})
});
fs.createReadStream(inputFile).pipe(parser);
You can also use fast-csv
Related
I am trying to run parallel requests in batches to an API using a bunch of keywords in an array. Article by Denis Fatkhudinov.
The problem I am having is that for each keyword, I need to run the request again with a different page argument for as many times as the number in the pages variable.
I keep getting Cannot read property 'then' of undefined for the return of the chainNext function.
The parallel request in batches on its own, without the for loop, works great, I am struggling to incorporate the for loop on the process.
// Parallel requests in batches
async function runBatches() {
// The keywords to request with
const keywords = ['many keyword strings here...'];
// Set max concurrent requests
const concurrent = 5;
// Clone keywords array
const keywordsClone = keywords.slice()
// Array for future resolved promises for each batch
const promises = new Array(concurrent).fill(Promise.resolve());
// Async for loop
const asyncForEach = async (pages, callback) => {
for (let page = 1; page <= pages; page++) {
await callback(page);
}
};
// Number of pages to loop for
const pages = 2;
// Recursively run batches
const chainNext = (pro) => {
// Runs itself as long as there are entries left on the array
if (keywordsClone.length) {
// Store the first entry and conviently also remove it from the array
const keyword = keywordsClone.shift();
// Run 'the promise to be' request
return pro.then(async () => {
// ---> Here was my problem, I am declaring the constant before running the for loop
const promiseOperation = await asyncForEach(pages, async (page) => {
await request(keyword, page)
});
// ---> The recursive invocation should also be inside the for loop
return chainNext(promiseOperation);
});
}
return pro;
}
return await Promise.all(promises.map(chainNext));
}
// HTTP request
async function request(keyword, page) {
try {
// request API
const res = await apiservice(keyword, page);
// Send data to an outer async function to process the data
await append(res.data);
} catch (error) {
throw new Error(error)
}
}
runBatches()
The problem is simply that pro is undefined, because you haven't initialized it.
You basically execute this code:
Promise.all(new Array(concurrent).fill(Promise.resolve().map(pro => {
// pro is undefined here because the Promise.resolve had no parameter
return pro.then(async () => {})
}));
I'm not completely sure about your idea behind that, but this is your problem in a more condensed version.
I got it working by moving actual request promiseOperation inside the for loop and returning the recursive function there too
// Recursively run batches
const chainNext = async (pro) => {
if (keywordsClone.length) {
const keyword = keywordsClone.shift()
return pro.then(async () => {
await asyncForEach(pages, (page) => {
const promiseOperation = request(keyword, page)
return chainNext(promiseOperation)
})
})
}
return pro
}
Credit for the parallel requests in batches goes to https://itnext.io/node-js-handling-asynchronous-operations-in-parallel-69679dfae3fc
I have been trying to understand Promises and I'm hitting a brick wall.
==Order I want the code to run==
I need a .txt file to load each line into an array.
WAIT for this to happen.
Run a Function on each entry that returns and array.
WAIT for each index of the array to be processed before doing the next.
==My Functions==
Call this function to start the program.
async function start(){
var data = await getData();
console.log(data);
for (var i = 0; i < data.length; i++){
console.log(await searchGoogle(data[i]));
}
}
'await' for the data from getData
async function getData(){
return new Promise(function(resolve, reject){
fs.readFile('./thingsToGoogle.txt', function(err, data) {
if(err) throw err;
var array = data.toString().split("\n");
resolve(array);
});
});
}
Then call searchGoogle on each index in the array.
async function searchGoogle(toSearch) {
(async() => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.google.com/');
await page.type('input[name=q]', toSearch);
try {
console.log('Setting Search' + toSearch);
await page.evaluate(() => {
let elements = document.getElementsByClassName('gNO89b');
for (let element of elements)
element.click();
});
await page.waitForNavigation();
} catch (err) {
console.log(err)
}
try {
console.log("Collecting Data");
const[response] = await Promise.all([
page.waitForNavigation(),
await page.click('.rINcab'),
]);
} catch (err) {
console.log("Error2: " + err)
}
let test = await page.$$('.LC20lb');
// console.log(test);
allresults = [];
for (const t of test) {
const label = await page.evaluate(el => el.innerText, t);
if (label != "") {
allresults.push(label);
}
}
await browser.close();
resolve(allresults);
})();
}
The problem is that this does not work. it does not wait for the file to load.
Picture of Node JS output.
Hopefully the screen shot has uploaded, but you can see it stacking the SearchGoogle function console.logs;
console.log('Setting..')
console.log('Setting..')
console.log('Collecting..')
console.log('Collecting..')
When it should be
console.log('Setting..')
console.log('Collecting..')
console.log('Setting..')
console.log('Collecting..')
This is the 'first' time sort of dealing with promises, i have done a lot of reading up on them and done bits of code to understand them, however when I have tried to apply this knowledge I am struggling. Hope someone can help.
-Peachman-
Queue with concurrent Limit (using p-queue)
You need a queue with concurrency limit. You will read every single line and add them to a queue. We will be using readline and p-queue module for this.
First, create a queue with concurrency of 1.
const {default: PQueue} = require('p-queue');
const queue = new PQueue({concurrency: 1});
Then, create our reader instance.
const fs = require('fs');
const readline = require('readline');
const rl = readline.createInterface({
input: fs.createReadStream('your-input-file.txt')
});
For every line of the file, add an entry to the queue.
rl.on('line', (line) => {
console.log(`Line from file: ${line}`);
queue.add(() => searchGoogle(line));
});
That's it! If you want to process 10 lines at once, just change the concurrency line. It will still read one line at a time, but the queue will limit how many searchGoogle is invoked.
Optional Fixes: Async Await
Your code has the following structure,
async yourFunction(){
(async()=>{
const browser = await puppeteer.launch();
// ... rest of the code
})()
}
While this might run as intended, you will have a hard time debugging because you will be creating an anonymous function every time you run yourFunction.
The following is enough.
async yourFunction(){
const browser = await puppeteer.launch();
// ... rest of the code
}
Here's a way to process them that lets you process N URLs at a time where you can adjust the value of N. My guess is that you want it set to a value of between 5 and 20 in order to keep your CPU busy, but not use too many server resources.
Here's an outline of how it works:
It uses the line-by-line module to read a file line by line and (unlike the built-in readline interface), this module pauses line events when you call .pause() which is important in this implementation.
It maintains a numInFlight counter that tells you how many lines are in the midst of processing.
You set a maxInFlight constant to the maximum number of lines you want to be processed in parallel.
It maintains a resultCntr that helps you keep results in the proper order.
It creates the readline interface and establishes a listener for the line event. This will start the stream flowing with line events.
On each line event, we increment our numInFlight counter. If we have reached the maximum number allowed in flight, we pause the readline stream so it won't produce any more line events. If we haven't reached the max in flight yet, then more line events will flow until we do reach the max.
We pass that line off to your existing searchGoogle() function.
When that line is done processing, we save the result in the appropriate spot in the array, decrement the numInFlight counter and resume the stream (in case it was previously paused).
We check if we're all done (by checking if numInFlight is 0 and if we've reached the end of our file). If we are done, resolve the master promise with the results.
If we're not all done, then there will either be more line events coming or more searchGoogle() functions in flight that will finish, both of which will check again to see if we're done.
Note that the way this is designed to work is that errors on any given URL are just put into the result array (the error object is in the array) and processing continues on the rest of the URLs with an eventual resolved promise. Errors while reading the input file will terminate processing and reject the return promise.
Here's the code:
const fs = require('fs');
const Readline = require('line-by-line');
function searchAll(file) {
return new Promise(function(resolve, reject) {
const rl = new Readline(file);
// set maxInFlight to something between 5 and 20 to optimize performance by
// running multiple requests in flight at the same time without
// overusing memory and other system resources.
const maxInFlight = 1;
let numInFlight = 0;
let resultCntr = 0;
let results = [];
let doneReading = false;
function checkDone(e) {
if (e) {
reject(e);
} else if (doneReading && numInFlight === 0) {
resolve(results);
}
}
rl.on('line', async (url) => {
if (url) {
let resultIndex = resultCntr++;
try {
++numInFlight;
if (numInFlight >= maxInFlight) {
// stop flowing line events when we hit maxInFlight
rl.pause();
}
let result = await searchGoogle(url);
// store results in order
results[resultIndex] = result;
} catch(e) {
// store error object as result
results[resultIndex] = e;
} finally {
--numInFlight;
rl.resume();
checkDone();
}
}
}).on('end', () => {
// all done reading here, may still be some processing in flight
doneReading = true;
checkDone();
}).on('error', (e) => {
doneReading = true;
checkDone(e);
});
});
}
FYI, you can set maxInFlight to a value of 1 and it will read process the URLs one at a time, but the whole point of writing this type of function is so that you can likely get better performance by setting it to a value higher than 1 (I'm guessing 5-20).
I have a file, and I want to read it line by line, and for every line extracted I perform some expensive analyzes and then save the results to the database. In short, I have something like this:
const fs = require('fs');
const path = require('path');
const readline = require('readline');
async function analyzeAndSave(url) {
// Removed for brevity, but this function takes a minute or so finsh.
}
async function run() {
try {
const dataPath = path.join(path.dirname(require.main.filename), 'data/urls.txt');
const rl = readline.createInterface({
input: fs.createReadStream(dataPath),
});
let line_no = 0;
rl.on('line', async (url) => {
line_no++;
logger.info(`Analyzing: ${url}`);
await analyzeAndSave(url);
});
} catch (err) {
// Error caught.
logger.error(err);
}
}
run();
The problem with this is that, I notice that it doesn't wait for the analyzes of one line to finish, it kind of tries to execute multiple of the analyzes instances. I can see this as initially it prints all the lines with logger.info('Analyzing: ' + url);`. So, it is not executed sequentially. How can I make sure that one line finishes before moving onto the next?
The readline interface is emitting the "on" events asynchronously and doing an await inside one of them doesn't stop other from being fired. Instead you can buffer up the lines in an array like this:
r.on('line', url => urls.push(url));
r.on('close', async () => {
for (const url of urls) {
await analyzeAndSave(url);
}
});
where urls is initialized to an empty array before the readline interface is created.
I think this is going to be helpful to you, exampled and mentioned here.
Nodejs - read line by line from file, perform async action for each line and reusme
Someone stated you can use a library for big files which is titled: line-by-line
#JavierFerrero stated a solution as such.
var LineByLineReader = require('line-by-line'),
lr = new LineByLineReader('big_file.txt');
lr.on('error', function (err) {
// 'err' contains error object
});
lr.on('line', function (line) {
// pause emitting of lines...
lr.pause();
// ...do your asynchronous line processing..
setTimeout(function () {
// ...and continue emitting lines.
lr.resume();
}, 100);
});
lr.on('end', function () {
// All lines are read, file is closed now.
});
You can also pass it ass a callback, waiting for the operation to
finish.
const fs = require('fs');
function run(path, cb) {
try {
fs.readFile(path, 'utf8', function(err, data){
if(err) throw err;
cb(data);
});
} catch (err) {
// Error caught.
}
}
run('./test.txt', (response) => {
// We are done, now continue
console.log(response)
})
I am trying to implement a function in Express to return a list with data from a mongoose model. 'MiModelo' is the mongoose model created from a Schema.
//Get data in the DB
function getAllData()
{
var promesa = MiModelo.find().exec();
console.log(promesa);
console.log("---");
var miLista=[];
async = require('async');
async.parallel([function(){
promesa.then(function(datos)
{
datos.forEach(function(dato){
console.log("dato: " + dato.numero)
miLista.push(dato.numero);
});
});
}],function(){
console.log(miLista);
});
return miLista;
}
In the final console.log() I am able to get all 'numero' field values from the database but the return is empty when I call this function elsewhere. I know it is because it is asynchronous.
I have read the answer in this question: How to make a function wait until a callback has been called using node.js but I do not know how to adapt my function.
Any help is appreciated.
Thank you for your time and help.
The whole function can be simplified to a few lines:
async function getAllData() {
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/async_function
const datos = await MiModelo.find().exec();
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/map
const miLista = datos.map(dato => dato.numero)
return miLista;
}
which you can then call like so:
const data = await getAllData()
// or
getAllData().then(data => console.log(data))
I'm using mongoose to query a database for objects and wish to write each object to a file. The console.log shows me that the data I want is being returned from the query, but the file that is created by fs.append (./returned.json) is always empty. Is it not possible to do this within an async function?
async function findReturned(){
try {
const returned = await Data.find({});
returned.forEach(function(file) {
returnedfiles = file.BACSDocument;
console.log(returnedfiles);
fs.appendFile('./returned.json', returnedfiles, 'utf-8', (err) => {
if (err) throw err;
});
});
process.exit();
} catch(e) {
console.log('Oops...😯😯😯😯😯😯😯');
console.error(e);
process.exit();
}
};
You probably should use mz/fs for async function
const fs = require('mz/fs');
...
returned.forEach(async (file) => {
returnedfiles = file.BACSDocument;
await fs.writeFile('./returned.json', returnfiles, 'utf-8');
})
You don't need to throw the err as it will the catch.
You can't mix async function and functions with callback API
Also you can't call multiply async function inside forEach, you have to create a promise that will do what do you need in chain (see reduce function in example below).
Another way - to use .map, return an array of promises and await it using
await Proimse.all(arrayOfPromises)
but in this case the order of new data in file will not be as the order in the initial array.
In order to do that you need to promisify fs.appendFile and call promisified version of function
// add new import
const {promisify} = require('util');
// create new function with Promise API on existing Callback API function
const appendFile = promisify(fs.appendFile)
async function findReturned(){
try {
const returned = await Data.find({});
// Create a single promise that will append each record one by one to the file in initial order.
const promise = returned.reduce(function(acc, file){
returnedfiles = file.BACSDocument;
console.log(returnedfiles);
acc.then(() => appendFile('./returned.json', returnedfiles, 'utf-8'));
return acc
}, Promise.resolve());
// Wait until the promise resolves;
await promise;
process.exit();
} catch(e) {
console.log('Oops...😯😯😯😯😯😯😯');
console.error(e);
process.exit();
}
};
Instead of using build int fs module you can use fs-extra module that already has promisified versions of native methods and several extra methods.
What you can improve in your function - do not append file multiply times, but collect all data that you need to append to file into single variable and do it once.
If for some reason you have to do it multiply times - obtain file descriptor using fs.open and pass it instead of file name to fs.appendFile (or promisified verison). In this case do not forget co close file descriptor (fs.Close).