I need to get every client in a table so that I can iterate through them, and use Puppeteer to crawl some data. I need the MySQL query because I gotta pass some params through the querystring.
I'm using Puppeteer, Puppeteer-cluster (due to the hundreds of rows), and MySQL driver.
const mysql = require('mysql');
const { Cluster } = require('puppeteer-cluster');
const sql = "select an.id_clientes, ano, c.nome, c.cpf, c.dt_nasc from clientes_anos an inner join clientes c on an.id_clientes = c.id limit 3";
db.query(sql, async (err, result) => {
//console.log(result);
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: 2,
puppeteerOptions: {
headless: false
},
//monitor: true,
timeout: 90000,
});
for (const elem of result){
const cpf = elem.cpf;
const dt = elem.dt_nasc.toLocaleDateString();
const url = `https://servicos.receita.fazenda.gov.br/Servicos/ConsRest/Atual.app/paginas/index.asp?cpf=${cpf}&dtnasc=${dt}`;
console.log(url);
(async()=>{
cluster.on('taskerror', (err, data) => {
//console.log(`Error crawling ${data}: ${err.message}`);
});
await cluster.task(async ({ page, data: url }) => {
console.log(dt, cpf);
await page.goto(url);
await page.type('#data_nascimento', dt);
await page.type('input[name=CPF]', cpf);
await page.waitForNavigation();
try{
const msg = await page.$eval(
`#rfb-main-container table tbody tr:nth-of-type(1) td:nth-of-type(1)`, divs => divs.innerText.trim()
);
if(msg.includes('Resultado encontrado: Saldo inexistente de imposto a pagar ou a restituir.')){
console.log('situação: '+ msg);
}else{
console.log('0');
}
}catch(exception){
console.log('Error');
}
});
await cluster.queue(url);
await cluster.idle();
await cluster.close();
})();
}
});
The problem I'm having is that when the page loads, await page.type('#data_nascimento', dt); await page.type('input[name=CPF]', cpf); are not returning the right value (the one that comes from the query is right), they are returning the value of the last row. I'm betting it's something with async but I can't figure it out.
Any ideas how I can solve this?
damn boy, i have things to say :)
I think the main cause of your issue is interaction between loops / callbacks / cluster
here is an exemple to clarify my point on loops
for (var i = 0; i<3; i++) {
setTimeout(function() {
console.log(i);
}, 1000);
}
// shows : 3 3 3
// because when console is finally executed "i" has been fully looped
if instead of var you had used const or let, the code would work fine and show 0 1 2.
if you dont use callbacks but only awaits this situation would be much clearer.
async function wait(ms) {
return new Promise(resolve => {
setTimeout(resolve, ms);
});
}
for (var i = 0; i<3; i++) {
await wait(1000);
console.log(i);
}
you can appply this to you mysql call to flatten your code.
function query(sql) {
return new Promise((resolve, reject) => {
db.query(sql, function(err, result) {
if (err) return reject(err);
resolve(result);
})
});
}
//then you call it like this
const sql = "select an.id_clientes, ano, c.nome, c.cpf, c.dt_nasc from clientes_anos an inner join clientes c on an.id_clientes = c.id limit 3";
const result = await query(sql)
in this spirit of flatenning your code, can you remove the async autoexecuting function ? It looks very unnecessary.
"(async()=>{"
...
"})();"
and finally the cluster part : its not written, but from reading the documentation i guess that any data you want to use in the cluster function needs to be pushed in the queue (the XXX).
cluster.queue(XXX)
=>
await cluster.task(async ({ page, data: XXX}) ...)
you should then push not only your url but a whole object if you need many different data.
cluster.queue({cpf, dt, url})
=>
await cluster.task(async ({ page, data: {cpf, dt, url}}) ...)
i never tried using this module, but it makes sens to me.
can you try it ?
PS: Also, not using cluster at all could greatly simplify things.
Related
I'm struggling a bit with JS promises.
I am using a library to pull data from Spotify that returns promises.
In my main function I can use an await to build an object from the response data and push it to an array (called nodes):
var nodes = [];
main();
async function main() {
var id = '0gusqTJKxtU1UTmNRMHZcv';
var artist = await getArtistFromSpotify(id).then(data => buildArtistObject(data));
nodes.push(artist);
When I debug here then all is good, nodes has my object.
However, when I introduce a 2nd await underneath to make another call:
nodes.forEach((node, i) => {
if (node.done == false) {
console.log(node.toString());
var related_artists = await getRelatedArtists(node.spotify_id);
I get the following error: SyntaxError: await is only valid in async function
I thought the first await statement would be resolved and the execution would continue until the next..?
Any help would be greatly appreciated.
EDIT
The other functions, if that helps, are just as follows:
function getArtistFromSpotify(id) {
let response = spotify
.request('https://api.spotify.com/v1/artists/' + id).then(function (data) {
return data;
})
.catch(function (err) {
console.error('Error occurred: ' + err);
return null;
});
return response;
}
function getRelatedArtists(id) {
let response = spotify
.request('https://api.spotify.com/v1/artists/' + id + '/related-artists').then(function (data) {
return data;
})
.catch(function (err) {
console.error('Error occurred: ' + err);
return null;
});
return response;
}
function buildArtistObject(data) {
var artist = {
node_id: nodes.length,
name: null,
genres: null,
popularity: null,
spotify_id: null,
done: false
}
artist.name = data.name;
artist.genres = data.genres;
artist.popularity = data.popularity > 0 ? data.popularity : 0;
artist.spotify_id = data.id;
return artist;
}
The code below has multiple problems.
var nodes = [];
main();
async function main() {
var id = '0gusqTJKxtU1UTmNRMHZcv';
var artist = await getArtistFromSpotify(id).then(data => buildArtistObject(data));
nodes.push(artist);
// ...
First of all, main mutates global scope nodes. Not only is this an antipattern even in synchronous code (functions should not rely on, or modify, global variable names; use parameters and return values instead), in asynchronous code, nodes will never be available for use anywhere but within main. See How do I return the response from an asynchronous call?.
Secondly, try to avoid combining then and await. It's confusing.
It's also a little odd that an array of nodes is used, yet only one artist is pushed onto it...
As for this code:
nodes.forEach((node, i) => {
if (node.done == false) {
console.log(node.toString());
var related_artists = await getRelatedArtists(node.spotify_id);
// ...
The error is self-explanatory. You must add async to the enclosing function if you want it to be asynchronous: nodes.forEach(async (node, i) => { // .... But that spawns a new promise chain per node, meaning future code that's dependent on the result won't be able to await all of the promises in the loop resolving. See Using async/await with a forEach loop. The likely solution is for..of or Promise.all.
While I'm not 100% sure what your final goal is, this is the general pattern I'd use:
async function main() {
const id = '0gusqTJKxtU1UTmNRMHZcv';
const data = await getArtistFromSpotify(id);
const artist = await buildArtistObject(data);
const nodes = [artist]; // odd but I assume you have more artists somewhere...
for (const node of nodes) {
if (!node.done) {
const relatedArtists = await getRelatedArtists(node.spotify_id);
}
}
/* or run all promises in parallel:
const allRelatedArtists = await Promise.all(
nodes.filter(e => !e.done).map(e => getRelatedArtists(e.spotify_id))
);
*/
// ...
}
main();
Since your code isn't runnable and some of the intent is unclear from the context, you'll likely need to adapt this a bit, so consider it pseudocode.
You have some misunderstandings of how to use promises -
let response = spotify
.request(url)
.then(function(data) { return data }) // this does nothing
.catch(function (err) { // don't swallow errors
console.error('Error occurred: ' + err);
return null;
})
return response
You'll be happy there's a more concise way to write your basic functions -
const getArtist = id =>
spotify
.request('https://api.spotify.com/v1/artists/' + id)
const getRelatedArtists = id =>
spotify
.request('https://api.spotify.com/v1/artists/' + id + '/related-artists')
Now in your main function, we can await as many things as needed. Let's first see how we would work with a single artist ID -
async function main(artistId) {
const artistData = await getArtist(artistId)
const relatedData = await getRelatedArtists(artistId)
return buildArtist(artistData, relatedData)
}
If you have many artist IDs -
async function main(artistIds) {
const result = []
for (const id of artistIds) {
const artistData = await getArtist(artistId)
const relatedData = await getRelatedArtists(artistId)
result.push(buildArtist(artistData, relatedData))
}
return result
}
Either way, the caller can handle errors as
main([693, 2525, 4598])
.then(console.log) // display result
.catch(console.error) // handle errors
Which is the same as -
main([693, 2525, 4598]).then(console.log, console.error)
The pattern above is typical but sub-optimal as the caller has to wait for all data to fetch before the complete result is returned. Perhaps you would like to display the information, one-by-one as they are fetched. This is possible with async generators -
async function* buildArtists(artistIds) {
for (const id of artistIds) {
const artistData = await getArtist(artistId)
const relatedData = await getRelatedArtists(artistId)
yield buildArtist(artistData, relatedData) // <- yield
}
}
async function main(artistIds) {
for await (const a of buildArtists(artistIds)) // <- for await
displayArtist(a)
}
main([693, 2525, 4598]).catch(console.error)
Below I have a Node.js function that makes a series of requests to different urls, then for each url I use the Cheerio web scraping library to loop through elements on the dom and create a sub array. At the end of each request (after the sub array is full) I'd like to push the contents of that array to a larger array, which is outside of the request scope.
The approach I'm trying doesn't seem to be working. It looks like I don't have access to 'allPlayers' from inside the .then block.
function readPlayers(teamUrls){
const allPlayers = [];
teamUrls.forEach((teamUrl, i) => {
const options = {
gzip: true,
uri: teamUrl,
Connection: 'keep-alive',
transform: function (body) {
return cheerio.load(body);
}
};
request(options)
.then(($) => {
const team = [];
$('tbody').children('tr').each(function(j, element){
const playerName = $(element).children('td').eq(1).children('span').eq(1).find('a').text().trim();
const player = { 'playerName': playerName };
team.push(player);
});
allPlayers.push(team);
}).catch(err => console.log("error: " + err)) );
});
}
So I'm wondering the best way to re-write this code to make the requests work and populate the outer array (allPlayers) with the results.
I've looked into trying to push the entire request directly into the outer array, to no avail.
In this example I'm using request-promise to make the request.
I've looked into using Promise.map, which I think is suited for this situation. Then I would return the entire request (I think), but I don't exactly understand what I'm doing in that case.. or if it will work.
Could anyone explain the scoping in this case, why I can't do it like I'm trying.
Many thanks
You have to remember when you are using asynchronous function you cannot go back to synchronous code execution.
This is one of the methods you can do it. It will fetch all the players parallely:
async function readPlayers(teamUrls) {
const playerPromises = teamUrls.map((teamUrl, i) => {
const options = {
gzip: true,
uri: teamUrl,
Connection: 'keep-alive',
transform: function(body) {
return cheerio.load(body);
}
};
return request(options)
});
const players = await Promise.all(playerPromises);
return players.reduce((allPlayers, $) =>{
const team = [];
$('tbody').children('tr').each(function(j, element) {
const playerName = $(element).children('td').eq(1).children('span').eq(1).find('a').text().trim();
const player = { playerName: playerName };
team.push(player);
});
allPlayers.push(team);
return allPlayers;
},[])
}
And you can use it using await readPlayers(array) or readPlayers(array).then(allteamplayers=>{...})
Note: In the current code it will be a 2D array, [[{p1:p1}..], [{p2:p2}..]] etc
If you use a forEach, every callback will run asynchronously and you won't be able to await them. You could swap it to a for loop, collect your promises in an array and then await the completion of all of them:
async function readPlayers(teamUrls) {
const allPlayers = [];
const allPromises = [];
for (var i = 0; i < teamUrls.length; i++) {
var teamUrl = teamUrls[i];
const options = {
gzip: true,
uri: teamUrl,
Connection: "keep-alive",
transform: function(body) {
return cheerio.load(body);
}
};
allPromises.push(
request(options)
.then($ => {
const team = [];
$("tbody")
.children("tr")
.each(function(j, element) {
const playerName = $(element)
.children("td")
.eq(1)
.children("span")
.eq(1)
.find("a")
.text()
.trim();
const player = { playerName: playerName };
team.push(player);
});
allPlayers.push(team);
})
.catch(err => console.log("error: " + err))
);
// wait untill all the promises resolve
await Promise.all(allPromises);
console.log(allPlayers);
return allPlayers;
}
}
Then you can get all the players by awaiting your function:
var allPlayers = await readPlayers(teamUrls);
I have an endpoint which loops through an array and updates the database as follows.
app.post('/create', Authenticate, async (req, res) => {
const {
products,
} = req.body;
const trxProvider = knex.transactionProvider();
const trx = await trxProvider();
try {
const formattedProduct = await Promise.all(products.map(async (product) => {
// Get Current value
const warehouseProducts = await trx('warehouse_products')
.select('available_qty as qty')
.where('product_code', product.product.code)
.first();
const finalQty = warehouseProducts.qty - product.orderQty;
// Update database
await trx('warehouse_products')
.update({ available_qty: finalQty })
.where('product_code', product.product.code);
}));
await trx('ordered_products')
.insert(formattedProduct);
trx.commit();
console.log('Transaction successful');
return send(res, 201, { success: true });
} catch (err) {
console.log(err);
trx.rollback();
const errors = {};
errors.message = 'something went wrong';
return send(res, 500, errors);
}
});
The issue arises when i try to update the same row of the warehouse_products table within the loop.
In the loop initially the qty value is taken from the warehouse_products table for a particular product then an arithmetic operation is done and the qty value is updated.
Ideally if both iterations access the same row, the second iteration's initial qty value should be what the first iteration updated. However the issue is that the second iteration too starts with the initial value of the first iteration. Its almost as if both iterations are happening parallel to each other instead of occurring sequentially.
Since you are using Promise.all it is supposed to happen in paralle. For sequential processing change this code
await Promise.all(products.map(async (product) => {
// logic here
});
to
for(const product of products) {
// logic here
}
Have a look at the definition for Promise.all()
It is typically used after having started multiple asynchronous tasks to run concurrently and having created promises for their results, so that one can wait for all the tasks being finished.
if you don't want to use an external library like Bluebird or Async
you can go with simple for a loop as following
let delay = (t) => new Promise((resolve) => {
setTimeout(() => {
return resolve(new Date().getMilliseconds())
}, t*1000);
});
let array = [1,1,1,1];
//using simple for loop
async function test() {
let out = [];
for (let i = 0; i < 4; i++) {
const res = await delay(array[i]);
out.push(res);
}
return out;
}
test().then(res => {
console.log(res)
})
I have the following code:
* Fetch stats from api
*/
fetchStats() {
this._isFetching = true;
// fetch stats after building url and replacing invalid characters
return new Promise(async (resolve, reject) => {
await API.fetchStats(this.rsn)
.then(jres => {
this.skills = jres.main.skills;
this._isFetching = false;
resolve('success');
})
.catch(err => {
console.log(err);
console.log('error retreiving stats');
this._isFetching = false;
reject('Failed to retreive stats');
})
.finally(() => {
this._isFetching = false;
});
});
}
I thought making it async with await would make it wait until it got the response before continuing. Returning the promise is something I added in testing to see if I could make it synchronous.
Then my code that consumes this method:
memberCollection.forEach(async el => {
await el.player.fetchStats()
.then(() => {
console.log(`Refreshed ${el.player.rsn}'s account`);
})
.catch(console.log(`Failed to refresh ${el.player.rsn}'s account`));
});
My thinking was that it would wait till it got a response then console.log either a successful refresh or a failed refresh. What I am instead seeing is a whole bunch of "success" messages followed by a string of failed messages indicating that it is running both the then and the catch message in the foreach. Does anyone know how I can make this work.
My issue is that Axios keeps timing out (my speculation is that it is due to the number of requests being sent off and the fact that there is a 5-10sec delay as it pulls from the db), if I navigate to the API URL manually it works as well as if I just do one member (as opposed to forEach) it works fine. So I'm trying to limit the number of requests fired off at once. I have tried setting my axios timeout to 10, 20, and 60 seconds, but it made no improvement.
Solution code:
const asyncForEach = async (arr, cb) => {
for(let i=0;i<arr.length;i++) {
let el = arr[i];
try {
let res = await cb(el);
} catch (err) { console.log(err) };
if(el.player && el.player.rsn) console.log(`Processed ${el.player.rsn}`);
}
console.log('done processing in asyncForEach');
}
not linked to axios but to async await.
consider
function slow(i){
return new Promise((ok,ko)=>{
return setTimeout(_=>ok(i), 1000)
})
}
async function asyncForEach(arr, cb){
for(var i = 0; i<arr.length; ++i){
let el = arr[i];
let res = await cb(el);
console.log('async', res, new Date)
}
}
/*
#foreach does not wait, but async and reduce are spaced by one second
foreach 4 2019-10-14T13:43:47.059Z
foreach 5 2019-10-14T13:43:47.071Z
foreach 6 2019-10-14T13:43:47.071Z
async 1 2019-10-14T13:43:47.071Z
async 2 2019-10-14T13:43:48.073Z
async 3 2019-10-14T13:43:49.074Z
reduce 7 2019-10-14T13:43:50.076Z
reduce 8 2019-10-14T13:43:51.078Z
reduce 9 2019-10-14T13:43:52.080Z
*/
async function main(){
await [4,5,6].forEach(async el=>{
let res = await slow(el);
console.log('foreach', res, new Date)
})
await asyncForEach([1,2,3], slow);
await [7,8,9].reduce((acc, el)=>acc.then(async _=>{
let res = await slow(el);
console.log('reduce', res, new Date);
return;
}), Promise.resolve())
}
main();
As you can see from timestamps, forEach does not wait for slow to finish
however, asyncForEach in its iteration does wait
What you may want to do is either
write a for loop as done with asyncForEach
use standard promises (stacking them):
[1,2,3].reduce((acc, el)=>acc.then(_=>{
return slow(el);
}), Promise.resolve())
My requirement is pretty straightforward -
I want to insert a document in MongoDB database. But before I have to check if the slug already exists in database. Then perform an operation to rename the slug if the slug is already exists.
What I have been trying is to perform an async await callback to check the slug is already exists then insert the document.
mongoClient.connect(function (err, mongoClient) {
let db = mongoClient.db("articles");
let category_information = async (db, category_info) => {
let slug_information = await db.collection('categories').find({slug: category_info.slug});
slug_information.count((err, count) => {
if (count > 0) {
let new_slug = `${category_info.slug}_${new Date().getTime()}`;
console.log(new_slug);
return new_slug;
}
else
return category_info.slug;
})
};
let category_promise = category_information(db, category_info);
category_promise.then(value => {
console.log(value);
category_info.slug = value;
});
db.collection('categories')
.insertOne(category_info, (err, data) => {
assert.equal(null, err);
res.status(200);
res.json('success');
});
mongoClient.close();
});
In console I get undefined value from Promise. Can you please figure out my code?
I am new in MongoDB. So also, do you have the solution of the problem in MongoDB way? I mean, can I perform these two queries within a single query?
Thanks!
You don't need to await find() since it's actually the command coming after, in this case count() that is executing the query.
Next I wonder where and how category_info is defined. It's missing in the code above. But I'll assume you have set it properly in your code.
You must return something from your async function (a promise preferably). Right now you only return from the count-callback.
With async/await you should be able to:
const count = await slug_information.count();
if (count > 0) {
let new_slug = `${category_info.slug}_${new Date().getTime()}`;
console.log(new_slug);
return new_slug;
} else {
return category_info.slug;
}
Basically, if you use a callback like (err, count)=>{..} then you say "I won't be using promises here!", no promise will come and you have nothing to wait for.
Next: category_promise.then(... this bit is async, so you cannot know that it'll resolve before you start your insertOne( query. Actually you can be almost sure it hasn't.
So you either chain another then:
category_promise.then(value => {
console.log(value);
return category_info.slug = value;
}).then( ()=>{
db.collection('categories')
.insertOne( ...
});
or just async the whole thing:
const MongoClient = require("mongodb").MongoClient;
const category_info = { slug: "abc" };
async function run(req, res, next) {
const mongoClient = await MongoClient.connect("mongodb://localhost:27017");
let db = mongoClient.db("categories");
// With async/await this is almost superfluous, but lets roll with it.
let category_information = async (db, category_info) => {
const count = await db.collection("articles")
.find({ slug: category_info.slug })
.count();
if (count > 0) {
let new_slug = `${category_info.slug}_${new Date().getTime()}`;
console.log(new_slug);
return new_slug;
} else {
return category_info.slug;
}
};
category_info.slug = await category_information(db, category_info);
// note that insertOne() does not return the inserted document.
let data = await db.collection("categories").insertOne(category_info);
res.status(200).json(data);
mongoClient.close();
}
run(); // or app.get("/some-route", run);
This code runs, but I haven' tested every case (count and so on), so grain of salt and all that.