I have a simple function to where I get the word count from an url. The script works if I have a low amount of urls. I only limit async 4 at a time. I watch my ram and cpu and it doesn't go near the max on my machine. Let's say after about 70ish urls there is no error. The script just sets there. I have it in a try catch block and it never catches. Any help would be appreciated.
I have tried lodash forEach instead of async and I get the same issue.
const async = require('async')
const wordcount = require('wordcount')
const afterLoad = require('after-load')
const htmlToText = require('html-to-text')
function getWordCount(urls, cb) {
async.eachLimit(urls, 4, function(url, cbe) {
try {
let html = afterLoad(url) // https://www.npmjs.com/package/after-load
let text = htmlToText.fromString(html)
let urlWordCount = wordcount(text) // https://www.npmjs.com/package/wordcount
console.log(url, urlWordCount)
cbe(null)
} catch(err) {
console.log(err)
urlWordCount = 0
console.log(url, urlWordCount, err)
cbe(null)
}
}, function(err) {
console.log("finished getting wordcount", err)
if (err) {
cb(err)
} else {
cb(null)
}
})
}
getWordCount(["https://stackoverflow.com/", "https://caolan.github.io/async/docs.html#eachLimit"], function(err){
console.log(err)
})
I think the issue is in the synchronous implementation of that after-load module, but it's indeed hard to judge unless you get an actual error (you could put some console.logs here and there on every line and see where your code actually gets stuck - or use a debugger for the same purpose).
What I'd propose though is to use proper asynchronous code - I run the example below with a set of 1000 urls and it did not get stuck - with usage of [scramjet] it's also more readable:
const {StringStream} = require('scramjet');
const wordcount = require('wordcount');
const fetch = require('node-fetch');
const htmlToText = require('html-to-text');
const {promisify} = require('util');
StringStream.fromArray(["https://stackoverflow.com/", "https://caolan.github.io/async/docs.html#eachLimit"])
.setOptions({maxParallel: 4})
.parse(async url => ({
url,
response: await fetch(url)
}))
.map(async ({url, response}) => {
const html = await response.text();
const text = htmlToText.fromString();
const count = wordcount(text);
return {
url,
count
};
})
.each(console.log)
;
I actually run this from a file with the URL's by changing the first lines to:
StringStream.from(fs.createReadStream('./urls-list.txt'), 'utf-8')
.lines()
.setOptions({maxParallel: 4})
// and so on.
Related
Totally new to programming with async functions. Also new to node.js which could be adding to my issue. I've read a lot and keep running into the similar problems and it seems like I've been randomly getting some portions of the async code to work, while others doesn't. Here is a simplified version of what I have:
Essentially I'm searching a site for music, scrapping all the results (scraper_start.js) and then it is sent to scrape_individual.js to gather data. It is currently able to get all the data, but when it downloads the album art it comes in "too late".
The image does get logged to the console, but only after info gets returned. Also if you have any good resources to learn async programming please share them - I haven't been able to find a website that is nice and clean and gets into examples big enough that they become realistic (such as multiple async functions working at once and sometimes relying upon each other). Please critic my code as well - I am trying to learn!
File scraper_start.js:
const rp = require('request-promise');
const cheerio = require('cheerio');
const scrape = require('./scrape_individual.js');
const base_url = 'https://www.test.ca';
const url = 'https://www.test.ca/search?mysearchstring';
rp(url)
.then(function(html)
{
const $ = cheerio.load(html);
var results = []
var hits = $('h3 > a').length;
console.log("TOTAL HITS: " + hits);
results = $('h3 > a').map(function(i,v){ return $(v).attr('href'); }).get()
return Promise.all(
results.map(function(url)
{
return scrape(base_url + url);
})
);
})
.then(function(my_data)
{
console.log(my_data);
});
File scrape_individual.js:
const rp = require('request-promise');
const cheerio = require('cheerio');
var info = {}
const scrape = function(url)
{
return rp(url)
.then(function(html)
{
const $ = cheerio.load(html);
if (!html.includes('contentType = "Podcast"'))
{
info = {
title: $('h2.bc-heading:first').text(),
img3: null};
img_data($('.bc-image-inset-border').attr('src'))
.then(function(v)
{
console.log(v);
info.img3 = v; // Log the value once it is resolved
})
.catch(function(v) {
});
return info;
}
})
};
function img_data(src)
{
return new Promise(function(resolve, reject)
{
const { createCanvas, loadImage } = require('canvas');
loadImage(src).then((image) =>
{
const canvas = createCanvas(image.width, image.height);
const ctx = canvas.getContext('2d');
ctx.drawImage(image, 0, 0);
resolve(canvas.toDataURL());
});
});
}
module.exports = scrape;
UPDATE: New Code with ASYNC / AWAIT
scraper_start.js:
const rp = require('request-promise');
const cheerio = require('cheerio');
const scrape = require('./scrape_individual.js');
const base_url = 'https://www.test.ca';
const url = 'https://www.test.ca/search?mysearchstring';
var data = [];
async function get_links(url)
{
let html = await rp(url);
const $ = cheerio.load(html);
var results = [];
var hits = $('h3 > a').length;
console.log("TOTAL HITS: " + hits);
hrefs = $('h3 > a').map(function(i,v){ return $(v).attr('href'); }).get()
await Promise.all(hrefs.map(async (href) =>
{
let data_single = await scrape.scrape_book3(base_url + href);
data.push(data_single);
}));
//QUESTION AREA 1: This data works great with all info.
console.log(data);
return data
}
get_links(url);
//QUESTION AREA 2: This data gets printed before getting the actual data returned.
console.log(data);
scrape_individual.js:
const rp = require('request-promise');
const cheerio = require('cheerio');
var info = {}
//scrape2(url)
module.exports.scrape_individual = scrape2;
async function scrape2(url)
{
let html = await rp(url);
const $ = cheerio.load(html);
if (!html.includes('contentType = "Podcast"'))
{
let my_image = await img_data($('.bc-image-inset-border').attr('src'));
info = {title: $('h2.bc-heading:first').text(),
img3: my_image};
//console.log(info);
return info;
}
}
async function img_data(src)
{
const { createCanvas, loadImage } = require('canvas');
let image = await loadImage(src);
const canvas = createCanvas(image.width, image.height);
const ctx = canvas.getContext('2d');
ctx.drawImage(image, 0, 0);
//console.log(canvas.toDataURL());
return canvas.toDataURL();
}
This code works great now. Also easier to understand. Please feel free to critic as I am trying to master this. My question now is more of a general coding question.
Within scraper_start.js where the end result ends up (data), I marked two comments with "QUESTION AREA 1" and "QUESTION AREA 2"
QUESTION AREA 1: works entirely fine, which I would assume because it is in the async function
QUESTION AREA 2: outside of async function, does not have the object returned yet as there is nothing to say await. Is there a way to make it wait?
My question is pretty loaded. I can't use await since it's not in an async function from my understanding. Does this mean all my code needs to be in functions if I want to maintain an important order? What is best practice? Why not call every function as async?
edit: Fixing typos
edit2: Added ASYNC / AWAIT modifications
Question Area 2 is returning before the data is captured because you are telling JS that the last function is not waiting on anyone, so it is run synchronously.
But the truth is that it is actually waiting for someone, it is waiting for get_links()
So a way to get to print the data would be:
async printer(){
const returnedData = await get_links(URL);
console.log(returnedData)
}
printer();
if you want to use the data returned from an async function you need to call it with await, so JS knows that it needs to wait for a resolved or rejected promise before going on, or else it will return a Promise<pending>. And all await need to be inside an async.
In the beginning, sounds like an endless circle but it really is not.
For instance in our example you don't need any other async to call the printer() (because no other function is depending on that one)
I hope it makes sense, but promises at the beginning need some time to be digested before understanding them.
Async/await in my opinion are a blessing to understand promises, once you get your head around it and how they function you are better prepared to understand the resolve/reject way
I have two js scripts that I would like to merge into one, but I do not know how.
Script one, uploads all files inside specified folder into virustotal, scans them, and returns the result of the scan.
Script two, lists all files inside the specified folder and all of its subfolders.
I would like to make a script that uploads all files inside specified folder and all of its subfolders into virustotal, scans them, and returns the result of the scan.
How would I go about doing that?
Script one:
/*jshint esversion: 8 */
const path = require('path');
const fsp = require('fs').promises;
const VirusTotalApi = require("virustotal-api");
const virusTotal = new VirusTotalApi('<YOUR API KEY>');
const basePath = '/home/username/Desktop/TEST/';
const wait = (time) => new Promise((resolve) => setTimeout(resolve, time));
async function scan() {
const files = await fsp.readdir(basePath);
let errors = [];
for (let file of files) {
const fullPath = path.join(basePath, file);
console.log(file);
try {
const data = await fsp.readFile(fullPath);
const response = await virusTotal.fileScan(data, file);
const resource = response.resource;
const result = await virusTotal.fileReport(resource);
const resultLine = `${file}: ${JSON.stringify(result, ["verbose_msg","total","positives"])}\n`;
await fsp.appendFile('Result.txt', resultLine);
console.log(`${file}: Saved!`);
} catch (e) {
// collect the error, log the error and continue the loop
e.fullPath = fullPath;
errors.push(e);
console.log(`Error processing ${fullPath}`, e);
continue;
}
// Wait for 30 seconds
await wait(30000);
}
// if there was an error, then reject with all the errors we got
if (errors.length) {
let e = new Error("Problems scanning files");
e.allErrors = errors;
throw e;
}
}
scan().then(() => {
console.log("all done scanning - no errors");
}).catch(err => {
console.log(err);
});
Script two:
const { promisify } = require('util');
const { resolve } = require('path');
const fs = require('fs');
const readdir = promisify(fs.readdir);
const stat = promisify(fs.stat);
async function getFiles(dir) {
const subdirs = await readdir(dir);
const files = await Promise.all(subdirs.map(async (subdir) => {
const res = resolve(dir, subdir);
return (await stat(res)).isDirectory() ? getFiles(res) : res;
}));
return files.reduce((a, f) => a.concat(f), []);
}
getFiles('/home/username/Desktop/TEST')
.then(files => console.log(files))
.catch(e => console.error(e));
You have quite a few options to get to a result here. The quick and dirty approach is to:
eliminate naming conflicts (make sure nothing is named the same between the two files
Copy over the consts and the function in file B into file A.
Copy the getFiles call in right after the scan().then... call
There are other cleaner approaches. But this should get you to a proof of concept that it is possible to have both scripts function together in a single script.
I've been trying to just extract and scrape the number of COVID cases off of a website and display them on an index.html page, using puppeteer to scrape the data from the official COVID website. I have this code for my API in my index.js file:
const func = require('./scrapers');
app.get('/creators', async (req, res) => {
const numC = func.scrapeCases('https://covid19.ca.gov/state-dashboard/');
var myFinal = numC.toString();
numC.then(result => {
const setter = result.toString();
myFinal = setter;
console.log(myFinal)
//res.send('6')
res.send(myFinal)
});
})
The function "scrapeCases" is in a file called scrapers.js in the same directory as index.js. It returns the jsonValue() of the number I'm trying to extract from the COVID website (number of cases), which in reality gives me a Promise return value, which is why I also used toString() to change that.
When I do console.log(myFinal) as given, the proper number of cases is shown in the terminal as if it were a regular string. But when I do res.send(myFinal), it doesn't show up in my index.html page, but if I comment res.send(myFinal) and do res.send('6') as an arbitrary example, the 6 is displayed (after a short delay of time, not sure why). Why is it that it doesn't work with res.send?
Also, if I do this instead:
app.get('/creators', async (req, res) => {
const numC = await func.scrapeCases('https://covid19.ca.gov/state-dashboard/');
var myFinal = numC.toString();
console.log(myFinal)
//res.send('6')
res.send(myFinal)
})
Here I add "await" to my func.ScrapeCases statement since I think this is another way I can get the value of the Promise of numC. Again, the console.log(myFinal) gives me the proper amount of cases as a simple number with commas in between (e.g. 1,366,234), and res.send('6') again displays on my index.html if I uncomment it, but res.send(myFinal) does not display anything.
Any help is appreciated thank you!
By wrapping res.send(), it is modified to be executed after the async function is executed.
This is the code that created and solved the same situation as you.
const express = require('express');
const app = express();
const asyncWrapper = (fn) => {
return (req, res, next) => {
return Promise.resolve(fn(req))
.then((result) => res.send(result))
.catch((err) => next(err))
}
}
const count = async () => {
let count = 0;
for(let i = 0; i < 1000000; ++i){
count++;
}
return count.toString();
}
app.get('/creators', asyncWrapper(count));
app.listen(3000, () => {
log(`API Server listening on port 3000!`);
});
Probably your code will be like this (I haven't tested it. May need to be modified)
const func = require('./scrapers');
const asyncWrapper = (fn) => {
return (req, res, next) => {
return Promise.resolve(fn(req))
.then((result) => res.send(result))
.catch((err) => next(err))
}
}
const count = async () => {
let numC = await func.scrapeCases('https://covid19.ca.gov/state-dashboard/');
return numC.toString();
}
app.get('/creators', asyncWrapper(count));
My goal is to replace value at a specific position in a file in NodeJS without loading the entire content of file in the RAM (not fs.readFileSync(path, "utf8")). It should work with very large files (> 2^28 - 16 bytes (V8 max allowed String length)).
This is my code :
const fs = require('fs');
const path = "./text.txt";
const cursor = 1;
(async () => {
await new Promise((res, rej) => {
const buffer = Buffer.from('ee');
fs.open(path, 'w+', function (err, fd) {
if (err) {
console.log('Cant open file');
rej()
} else {
fs.write(fd, buffer, 0, buffer.length, cursor, function (err, writtenbytes) {
if (err) {
console.log('Cant write to file');
rej()
} else {
console.log(writtenbytes +
' characters added to file');
res()
}
})
}
})
})
})()
This is the content of the "./text.txt" before I launch the program :
foo
This is the content of the "./text.txt" after I launch the program :
❓ee
and the charCode of ❓ equals 0.
This is the expected result :
fee
What was wrong ? What should I fix ?
The problem is you're using w+ as the mode when opening the file. It should be r+:
fs.open(path, 'r+', function (err, fd) {
// −−−−−−−−−−−−^^
From the documentation:
'w+': Open file for reading and writing. The file is created (if it does not exist) or truncated (if it exists).
(my emphasis)
So once it was truncated, writing to position 1 implicitly wrote 0 at position 0 (either that, or left indeterminate garbage there, I'm not sure it's specified which).
But with r+:
'r+': Open file for reading and writing. An exception occurs if the file does not exist.
...you open the existing file without truncating, and then can write to the specific position.
For what it's worth, if you're using an async function, you might want to use the promises version of the fs API (fs.promises):
const fsp = require("fs").promises;
const path = "./text.txt";
const cursor = 1;
(async () => {
const handle = await fsp.open(path, "r+");
const buffer = Buffer.from('ee');
console.log(buffer);
try {
const { bytesWritten } = await handle.write(buffer, 0, buffer.length, cursor);
console.log(`${bytesWritten} characters added to file`);
} catch (err) {
console.log(`Cant write to file: ${err.message || String(err)}`);
} finally {
handle.close();
}
})()
.catch(err => {
console.log(`Error: ${err.message || String(err)}`);
});
Also note that you want to catch rejections of the promise your top-level async function may throw.
I am just trying a simple get command with Firestore, using this code from Google it doesn't work because it's not waiting for the promise?
Earlier I had put only a snippet of code, this is the entirety of index.js -- I'm using Firestore with Dialogflow to build a Google Assistant app and trying to call a function from the welcome intent that gets a field from Firestore, then writes that field to a string (named question1), and then this string should be spoken by the assistant as part of the ssml response. I've been on this for at least 30 hours already, can't seem to comprehend promises in regards to intents, firestore, etc. I've tried about 10 different solutions, this one works, only it says "undefined" in other variations I have tried it would say undefined several times but after 2-3 passes the get command would be complete and then the variable would be read out. I'm just trying to figure out how to get the get command and variable set before moving onto the SSML response. Can anyone point me in the right direction?
'use strict';
const functions = require('firebase-functions'); //don't forget this one
// Import Admin SDK
var admin = require("firebase-admin");
admin.initializeApp(functions.config().firebase);
var db = admin.firestore();
const collectionRef = db.collection('text');
const Firestore = require('#google-cloud/firestore');
var doc;
var question1;
const url = require('url');
const {
dialogflow,
Image,
Permission,
NewSurface,
} = require('actions-on-google');
const {ssml} = require('./util');
const config = functions.config();
const WELCOME_INTENT = 'Default Welcome Intent';
const app = dialogflow({debug: true});
async function dbaccess(rando) {
console.log("dbaseaccess started")
var currentquestion2 = 'question-' + rando.toString();
var cityRef
try { return cityRef = db.collection('text').doc(currentquestion2).get();
console.log("get command completed")
//do stuff
question1 = cityRef.data().n111
} catch(e) {
//error!
}
console.log("one line above return something");
return rando;
}
app.fallback((conv) => {
// intent contains the name of the intent
// you defined in the Intents area of Dialogflow
const intent = conv.intent;
switch (intent) {
case WELCOME_INTENT:
var rando = Math.floor(Math.random() * 3) + 1;
dbaccess(rando);
const ssml =
'<speak>' +
question1 +
'</speak>';
conv.ask(ssml);
break;
exports.dialogflowFirebaseFulfillment = functions.https.onRequest(app);
You have 2 options: you can use async/await or you can use Promise.then() depending on how you want the code to execute.
Async/await:
async function databasetest {
var cityRef;
try{
cityRef = await db.collection('cities').doc('SF');
// do stuff
} catch(e) {
// error!
}
Promise.then():
db.collection('cities').doc('SF').then((cityRef) => {
cityRef.get()
.then(doc => { /* do stuff */ })
.catch(err => { /* error! */ });
});
maybe a little of work around could help you, I'm not sure yet how you are trying to implement it.
function databasetest () {
var cityRef = db.collection('cities').doc('SF');
return cityRef.get()
}
// so you can implement it like
databasetest().then(doc => {
if (!doc.exists) {
console.log('No such document!');
} else {
console.log('Document data:', doc.data());
}
})
.catch(err => {
console.log('Error getting document', err);
});
More context would help to understand your use case better :)