Axios.get too slow to return data

Axios.get too slow to return data - javascript

I have an axios get request which takes too long to resolve. This is for a site hosted on Heroku, which has a request timeout set at 30 seconds. The following code returns the request after about 50 seconds (which is surprisingly long, as there are only 21 urls to loop through in playerLink). Therefore, the request is never resolved on the live site.
Here is the Promise code:
const PORT = 8000
const axios = require('axios')
const cheerio = require('cheerio')
const express = require('express')
const cors = require('cors')
const app = express()
app.use(cors())
app.listen(PORT , () => console.log(`server running on PORT ${PORT}`))
const players = 'https://www.trinethunder.com/sports/sball/2021-22/teams/trine?view=roster'
const playerStats = 'https://www.trinethunder.com'
const playerLink = []
app.get('/players', (req, res) => {
function getPlayers() {
return new Promise((resolve, reject) => {
axios(players)
.then((response) => {
const html = response.data;
const $ = cheerio.load(html);
$("td.text.pinned-col > a", html).each(function () {
var link = $(this).attr("href");
//if link not yet in array, push to array
if (playerLink.indexOf(playerStats + link) === -1) {
playerLink.push(playerStats + link);
}
});
resolve()
})
.catch((err) => {
console.log(err);
});
});
}
function getPlayerStats() {
setTimeout(async () => {
const statsArray = []
for (let i = 0; i < playerLink.length; i++) {
await new Promise((resolve, reject) => {
axios.get
(playerLink[i])
.then((response) => {
const html = response.data;
const $ = cheerio.load(html);
const statName = [];
const statDesc = [];
const statNum = [];
$("h2 > span:nth-child(1)", html).each(function () {
var name = $(this).text();
statName.push(name);
});
$(".stat-title", html).each(function () {
var stat1 = $(this).text();
statDesc.push(stat1);
});
$(".stat-value", html).each(function () {
var stat2 = $(this).text();
statNum.push(stat2);
});
//Conditional is here because sometimes statsArray
//gets filled multiple times
if (statsArray.length < 63) {
statsArray.push(statName, statDesc, statNum);
}
resolve();
})
.catch((err) => console.log(err));
});
}
res.json(statsArray)
}, 400);
}
getPlayers()
.then(getPlayerStats)
.catch((err) => console.log(err));
});
Simplified Fetch statement for /players:
fetch('http://localhost:8000/players')
.then(response => response.json())
.then(data => {
console.log(data)
}).catch(err=>console.log(err))
Please let me know if you see anything that may be slowing down the execution of the request.

I cleaned up the code, removed the setTimeout(), set it up for maximum parallelization and instrumented it and made it so it can run stand-alone. After doing so, the log it produces is below and I see that getPlayers() takes 2413ms and the synchronous cheerio processing of the individual player requests takes a total of 6087ms. From start to finish, the whole thing takes 9415ms on my system.
This is significantly faster than what you report. The biggest structural change I made is that all the individual getPlayerStat requests are made in parallel, not in serial which (if the target server can handle it) will shorten the total wait for network requests on getting player stats. I also removed the setTimeout() as that seemed like a hack for some other problem and once the code is structured properly for asynchronous handling, that should not be necessary.
Here's the detailed log if you want to see where all the detailed time is spent. You can run the code below on your own system to see what you get there:
000000: begin all
000006: begin getPlayers()
002419: end getPlayers()
002419: begin getPlayerStats
002420: begin get https://www.trinethunder.com/sports/sball/2021-22/players/makinzeromingersy0k
002423: begin get https://www.trinethunder.com/sports/sball/2021-22/players/emersynhaneyjnrb
002424: begin get https://www.trinethunder.com/sports/sball/2021-22/players/amandapratheruluw
002424: begin get https://www.trinethunder.com/sports/sball/2021-22/players/adrienneroseybff7
002425: begin get https://www.trinethunder.com/sports/sball/2021-22/players/emmabeyeri6zz
002426: begin get https://www.trinethunder.com/sports/sball/2021-22/players/aprilsellersi95s
002427: begin get https://www.trinethunder.com/sports/sball/2021-22/players/annakoeppl38q8
002427: begin get https://www.trinethunder.com/sports/sball/2021-22/players/annagilli8rl
002428: begin get https://www.trinethunder.com/sports/sball/2021-22/players/angelenaperry2scn
002429: begin get https://www.trinethunder.com/sports/sball/2021-22/players/laurenclausenfb4j
002430: begin get https://www.trinethunder.com/sports/sball/2021-22/players/emilywheaton1jym
002430: begin get https://www.trinethunder.com/sports/sball/2021-22/players/kaylyncoahranhp6r
002431: begin get https://www.trinethunder.com/sports/sball/2021-22/players/mercededaughertyiswy
002432: begin get https://www.trinethunder.com/sports/sball/2021-22/players/taylormurdockgeho
002432: begin get https://www.trinethunder.com/sports/sball/2021-22/players/lexiclark77gr
002433: begin get https://www.trinethunder.com/sports/sball/2021-22/players/ainsleyphillipsmfe9
002434: begin get https://www.trinethunder.com/sports/sball/2021-22/players/ellietrinexhe2
002434: begin get https://www.trinethunder.com/sports/sball/2021-22/players/ashleyswartouta714
002435: begin get https://www.trinethunder.com/sports/sball/2021-22/players/gisellerileybdb8
002436: begin get https://www.trinethunder.com/sports/sball/2021-22/players/elizabethkoch5umu
002436: begin get https://www.trinethunder.com/sports/sball/2021-22/players/scarlettelliott0bvt
003251: after get https://www.trinethunder.com/sports/sball/2021-22/players/kaylyncoahranhp6r
003596: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/kaylyncoahranhp6r
003599: after get https://www.trinethunder.com/sports/sball/2021-22/players/makinzeromingersy0k
003902: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/makinzeromingersy0k
003905: after get https://www.trinethunder.com/sports/sball/2021-22/players/emersynhaneyjnrb
004200: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/emersynhaneyjnrb
004203: after get https://www.trinethunder.com/sports/sball/2021-22/players/amandapratheruluw
004489: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/amandapratheruluw
004492: after get https://www.trinethunder.com/sports/sball/2021-22/players/emmabeyeri6zz
004771: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/emmabeyeri6zz
004773: after get https://www.trinethunder.com/sports/sball/2021-22/players/aprilsellersi95s
005060: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/aprilsellersi95s
005063: after get https://www.trinethunder.com/sports/sball/2021-22/players/elizabethkoch5umu
005345: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/elizabethkoch5umu
005348: after get https://www.trinethunder.com/sports/sball/2021-22/players/emilywheaton1jym
005638: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/emilywheaton1jym
005643: after get https://www.trinethunder.com/sports/sball/2021-22/players/ashleyswartouta714
005943: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/ashleyswartouta714
005951: after get https://www.trinethunder.com/sports/sball/2021-22/players/ainsleyphillipsmfe9
006243: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/ainsleyphillipsmfe9
006245: after get https://www.trinethunder.com/sports/sball/2021-22/players/adrienneroseybff7
006541: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/adrienneroseybff7
006545: after get https://www.trinethunder.com/sports/sball/2021-22/players/annagilli8rl
006821: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/annagilli8rl
006824: after get https://www.trinethunder.com/sports/sball/2021-22/players/mercededaughertyiswy
007111: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/mercededaughertyiswy
007118: after get https://www.trinethunder.com/sports/sball/2021-22/players/lexiclark77gr
007402: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/lexiclark77gr
007411: after get https://www.trinethunder.com/sports/sball/2021-22/players/angelenaperry2scn
007681: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/angelenaperry2scn
007685: after get https://www.trinethunder.com/sports/sball/2021-22/players/laurenclausenfb4j
007974: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/laurenclausenfb4j
007976: after get https://www.trinethunder.com/sports/sball/2021-22/players/scarlettelliott0bvt
008265: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/scarlettelliott0bvt
008267: after get https://www.trinethunder.com/sports/sball/2021-22/players/ellietrinexhe2
008553: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/ellietrinexhe2
008555: after get https://www.trinethunder.com/sports/sball/2021-22/players/gisellerileybdb8
008838: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/gisellerileybdb8
008840: after get https://www.trinethunder.com/sports/sball/2021-22/players/annakoeppl38q8
009129: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/annakoeppl38q8
009131: after get https://www.trinethunder.com/sports/sball/2021-22/players/taylormurdockgeho
009415: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/taylormurdockgeho
009415: end all
... data here
getPlayers() took 2413ms
cheerio processing took 6087ms
And, here's the stand-alone code that anyone can run:
const axios = require('axios');
const cheerio = require('cheerio');
const players = 'https://www.trinethunder.com/sports/sball/2021-22/teams/trine?view=roster'
const playerStats = 'https://www.trinethunder.com'
const zeroes = "000000000000000000000000000000";
function zeroPad(num, padLen) {
let str = num + "";
let padNum = padLen - str.length;
if (padNum > 0) {
str = zeroes.slice(0, padNum) + str;
}
return str;
}
const base = Date.now();
function log(...args) {
let delta = Date.now() - base;
let deltaPad = zeroPad(delta, 6);
console.log(deltaPad + ": ", ...args);
}
let getPlayersT = 0;
let cheerioT = 0;
async function run() {
async function getPlayers() {
log("begin getPlayers()");
let startT = Date.now();
const playerLink = [];
const response = await axios(players);
const html = response.data;
const $ = cheerio.load(html);
$("td.text.pinned-col > a", html).each(function() {
const link = $(this).attr("href");
//if link not yet in array, push to array
if (playerLink.indexOf(playerStats + link) === -1) {
playerLink.push(playerStats + link);
}
});
log("end getPlayers()")
getPlayersT += Date.now() - startT;
return playerLink;
}
async function getPlayerStats(playerLink) {
log("begin getPlayerStats");
const statsArray = [];
await Promise.all(playerLink.map(async link => {
log(`begin get ${link}`)
const response = await axios.get(link);
log(`after get ${link}`)
const html = response.data;
const startT = Date.now();
const $ = cheerio.load(html);
const statName = [];
const statDesc = [];
const statNum = [];
$("h2 > span:nth-child(1)", html).each(function() {
var name = $(this).text();
statName.push(name);
});
$(".stat-title", html).each(function() {
var stat1 = $(this).text();
statDesc.push(stat1);
});
$(".stat-value", html).each(function() {
var stat2 = $(this).text();
statNum.push(stat2);
});
//Conditional is here because sometimes statsArray
//gets filled multiple times
if (statsArray.length < 63) {
statsArray.push(statName, statDesc, statNum);
}
cheerioT += Date.now() - startT;
log(`after cheerio parse ${link}`);
}));
return statsArray;
}
try {
log("begin all")
const playerLink = await getPlayers();
const statsArray = await getPlayerStats(playerLink);
log("end all")
return statsArray;
} catch (e) {
console.log(e);
}
}
run().then(result => {
console.log(result);
console.log(`getPlayers() took ${getPlayersT}ms`);
console.log(`cheerio processing took ${cheerioT}ms`);
}).catch(err => {
console.log("error");
});

Related

How to much time take a request in javascript axios?

I've an application where I get the data from an API, every things is working fine, but I want to know how much time a request take in times, so I've used the interceptors from axios and I get the time in milleseconds, BUT the problem is that the time I get request-duration is after the request respond which is not useful I want to know the time before or at calling the webservice, the thing make it hard for me is that the method I call is located in another file:
Request.js
export const getRequest = async (url, baseURL, headers) => {
const HTTP = axios.create({
baseURL,
headers,
});
HTTP.interceptors.request.use((config) => {
config.headers["request-startTime"] = new Date().getTime();
return config;
});
HTTP.interceptors.response.use((response) => {
const currentTime = new Date().getTime();
const startTime = response.config.headers["request-startTime"];
response.headers["request-duration"] = currentTime - startTime;
return response;
});
return HTTP.get(url);
};
Users.vue
async getUsers() {
try {
let url = `/users`;
let baseUrl = `baseURL`;
let headers = {};
const responseUsers = await getRequest(url,baseUrl,headers);
console.log(responseJobTasks.headers["request-duration"]); //show how much milleseconds here
if (responseJobTasks.status === 200) {
const { data } = responseJobTasks;
this.users = data;
}
} catch (error) {
console.error(error);
}
}

You can know it simply by taking the timestamp of the before and after:
const before = Date.now();
const responseUsers = await getRequest(url,baseUrl,headers);
const after = Date.now();
const duration = after-before;
Another option is the User Timing API, but it is overkill for timing a single request.

UnhandledPromiseRejectionWarning: TypeError: $ is not a function

const $ = require ('cheerio');
const fetch = require('node-fetch');
const url = "https://fr.wikipedia.org/wiki/The_Legend_of_Zelda";
function extractionBrut(url){
return fetch(url)
.then((reponse) => reponse.text())
.then((data) => {
return data;
})
}
const getFormationList = async () => {
const data = await extractionBrut(url);
const num = $.parseHTML(data).length;
console.log(num);
for(let i = 0; i<num; i++){
const numTable = $('<div id="" class="bandeau-container homonymie plainlinks hatnote" style="">')[i];
console.log(numTable);
}
}
getFormationList();
I want to show the different section of this div however i have this error and don't know how to solve it.

The $ is supposed to be loaded with data. You would do something like:
const cheerio = require ('cheerio');
And later:
const $ = cheerio.load(someHtmlThatYouGotFromNodeFetch);
Now you can use the $ as if it's jQuery, or how does cheerio know what HTML you're working with?

Creating separated files for each request with createWriteStream

I'm making multiple URL request's using Axios and collecting the data using Cheerio.
Everything works great, I just can't figure out how to prevent the data to be overwritten by the previous response that was generated into a file using the createWriteStream method.
I'm trying to create different files for each request with unique names preferably, but haven't found any solution in the docs.
const axios = require("axios").default;
const cheerio = require('cheerio');
const fs = require('fs');
const writeStream = fs.createWriteStream('./names/names.text')
const getTitle = (res) => {
const $ = cheerio.load(res.data);
const names = $('.name_wrap > .name')
names.each(function (i, el) {
const item = $(el).text().replace(/^\s*$/g, '')
writeStream.write(`${item}\n`)
});
}
// URL'S Array
let URLS = []
for (let index = 1; index <= 3; index++) {
let url = `https://www.example.com/name-1-${index}`
URLS.push(axios.get(url))
}
Promise.all(URLS)
.then(responses => {
getTitle(responses[0])
getTitle(responses[1])
});

Initializing a Puppeteer Browser Outside of Scraping Function

I am very new to puppeteer (I started today). I have some code that is working the way that I want it to except for an issue that I think is making it extremely inefficient. I have a function that links me through potentially thousands of urls that have incremental IDs to pull the name, position, and stats of each player and then inserts that data into a neDB database. Here is my code:
const puppeteer = require('puppeteer');
const Datastore = require('nedb');
const database = new Datastore('database.db');
database.loadDatabase();
async function scrapeProduct(url, id){
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
let attributes = [];
const [name] = await page.$x('//*[#id="ctl00_ctl00_ctl00_Main_Main_name"]');
const txt = await name.getProperty('innerText');
const playerName = await txt.jsonValue();
attributes.push(playerName);
//Make sure that there is a legitimate player profile before trying to pull a bunch of 'undefined' information.
if(playerName){
const [role] = await page.$x('//*[#id="ctl00_ctl00_ctl00_Main_Main_position"]');
const roleTxt = await role.getProperty('innerText');
const playerRole = await roleTxt.jsonValue();
attributes.push(playerRole);
//Loop through the 12 attributes and pull their values.
for(let i = 1; i < 13; i++){
let vLink = '//*[#id="ctl00_ctl00_ctl00_Main_Main_SectionTabBox"]/div/div/div/div[1]/table/tbody/tr['+i+']/td[2]';
const [e1] = await page.$x(vLink);
const val = await e1.getProperty('innerText');
const skillVal = await val.jsonValue();
attributes.push(skillVal);
}
//Create a player profile to be pushed into the database. (I realize this is very wordy and ugly code)
let player = {
Name: attributes[0],
Role: attributes[1],
Athleticism: attributes[2],
Speed: attributes[3],
Durability: attributes[4],
Work_Ethic: attributes[5],
Stamina: attributes[6],
Strength: attributes[7],
Blocking: attributes[8],
Tackling: attributes[9],
Hands: attributes[10],
Game_Instinct: attributes[11],
Elusiveness: attributes[12],
Technique: attributes[13],
_id: id,
};
database.insert(player);
console.log('player #' + id + " scraped.");
await browser.close();
} else {
console.log("Blank profile");
await browser.close();
}
}
//Making sure the first URL is scraped before moving on to the next URL. (i removed the URL because its unreasonably long and is not important for this part).
(async () => {
for(let i = 0; i <= 1000; i++){
let link = 'https://url.com/Ratings.aspx?rid='+i+'&section=Ratings';
await scrapeProduct(link, i);
}
})();
What I think is making this so inefficient is the fact that everytime scrapeProduct() is called, i create a new browser and create a new page. Instead I believe it would be more efficient to create 1 browser and 1 page and just change the pages URL with
await page.goto(url)
I believe that in order to do what I'm trying to accomplish here, i need to move:
const browser = await puppeteer.launch();
const page = await browser.newPage();
outside of my scrapeProduct() function but i cannot seem to get this to work. Anytime I try i get an error in my function saying that page is not defined. I am very new to puppeteer (started today), I would appreciate any guidance on how to accomplish this. Thank you very much!
TL;DR
How do i create 1 Browser instance and 1 Page instance that a function can use repeatedly by only changing the await page.goto(url) function.

About a year ago I tried to a make an React Native Pokemon Go helper app. Since there wasn't an api for pokemon nest and pokestops I created a server that scraped thesilphroad.com and I found the need to implement something like #Arkan said.
I wanted the server to be able to take multiple request, so I decided to initialize the browser when the server is booted up. When a request is received, the server checks to see if MAX_TABS have been reached. If reached, it waits, if not a new tab is opened and the scrape is performed
Here's the scraper.js
const puppeteer = require ('puppeteer')
const fs = require('fs')
const Page = require('./Page')
const exec = require('child_process').exec
const execSync = require('util').promisify(exec)
module.exports = class scraper {
constructor(){
this.browser = null
this.getPages = null
this.getTotalPages = null
this.isRunning = false
//browser permissions
this.permissions = ['geolocation']
this.MAX_TABS = 5
//when puppeteer launches
this.useFirstTab = true
}
async init(config={}){
let headless = config.headless != undefined ? config.headless : true
this.permissions = this.permissions.concat(config.permissions || [])
//get local chromium location
let browserPath = await getBrowserPath('firefox') || await getBrowserPath('chrome')
this.browser = await puppeteer.launch({
headless:headless,
executablePath:browserPath,
defaultViewport:null,
args:[
'--start-maximized',
]
})
this.getPages = this.browser.pages
this.getTotalPages = ()=>{
return this.getPages().then(pages=>pages.length).catch(err=>0)
}
this.isRunning = true
}
async waitForTab(){
let time = Date.now()
let cycles = 1
await new Promise(resolve=>{
let interval = setInterval(async()=>{
let totalPages = await this.getTotalPages()
if(totalPages < this.MAX_TABS){
clearInterval(interval)
resolve()
}
if(Date.now() - time > 100)
console.log('Waiting...')
if(Date.now() - time > 20*1000){
console.log('... ...\n'.repeat(cycle)+'Still waiting...')
cycle++
time = Date.now()
}
},500)
})
}
//open new tab and go to page
async openPage(url,waitSelector,lat,long){
await this.waitForTab()
let pg
//puppeteer launches with a blank tab, use this
// if(this.useFirstTab){
// let pages = await this.browser.pages()
// pg = pages.pop()
// this.useFirstTab = false
// }
// else
pg = await this.browser.newPage()
if(lat && long){
await this.setPermissions(url)
}
let page = await new Page()
await page.init(pg,url,waitSelector,lat,long)
return page
}
async setPermissions(url){
const context = this.browser.defaultBrowserContext();
await context.overridePermissions(url,this.permissions)
}
}
// assumes that the browser is in path
async function getBrowserPath(browserName){
return execSync('command -v chromium').then(({stdout,stderr})=>{
if(stdout.includes('not found'))
return null
return stdout
}).catch(err=>null)
}
The scraper imports Page.js, which is just wrapper for a puppeteer Page object with the functions I used most made available
const path = require('path')
const fs = require('fs')
const userAgents = require('./staticData/userAgents.json')
const cookiesPath = path.normalize('./cookies.json')
// a wrapper for a puppeteer page with pre-made functions
module.exports = class Page{
constuctor(useCookies=false){
this.page = null
this.useCookies = useCookies
this.previousSession = this.useCookies && fs.existsSync(cookiesPath)
}
async close (){
await this.page.close()
}
async init(page,url,waitSelector,lat,long){
this.page = page
let userAgent = userAgents[Math.floor(Math.random()*userAgents.length)]
await this.page.setUserAgent(userAgent)
await this.restoredSession()
if(lat && long)
await this.page.setGeolocation({
latitude: lat || 59.95, longitude:long || 30.31667, accuracy:40
})
await this.page.goto(url)
await this.wait(waitSelector)
}
async screenshotElement(selector='body',directory='./screenshots',padding=0,offset={}) {
const rect = await this.page.evaluate(selector => {
const el = document.querySelector(selector)
const {x, y, width, height} = el.getBoundingClientRect()
return {
left: x,
top: y,
width,
height,
id: el.id
}
}, selector)
let ext = 'jpeg'
let filename = path.normalize(directory+'/'+Date.now())
return await this.page.screenshot({
type:ext,
path:filename+' - '+selector.substring(5)+'.'+ext,
clip: {
x: rect.left - padding+(offset.left || 0),
y: rect.top - padding+(offset.right || 0),
width: rect.width + padding * 2+(offset.width||0),
height: rect.height + padding * 2+ (offset.height||0)
},
encoding:'base64'
})
}
async restoredSession(){
if(!this.previousSession)
return false
let cookies = require(cookiesPath)
for(let cookie of cookies){
await this.page.setCookie(cookie)
}
console.log('Loaded previous session')
return true
}
async saveSession(){
//write cookie to file
if(!this.useCookies)
return
const cookies = await this.page.cookies()
fs.writeFileSync(cookiesPath,JSON.stringify(cookies,null,2))
console.log('Wrote cookies to file')
}
//wait for text input elment and type text
async type(selector,text,options={delay:150}){
await this.wait(selector)
await this.page.type(selector,text,options)
}
//click and waits
async click(clickSelector,waitSelector=500){
await this.page.click(clickSelector)
await this.wait(waitSelector)
}
//hovers over element and waits
async hover(selector,waitSelector=500){
await this.page.hover(selector)
await this.wait(1000)
await this.wait(waitSelector)
}
//waits and suppresses timeout errors
async wait(selector=500, waitForNav=false){
try{
//waitForNav is a puppeteer's waitForNavigation function
//which for me does nothing but timeouts after 30s
waitForNav && await this.page.waitForNavigation()
await this.page.waitFor(selector)
} catch (err){
//print everything but timeout errors
if(err.name != 'Timeout Error'){
console.log('error name:',err.name)
console.log(err)
console.log('- - - '.repeat(4))
}
this.close()
}
}
}
``

To achieve this, you'll just need to separate the browser from your requests, like in a class, for example:
class PuppeteerScraper {
async launch(options = {}) {
this.browser = await puppeteer.launch(options);
// you could reuse the page instance if it was defined here
}
/**
* Pass the address and the function that will scrape your data,
* in order to mantain the page inside this object
*/
async goto(url, callback) {
const page = await this.browser.newPage();
await page.goto(url);
/**evaluate its content */
await callback(page);
await page.close();
}
async close() {
await this.browser.close();
}
}
and, to implement it:
/**
* scrape function, takes the page instance as its parameters
*/
async function evaluate_page(page) {
const titles = await page.$$eval('.col-xs-6 .star-rating ~ h3 a', (itens) => {
const text_titles = [];
for (const item of itens) {
if (item && item.textContent) {
text_titles.push(item.textContent);
}
}
return text_titles;
});
console.log('titles', titles);
}
(async () => {
const scraper = new PuppeteerScraper();
await scraper.launch({ headless: false });
for (let i = 1; i <= 6; i++) {
let link = `https://books.toscrape.com/catalogue/page-${i}.html`;
await scraper.goto(link, evaluate_page);
}
scraper.close();
})();
altho, if you want something more complex, you could take a look how they done at Apify project.

async.each does not finish without error

I have a simple function to where I get the word count from an url. The script works if I have a low amount of urls. I only limit async 4 at a time. I watch my ram and cpu and it doesn't go near the max on my machine. Let's say after about 70ish urls there is no error. The script just sets there. I have it in a try catch block and it never catches. Any help would be appreciated.
I have tried lodash forEach instead of async and I get the same issue.
const async = require('async')
const wordcount = require('wordcount')
const afterLoad = require('after-load')
const htmlToText = require('html-to-text')
function getWordCount(urls, cb) {
async.eachLimit(urls, 4, function(url, cbe) {
try {
let html = afterLoad(url) // https://www.npmjs.com/package/after-load
let text = htmlToText.fromString(html)
let urlWordCount = wordcount(text) // https://www.npmjs.com/package/wordcount
console.log(url, urlWordCount)
cbe(null)
} catch(err) {
console.log(err)
urlWordCount = 0
console.log(url, urlWordCount, err)
cbe(null)
}
}, function(err) {
console.log("finished getting wordcount", err)
if (err) {
cb(err)
} else {
cb(null)
}
})
}
getWordCount(["https://stackoverflow.com/", "https://caolan.github.io/async/docs.html#eachLimit"], function(err){
console.log(err)
})

I think the issue is in the synchronous implementation of that after-load module, but it's indeed hard to judge unless you get an actual error (you could put some console.logs here and there on every line and see where your code actually gets stuck - or use a debugger for the same purpose).
What I'd propose though is to use proper asynchronous code - I run the example below with a set of 1000 urls and it did not get stuck - with usage of [scramjet] it's also more readable:
const {StringStream} = require('scramjet');
const wordcount = require('wordcount');
const fetch = require('node-fetch');
const htmlToText = require('html-to-text');
const {promisify} = require('util');
StringStream.fromArray(["https://stackoverflow.com/", "https://caolan.github.io/async/docs.html#eachLimit"])
.setOptions({maxParallel: 4})
.parse(async url => ({
url,
response: await fetch(url)
}))
.map(async ({url, response}) => {
const html = await response.text();
const text = htmlToText.fromString();
const count = wordcount(text);
return {
url,
count
};
})
.each(console.log)
;
I actually run this from a file with the URL's by changing the first lines to:
StringStream.from(fs.createReadStream('./urls-list.txt'), 'utf-8')
.lines()
.setOptions({maxParallel: 4})
// and so on.

We Keep Coding

JavaScript is the programming language of the Web.

Axios.get too slow to return data - javascript

Related

How to much time take a request in javascript axios?

UnhandledPromiseRejectionWarning: TypeError: $ is not a function

Creating separated files for each request with createWriteStream

Initializing a Puppeteer Browser Outside of Scraping Function

async.each does not finish without error

Categories

Resources