Script gets stuck somewhere in it's execution process - javascript

I've created a script using request and cheerio libraries to scrape links of different provinces from this webpage and then use those urls to parse the links of different offices from here. Finally, use those office links to scrape title from here.
When I run the script, I can see that it does it's job accordingly until it gets stuck somewhere in it's execution. When it gets stuck, it doesn't throw any error.
Here are the steps in image what the script is following:
Firstly, the script grabs links from here
Secondly, it grabs links from here
And finally, the script parses title from here
Here is what I've tried with:
const request = require('request');
const cheerio = require('cheerio');
const link = 'https://www.egyptcodebase.com/en/p/all';
const base_link = 'https://www.egyptcodebase.com/en/';
let getLinks = (link) => {
const items = [];
return new Promise((resolve, reject) => {
request(link, function(error, response, html) {
let $ = cheerio.load(html);
if (error) return reject(error);
try {
$('.table tbody tr').each(function() {
items.push(base_link + $(this).find("a[href]").attr("href"));
});
resolve(items);
} catch (e) {
reject(e);
}
});
});
};
let getData = (links) => {
const nitems = [];
const promises = links
.map(nurl => new Promise((resolve, reject) => {
request(nurl, function(error, response, html) {
let $ = cheerio.load(html);
if (error) return reject(error);
try {
$('.table tbody tr').each(function() {
nitems.push(base_link + $(this).find("a[href]").attr("href"));
});
resolve(nitems);
} catch (e) {
reject(e);
}
})
}))
return Promise.all(promises)
}
let FetchData = (links) => {
const promises = links
.map(turl => new Promise((resolve, reject) => {
request(turl, function(error, response, html) {
if (error) return reject(error);
let $ = cheerio.load(html);
try {
const title = $(".home-title > h2").eq(0).text();
console.log({
title: title,
itemLink: turl
});
resolve(title);
} catch (e) {
reject(e);
}
})
}))
return Promise.all(promises)
}
(async function main() {
const result = await getLinks(link);
const resultSecond = await getData(result);
const merged = resultSecond.flat(1);
const resultFinal = await FetchData(merged);
for (const title of resultFinal) {
console.log(title);
}
})().catch(console.error);
How can I make the script finish it's execution process?
PS Although the script appears to be big, the functions used in there are alomost identical to each other except for the selectors.

Ok, so on testing this code, I ran across two problems right off the bat:
resultSecond, containing the data from getData(), returned an Array-like Object, not an Array, so I wasn't able to use the flat(). So I created a function toArray that converts Objects to Arrays and added another variable after resultSecond called resultThird and used this function on resultSecond, turning it to an array.
flat() did not exist in the Array prototype, so I had to add it manually.
After resolving those issues, I was able to run your code, and experienced the hang you were talking about.
An ECONNRESET error occurred, and then proceeded to make probably a couple thousand requests before hanging. An ECONNRESET usually results from not handling asynchronous network errors or the server you're requesting decides to kill the connection. Not sure how the request module would handle such an event, but it seems like the module could potentially not be handling the network errors or terminated connections properly.
The issue was you were making 15,000 requests to this sites API, so the API probably had a rate limiter, saw the amount of requests and terminated most of them, but allowed a couple thousand to go through, but since you're not handling the terminated connections-- most likely due to the request module swallowing those errors-- it's "hanging" there with the node process not exiting.
So I batched the requests into intervals of 300 using the async module and it worked like a charm. No terminated connections because I didn't reach the rate limit. You could probably up the interval limit higher than 300.
However, I would suggest not using the request module and use another http module like axios, which most likely handle these issues. You should consider using async when you're doing a ton of asynchronous requests. It has so many helpful methods. Lmk if you need more explanation to what the async module is doing here, but I'd advise reading the documentation first: https://caolan.github.io/async/v3/docs.html#mapLimit
const request = require('request');
const cheerio = require('cheerio');
const _async = require('async');
const link = 'https://www.egyptcodebase.com/en/p/all';
const base_link = 'https://www.egyptcodebase.com/en/';
const toArray = (obj) => {
const arr = [];
for (const prop in obj) {
arr.push(obj[prop])
}
return arr;
}
Object.defineProperty(Array.prototype, 'flat', {
value: function(depth = 1) {
return this.reduce(function (flat, toFlatten) {
return flat.concat((Array.isArray(toFlatten) && (depth>1)) ? toFlatten.flat(depth-1) : toFlatten);
}, []);
}
});
let getLinks = (link) => {
const items = [];
return new Promise((resolve, reject) => {
request(link, function(error, response, html) {
let $ = cheerio.load(html);
if (error) return reject(error);
try {
$('.table tbody tr').each(function() {
items.push(base_link + $(this).find("a[href]").attr("href"));
});
resolve(items);
} catch (e) {
reject(e);
}
});
});
};
let getData = (links) => {
const nitems = [];
const promises = links
.map(nurl => new Promise((resolve, reject) => {
request(nurl, function(error, response, html) {
let $ = cheerio.load(html);
if (error) return reject(error);
try {
$('.table tbody tr').each(function() {
nitems.push(base_link + $(this).find("a[href]").attr("href"));
});
return resolve(nitems);
} catch (e) {
return reject(e);
}
})
}))
return Promise.all(promises)
}
let FetchData = (links) => {
const limit = 300;
return new Promise((resolve, reject) => {
const itr = (col, cb) => {
request(col, function(error, response, html) {
if (error) cb(error)
let $ = cheerio.load(html);
try {
const title = $(".home-title > h2").eq(0).text();
console.log({
title: title,
itemLink: col
});
cb(null, title);
} catch (e) {
cb(e);
}
})
}
_async.mapLimit(links, limit, itr, function(err, results) {
if (err) reject(err);
return resolve(results);
})
})
}
(async function main() {
const result = await getLinks(link);
const resultSecond = await getData(result);
const resultThird = toArray(resultSecond);
const merged = resultThird.flat(1);
const resultFinal = await FetchData(merged);
for (const title of resultFinal) {
console.log("title: ", title);
}
})().catch(err => console.log(err))
//good to listen to these
process.on('uncaughtException', err => { console.log(err) });
process.on('unhandledRejection', err => { console.log(err) });

Related

What are the best practices to pass variable to outer scope while using Promise?

I was working on a Scraper script using cheerio and puppeteer. Inside a cheerio each loop I wanted to resolve a redirected url field. Using Promise I can console log it but what is the best way to insert it in the metadata?
I am actually confused about the control flow also.
(async function main() {
const browser = await puppeteer.launch({
headless: true,
});
const page = await browser.newPage();
await page.goto('https://www.example.com/?q=async+urls&s=s');
const content = await page.content();
const $ = cheerio.load(content);
var parsedResults = [];
const fetchRedirect = async (url) => {
try {
let response = await doRequest(url);
return response;
} catch (err) {
return false;
}
};
const videoBlocks = $('td[itemprop="subjectOf"]').first().each(function (i, element) {
const url = ($(this).find('a.title').attr('href'));
const fetchUrl = fetchRedirect(url);
// ** i can console log the redirected url **/
fetchUrl.then(url => console.log(url));
const title = ($(this).find('a.title').text());
var metadata = {
title
};
parsedResults.push(metadata);
});
function doRequest(url) {
return new Promise(function (resolve, reject) {
request(url, function (error, res, body) {
if (!error) {
resolve(getPathFromUrl(res.request.uri.href));
} else {
reject(error);
}
});
});
}
function getPathFromUrl(url) {
return url.split(/[?#]/)[0];
}
console.log(parsedResults);
await page.close()
await browser.close();
})();
//...............await!
const fetchUrl = await fetchRedirect(url);
// ** then instead that **/
// fetchUrl.then(url => console.log(url));
// just
console.log(fetchUrl)
const title = ($(this).find('a.title').text());
var metadata = {
title
};
parsedResults.push(metadata);
Personally I would go for a Node module. The code like this is hard to read as inside one function(main) you have declared multiple functions. It is upon your preference whether to apply OOP or functional programming there(the second is much more popular in Node environment but I prefer the first if I am going to create a functionality around a specific entity). Avoiding nesting and abstract functions(functions that you declare inside other functions) is crucial in creating a reusable and readable code.
Here it is a worked prototype(not tested). It makes a puppeteer instance reusable. The module exposes three methods: start, stop, crawlWeb
'use strict'
var browser;
async function crawlWeb(options) {
// validate options and throw errors
if (!options.url) {
throw new Error('url is invalid');
}
if (!browser) {
throw new Error('puppeteer is not started');
}
const page = await browser.newPage();
await page.goto(optionsl.url);
const content = await page.content();
const $ = cheerio.load(content);
const metas = extracMetadata($);
for (let metadata of metas) {
// you can verify if site is valid
// you can use await
try {
await doRequest(metadata.url);
}
catch(err) {
// do something if not valid
}
}
return metas;
}
async function start(options) {
browser = await puppeteer.launch(options);
}
async function stop() {
if (!browser) {
throw new Error('puppeteer is not started');
}
await page.close()
await browser.close();
}
function extracMetadata($) {
const metas = [];
$('td[itemprop="subjectOf"]').first().each(function (i, element) {
const url = ($(this).find('a.title').attr('href'));
const title = ($(this).find('a.title').text());
var metadata = {
url,
title
};
metas.push(metadata);
});
return metas;
}
function doRequest(url) {
return new Promise(function (resolve, reject) {
request(url, function (error, res, body) {
if (!error) {
resolve(getPathFromUrl(res.request.uri.href));
} else {
reject(error);
}
});
});
}
function getPathFromUrl(url) {
return url.split(/[?#]/)[0];
}
module.exports = {
crawlWeb,
start,
stop
};

How to read files present in array nodejs

I would like to know to read the files and search for keyword sample in nodejs.
If keyword found, display the path
const allfiles = [
'C:\\Users\\public',
'C:\\Users\\public\\images',
'C:\\Users\\public\\javascripts\\index1.js',
'C:\\Users\\public\\javascripts\\index2.js'
]
const readFile = (path, opts = 'utf8') =>
new Promise((resolve, reject) => {
try{
let result=[];
fs.readFile(path, opts, (err, data) => {
if (err) reject(err)
else {
if(data.indexOf("sample")>=0){
result.push(data);
resolve(result);
}
}
})
}
catch (e) {
console.log("e", e);
}
})
const run = async () => {
allfiles.forEach(e=>{
const s = await readFile(e);
console.log(s);
})
}
run();
Expected Output
[
'C:\\Users\\public\\javascripts\\index1.js',
'C:\\Users\\public\\javascripts\\index2.js'
]
Some tips:
What happens when "sample" isn't found in readFile?
You're currently pushing the data into result instead of the path.
Think about what you're trying to accomplish with readFile. To me, what you want to do is see if that file has the word "sample", and return true if so and if not return false. So I'd name the function checkIfFileHasSample and have it return a boolean. Then in your run function, in the forEach you have the path, so that is where I'd add the path to a list of results.
Maybe you already realized this, but run is never actually called in your code sample. Ie. run() doesn't happen.
Solution:
You had some syntax errors and a tricky gotcha with async-await with run. For the syntax errors, it'll come with experience, but I'd also recommend using ESLint to help you catch them, as well as making sure your code is always properly indented.
const fs = require("fs");
const allfiles = [
"C:\\Users\\public",
"C:\\Users\\public\\images",
"C:\\Users\\public\\javascripts\\index1.js",
"C:\\Users\\public\\javascripts\\index2.js",
];
const checkIfFileHasSample = (path, opts = "utf8") =>
new Promise((resolve, reject) => {
fs.readFile(path, opts, (err, data) => {
if (err) {
reject(err);
} else {
if (data.includes("sample")) {
resolve(true);
} else {
resolve(false);
}
}
});
});
const run = async () => {
const results = [];
for (let i = 0; i < allFiles.length; i++) {
try {
const file = allFiles[i];
const hasSample = await checkIfFileHasSample(file);
if (hasSample) {
results.push(file);
}
} catch (e) {
console.log(e);
}
}
console.log(results);
};
run();

Upgrade .then .catch to async await and try catch

I'm tryng to upgrade this code for a better maintenance, this code uploads two images to a server, i know it's possible to get rid of those .catch, by applying async await functions, and try catch blocks, but it's pretty confusing for me, any help will be apreciated.
this._uploadService.makeFileRequest(Global.url + "/upload-image1/" + response.product._id, [], this.filesToUpload1, 'image')
.then((result: Product) => {
this.filesToUpload1 = null;
this._uploadService.makeFileRequest(Global.url + "/upload-image/" + response.product._id, [], this.filesToUpload, 'image')
.then((result: Product) => {
this.filesToUpload = null;
setTimeout( () => this._router.navigate(['/detail', this.saveProduct._id]), 800 );
})
.catch(err => {
console.log(err);
this._router.navigate(['/detail', this.saveProduct._id]);
})
})
.catch(err => {
console.log(err);
this._router.navigate(['/detail', this.saveProduct._id]);
})
I suggest using a pen and paper to draw a block diagram for the logic involved, i.e. which api gets called first, with what kind of data, then which api comes afterwards; also include any logical conditionals through branching.
After that, you should attempt to write something like
const aggregateFunction = async() => {
try {
const someResponse = await callFirstApi(); // return response
await callSecondApi(someResponse); // use the response of the first api for the second api
if (someConditional) {
await callThirdApi(); // response not returned (i.e. when not required)
}
} catch (error) { // catch all errors from all await if they're not within another try-catch
console.log(error);
}
}
This pattern should eliminate all then and catch blocks. If you need more specific error handling for calling say a specific api, wrap function call inside another try-catch block, but everything should still be within the outer try-catch so that all errors will be caught regardless.
this._uploadService.makeFileRequest = function(){
return new Promise(resolve => {
// do logic of file request
resolve(true);
})
}
var waitForTime = function() {
return new Promise(resolve => {
setTimeout( () => {
this._router.navigate(['/detail', this.saveProduct._id]),
resolve(true)
}, 800 );
})
}
var f = async function(){
try {
await this._uploadService.makeFileRequest(Global.url + "/upload-image1/" + response.product._id, [], this.filesToUpload1, 'image');
await this.fileToUpload1 = null;
await this._uploadService.makeFileRequest(Global.url + "/upload-image/" + response.product._id, [], this.filesToUpload, 'image')
await this.fileToUpload = null;
await waitForTime();
}
catch(e) {
// error logic
}
}
if (this.filesToUpload1 && this.filesToUpload) {
f()
}
this might be another cleaner approach with async,await and promise

How to write an async function that resolves when `data` event emitter fires

I am using node-serialport to communicate with a piece of hardware. It just writes a command and receives a response.
https://serialport.io/docs/en/api-parsers-overview
The following code works:
const port = new SerialPort(path);
const parser = port.pipe(new Readline({ delimiter: '\r', encoding: 'ascii' }));
const requestArray = [];
parser.on('data', (data) => {
// get first item in array
const request = requestArray[0];
// remove first item
requestArray.shift();
// resolve promise
request.promise.resolve(data);
});
export const getFirmwareVersion = async () => {
let resolvePromise;
let rejectPromise;
const promise = new Promise((resolve, reject) => {
resolvePromise = resolve;
rejectPromise = reject;
});
const title = 'getFirmwareVersion';
const cmd = 'V\r';
requestArray.push({
title,
cmd,
promise: {
resolve: resolvePromise,
reject: rejectPromise
}
});
await v2Port.write(cmd);
return promise;
};
Then from my app (which is written in electron/react) I can call the function:
<Button onClick={() => {
let data = await _api.getFirmwareVersion();
console.log('done waiting...');
console.log(data);
}>
Click Me
</Button>
Is there anyway I can refactor this code to make it more succinct?
Is there a way to get the Promise from the async function, rather than having to make a new Promise?
Is there a way to tap into the Transform Stream that already exists and pipe the Promise in there somehow?
I'm also new to async/await, and wanted to avoid using callbacks, especially in the React/Redux side of things.
I aim to have a lot of these endpoints for the api (i.e. getFirmwareVersion, getTemperature, etc...). So I want to make the code as concise as possible. I don't want the UI to have any underlying knowledge of how the API is getting the data. It just needs to request it like any other API and wait for a response.
Oh, I think I get it. The parser is receiving data constantly. So when a request comes, you wait for the next data and send it when it arrives. I suggest you to write an intermediate class.
Like this:
const SerialPort = require('serialport')
const Readline = require('#serialport/parser-readline')
const { EventEmitter } = require('events');
class SerialPortListener extends EventEmitter {
constructor(path) {
super();
this.serialPortPath = path;
}
init() {
this.serialPort = new SerialPort(this.serialPortPath);
const parser = this.serialPort.pipe(new Readline({ delimiter: '\r', encoding: 'ascii' }));
parser.on('data', data => this.emit('data', data));
}
}
Then you could modify the getFirmwareVersion like this:
const serialPortListener = new SerialPortListener(path);
serialPortListener.init();
export const getFirmwareVersion = () => {
return new Promise((resolve, reject) => {
serialPortListener.once('data', async (data) => {
try {
const cmd = 'V\r';
await v2Port.write(cmd);
resolve(data);
} catch (ex) {
reject(ex);
}
});
});
};
Based on help from Mehmet, here is what I ended up with:
const _port = new SerialPort(path);
const _parser = _port.pipe(new Readline({ delimiter: '\r', encoding: 'ascii' }));
const waitForData = async () => {
return new Promise((resolve, reject) => {
const timeoutId = setTimeout(() => reject('Write Timeout'), 500);
_parser.once('data', (data) => {
clearTimeout(timeoutId);
resolve(data);
});
});
};
const createAPIFunction = (cmdTemplate, validationString) => {
return async (config) => {
try {
// replace {key} in template with config[key] props
const cmd = cmdTemplate.replace(/{(\w+)}/g, (_, key) => {
return config[key];
});
_port.write(cmd + '\r');
const data = await waitForData();
// validate data
if (data.startsWith(validationString)) {
// is valid
return data;
} else {
// invalid data
throw new Error('Invalid Data Returned');
}
} catch (err) {
throw err;
}
};
};
export const getFirmwareVersion = createAPIFunction('V', 'V1');
export const enableSampling = createAPIFunction('G1{scope}', 'G11');

How to add server-side delay to Javascript for loop?

I'm fiddling around with using Node.js to scrape data from an e-commerce site. I use Request to retrieve the DOM of the page and Cheerio to do server-side DOM selection.
const cheerio = require('cheerio');
const request = require('request');
// takes a URL, scrapes the page, and returns an object with the data
let scrapePage = (url) => {
return new Promise((resolve, reject) => {
request(url, (error, resp, body) => {
if(error){
reject(error);
};
let $ = cheerio.load(body);
let $url = url;
let $price = $('#rt-mainbody > div > div.details > div.Data > div:nth-child(4) > div.description').text();
let obj = {
url: $url,
price: $price
}
resolve(obj);
});
});
};
// Runs scrapePage in a loop
// There is a variable called arrayOfURLs defined elsewhere that contains 100s of URLs
for( let i = 0; i < arrayOfURLs.length; i++){
scrapePage(arrayOfURLs[i])
.then((obj) => {
//write to a file
})
.catch((error) => {
})
};
The problem is that the server that I send requests to sometimes sends back blank data, I'm assuming because I'm sending too many requests without any kind of pause. Due to the async nature of JS I'm having a hard time figuring out how to add an effective delay between each iteration of the loop. It's not enough to just add a setTimeOut in a synchronous fashion because setTimeOut itself is async, and I'm running this on the server so there's no Window object.
EDIT
The code above is a simplified version of what I'm working on. The entire code is this:
app.js
const fs = require('fs');
const path = 'urls.txt';
const path2 = 'results.txt';
const scraper = require('./scraper');
let scrapePage = (url) => {
scraper.scrapePage(url)
.then((obj) => {
// console.log('obj from the scraper with Promises was received');
// console.log(obj);
// console.log('writing obj to a file');
fs.appendFile(path2, JSON.stringify(obj) + ', ', (error) => {
if(error){
console.log(error);
} else {
// console.log('Successfully wrote to ' + path2);
}
})
})
.catch((error) => {
console.log('There was an error scraping obj: ');
console.log(error);
})
}
fs.readFile(path, 'utf8', (err, data) => {
if (err){
throw err;
};
var urlArray = JSON.parse(data);
// this returns an Unexpected Identifier error
// const results = await Promise.all(urlArray.map(scrapePage));
// this returns an Unexpected Token Function error
// async function scrapePages(){
// const results = await Promise.all(urlArray.map(scrapePage));
// };
});
scraper.js
const request = require('request');
const cheerio = require('cheerio');
exports.scrapePage = (url) => {
return new Promise((resolve, reject) => {
request(url, (error, resp, body) => {
if(error){
reject(error);
};
let $ = cheerio.load(body);
let $url = url;
let $price = $('#rt-mainbody > div > div.details > div.itemData > div:nth-child(4) > div.description').text();
let obj = {
url: $url,
price: $price
}
resolve(obj);
})
})
}
Looks to me like you aren't waiting for your promises to resolve before you send the server response. You could completely eliminate the for loop using using async / await e.g.
const results = await Promise.all(arrayOfURLs.map(scrapePage));
If you want to have no more than x amount of active connections you could use throttle. Or if you want no more than x amount per second you could use throttlePeriod.
Using Promise.all will never call your resolve handler if only one request fails so you could catch any errors and return a Fail object
const Fail = function(details){this.details=details;};
const max10 = throttle(10)(scrapePage);//max 10 active connections
//const fivePerSecond = throttlePeriod(2,1000)(scrapePage); //start no more than 2 per second
Promise.all(
arrayOfURLs.map(
url =>
max10(url)
.catch(err=>new Fail([err,url]))
)
)
.then(
results =>{
successes =
results.filter(
result=>(result&&result.constructor)!==Fail
);
failed =
results.filter(
result=>(result&&result.constructor)===Fail
)
}
);
const cheerio = require('cheerio');
const request = require('request');
let scrapePage = (url) => {
return new Promise((resolve, reject) => {
request(url, (error, resp, body) => {
if(error){
reject(error);
return;
};
if(!body) {
reject('Empty Body');
return;
}
let $ = cheerio.load(body);
let $url = url;
let $price = $('#rt-mainbody > div > div.details > div.Data > div:nth-child(4) > div.description').text();
let obj = {
url: $url,
price: $price
}
resolve(obj);
});
});
};
function processUrl(url){
scrapePage(url)
.then((obj) => {
//write to a file
if(i < arrayOfURLs.length)
processUrl(arrayOfURLs.pop())
})
.catch((error) => {
arrayOfURLs.unshift(url);
if(i < arrayOfURLs.length) // put this in finally block
processUrl(arrayOfURLs.pop())
})
};
processUrl(arrayOfURLs.pop());
Here we can use arrayOfUrls arrays as queue and if we received an error or blank page, we push this URL in array again. in that way we can process every URL in a synchronous fashion.

Categories