How to avoid - Error 403 while web scraping using cheerio - javascript

I'm web scraping a website and I have an array of links:
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Abercorn',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Longueuil',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Sainte-Anne-De-Bellevue',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Shawinigan',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Chateauguay',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Mont-Laurier',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Saint-Georges',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Sherbrooke',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Chicoutimi',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Montreal',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Saint-Henri-De-Levis',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Stukely-Sud',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Drummondville',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Montreal-Est',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Saint-Hubert',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Trois-Rivieres',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Gatineau',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Montreal-Nord',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Saint-Jerome',
"http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Val-D'or",
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Granby',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Montreal-Ouest',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Saint-Lambert',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Verdun',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Lachine',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Quebec',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Saint-Laurent',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Warwick',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Lasalle',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Rigaud',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Saint-Leonard',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Westmount',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Laval',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Roxboro'
But when I do the request some of those links return error 403 - Forbidden.
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Verdun
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Granby
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Saint-Lambert
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Val-D'or
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Lasalle
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Laval
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Warwick
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Quebec
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Westmount
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Roxboro
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Saint-Laurent
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Rigaud
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Saint-Leonard
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Lachine
When I use a list with fewer links it works perfectly.
Here is my code:
const request = require('request');
const cheerio = require('cheerio');
function readChurches(cities){
const churches = []
for (let index = 0; index < cities[0].length; index++){
const city = cities[0][index];
churches.push(new Promise((resolve, reject) => {
const church = []
let options = {
url: city,
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
};
request(options, (error, response, html) => {
if(!error && response.statusCode == 200) {
const $ = cheerio.load(html);
const $$ = cheerio.load($('table').find('tbody').eq(1).find('tr').eq(1).find('td').eq(1).html())
$$('a').each((i, el) => {
const item = $(el).attr('href')
if(item != undefined){
if(item.includes('ViewEntity')) {
church.push(`http://www.adventistdirectory.org${item}`);
}
}
});
resolve(church);
} else {
console.log('Error',error,response.statusMessage,city)
reject(error)
}
});
}))
}
return Promise.all(churches);
}
What can be done to bypass error 403?. Because when I try to open the link on my browser it works, when I use the javascript function doesn't work though.
--- NEW UPDATES ---
I've changed to code. I added a try catch block
function readChurches(cities){
const churches = []
for (let index = 0; index < cities[0].length; index++){
const city = cities[0][index];
churches.push(new Promise((resolve, reject) => {
const church = []
let options = {
url: city,
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
};
try {
request(options, (error, response, html) => {
if(!error && response.statusCode == 200) {
const $ = cheerio.load(html);
const $$ = cheerio.load($('table').find('tbody').eq(1).find('tr').eq(1).find('td').eq(1).html())
$$('a').each((i, el) => {
const item = $(el).attr('href')
if(item != undefined){
if(item.includes('ViewEntity')) {
church.push(`http://www.adventistdirectory.org${item}`);
}
}
});
resolve(church);
}
});
} catch (error) {
console.log('Error',error,city)
reject(error)
}
}))
}
return churches
}
and also, created this function, provided by #chrispytoes
async function doStuff(churches) {
const results = [];
for(let i in churches) {
try {
console.log(churches[i])
results.push(await churches[i]);
sleep(5000);
} catch (error) {
console.log(error)
}
}
return results
}
and I'm running it:
async function run(){
let provinces = []
provinces.push(`http://www.adventistdirectory.org/BrowseStateProv.aspx?CtryCode=CA&StateProv=QC`)
let cities = await readCities(provinces);
const churches = await readChurches(cities);
const stuff = await doStuff(churches)
console.log('Churches: ', stuff);
console.log('End')
} catch (error) {
console.log('Error', error)
}
}
And I'm getting this on my console:
Promise { <pending> }
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=19653'
]
}
Promise { <pending> }
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=19637'
]
}
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=54633'
]
}
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=31155'
]
}
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=15271'
]
}
Promise { <pending> }
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=30783'
]
}
Promise { <pending> }
Promise { <pending> }
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=15265'
]
}
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=15255'
]
}
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=15251'
]
}
Promise { <pending> }
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=15247'
]
}
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=19645'
]
}
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=32838'
]
}
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=29973'
]
}
Promise { <pending> }
it is not getting to console.log('Churches: ', stuff);

The error being sent back is the choice of the server you are making the request to, so there's no universal way to "avoid" it. You are probably making the requests too fast and they are blocking you for using too much bandwidth.
Using Promise.all is making all the requests at once. You need to make a loop of sorts to make the requests go one at a time.
So something like this may work:
const wait = async (time) =>
new Promise((res, rej) => setTimeout(() => res(), time));
async function doStuff() {
const results = [];
for(let i in churches) {
await wait(1000);
const result = await churches[i];
console.log(result);
results.push(result);
}
}

Related

How can I get value in [Object]

I'm working on coinmarketcap.com api and I need only name and price and save to Mongo. It is working but at the end undefined value returned. Price value is under the quote. What is wrong ?
const axios = require('axios');
let response = null;
new Promise(async (resolve, reject) => {
try {
response = await axios.get('https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest?limit=2', {
headers: {
'X-CMC_PRO_API_KEY': 'myKey',
},
});
} catch(ex) {
response = null;
console.log(ex);
reject(ex);
}
if (response) {
const coin = response.data;
console.dir(coin, { depth: null });
console.log(coin.data.map(a => { a.name, a.quote.price}));
resolve(coin);
}
}
);
You shuldn't be wrapping this in another Promise (see: What is the explicit promise construction antipattern and how do I avoid it?) , just make a method which is async
async function getCoin(){
try {
const response = await axios.get('https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest?limit=2', {
headers: {
'X-CMC_PRO_API_KEY': 'YOURKEY',
},
});
const coin = response.data;
console.dir(coin, { depth: null });
return coin;
} catch(ex) {
console.log(ex);
throw ex;
}
}
quote contains prices identified by their keys like this:
"quote": {
"USD": {
"price": 42177.91536878145,
"volume_24h": 18068350418.6717,
"volume_change_24h": 8.1323,
"percent_change_1h": -0.02144759,
"percent_change_24h": -0.48235688,
"percent_change_7d": -0.65364542,
"percent_change_30d": -1.87762517,
"percent_change_60d": -13.80076009,
"percent_change_90d": -30.24448455,
"market_cap": 799599134284.9932,
"market_cap_dominance": 42.7605,
"fully_diluted_market_cap": 885736222744.41,
"last_updated": "2022-02-14T09:35:00.000Z"
}
}
so in order to solve the bug, you need to specify the proper key here (USD for example):
I tested this code and it works please copy it inside the separate js file and check the result(Replace your API key):
const axios = require('axios');
let response = null;
async function getCoin(){
try {
const response = await axios.get('https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest?limit=2', {
headers: {
'X-CMC_PRO_API_KEY': 'KEY',
},
});
const coin = response.data;
const result = coin.data.map(a => ({ name: a.name, price: a.quote['USD'].price}))
console.log(result)
return coin;
} catch(ex) {
console.log(ex);
throw ex;
}
}
getCoin()
const axios = require("axios");
async function fetchData() {
try {
const response = await axios.get(
"https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest?limit=2",
{
headers: {
"X-CMC_PRO_API_KEY": "myKey",
},
}
);
return [response, null];
} catch (error) {
return [null, error];
}
}
// in your calling function code base
const [response,error] = await fetchData()
if(response){
// handle response object in calling code base
const coin = response.data
}
if(error){
// handle error explictly in calling code base
}
Getting price in quote.USD or quote['USD'] and also need to return object of name and price using map.
So Try this:
const axios = require('axios');
let response = null;
new Promise(async (resolve, reject) => {
try {
response = await axios.get('https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest?limit=2', {
headers: {
'X-CMC_PRO_API_KEY': '5345bcb4-0203-4374-bef6-d67a89a6d216',
},
});
if (response) {
const coin = response.data;
cointData = coin.data.map(a => ({ name: a.name, price: a.quote.USD.price }))
resolve(cointData);
}
} catch (ex) {
response = null;
console.log(ex);
reject(ex);
}
});

UnhandledPromiseRejectionWarning, how to fix it?

I don't know what I could be doing wrong to get this error:
I've tried different promises, await, async combinations and nothing.
I've tried Promise.resolve()) and also .then(function().
Nothing stopped that error, what can I change to fix it?
#Controller()
export class AppController {
constructor(private httpSoap: HttpClient,
#InjectModel('product') private readonly productModel: Model<any>,
private xmlUtils: XmlUtils) { }
#EventPattern("next")
async handleMessagePrinted(data: Record<any, any>) {
let result = data;
this.createproduct(result);
this.insertproduct(result);
}
insertproduct(data: any) {
stringify(data);
this.productModel.insertMany(data);
}
async createproduct(job: any): Promise<any> {
return new Promise((async (resolve, reject) => {
// if (this.soapService.productCreate) {
const payload = job;
const xmlPayload = this.xmlUtils.parseJSONtoXML(payload);
this.insertproduct(stringify(xmlPayload)); //gravar na mongo
console.log("xmlPayload "+xmlPayload);
const headerRequest = {
'Content-Type': ContentTypeEnum.XML,
SOAPAction: SoapActionEnum.CREATE_SERVICE_product
};
const soap: ResponseInterface = await this.request("localhost:8080", xmlPayload, headerRequest, SoapType.product);
if (soap.error) {
reject(soap.error);
}
if (soap.status) {
if (soap.status.firewall.code === '000-000' || soap.status.firewall.code === '000-001') {
resolve(`product ${soap.body.Number} created successfully`);
} else if (soap.status.firewall.code === '000-998' && soap.status.fireWall.code === '623') {
reject({ error: soap.status.fireWall.description });
} else if (soap.status.firewall.code === '000-500' && soap.status.fireWall.code === 'BWENGINE-100029') {
const payloadSearch: productSearchDocument = new productSearchDocument();
payloadSearch.IsOperational = undefined;
payloadSearch.IsHistory = undefined;
payloadSearch.Qualification = `id='${job.data.id_ID}'`;
const search = await this.searchproduct(payloadSearch);
if (search.status) {
if (search.status.firewall.code === '000-000' || search.status.firewall.code === '000-001') {
resolve(`product ${soap.body.Number} created successfully`);
}
} else {
reject({ error: search.status.firewall.description, fireWallError: soap.status.fireWall.description });
}
} else {
reject({ error: soap.status.firewall.description, fireWallError: soap.status.fireWall.description });
}
}
}));
}
public async searchproduct(data: any): Promise<any> {
return new Promise((async (resolve, reject) => {
// if (this.soapService.productSearch) {
const payload = data;
const xmlPayload = this.xmlUtils.parseJSONtoXML(payload);
const headerRequest = {
'Content-Type': ContentTypeEnum.XML,
SOAPAction: SoapActionEnum.SEARCH_SERVICE_product
};
const soap: ResponseInterface = await this.request("localhost:8080", xmlPayload, headerRequest, SoapType.product);
if (soap.error) {
reject(soap.error);
}
if (soap.status) {
if (soap.status.firewall.code === '000-000' || soap.status.firewall.code === '000-001') {
resolve(soap);
} else {
reject({ error: soap.status.fireWall.description });
}
} else {
reject({ error: soap });
}
}));
}
public request(uri: string, data: any, headers: IHeaders, type: SoapType): Promise<any> {
return new Promise(((resolve) => {
this.httpSoap.request(uri, data, (async (err, res) => {
if (err) {
resolve({ error: err });
} else {
try {
console.log("fireWall response: "+data);
const bodyJson = await this.xmlUtils.formatXmlToJson(res.body);
const status: StatusInterface = await this.xmlUtils.formatStatusXML(bodyJson);
let body;
if (type === SoapType.product) {
body = await this.xmlUtils.formatproductServiceBodyXml(bodyJson);
this.insertproduct(stringify(bodyJson)); //gravar na mongo
} else if (type === SoapType.UNAVAILABILITY) {
body = await this.xmlUtils.formatImpactServiceBodyToXML(bodyJson);
} else if (type === SoapType.TASK) {
body = await this.xmlUtils.formatTaskServiceBodyXML(bodyJson);
} else {
body = '';
}
const response: ResponseInterface = {
status,
body,
};
resolve(response);
} catch (e) {
resolve(e);
}
}
}), headers);
}));
}
public simpleRequest(connection, payload): Promise<any> {
return new Promise<any>((resolve, reject) => {
const headers = {
};
this.httpSoap.request("localhost:8080", payload, (async (err, res) => {
if (err) {
resolve({ error: err });
} else {
try {
if (res.statusCode === 500) {
// const bodyJson = await this.xmlUtils.formatXmlToJson(res.body);
resolve(res.body);
} else {
const bodyJson = await this.xmlUtils.formatXmlToJson(res.body);
resolve(bodyJson);
}
} catch (e) {
reject(e);
}
}
}), headers);
});
}
}
My goal is to be able to save to mongo and also be able to make the http call to the SOAP api
This warning is shown when you don't add a rejection handler to a Promise, and it's rejected. It can be rejected when an error is occurred inside a promise, or reject() is called.
reject called:
const aa = new Promise((resolve, reject) => {
reject(new Error('whoops'));
});
aa.then(v => {
console.log(v);
});
// Running this script gives unhandled rejection warning
an error is occurred:
const aa = new Promise((resolve, reject) => {
const a = {};
// "cannot read property 'unexistingProperty' of undefined" error is thrown here
const b = a.b.unexistingProperty;
// alternatively, when an is thrown with throw
// throw new Error('oops')
});
aa.then(v => {
console.log(v);
});
// Running this script also gives unhandled rejection warning
You can add a rejection handler via then (the second argument to then() is rejection handler) or catch. For async/await you can add a try/catch block to catch error.
In node.js you can also add rejection handler to all unhandled rejected promises with process.on('unhandledRejection') like this:
process.on('unhandledRejection', error => {
console.log(error);
});
You can also see where the error is thrown with unhandledRejection event handler shown above, or you can run node.js with --trace-warnings like this.
node --trace-warnings index.js
References:
https://thecodebarbarian.com/unhandled-promise-rejections-in-node.js.html
https://nodejs.org/dist/latest-v14.x/docs/api/process.html

AWS Lambda retries despite returning value

I have found out that my and my colleagues lambda doesn't return data we anticipate.
After what I have already found, we used deprecated invokeAsync which returnes only statusCode.
I have upgraded aws-sdk to the newest version and change the invocation code to use invoke with
InvocationType: 'RequestResponse'
as that should give us the returned value.
Lambdas code itself looks good, its an async function without arguments.
Im not using the callback here (3rd argument of handler), as we are doing some async stuff and it would require a small refactor to not use async/await, also i have read that just returning value is ok as well.
Earlier lambda was returning array, but after some investigation and googling I have changed it to
return {
statusCode: 200,
body: JSON.stringify(result)
}
where result is the mentioned array, as this was recurring pattern in search results.
Overall result is in our service where we do the invocation, we get timeout, lambda logs that it is returning value, but then retries itself again and again.
Dumping code
invocation in our service:
const lambda = new AWS.Lambda({ httpOptions: { timeout: 600000 } })
const results = await new Promise((resolve, reject) => {
lambda.invoke(
{
FunctionName: `lambda-name`,
InvocationType: 'RequestResponse',
Payload: '""'
},
(err, data) => {
if (err) {
reject(err)
} else {
resolve(data)
}
}
)
})
lambda handler
import { parallelLimit } from 'async'
const PARALLEL_FETCHERS_NUMBER = 2
export async function runTasksInParallel (
tasks,
limit,
) {
return new Promise(
(
resolve,
reject,
) => {
parallelLimit(tasks, limit, (error, results) => {
if (error === null) {
resolve(results)
} else {
reject(error)
}
})
},
)
}
async function connectToDB(logger) {
const MONGO_URL = process.env.MONGO_URL || 'mongodb://localhost:27017/'
let db
logger.debug('Connecting to mongo')
try {
db = await MongoClient.connect(
MONGO_URL,
{
sslValidate: false,
}
)
} catch (error) {
logger.error('Could not connect to Mongo', { error })
throw error
}
logger.debug('Connected')
return db
}
async function fetchAndStore(
list,
sources,
logger
) {
const results = []
const tasks = sources.map((source) => async (callback) => {
const { fetchAdapter, storageAdapter } = source
try {
const { entries, timestamp } = await fetchAdapter.fetchLatest(list)
await storageAdapter.storeMany(timestamp, entries)
results.push(['fetch.success', 1, [ `fetcher:${fetchAdapter.name}` ] ])
} catch (error) {
const errorMessage = `Failed to fetch and store from adapter ${fetchAdapter.name}`
const { message, name, stack } = error
logger.error(errorMessage, { error: { message, name, stack } })
results.push(['fetch.error', 1, [ `fetcher:${fetchAdapter.name}` ] ])
}
callback && callback(null)
})
await runTasksInParallel(tasks, PARALLEL_FETCHERS_NUMBER)
return results
}
export async function handle() {
const logger = createLambdaLogger() // just a wrapper for console.log to match interface in service
logger.debug('Starting fetching data')
const db: Db = await connectToDB(logger)
const primaryCollection = db.collection(PRIMARY_COLLECTION)
const itemsColletion = db.collection(ITEMS_COLLECTION)
const secondaryCollection = db.collection(SECONDARY_PRICING_COLLECTION)
const primaryStorageAdapter = new StorageAdapter(
Promise.resolve(primaryCollection),
logger
)
const itemsStorageAdapter = new ItemsStorageAdapter(
Promise.resolve(itemsColletion),
logger
)
const secondaryStorageAdapter = new StorageAdapter(
Promise.resolve(secondaryCollection),
logger
)
const primaryFetchAdapter = new PrimaryFetchAdapter(getFetcherCredentials('PRIMARY'), logger)
const secondaryFetchAdapter = new SecondaryFetchAdapter(getFetcherCredentials('SECONDARY'), logger)
const sources = [
{ fetchAdapter: primaryFetchAdapter, storageAdapter: primaryStorageAdapter },
{ fetchAdapter: secondaryFetchAdapter, storageAdapter: secondaryStorageAdapter },
]
try {
const list = await itemsStorageAdapter.getItems()
logger.debug(`Total items to fetch ${list.length}`)
const result = await fetchAndStore(list, sources, logger)
logger.debug('Returning: ', { result })
return {
statusCode: 200,
body: JSON.stringify(result)
}
} catch (error) {
const errorMessage = 'failed to do task'
const { message, name, stack } = error
logger.error(errorMessage, { error: { message, name, stack } })
return {
statusCode: 500,
body: JSON.stringify(new Error(errorMessage))
}
} finally {
await db.close()
}
}
Edit:
I have changed the way i'm invoking the lambda to this below. Basically I tried to listen for every possible event that can say me something. Result is that i get a error event, with response message being Converting circular structure to JSON. Thing is I don't really know about which structure this is, as the value I'm returning is a two elements array of 3 elements (strings and number) arrays.
const lambda = new AWS.Lambda({
httpOptions: { timeout: 660000 },
maxRetries: 0
})
const request = lambda.invoke(
{
FunctionName: `${config.get('env')}-credit-rubric-pricing-fetching-lambda`,
InvocationType: 'RequestResponse',
Payload: '""'
},
(err, data) => {
if (err) {
reject(err)
} else {
resolve(data)
}
}
)
const results = await request
.on('send', () => {
logger.debug('Request sent to lambda')
})
.on('retry', response => {
logger.debug('Retrying.', { response })
})
.on('extractError', response => {
logger.debug('ExtractError', { response })
})
.on('extractData', response => {
logger.debug('ExtractData', { response })
})
.on('error', response => {
logger.debug('error', { response })
})
.on('succes', response => {
logger.debug('success', { response })
})
.on('complete', response => {
logger.debug('complete', { response })
})
.on('httpHeaders', (statusCode, headers, response, statusMessage) => {
logger.debug('httpHeaders', { statusCode, headers, response, statusMessage })
})
.on('httpData', (chunk, response) => {
logger.debug('httpData', { response, chunk })
})
.on('httpError', (error, response) => {
logger.debug('httpError', { response, error })
})
.on('httpDone', response => {
logger.debug('httpDone', { response })
})
.promise()

Nock throws a no match for request when using Promise.race

I'm writing a test for some code that will use Promise.race to bring back a result from a graphql service that is on (could be on) multiple servers. I've used Nock to mock the request, which works fine when I'm hitting a single service. When I mock up multiple services, Nock throws an error saying
AssertionError: expected [Function] to not throw an error but 'Error: Error: Nock: No match for request {\n "method": "POST",\n "url": "http://94.82.155.133:35204",\n "headers": {\n "content-type": "application/json",\n "accept": "application/json"\n },\n "body": "{...}"\n}' was thrown
my test looks like this:
it('should make two POST requests to the service for data from graphQL', async () => {
const spy = sinon.spy(releases, '_queryGraphQL');
const releaseID = 403615894;
nock.cleanAll();
const services = serviceDetails(NUMBER_OF_SERVICES); // NUMBER_OF_SERVICES = 3
nock(serviceDiscoveryHost)
.get('/v1/catalog/service/state51')
.reply(HTTP_CODES.OK, services);
for (const service of services) {
const currentNodeHealth = nodeHealth(service.Node);
nock(serviceDiscoveryHost)
.get('/v1/health/node/'+service.Node)
.reply(HTTP_CODES.OK, currentNodeHealth);
const delayTime = Math.floor(Math.random()*1000);
nock('http://'+service.Address+':'+service.ServicePort, serviceHeaders)
.post('/')
.delay(delayTime)
.replyWithError({code: 'ETIMEDOUT', connect: false})
.post('/')
.delay(delayTime)
.reply(HTTP_CODES.OK, getReply(releaseID))
}
const actual = await releases.getRelease(releaseID)
.catch((err) => {
console.log(releases._retries);
(() => { throw err; }).should.not.throw();
});
expect(releases._retries[releaseID]).to.be.equal(1);
expect(spy.callCount).to.be.equal(2);
expect(actual).to.be.an('object')
expect(actual.data.ReleaseFormatById.id).to.be.equal(releaseID);
});
and the offending bit of code looks like
async _queryGraphQL(releaseID, services) {
if (! this._retries[releaseID]) {
this._retries[releaseID] = 0;
}
const postData = this._getReleaseQuery(releaseID);
return Promise.race(services.map( (service) => {
const options = this._getHTTPRequestOptions(service);
return new Promise((resolve, reject) => {
let post = this.http.request(options, (res) => {
let data = '';
if (res.statusCode < 200 || res.statusCode > 299) {
const msg = this.SERVICE_NAME + ' returned a status code outside of acceptable range: ' + res.statusCode;
reject(new QueryError(msg, postData));
} else {
res.setEncoding('utf8');
res.on('data', (chunk) => {
data += chunk;
});
res.on('error', (err) => {
reject(new QueryError(err.message, postData, err));
});
res.on('end', () => {
resolve(JSON.parse(data));
});
}
});
post.on('error', async (err) => {
if (err.code === 'ETIMEDOUT') {
if (this._retries[releaseID] &&
this._retries[releaseID] === 3) {
reject(err);
} else {
this._retries[releaseID] += 1;
resolve(this._queryGraphQL(releaseID, services));
}
} else {
reject(new QueryError(err.message, postData, err));
}
});
post.write(JSON.stringify(postData));
post.end();
});
}));
}
this.http is just require('http');. and the options will be {hostname: service.hostname} \\ example.com etc.
What I'm expecting, is that if the first service to respond, responds with an error relating to: 'ETIMEDOUT', it'll recall the function (upto 2 more times) and try all the services again until the first service to respond is something that isn't a 'ETIMEDOUT'.

Recursive loop with promises crawler NodeJS

I'm trying to use Recursive Loop and Promises to Scrape a website.
But it fails.. It make the request only for the first page and at the second the program stops giving to me unhandled promise rejection warning
I have this three JS files:
scrapeAll.js (is the recursive loop that calls scrapePage.js)
scrapePage.js
scrapeComponents.js
scrapeAll.js:
var indexPage = 0;
scrapePage(indexPage).then((json)=>{
console.log(JSON.stringify(json, null, 4));
if(indexPage === Number.MAX_SAFE_INTEGER){
console.log("MAX SAFE INTEGER");
return;
}
save(json);
indexpage++;
scrapePage(indexPage);
}).catch((data)=>{
console.log(data);
if(indexPage === Number.MAX_SAFE_INTEGER){
console.log("MAX SAFE INTEGER");
return;
}
indexPage++;
scrapePage(indexPage);
});
ScrapePage.JS
let makeRequestCounter = 0;
function scrapePage(number) {
return new Promise((resolve, reject) => {
let url = URL + number;
let options = {
url: url,
headers: {
Host: SITE,
Connection: "keep-alive",
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "it-IT,it;q=0.9,en-US;q=0.8,en;q=0.7",
"Cache-Control": "max-age=0",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36",
"Cookie": restoreCookieToString()
}
};
makeRequest(options).then((jsonData) => {
resolve(jsonData);
}).catch((error) => {
//REQUEST_LIMIT_EXCEEDED
if (error === CONSTANTS.REQUEST_LIMIT_EXCEEDED) {
reject(CONSTANTS.REQUEST_LIMIT_EXCEEDED);
}
//ALREADY_EXIST
else if (error === CONSTANTS.ALREADY_EXIST) {
reject(CONSTANTS.ALREADY_EXIST);
}
else if (error === 404) {
reject("no data found at this page");
}
//error can beeconnrefused or econnreset
else if (error.code !== undefined) {
//econnrefused
if (error.code === CONSTANTS.ECONNREFUSED) {
reject("WRONG_URL", url);
}
//econnreset
else if (error.code === CONSTANTS.ECONNRESET) {
console.log("\neconnreset error\n");
makeRequest(options);
}
}
}
);
});
}
function makeRequest(options) {
return new Promise((resolve, reject) => {
let json = {
category: [],
imgs: [],
title: "",
description: "",
url: ""
};
if (makeRequestCounter === CONSTANTS.REQUEST_LIMIT) {
reject(CONSTANTS.REQUEST_LIMIT_EXCEEDED);
}
makeRequestCounter++;
console.log("request to: ", options.url);
request(options, function (error, response, html) {
if (error) {
//error: possible econnreset econnrefused
reject(error);
} else {
if (response.statusCode === 200) {
cookieSave(response.headers);
//---------- check if in db the url is already saved -------------//
check(response.request.uri.href, (err) => {
if (!err) {
reject(CONSTANTS.ALREADY_EXIST);
}
});
//----------finish checking, is new -------------------//
//GETTING TITLE
title(html, json_recipe).then((json) => {
//GETTING category
category(html, json).then((json) => {
//GETTING images
imgs(html, json).then((json) => {
description(html, json).then((json) => {
json.url = response.request.uri.href;
resolve(json);
//description error
}).catch((error) => {
console.log(error);
});
//images error
}).catch((error) => {
console.log(error);
});
//category error
}).catch((error) => {
console.log(error);
});
//title error
}
).catch((error) => {
console.log(error);
});
}
//no data in this page
if (response.statusCode === 404) {
reject(response.statusCode);
}
}
});
});
}
scrapeComponents.js
...
function description(html, json) {
return new Promise((resolve, reject) => {
const $ = cheerio.load(html);
let description = $('.submitter__description').text().trim();
json.description = JSON.parse(description);
resolve(json);
});
}
...
error:
UnhandledPromiseRejectionWarning: Unhandled promise rejection (rejection id: 1): no data found at this page
The program make the first request and return correctly at the scrapeAll.js that correctly the scrapePage(indexPage = 1).
The second time my program do exactly same as first time but when is time to return to the scrapeAll.js ( reject("no data found at this page"); in ScrapePage.js ) the program ends with the error.
Both the pages are without data but program fails also with good pages saving only the first.
I think that I made a big mistake with promises.
Thank you very much guys.
Your call for scrapPage function is running only once and you are not calling it iteratively. You might have to call it in iteration using a function. Update your scrapeAll.js:
function callScrapPage() {
var indexPage = 0;
while (indexPage < Number.MAX_SAFE_INTEGER) {
scrapePage(indexPage).then((json) => {
console.log(JSON.stringify(json, null, 4));
save(json);
indexpage++;
}
}
}
The problem is that one or more of your calls to scrapePage(indexPage) in scrapeAll.js are failing. You cannot recursively call a promise the way you might with other code, so you need a .then and .catch on the additional calls as well. Adding a .catch to the other calls will enable you to see the true source of failure.
scrapePage(indexPage)
.then((json)=>{
console.log(JSON.stringify(json, null, 4));
if(indexPage === Number.MAX_SAFE_INTEGER){
console.log("MAX SAFE INTEGER");
return;
}
save(json);
indexpage++;
scrapePage(indexPage).catch(e => console.log(e));
})
.catch((data)=>{
console.log(data);
if(indexPage === Number.MAX_SAFE_INTEGER){
console.log("MAX SAFE INTEGER");
return;
}
indexPage++;
scrapePage(indexPage).catch(e => console.log(e));
});

Categories