Cache result of function - javascript

How I can store result of an async function in variable in node? For example I want to parse a website and get some info, but I don't want to constantly request data, I want to get all page only once. Here is code:
var cheerio = require('cheerio');
var request = require('request');
function SiteParser() {
const SITE = 'http://www.anywebsite.com';
// variable for caching html
var $ = getBody();
function getBody(SITE){
// request html
request(SITE, function(error, response, html) {
if (!error && response.statusCode == 200) {
return cheerio.load(html);
}
else {
throw new Error('Could not parse web site\nError text: ' + error);
}
})
}
//declaration of class methods using cached html
this.getCategories = function() {};
this.getNews = function() {};
}
How can I be sure, that response will be received, when I will call methods of the class?
Or is this a bad practice?

Note that var $ = getBody(); does not work as you want it, since getBody does not return anything. It is the inner callback that returns something, but that return value is for ever lost. Moreover, that callback is called asynchronously, so you could not get it via var $ = getBody();, since getBody finishes before the callback is executed.
So, here is a quick way to resolve that issue. The constructor takes an optional callback function as argument:
function SiteParser(onready) {
const SITE = 'http://www.anywebsite.com';
// variable for caching html
var $;
var that = this; // maintain reference to this instance of SiteParser
function getBody(SITE){
// request html
request(SITE, function(error, response, html) {
if (!error && response.statusCode == 200) {
$ = cheerio.load(html); // store the result
that.ready() // notify client that html has been retrieved
}
else {
throw new Error('Could not parse web site\nError text: ' + error);
}
})
}
//declaration of class methods using cached html
this.getCategories = function() {
if ($ === undefined) throw new Error('Not ready yet');
// extract categories from $ and return them
};
this.getNews = function() {
if ($ === undefined) throw new Error('Not ready yet');
// extract news from $ and return it
};
this.ready = onready || function() {}; // callback for being notified
}
Now you can write:
parser = new SiteParser;
parser.ready = function () {
var cats = parser.getCategories();
// ...etc
};
or also:
parser = new SiteParser(function () {
var cats = parser.getCategories();
// ...etc
});
Promise
The above code exposes the getCategories and getNews methods even when the HTML has not yet been retrieved. This is not so nice.
Using the promise concept you can improve on this by only providing the SiteParser instance when all is ready. Here is some (untested) code you could use with promise for nodejs:
var Promise = require('promise');
// SiteParser now needs to be given the HTML in the constructor:
function SiteParser(html) {
var $ = $cheerio.load(html)
//declaration of class methods using cached html
this.getCategories = function() {
// extract categories from $ and return them
};
this.getNews = function() {
// extract news from $ and return it
};
}
// a separate Promise object takes care of the retrieval:
function promiseSiteParser() {
const SITE = 'http://www.anywebsite.com';
return new Promise(function (resolve, reject) {
// request html
request(SITE, function(error, response, html) {
if (!error && response.statusCode == 200) {
resolve(new SiteParser(html));
}
else {
reject(error);
}
})
});
}
promiseSiteParser().then(function (parser) {
var cats = parser.getCategories();
// ...etc
}, function (error) {
throw new Error('Could not parse web site\nError text: ' + error);
});

Related

Why is this an Unhandled Promise Rejection?

I'm trying to rewrite my code to utilize promises correctly.
The full program is supposed to scrape the data from a tshirt site. This first block of code is supposed to enter the front page of the site, grab the product pages that are immediately available and then store the URL's in an array. The remainder URL's will be stored in 'remainder' for a secondScrape to be performed later on.
Currently manually unit testing each section:
//TASK: Create a command line application that goes to an ecommerce site to get the latest prices.
//Save the scraped data in a spreadsheet (CSV format).
//Modules being used:
var cheerio = require('cheerio');
var request = require('request');
//harcoded url
var url = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = new Set();
var remainder;
const requestPromise = function(url) {
return new Promise(function(resolve, reject) {
request(url, function(error, response, html) {
if(error) return reject(error);
if(!error && response.statusCode == 200){
return resolve(html);
}
});
});
}
function firstScrape (url) {
return requestPromise(url)
.then(function(html) {
var $ = cheerio.load(html);
var links = [];
//get all the links
$('a[href*=shirt]').each(function(){
var a = $(this).attr('href');
//add into link array
links.push(url + a);
});
return links;
// return this array you've made
});
}
function nextStep (arrayOfLinks) {
var promiseArray = [];
for(var link in arrayOfLinks){
promiseArray.push(requestPromise(link));
return Promise.all(promiseArray);
}
}
function lastStep (arrayOfHTMLresults){
for(var html in arrayOfHTMLresults){
var $ = cheerio.load(html);
//if page has a submit it must be a product page
if($('[type=submit]').length !== 0){
//add page to set
urlSet.add(scrapeLink);
} else if(remainder == undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
remainder = scrapeLink;
}
}
console.log(urlSet);
console.log(remainder);
}
firstScrape(url)
.then(nextStep)
.then(lastStep);
I'm currently getting the following error:
(node:71094) UnhandledPromiseRejectionWarning: Unhandled promise rejection (rejection id: 3): Error: Invalid URI "0"
This is the code I'm trying to promisify:
// Load front page of shirts4mike
function firstScrape(){
request(url, function(error, response, html) {
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
//iterate over links with 'shirt'
$('a[href*=shirt]').each(function(){
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
//for each new link, go in and find out if there is a submit button.
//If there, add it to the set
request(scrapeLink, function(error,response, html){
if(!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//if page has a submit it must be a product page
if($('[type=submit]').length !== 0){
//add page to set
urlSet.add(scrapeLink);
} else if(remainder == undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
remainder = scrapeLink;
}
}
});
});
}
});
}
What I can't work out is how can I use urlSet.add(scrapeLink); in lastStep() when it doesn't know what scrapeLink is?
Any idea why? Thank you
.add() is not an Array.prototype method, you also return promiseArray within for loop instead of pushing a Promise to promiseArray and using Promise.all()
function nextStep (arrayOfLinks) {
var promiseArray = [];
for(var i = 0; i < arrayOfLinks.length; i++) {
var link = requestPromise(arrayOfLinks[i]);
promiseArray.push(link);
}
return Promise.all(promiseArray)
}
UPDATE due to question changing:
So from firstScrape() you could return a results Object instead of just an array of links:
return { scrapeLink: link, result: links }
You would then get that in nextStep() as the result of the promise where you could return something with the same shape again:
return { scrapeLink: firstStepResult.scrapLink, result: Promise.all(promiseArray) }
Then in lastStep() instead of arrayOfHTMLresults getting passed in you would then have an Object which would look like:
{ scrapeLink: "http://someurl.com", result: arrayOfHTMLresults }
PREVIOUS answer:
You will need to initialize your variable in the for...in loop. e.g. with const, var or let depending on your use case and JS version.
for(var link in arrayOfLinks){
promiseArray.add(requestPromise(link));
return promiseArray;
}

wait for request to finish and return data

I'm building an Restful API with node js this is my code
There are 3 main parts, I created a get method that has to return some data
the get calls getImportIoData that has a request to an external API and this method calls another method called getEmailFromWebSite this other method calls another request,
how can I wait for every request to finish an return data
I Know this has been answered in other questions but I tried some of them and didn't work
I tried Async and q libraries, also the callback on this part Doesn't work
if (item.website !== undefined){
getEmailFromWebSite(item.website, function(email){console.log(email); item.email = email;});
}
I don't know If Im doing something wrong,
Can someone help me with this? using my code
Thanks in advance
router.get('/', function(req, res) {
var Type = req.query.Type;
var Locations = req.query.Locations;
var Page = req.query.Page;
res.send(getImportIoData(res, Type, Locations, Page));
});
function getImportIoData(res, Type, Locations, Page) {
var criteria = {
'search_terms': Type,
'geo_location_terms': Locations,
'page': Page
}
var url = consts.import_io + consts.import_key + consts.query +
encodeURIComponent(consts.api_url) + encodeQueryData(criteria) +
/*"&_user=" + consts.user + */"&_apikey=" + consts.api_key;
request(url, function(error, response, json) {
var data = JSON.parse(json);
if (!error && response.statusCode === 200) {
var doctors = _.forEach(data.results, function(item) {
if (item.website !== undefined){
getEmailFromWebSite(item.website, function(email){console.log(email); item.email = email;});
}
});
return data.results
}
//else
});
}
function getEmailFromWebSite(website, callback) {
var EmptyReturn='';
searchTerm = extractDomain(website).replace('www.', '');
if (searchTerm != ''){
var EH_APIURL = 'https://api.emailhunter.co/v1/search?domain='+searchTerm+'&api_key='+ consts.EmailHunterAPIKey;
request(EH_APIURL, function(error, response, json) {
var data = JSON.parse(json);
if (!error && response.statusCode === 200) {
if((data.emails).length != 0){
var Emailobject = data.emails;
var EmailString = returnEmails(Emailobject);
callback(EmailString);
}else
callback(EmptyReturn);
}else if (response.statusCode === 429)
console.log('The number of Request has been Reached for this account');
else if (response.statusCode > 500)
console.log('Error with Email Hunter Servers');
else
console.log('An Error Ocurred');
});
}
}
There's no code related to q in there but q.all receives an array of promises and resolves when all of those promises are resolved, so you can do something like this..
var promises = [ promiseOne(), promiseTwo(), promiseThree() ];
q.all(promises).then(function(results) {
//results is an array with the resolution values from all your promises in the order you specified in the promises array
});

Return result from a javascript callback (node.js)

I know this question have been asked many times, but I can't make it work.
Here is my situation. I had a string called data, and I want to unshorten all the link inside that string.
Code:
var Bypasser = require('node-bypasser');
var URI = require('urijs');
var data = 'multiple urls : http://example.com/foo http://example.com/bar';
var result = URI.withinString(data, function(url) {
var unshortenedUrl = null;
var w = new Bypasser(url);
w.decrypt(function(err, res) {
// How can I return res ?
unshortenedUrl = res;
});
// I know the w.descrypt function is a asynchronous function
// so unshortenedUrl = null
return unshortenedUrl;
});
Let's me walk you through the code.
URI.withinString will match all the URLs in data, manipulate it and return the result.
You can view an example from URI.js docs
What I want to with these URLs is to unshorten all of them using node-passer.
This is from node-bypasser document:
var Bypasser = require('node-bypasser');
var w = new Bypasser('http://example.com/shortlink');
w.decrypt(function(err, result) {
console.log('Decrypted: ' + result);
});
This is the result that I want multiple urls : http://example.com/foo_processed http://example.com/bar_processed
I created a notebook at tonicdev.com
Solution
var getUrlRegEx = new RegExp(
"(^|[ \t\r\n])((ftp|http|https|gopher|mailto|news|nntp|telnet|wais|file|prospero|aim|webcal):(([A-Za-z0-9$_.+!*(),;/?:#&~=-])|%[A-Fa-f0-9]{2}){2,}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*(),;/?:#&~=%-]*))?([A-Za-z0-9$_+!*();/?:~-]))"
, "g"
);
var urls = data.match(getUrlRegEx);
async.forEachLimit(urls, 5, function (url, callback) {
let w = new Bypasser(url);
w.decrypt(function (err, res) {
if (err == null && res != undefined) {
data = data.replace(url, res);
callback();
}
});
}, function(err) {
res.send(data);
});
You don't really understand what callback is. The callback serves to allow asynchronous code to run without Javascript waiting for it. If you were less lazy and added some debug in your code:
console.log("Started parsing");
var result = URI.withinString(data, function(url) {
console.log("URL parsed (or whatever)");
var unshortenedUrl = null;
var w = new Bypasser(url);
w.decrypt(function(err, res) {
// How can I return res ?
unshortenedUrl = res;
});
// I know the w.descrypt function is a asynchronous function
// so unshortenedUrl = null
return unshortenedUrl;
});
console.log("Call to library over");
You would (most likely) see messages in this order:
Started parsing
Call to library over
URL parsed (or whatever)
The answer: Callback is not guaranteed to run before any code you execute after assigning it. You can't put data in your result variable because the data might not be fetched yet.

"undefined is not a function" using callbacks

I'm creating a twitter bot that is requesting from the same API, Wordnik, but each request is depending on the last request's results. So, I decided to try creating some code using callbacks to make sure that all of the information is returned from the API before the next function runs. I am having trouble setting it up, I have looked at many examples and I just cannot get the hang of it. (Sorry for the messy code).
The error I am getting right now is "undefined is not a function" in my function getWord() on thenRunThisFunction(getRhyme). I'm wondering if I have a small error with the callbacks or if my whole approach to this problem is incorrect?
function runBot() {
var request = require('request');
var Twit = require('twit');
var async = require('async');
var T = new Twit({
consumer_key: '' // Your Consumer Key
, consumer_secret: '' // Your Consumer Secret
, access_token: '' // Your Access Token
, access_token_secret: '' // Your Access Token Secret
});
var WORDNIKAPIKEY = '';
// GLOBAL VARS
var randomWord; //get random word
var rhymingWord; //get rhyming word
var bogusDef; //get def of rhyming word
var tweet; // combined random and bogusdef
function getWord(thenRunThisFunction){
request('http://api.wordnik.com:80/v4/words.json/randomWord?hasDictionaryDef=false&minCorpusCount=0&maxCorpusCount=-1&minDictionaryCount=1&maxDictionaryCount=-1&minLength=5&maxLength=-1&api_key=' + WORDNIKAPIKEY, function (error, response, body1) {
if (!error && response.statusCode == 200) {
//console.log(body1) // Show the HTML for the Google homepage.
var pparsedData = JSON.parse(body1);
console.log("Word: " + pparsedData.word);
// set random word
randomWord = pparsedData.word;
thenRunThisFunction(getRhyme);
}
})
}
// Get the rhyming word
function getRhyme(thenRunThisFunction){
request('http://api.wordnik.com:80/v4/word.json/' + randomWord + '/relatedWords?useCanonical=false&relationshipTypes=rhyme&limitPerRelationshipType=10&api_key=' + WORDNIKAPIKEY, function (error, response, body2) {
if (!error && response.statusCode == 200) {
//console.log(body2) // Show the HTML for the Google homepage.
var o = JSON.parse(body2);
console.log("Rhyme: " + o[0].words[0]);
// set rhyming word
rhymingWord = o[0].words[0];
thenRunThisFunction(getDef);
}
})
}
// GET THE SEXY DEFINITION BABY, BEACH BOD
function getDef(thenRunThisFunction){
request('http://api.wordnik.com:80/v4/word.json/' + rhymingWord + '/definitions?limit=200&includeRelated=true&sourceDictionaries=all&useCanonical=false&includeTags=false&api_key=' + WORDNIKAPIKEY, function (error, response, body3) {
if (!error && response.statusCode == 200) {
//console.log(body3) // Show the HTML for the Google homepage.
var newnew = JSON.parse(body3);
console.log("Definition: " + newnew[0].text);
// set definition
bogusDef = newnew[0].text;
randomWord = randomWord.charAt(0).toUpperCase();
tweet = randomWord + ": " + bogusDef;
thenRunThisFunction(postStatus);
}
})
}
function postStatus(){
T.post('statuses/update', { status: tweet }, function(err, data, response) {
if(err) {
console.log("There was a problem tweeting the message.", err);
}
});
console.log("status posted");
}
getWord();
}
runBot();
You are not passing a function reference into getWord().
I have no clue really what you're trying to accomplish, instead of going
thenRunThisFunction();
thenRunThisFunction();
thenRunThisFunction();
thenRunThisFunction();
thenRunThisFunction();
Just invoke them by their names, remove the argument from them
getRhyme();
getDef();
What you're doing will never work, you're trying to call thenRunThisFunction as if it actually exists, it's an argument in your function that never gets served
Your method would work if it was like this:
function runThisFunction(fnc) {
fnc();
}
function blah(thenRunThisFunction) {
thenRunThisFunction(thing);
}
function thing() {
console.log('Blah');
}
blah(runThisFunction);
But that's horrible and bad.
You aren't passing anything to getWord at the end so thenRunThisFunction is literally undefined. Try passing a function to getWord like this getWord(function(){}). But in your case you want to pass whatever you want to run after get word.

node.js + cheerio scrape: Passing an array of urls to download?

Firstly, here is my code as I've progressed so far:
var http = require("http");
// Utility function that downloads a URL and invokes
// callback with the data.
function download(url, callback) {
http.get(url, function(res) {
var data = "";
res.on('data', function (chunk) {
data += chunk;
});
res.on("end", function() {
callback(data);
});
}).on("error", function() {
callback(null);
});
}
var cheerio = require("cheerio");
var url = "http://www.bloglovin.com/en/blogs/1/2/all";
var myArray = [];
var a = 0;
var getLinks = function(){download(url, function(data) {
if (data) {
// console.log(data);
var $ = cheerio.load(data);
$(".content").each(function(i, e) {
var blogName = $(e).find(".blog-name").text();
var followLink = $(e).find("a").attr("href");
var blogSite = $(e).find(".description").text();
myArray[a] = [a];
myArray[a]["blogName"] = blogName;
myArray[a]["followLink"] = "http://www.bloglovin.com"+followLink;
myArray[a]["blogSite"] = blogSite;
a++;
console.log(myArray);
});
}
});
}
getLinks();
As you can see, followLinks is concatenated to followUrl, of which I'd like to pass through the 'url' download, so effectively I'll be scraping each of the pages using the same CSS rules, which will be added to the multidimensional array for the corresponding blogger.
How can I go about this?
I do something similar in one of my scraping jobs, but I use the async.js library to accomplish. Note that I'm also using the request module and cheerio.js in my scraping. I fetch and scrape rows of data from a single webpage, but suspect you could do something similar to fetch URLs and request / scrape them in the same manner.
I also admit this is quite basic coding, certainly could be optimized with a bit of refactoring. Hope it gives you some ideas at least...
First, I use request to fetch the page and call my parse function -
var url = 'http://www.target-website.com';
function(lastCallback) {
request(url, function(err, resp, body) {
if(!err) { parsePage(err, resp, body, lastCallback); }
else { console.log('web request error:' + resp.statusCode); }
}
}
Next, in my parsePage function, I load the website into Cheerio, fetch the HTML of each data row into an array, push my parseRow function and each HTML segment into another array, and use async.parallel to process each iteration -
var rows = [];
function parsePage(err, resp, body, callback1) {
var $ = cheerio.load(body);
$('div#targetTable tr').each(function(i, elem) {
rows.push($(this).html());
});
var scrRows = [];
rows.forEach(function(row) {
scrRows.push(function(callback2) {
parseRow(err, resp, row);
callback2();
});
async.parallel(scrRows, function() {
callback1();
});
}
Inside your loop, just create an object with the properties you scrape then push that object onto your array.
var blogInfo = {
blogName: blogName,
followLink: "http://www.bloglovin.com"+followLink;
blogSite: blogSite
};
myArray.push(blogInfo);
You have defined a = 0; So
myArray[a] = [a]; // => myArray[0] = [0]; myArray[0] becomes an array with 0 as only member in it
All these statements throw an error since Array can have only integer as keys.
myArray[a]["blogName"] = blogName;
myArray[a]["followLink"] = "http://www.bloglovin.com"+followLink;
myArray[a]["blogSite"] = blogSite;
Instead try this:
var obj = {
index: a,
blogName: blogName,
followLink: "http://www.bloglovin.com" + followLink,
blogSite: blogSite
}
myArray.push(obj);
console.log(myArray);

Categories