I'm trying to rewrite my code to utilize promises correctly.
The full program is supposed to scrape the data from a tshirt site. This first block of code is supposed to enter the front page of the site, grab the product pages that are immediately available and then store the URL's in an array. The remainder URL's will be stored in 'remainder' for a secondScrape to be performed later on.
Currently manually unit testing each section:
//TASK: Create a command line application that goes to an ecommerce site to get the latest prices.
//Save the scraped data in a spreadsheet (CSV format).
//Modules being used:
var cheerio = require('cheerio');
var request = require('request');
//harcoded url
var url = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = new Set();
var remainder;
const requestPromise = function(url) {
return new Promise(function(resolve, reject) {
request(url, function(error, response, html) {
if(error) return reject(error);
if(!error && response.statusCode == 200){
return resolve(html);
}
});
});
}
function firstScrape (url) {
return requestPromise(url)
.then(function(html) {
var $ = cheerio.load(html);
var links = [];
//get all the links
$('a[href*=shirt]').each(function(){
var a = $(this).attr('href');
//add into link array
links.push(url + a);
});
return links;
// return this array you've made
});
}
function nextStep (arrayOfLinks) {
var promiseArray = [];
for(var link in arrayOfLinks){
promiseArray.push(requestPromise(link));
return Promise.all(promiseArray);
}
}
function lastStep (arrayOfHTMLresults){
for(var html in arrayOfHTMLresults){
var $ = cheerio.load(html);
//if page has a submit it must be a product page
if($('[type=submit]').length !== 0){
//add page to set
urlSet.add(scrapeLink);
} else if(remainder == undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
remainder = scrapeLink;
}
}
console.log(urlSet);
console.log(remainder);
}
firstScrape(url)
.then(nextStep)
.then(lastStep);
I'm currently getting the following error:
(node:71094) UnhandledPromiseRejectionWarning: Unhandled promise rejection (rejection id: 3): Error: Invalid URI "0"
This is the code I'm trying to promisify:
// Load front page of shirts4mike
function firstScrape(){
request(url, function(error, response, html) {
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
//iterate over links with 'shirt'
$('a[href*=shirt]').each(function(){
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
//for each new link, go in and find out if there is a submit button.
//If there, add it to the set
request(scrapeLink, function(error,response, html){
if(!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//if page has a submit it must be a product page
if($('[type=submit]').length !== 0){
//add page to set
urlSet.add(scrapeLink);
} else if(remainder == undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
remainder = scrapeLink;
}
}
});
});
}
});
}
What I can't work out is how can I use urlSet.add(scrapeLink); in lastStep() when it doesn't know what scrapeLink is?
Any idea why? Thank you
.add() is not an Array.prototype method, you also return promiseArray within for loop instead of pushing a Promise to promiseArray and using Promise.all()
function nextStep (arrayOfLinks) {
var promiseArray = [];
for(var i = 0; i < arrayOfLinks.length; i++) {
var link = requestPromise(arrayOfLinks[i]);
promiseArray.push(link);
}
return Promise.all(promiseArray)
}
UPDATE due to question changing:
So from firstScrape() you could return a results Object instead of just an array of links:
return { scrapeLink: link, result: links }
You would then get that in nextStep() as the result of the promise where you could return something with the same shape again:
return { scrapeLink: firstStepResult.scrapLink, result: Promise.all(promiseArray) }
Then in lastStep() instead of arrayOfHTMLresults getting passed in you would then have an Object which would look like:
{ scrapeLink: "http://someurl.com", result: arrayOfHTMLresults }
PREVIOUS answer:
You will need to initialize your variable in the for...in loop. e.g. with const, var or let depending on your use case and JS version.
for(var link in arrayOfLinks){
promiseArray.add(requestPromise(link));
return promiseArray;
}
Related
I am trying to parse some data from several web pages using javascript. I wrote a small parser for this purpose. The algorithm looks like this:
Open first URL from my .csv file
Find the data I need on the page
Save URL and data to a json file
My code executes 1. and 2. perfectly but sometimes messes up with number 3. Output looks like this:
URL 1 + data from URL 1 (correct line)
URL 2 + data from URL 2 (correct line)
URL 3 + data from URL 3 (correct line)
URL 4 + data from URL 4 (correct line)
URL 6(wrong URL) + data from another URL
URL 5(wrong URL) + data from another URL
URL 7 + data from URL 7 (correct line)
URL 8 + data from URL 8 (correct line)
URL 9 + data from URL 9 (correct line)
I assume the problem is that some pages load way too long which messes up the whole process. But I still don't understand why it sometimes saves the wrong data.
Heres my code:
var request = require('request');
var cheerio = require('cheerio');
var cloudscraper = require('cloudscraper');
var fs = require('fs');
var path = require('path');
var csvjson = require('csvjson');
//First, we read .csv file with our URL list
function getTheList() {
urlList = fs.readFileSync(path.join(__dirname, 'data.csv'), { encoding : 'utf8'});
var options = {
delimiter : ';', // optional
quote : '"' // optional
};
urlList = csvjson.toObject(urlList, options);
end = urlList.length;
logs = [];
//here we start the loop reading and saving data from each url
for (let p = 0; p < end; p += 1){
grabTheData(urlList, p)
}
}
//this code extracts the data from the page and saves it to a json file
function grabTheData(urlList, p){
setTimeout(function() {
url = url[p].ItemLink;
cloudscraper.get(url, function(err, res, body){
if (err) {
console.log(other.Time() + colors.yellow('Warn: ') + '- something went wrong with item ' + url);
callback();
} else {
var $ = cheerio.load(body);
/*
here are the lines which extract the data I need
dataIneed = ...;
*/
logs.push({
url, dataINeed
});
fs.writeFileSync('./logs.json', JSON.stringify(logs, null, 4));
}
});
//here I set a 2 seconds delay between each URL
}, 2000 * p);
}
getTheList()
The reason this is happening is that there is a potential mismatch between the callback result and the url variable in grabTheData.
Now there is a very quick fix for this, simple change the scope of the url variable like so:
function grabTheData(urlList, p){
setTimeout(function() {
// Set scope of url variable to block
let url = url[p].ItemLink;
cloudscraper.get(url, function(err, res, body){
if (err) {
console.log(other.Time() + colors.yellow('Warn: ') + '- something went wrong with item ' + url);
callback();
} else {
var $ = cheerio.load(body);
/*
here are the lines which extract the data I need
dataIneed = ...;
*/
logs.push({
url, dataINeed
});
fs.writeFileSync('./logs.json', JSON.stringify(logs, null, 4));
}
});
//here I set a 2 seconds delay between each URL
}, 2000 * p);
}
This should keep your results in order.
Here's another (IMHO much better) option, using promises and avoiding the use of setTimeout to separate calls. This should avoid any potential race condition, since the Promise.all call will preserve order:
async function getTheList() {
urlList = fs.readFileSync(path.join(__dirname, 'data.csv'), { encoding : 'utf8'});
var options = {
delimiter : ';', // optional
quote : '"' // optional
};
urlList = csvjson.toObject(urlList, options);
let promiseList = urlList.map(urlEntry => grabTheDataUpdated(urlEntry.ItemLink));
let logs = await Promise.all(promiseList);
fs.writeFileSync('./new_logs.json', JSON.stringify(logs, null, 4));
}
// Promisified version of cloudscraper.get
function getCloudScraperData(url) {
return new Promise((resolve, reject) => {
cloudscraper.get(url, (err, res, body) => {
if (err) {
reject(err);
} else {
resolve ( { url, res, body });
}
})
})
}
function getDataINeed(url, body) {
// Use cheerio to process data..
// Return mock data for now.. replace with actual data processed by cheerio..
return `data from ${url}`;
}
async function grabTheDataUpdated(url) {
try {
let result = await getCloudScraperData(url);
let dataINeed = getDataINeed(result.url, result.body);
return { url, dataINeed };
} catch (error) {
return { url, dataINeed: "Error occurred: " + error.message };
}
}
Is there a way to display your AJAX data back in the order in which you called your AJAX requests, without using promises, also no synchronous code or jQuery, but simply pure javascript?
For example:
//file 1 takes 3 seconds & file2 takes 1 second
input: ['example1.com', 'example2.com']
output: [example1_response, example2_response]
I started by setting up a small toy problem in my HTML page. I append two placeholder <div>'s with the text wait inside my webpage & then as my url requests completed the appropriate <div>'s placeholder text was replaced. But still it doesn't achieve the end goal of loading my content based on the order in which I made my requests.
JSFIDDLE:https://jsfiddle.net/nf4p1bgf/5/
var body = document.getElementsByTagName('body')[0]
var urls = [ "website1.com", "website2.com"];
//Helper function to simulate AJAX request
function fakeAjax(url,cb) {
var fake_responses = {
"website1.com": "data from website1.com",
"website2.com": "data from website2.com"
};
var randomDelay = (Math.round(Math.random() * 1E4) % 8000) + 1000;
console.log(`Requesting: ${url}. Response time: ${randomDelay}`);
setTimeout(function(){
cb(fake_responses[url]);
},randomDelay);
}
urls.forEach(function(url) {
//creating placeholder <div>'s before AJAX data returns
var div = document.createElement("div");
div.innerHTML = "this is a place holder - please wait";
body.appendChild(div);
fakeAjax(url, function(data) {
div.innerHTML = data;
});
});
EDIT & SOLUTION JSFiddle here: https://jsfiddle.net/fa707qjc/11/
//*********** HELPERS (SEE CODE BELOW HELPERS SECTION) ***********/
var body = document.getElementsByTagName('body')[0]
var urls = ["website1.com","website2.com"];
function fakeAjax(url,cb) {
var fake_responses = {
"website1.com": "data from website1.com",
"website2.com": "data from website2.com"
};
var randomDelay = (Math.round(Math.random() * 1E4) % 8000) + 1000;
console.log(`Requesting: ${url}. Response time: ${randomDelay}`);
setTimeout(function(){
cb(fake_responses[url]);
},randomDelay);
}
function createElement(typeOfElement, text){
var element = document.createElement(typeOfElement)
element.innerHTML = text;
return element;
}
function handleResponse(url, contents){
//if I haven't recieved response from x url
if( !(url in responses)){
responses[url] = contents;
}
//defining order for response outputs
var myWebsites = ['website1.com','website2.com'];
// loop through responses in order for rendering
for(var url of myWebsites){
if(url in responses){
if(typeof responses[url] === "string"){
console.log( responses[url])
//mark already rendered
var originalText = responses[url];
responses[url] = true;
var p = createElement('p', originalText);
body.appendChild(p);
}
}
//can't render it / not complete
else{
return;
}
}
}
//*********** CODE START ***********
function getWebsiteData(url) {
fakeAjax(url, function(text){
console.log(`Returning data from ${url} w/response: ${text}`)
handleResponse(url, text);
});
}
//As we get our responses from server store them
var responses = {};
// request all files at once in "parallel"
urls.forEach(function(url){
getWebsiteData(url);
})
Use Promise.
Promise.all(urls.map(function(url) {
return new Promise(function(resolve, reject) {
request(url, function(data, error) {
if (error) {
return reject(error);
}
resolve(data);
})
});
})).then(function(results) {
console.log(results);
});
How I can store result of an async function in variable in node? For example I want to parse a website and get some info, but I don't want to constantly request data, I want to get all page only once. Here is code:
var cheerio = require('cheerio');
var request = require('request');
function SiteParser() {
const SITE = 'http://www.anywebsite.com';
// variable for caching html
var $ = getBody();
function getBody(SITE){
// request html
request(SITE, function(error, response, html) {
if (!error && response.statusCode == 200) {
return cheerio.load(html);
}
else {
throw new Error('Could not parse web site\nError text: ' + error);
}
})
}
//declaration of class methods using cached html
this.getCategories = function() {};
this.getNews = function() {};
}
How can I be sure, that response will be received, when I will call methods of the class?
Or is this a bad practice?
Note that var $ = getBody(); does not work as you want it, since getBody does not return anything. It is the inner callback that returns something, but that return value is for ever lost. Moreover, that callback is called asynchronously, so you could not get it via var $ = getBody();, since getBody finishes before the callback is executed.
So, here is a quick way to resolve that issue. The constructor takes an optional callback function as argument:
function SiteParser(onready) {
const SITE = 'http://www.anywebsite.com';
// variable for caching html
var $;
var that = this; // maintain reference to this instance of SiteParser
function getBody(SITE){
// request html
request(SITE, function(error, response, html) {
if (!error && response.statusCode == 200) {
$ = cheerio.load(html); // store the result
that.ready() // notify client that html has been retrieved
}
else {
throw new Error('Could not parse web site\nError text: ' + error);
}
})
}
//declaration of class methods using cached html
this.getCategories = function() {
if ($ === undefined) throw new Error('Not ready yet');
// extract categories from $ and return them
};
this.getNews = function() {
if ($ === undefined) throw new Error('Not ready yet');
// extract news from $ and return it
};
this.ready = onready || function() {}; // callback for being notified
}
Now you can write:
parser = new SiteParser;
parser.ready = function () {
var cats = parser.getCategories();
// ...etc
};
or also:
parser = new SiteParser(function () {
var cats = parser.getCategories();
// ...etc
});
Promise
The above code exposes the getCategories and getNews methods even when the HTML has not yet been retrieved. This is not so nice.
Using the promise concept you can improve on this by only providing the SiteParser instance when all is ready. Here is some (untested) code you could use with promise for nodejs:
var Promise = require('promise');
// SiteParser now needs to be given the HTML in the constructor:
function SiteParser(html) {
var $ = $cheerio.load(html)
//declaration of class methods using cached html
this.getCategories = function() {
// extract categories from $ and return them
};
this.getNews = function() {
// extract news from $ and return it
};
}
// a separate Promise object takes care of the retrieval:
function promiseSiteParser() {
const SITE = 'http://www.anywebsite.com';
return new Promise(function (resolve, reject) {
// request html
request(SITE, function(error, response, html) {
if (!error && response.statusCode == 200) {
resolve(new SiteParser(html));
}
else {
reject(error);
}
})
});
}
promiseSiteParser().then(function (parser) {
var cats = parser.getCategories();
// ...etc
}, function (error) {
throw new Error('Could not parse web site\nError text: ' + error);
});
I'm building an Restful API with node js this is my code
There are 3 main parts, I created a get method that has to return some data
the get calls getImportIoData that has a request to an external API and this method calls another method called getEmailFromWebSite this other method calls another request,
how can I wait for every request to finish an return data
I Know this has been answered in other questions but I tried some of them and didn't work
I tried Async and q libraries, also the callback on this part Doesn't work
if (item.website !== undefined){
getEmailFromWebSite(item.website, function(email){console.log(email); item.email = email;});
}
I don't know If Im doing something wrong,
Can someone help me with this? using my code
Thanks in advance
router.get('/', function(req, res) {
var Type = req.query.Type;
var Locations = req.query.Locations;
var Page = req.query.Page;
res.send(getImportIoData(res, Type, Locations, Page));
});
function getImportIoData(res, Type, Locations, Page) {
var criteria = {
'search_terms': Type,
'geo_location_terms': Locations,
'page': Page
}
var url = consts.import_io + consts.import_key + consts.query +
encodeURIComponent(consts.api_url) + encodeQueryData(criteria) +
/*"&_user=" + consts.user + */"&_apikey=" + consts.api_key;
request(url, function(error, response, json) {
var data = JSON.parse(json);
if (!error && response.statusCode === 200) {
var doctors = _.forEach(data.results, function(item) {
if (item.website !== undefined){
getEmailFromWebSite(item.website, function(email){console.log(email); item.email = email;});
}
});
return data.results
}
//else
});
}
function getEmailFromWebSite(website, callback) {
var EmptyReturn='';
searchTerm = extractDomain(website).replace('www.', '');
if (searchTerm != ''){
var EH_APIURL = 'https://api.emailhunter.co/v1/search?domain='+searchTerm+'&api_key='+ consts.EmailHunterAPIKey;
request(EH_APIURL, function(error, response, json) {
var data = JSON.parse(json);
if (!error && response.statusCode === 200) {
if((data.emails).length != 0){
var Emailobject = data.emails;
var EmailString = returnEmails(Emailobject);
callback(EmailString);
}else
callback(EmptyReturn);
}else if (response.statusCode === 429)
console.log('The number of Request has been Reached for this account');
else if (response.statusCode > 500)
console.log('Error with Email Hunter Servers');
else
console.log('An Error Ocurred');
});
}
}
There's no code related to q in there but q.all receives an array of promises and resolves when all of those promises are resolved, so you can do something like this..
var promises = [ promiseOne(), promiseTwo(), promiseThree() ];
q.all(promises).then(function(results) {
//results is an array with the resolution values from all your promises in the order you specified in the promises array
});
Firstly, here is my code as I've progressed so far:
var http = require("http");
// Utility function that downloads a URL and invokes
// callback with the data.
function download(url, callback) {
http.get(url, function(res) {
var data = "";
res.on('data', function (chunk) {
data += chunk;
});
res.on("end", function() {
callback(data);
});
}).on("error", function() {
callback(null);
});
}
var cheerio = require("cheerio");
var url = "http://www.bloglovin.com/en/blogs/1/2/all";
var myArray = [];
var a = 0;
var getLinks = function(){download(url, function(data) {
if (data) {
// console.log(data);
var $ = cheerio.load(data);
$(".content").each(function(i, e) {
var blogName = $(e).find(".blog-name").text();
var followLink = $(e).find("a").attr("href");
var blogSite = $(e).find(".description").text();
myArray[a] = [a];
myArray[a]["blogName"] = blogName;
myArray[a]["followLink"] = "http://www.bloglovin.com"+followLink;
myArray[a]["blogSite"] = blogSite;
a++;
console.log(myArray);
});
}
});
}
getLinks();
As you can see, followLinks is concatenated to followUrl, of which I'd like to pass through the 'url' download, so effectively I'll be scraping each of the pages using the same CSS rules, which will be added to the multidimensional array for the corresponding blogger.
How can I go about this?
I do something similar in one of my scraping jobs, but I use the async.js library to accomplish. Note that I'm also using the request module and cheerio.js in my scraping. I fetch and scrape rows of data from a single webpage, but suspect you could do something similar to fetch URLs and request / scrape them in the same manner.
I also admit this is quite basic coding, certainly could be optimized with a bit of refactoring. Hope it gives you some ideas at least...
First, I use request to fetch the page and call my parse function -
var url = 'http://www.target-website.com';
function(lastCallback) {
request(url, function(err, resp, body) {
if(!err) { parsePage(err, resp, body, lastCallback); }
else { console.log('web request error:' + resp.statusCode); }
}
}
Next, in my parsePage function, I load the website into Cheerio, fetch the HTML of each data row into an array, push my parseRow function and each HTML segment into another array, and use async.parallel to process each iteration -
var rows = [];
function parsePage(err, resp, body, callback1) {
var $ = cheerio.load(body);
$('div#targetTable tr').each(function(i, elem) {
rows.push($(this).html());
});
var scrRows = [];
rows.forEach(function(row) {
scrRows.push(function(callback2) {
parseRow(err, resp, row);
callback2();
});
async.parallel(scrRows, function() {
callback1();
});
}
Inside your loop, just create an object with the properties you scrape then push that object onto your array.
var blogInfo = {
blogName: blogName,
followLink: "http://www.bloglovin.com"+followLink;
blogSite: blogSite
};
myArray.push(blogInfo);
You have defined a = 0; So
myArray[a] = [a]; // => myArray[0] = [0]; myArray[0] becomes an array with 0 as only member in it
All these statements throw an error since Array can have only integer as keys.
myArray[a]["blogName"] = blogName;
myArray[a]["followLink"] = "http://www.bloglovin.com"+followLink;
myArray[a]["blogSite"] = blogSite;
Instead try this:
var obj = {
index: a,
blogName: blogName,
followLink: "http://www.bloglovin.com" + followLink,
blogSite: blogSite
}
myArray.push(obj);
console.log(myArray);