iterate node.js request function

iterate node.js request function - javascript

This question is about a crawler in node.js.
A start_url is given where he crawls for URLs, and "pushes" them to a .json-file (output.json).
At the moment, he runs the request function only with the start_url, and saves the collected URLs in output.json. I want that he uses the saved URLs by replacing the start_url with the first collected URL and collect links again ... and so on ...
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var start_url = ["http://stackoverflow.com/"]
var req = function(url){
request(url, function(error, response, html){
var $ = cheerio.load(html);
var data = [];
$("a").each(function() {
var link = $(this);
var exurls = {exurl: new Array(link.attr("href"))}
data.push(exurls);
// Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
// save to "output.json" from time to time, so you can stop it anytime
});
fs.writeFile("output.json", JSON.stringify(data, null, 4), function(err){
if(err){
console.log(err);
} else {
console.log("File successfully written!");
}
});
});
}
for (var i = 0; i < start_url.length; i++){
req(start_url[i]);
}

So what you can do is make the function call recursively. The below example should work:
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var start_url = ["http://stackoverflow.com/"]
var req = function(url){
var count = 0;
request(url, function(error, response, html){
var $ = cheerio.load(html);
$("a").each(function() {
var link = $(this);
var exurls = {exurl: new Array(link.attr("href"))}
start_url.push(exurls);
// Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
// save to "output.json" from time to time, so you can stop it anytime
});
try {
fs.writeFileSync("output.json");
console.log("File successfully written!");
}catch(err){
console.log(err);
}
++count;
if(start_url.length > count) {
req(start_url[count]);
}
});
}
return req(start_url[0]);
The problem with this is that you are completely rewriting the file each time. If this goes on for awhile you are going to run out of memory. Another option is to create a write stream
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var start_url = ["http://stackoverflow.com/"]
var wstream = fs.createWriteStream("output.json");
var req = function(url){
request(url, function(error, response, html){
var $ = cheerio.load(html);
$("a").each(function() {
var link = $(this);
var exurls = {exurl: new Array(link.attr("href"))}
start_url.push(exurls);
// Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
// save to "output.json" from time to time, so you can stop it anytime
wstream.write('"'+ exurls + '",');
});
start_url.shift();
if(start_url.length > 0) {
return req(start_url[0]);
}
wstream.end();
});
}
req(start_url[0]);
Edit: switched to a basic queue so combat memory problems

Related

How to send the form data to the JS program

user will give the url in the input type field in the Html page that url need to get in the JS program and then the JS program need to execute to fetch the data from webpage.
this is what have done so far.
var request = require('request');
var cheerio = require('cheerio');
var fs = require("fs");
var url = ""
request(url, function(err, response, html){
if(!err) {
var $ =cheerio.load(html);
var allItems = $('.clearfix').parent().children();
var items = [];
allItems.each(function(index) {
var result = $('.clearfix').eq(index).parent().children().eq(1).find("a").text();
if(result !== ""){
items.push(result);
}
});
fs.writeFile("output1.xls",JSON.stringify(items, null, 1),)
console.log(items);
}
});

Is this the solution to your problem?
var url = document.getElementById('myURL').value

JavaScript each() is not working

var request = require('request'),
cheerio = require('cheerio');
var url = "https://namu.wiki/w/크롤링";
request(url, function (err, res, html) {
if (!err) {
var $ = cheerio.load(html);
$('.wiki-heading-content').each(function(){
var post = {"content": "" };
var data=$(this);
post['content']=data.text();
console.log(post);
});
}
});
The line of code below is not working. Why?
//$('.wiki-heading-content').each(function()

You are using jQuery in your code and you overwrite(in the function scope) your global jQuery object on this line.
var $ = cheerio.load(html);

How can I rewrite this with promises?

I am building a content scraper for a tshirt website.
The goal is to enter a website through only one hardcoded url: http://shirts4mike.com
I will then find all the product pages for each tshirt, and then create a object with it's details. Then add it to an array.
When the array is full of the tshirts, I'll work through the array and log it into a CSV file.
Right now, I am having some trouble with the timing of the requests/responses and the function calls.
How can I make sure that I call the NEXT function on the right time? I understand that it's not working because of it's async nature.
How can I call secondScrape, lastScraper and convertJson2Csv at the right time so that the variables they're working with are not undefined?
I tried to use something such as response.end() but this is not working.
I'm assuming I NEED to use promises to make this work properly? and to be legible?
Any ideas? My code is below:
//Modules being used:
var cheerio = require('cheerio');
var request = require('request');
var moment = require('moment');
//hardcoded url
var url = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = new Set();
var remainder;
var tshirtArray;
// Load front page of shirts4mike
request(url, function(error, response, html) {
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
//iterate over links with 'shirt'
$("a[href*=shirt]").each(function(){
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
//for each new link, go in and find out if there is a submit button.
//If there, add it to the set
request(scrapeLink, function(error,response, html){
if(!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//if page has a submit it must be a product page
if($('[type=submit]').length !== 0){
//add page to set
urlSet.add(scrapeLink);
} else if(remainder === undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
remainder = scrapeLink;
}
}
});
});
}
//call second scrape for remainder
secondScrape();
});
function secondScrape() {
request(remainder, function(error, response, html) {
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
$("a[href*=shirt]").each(function(){
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
request(scrapeLink, function(error,response, html){
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
//collect remaining product pages and add to set
if($('[type=submit]').length !== 0){
urlSet.add(scrapeLink);
}
}
});
});
}
});
console.log(urlSet);
//call lastScraper so we can grab data from the set (product pages)
lastScraper();
};
function lastScraper(){
//scrape set, product pages
for(var i = 0; i < urlSet.length; i++){
var url = urlSet[i];
request(url, function(error, response, html){
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
//grab data and store as variables
var price = $('.price').text();
var img = $('.shirt-picture').find("img").attr("src");
var title = $('body').find(".shirt-details > h1").text().slice(4);
var tshirtObject = {};
//add values into tshirt object
tshirtObject.price = price;
tshirtObject.img = img;
tshirtObject.title = title;
tshirtObject.url = url;
tshirtObject.date = moment().format('MMMM Do YYYY, h:mm:ss a');
//add the object into the array of tshirts
tshirtArray.push(tshirtObject);
}
});
}
//call function to iterate through tshirt objects in array in order to convert to JSON, then into CSV to be logged
convertJson2Csv();
};

There is a npm module called request-promise.
simply:
var rp = require("request-promise");
and anywhere you are making a request you can switch with request-promise.
for instance:
rp(url)
.then(function(value){
//do whatever
})
.catch(function(err){
console.log(err)
})

You can use this example to convert the rest of your code sample.
promise = new Promise((resolve, reject) => (
request("http://shirts4mike.com/",
(err, response, html) => (response.statusCode == 200 ? resolve(html): reject(err))
)));
promise.then(html => {
var $ = cheerio.load(html);
// continue
});

You can use waterfall method of async module which can give you a smooth way to resolve this issue.
I just try to do your code with this module
Hope this will work for you
Format of waterfall
async.waterfall([
function(callback) {
callback(null, previousvalue);
},
function(previousvalue, callback) {}
], function(err, result) { //Final callback
});
var async = require('async');
var cheerio = require('cheerio');
var request = require('request');
var moment = require('moment');
//hardcoded url
var url = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = new Set();
var remainder;
var tshirtArray = [];
async.waterfall([
function(callback) {
// Load front page of shirts4mike
request(url, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//iterate over links with 'shirt'
$("a[href*=shirt]").each(function() {
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
//for each new link, go in and find out if there is a submit button.
//If there, add it to the set
request(scrapeLink, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//if page has a submit it must be a product page
if ($('[type=submit]').length !== 0) {
//add page to set
urlSet.add(scrapeLink);
callback(null, true);
} else if (remainder === undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
remainder = scrapeLink;
callback(nul, true);
}
}
});
});
}
//call second scrape for remainder
// secondScrape();
});
},
function(previousvalue, callback) {
request(remainder, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$("a[href*=shirt]").each(function() {
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
request(scrapeLink, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//collect remaining product pages and add to set
if ($('[type=submit]').length !== 0) {
urlSet.add(scrapeLink);
}
callback(null, true);
}
});
});
}
});
console.log(urlSet);
//call lastScraper so we can grab data from the set (product pages)
},
function(previousvalue, callback) {
//scrape set, product pages
for (var i = 0; i < urlSet.length; i++) {
var url = urlSet[i];
request(url, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//grab data and store as variables
var price = $('.price').text();
var img = $('.shirt-picture').find("img").attr("src");
var title = $('body').find(".shirt-details > h1").text().slice(4);
var tshirtObject = {};
//add values into tshirt object
tshirtObject.price = price;
tshirtObject.img = img;
tshirtObject.title = title;
tshirtObject.url = url;
tshirtObject.date = moment().format('MMMM Do YYYY, h:mm:ss a');
//add the object into the array of tshirts
tshirtArray.push(tshirtObject);
}
});
}
}
], function(err, result) {
//call function to iterate through tshirt objects in array in order to convert to JSON, then into CSV to be logged
convertJson2Csv();
});

You correctly identify promises as a way ahead to solving your timing issues.
In order to have promises available, you need to promisify request (or adopt a HTTP lib, whose methods return promises).
You could just fix the timing issues with promises, but you could also take the opportunity to improve the overall paradigm. Instead of discrete functions for virtually identical first/second/third stages, you can write a single function that calls itself recursively. Written correctly, this will ensure that each page in the target site is visited a maximum of once; revisits should be avoided on grounds of overall performance, and loading of the target server.
//Modules being used:
var Promise = require('path/to/bluebird');
var cheerio = require('cheerio');
var moment = require('moment');
// Promisify `request` to make `request.getAsync()` available.
// Ref: http://stackoverflow.com/questions/28308131/how-do-you-properly-promisify-request
var request = Promise.promisify(require('request'));
Promise.promisifyAll(request);
//hardcoded url
var url = 'http://shirts4mike.com/';
var urlSet = new Set();
var tshirtArray = [];
var maxLevels = 3; // limit the recursion to this number of levels.
function scrapePage(url_, levelCounter) {
// Bale out if :
// a) the target url_ has been visited already,
// b) maxLevels has been reached.
if(urlSet.has(url_) || levelCounter >= maxLevels) {
return Promise.resolve();
}
urlSet.add(url_);
return request.getAsync(url_).then(function(response, html) {
var $;
if(response.statusCode !== 200) {
throw new Error('statusCode was not 200'); // will be caught below
}
$ = cheerio.load(html);
if($('[type=submit]').length > 0) {
// yay, it's a product page.
tshirtArray.push({
price: $('.price').text(),
img: $('.shirt-picture').find("img").attr("src"),
title: $('body').find(".shirt-details > h1").text().slice(4),
url: url_,
date: moment().format('MMMM Do YYYY, h:mm:ss a')
});
}
// find any shirt links on page represented by $, visit each link in turn, and scrape.
return Promise.all($("a[href*=shirt]").map(function(link) {
return scrapePage(link.href, levelCounter + 1);
}).get());
}).catch(function(e) {
// ensure "success" even if scraping threw an error.
console.log(e);
return null;
});
}
scrapePage(url, 0).then(convertJson2Csv);
As you can see, a recursive solution :
avoids repetition of code,
will drill down as many levels as you wish - determined by the variable maxLevels.
Note: This is still not a good solution. There's an implicit assumption here, as in the original code, that all shirt pages are reachable from the site's home page, via "shirt" links alone. If shirts were reachable via eg "clothing" > "shirts", then the code above won't find any shirts.

How to write data in a node.js stream without duplicates?

This question is about a URL-crawler in node.js.
On the start_url URL he looks for links and "pushes" them to a .json-file (output.json).
How can I make sure that he does not "push" or "write" domains twice to output.json (so that I do not get duplicates)? I've been using the hash function but this has caused problems.
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var start_url = ["http://blog.codinghorror.com/"]
var wstream = fs.createWriteStream("output.json");
// Extract root domain name from string
function extractDomain(url) {
var domain;
if (url.indexOf("://") > -1) { //find & remove protocol (http(s), ftp, etc.) and get domain
domain = url.split('/')[2];
} else {
domain = url.split('/')[0];
}
domain = domain.split(':')[0]; //find & remove port number
return domain;
}
var req = function(url){
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
$("a").each(function() {
var link = $(this).attr("href");
var makelinkplain = extractDomain(link);
start_url.push("http://" + makelinkplain);
wstream.write('"http://'+ makelinkplain + '",');
});
}
start_url.shift();
if(start_url.length > 0) {
return req(start_url[0]);
}
wstream.end();
});
}
req(start_url[0]);

You can just keep track of the previously seen domains in a Set object like this:
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var domainList = new Set();
var start_url = ["http://blog.codinghorror.com/"]
var wstream = fs.createWriteStream("output.json");
// Extract root domain name from string
function extractDomain(url) {
var domain;
if (url.indexOf("://") > -1) { //find & remove protocol (http(s), ftp, etc.) and get domain
domain = url.split('/')[2];
} else {
domain = url.split('/')[0];
}
domain = domain.split(':')[0]; //find & remove port number
// since domains are not case sensitive, canonicalize it by going to lowercase
return domain.toLowerCase();
}
var req = function(url){
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
$("a").each(function() {
var link = $(this).attr("href");
if (link) {
var makelinkplain = extractDomain(link);
// see if we've already done this domain
if (!domainList.has(makelinkplain)) {
domainList.add(makelinkplain);
start_url.push("http://" + makelinkplain);
wstream.write('"http://'+ makelinkplain + '",');
}
}
});
}
start_url.shift();
if(start_url.length > 0) {
return req(start_url[0]);
}
wstream.end();
});
}
req(start_url[0]);
Note: I also added a .toLowerCase() to the extractDomain() function since domains are not case sensitive, but a Set object is. This will make sure that even domains that differ only in case are recognized as the same domain.

Handle on success event with npm package request

I have a node.js app that scrapes informations from a website. I'm using npm packages request and cheerio and the scraping works fine but I want to do something else when the request function is done. Here's some code:
app.js
var express = require('express');
var extractor = require("./extractor");
console.log(extractor('http://www.example.com'));
var app = express();
app.get('/', function (req, res) {
res.send('Hello world\n');
});
app.listen(3000);
extractor.js (all the fun)
var request = require('request');
var cheerio = require('cheerio');
var Extractor = function(url) {
var games = [];
request(url, function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('tr.game').each(function(i, v){
var game = { /* many attributes */ };
games.push(game);
});
}
});
this.extractedGames = games;
};
module.exports = function(url) {
return new Extractor(url);
};
Eventually when I run this it shows { extractedGames: [] } that is because the output was printed before the request treatment was over. So I want to add an on success event to extracedGames attribute when the request job is over.
Thanks

Solved it myself ! I hope this could help people in the future (though I felt like a complete noob)
The trick was to emit an event and handle it later.
var express = require('express');
var extractor = require("./extractor");
extractor('http://www.example.com');
var app = express();
app.get('/', function (req, res) {
res.send('Hello world\n');
});
app.listen(3000);
I removed console.log(extractor('http://www.example.com')) because this would run before the request job is done. So I moved it to the event handling function.
var request = require('request');
var cheerio = require('cheerio');
var Emitter = require('events').EventEmitter;
var extractEmitter = new Emitter();
extractEmitter.on('extracted', function(extractedGames){
console.log(extractedGames);
});
var Extractor = function(url) {
var games = [];
request(url, function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('tr.game').each(function(i, v){
var game = { /* many attributes */ };
games.push(game);
});
extractEmitter.emit('extracted', games);
}
});
this.extractedGames = games;
};
module.exports = function(url) {
return new Extractor(url);
};

We Keep Coding

JavaScript is the programming language of the Web.

iterate node.js request function - javascript

Related

How to send the form data to the JS program

JavaScript each() is not working

How can I rewrite this with promises?

How to write data in a node.js stream without duplicates?

Handle on success event with npm package request

Categories

Resources