user will give the url in the input type field in the Html page that url need to get in the JS program and then the JS program need to execute to fetch the data from webpage.
this is what have done so far.
var request = require('request');
var cheerio = require('cheerio');
var fs = require("fs");
var url = ""
request(url, function(err, response, html){
if(!err) {
var $ =cheerio.load(html);
var allItems = $('.clearfix').parent().children();
var items = [];
allItems.each(function(index) {
var result = $('.clearfix').eq(index).parent().children().eq(1).find("a").text();
if(result !== ""){
items.push(result);
}
});
fs.writeFile("output1.xls",JSON.stringify(items, null, 1),)
console.log(items);
}
});
Is this the solution to your problem?
var url = document.getElementById('myURL').value
var request = require('request'),
cheerio = require('cheerio');
var url = "https://namu.wiki/w/크롤링";
request(url, function (err, res, html) {
if (!err) {
var $ = cheerio.load(html);
$('.wiki-heading-content').each(function(){
var post = {"content": "" };
var data=$(this);
post['content']=data.text();
console.log(post);
});
}
});
The line of code below is not working. Why?
//$('.wiki-heading-content').each(function()
You are using jQuery in your code and you overwrite(in the function scope) your global jQuery object on this line.
var $ = cheerio.load(html);
I am building a content scraper for a tshirt website.
The goal is to enter a website through only one hardcoded url: http://shirts4mike.com
I will then find all the product pages for each tshirt, and then create a object with it's details. Then add it to an array.
When the array is full of the tshirts, I'll work through the array and log it into a CSV file.
Right now, I am having some trouble with the timing of the requests/responses and the function calls.
How can I make sure that I call the NEXT function on the right time? I understand that it's not working because of it's async nature.
How can I call secondScrape, lastScraper and convertJson2Csv at the right time so that the variables they're working with are not undefined?
I tried to use something such as response.end() but this is not working.
I'm assuming I NEED to use promises to make this work properly? and to be legible?
Any ideas? My code is below:
//Modules being used:
var cheerio = require('cheerio');
var request = require('request');
var moment = require('moment');
//hardcoded url
var url = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = new Set();
var remainder;
var tshirtArray;
// Load front page of shirts4mike
request(url, function(error, response, html) {
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
//iterate over links with 'shirt'
$("a[href*=shirt]").each(function(){
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
//for each new link, go in and find out if there is a submit button.
//If there, add it to the set
request(scrapeLink, function(error,response, html){
if(!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//if page has a submit it must be a product page
if($('[type=submit]').length !== 0){
//add page to set
urlSet.add(scrapeLink);
} else if(remainder === undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
remainder = scrapeLink;
}
}
});
});
}
//call second scrape for remainder
secondScrape();
});
function secondScrape() {
request(remainder, function(error, response, html) {
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
$("a[href*=shirt]").each(function(){
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
request(scrapeLink, function(error,response, html){
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
//collect remaining product pages and add to set
if($('[type=submit]').length !== 0){
urlSet.add(scrapeLink);
}
}
});
});
}
});
console.log(urlSet);
//call lastScraper so we can grab data from the set (product pages)
lastScraper();
};
function lastScraper(){
//scrape set, product pages
for(var i = 0; i < urlSet.length; i++){
var url = urlSet[i];
request(url, function(error, response, html){
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
//grab data and store as variables
var price = $('.price').text();
var img = $('.shirt-picture').find("img").attr("src");
var title = $('body').find(".shirt-details > h1").text().slice(4);
var tshirtObject = {};
//add values into tshirt object
tshirtObject.price = price;
tshirtObject.img = img;
tshirtObject.title = title;
tshirtObject.url = url;
tshirtObject.date = moment().format('MMMM Do YYYY, h:mm:ss a');
//add the object into the array of tshirts
tshirtArray.push(tshirtObject);
}
});
}
//call function to iterate through tshirt objects in array in order to convert to JSON, then into CSV to be logged
convertJson2Csv();
};
There is a npm module called request-promise.
simply:
var rp = require("request-promise");
and anywhere you are making a request you can switch with request-promise.
for instance:
rp(url)
.then(function(value){
//do whatever
})
.catch(function(err){
console.log(err)
})
You can use this example to convert the rest of your code sample.
promise = new Promise((resolve, reject) => (
request("http://shirts4mike.com/",
(err, response, html) => (response.statusCode == 200 ? resolve(html): reject(err))
)));
promise.then(html => {
var $ = cheerio.load(html);
// continue
});
You can use waterfall method of async module which can give you a smooth way to resolve this issue.
I just try to do your code with this module
Hope this will work for you
Format of waterfall
async.waterfall([
function(callback) {
callback(null, previousvalue);
},
function(previousvalue, callback) {}
], function(err, result) { //Final callback
});
var async = require('async');
var cheerio = require('cheerio');
var request = require('request');
var moment = require('moment');
//hardcoded url
var url = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = new Set();
var remainder;
var tshirtArray = [];
async.waterfall([
function(callback) {
// Load front page of shirts4mike
request(url, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//iterate over links with 'shirt'
$("a[href*=shirt]").each(function() {
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
//for each new link, go in and find out if there is a submit button.
//If there, add it to the set
request(scrapeLink, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//if page has a submit it must be a product page
if ($('[type=submit]').length !== 0) {
//add page to set
urlSet.add(scrapeLink);
callback(null, true);
} else if (remainder === undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
remainder = scrapeLink;
callback(nul, true);
}
}
});
});
}
//call second scrape for remainder
// secondScrape();
});
},
function(previousvalue, callback) {
request(remainder, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$("a[href*=shirt]").each(function() {
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
request(scrapeLink, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//collect remaining product pages and add to set
if ($('[type=submit]').length !== 0) {
urlSet.add(scrapeLink);
}
callback(null, true);
}
});
});
}
});
console.log(urlSet);
//call lastScraper so we can grab data from the set (product pages)
},
function(previousvalue, callback) {
//scrape set, product pages
for (var i = 0; i < urlSet.length; i++) {
var url = urlSet[i];
request(url, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//grab data and store as variables
var price = $('.price').text();
var img = $('.shirt-picture').find("img").attr("src");
var title = $('body').find(".shirt-details > h1").text().slice(4);
var tshirtObject = {};
//add values into tshirt object
tshirtObject.price = price;
tshirtObject.img = img;
tshirtObject.title = title;
tshirtObject.url = url;
tshirtObject.date = moment().format('MMMM Do YYYY, h:mm:ss a');
//add the object into the array of tshirts
tshirtArray.push(tshirtObject);
}
});
}
}
], function(err, result) {
//call function to iterate through tshirt objects in array in order to convert to JSON, then into CSV to be logged
convertJson2Csv();
});
You correctly identify promises as a way ahead to solving your timing issues.
In order to have promises available, you need to promisify request (or adopt a HTTP lib, whose methods return promises).
You could just fix the timing issues with promises, but you could also take the opportunity to improve the overall paradigm. Instead of discrete functions for virtually identical first/second/third stages, you can write a single function that calls itself recursively. Written correctly, this will ensure that each page in the target site is visited a maximum of once; revisits should be avoided on grounds of overall performance, and loading of the target server.
//Modules being used:
var Promise = require('path/to/bluebird');
var cheerio = require('cheerio');
var moment = require('moment');
// Promisify `request` to make `request.getAsync()` available.
// Ref: http://stackoverflow.com/questions/28308131/how-do-you-properly-promisify-request
var request = Promise.promisify(require('request'));
Promise.promisifyAll(request);
//hardcoded url
var url = 'http://shirts4mike.com/';
var urlSet = new Set();
var tshirtArray = [];
var maxLevels = 3; // limit the recursion to this number of levels.
function scrapePage(url_, levelCounter) {
// Bale out if :
// a) the target url_ has been visited already,
// b) maxLevels has been reached.
if(urlSet.has(url_) || levelCounter >= maxLevels) {
return Promise.resolve();
}
urlSet.add(url_);
return request.getAsync(url_).then(function(response, html) {
var $;
if(response.statusCode !== 200) {
throw new Error('statusCode was not 200'); // will be caught below
}
$ = cheerio.load(html);
if($('[type=submit]').length > 0) {
// yay, it's a product page.
tshirtArray.push({
price: $('.price').text(),
img: $('.shirt-picture').find("img").attr("src"),
title: $('body').find(".shirt-details > h1").text().slice(4),
url: url_,
date: moment().format('MMMM Do YYYY, h:mm:ss a')
});
}
// find any shirt links on page represented by $, visit each link in turn, and scrape.
return Promise.all($("a[href*=shirt]").map(function(link) {
return scrapePage(link.href, levelCounter + 1);
}).get());
}).catch(function(e) {
// ensure "success" even if scraping threw an error.
console.log(e);
return null;
});
}
scrapePage(url, 0).then(convertJson2Csv);
As you can see, a recursive solution :
avoids repetition of code,
will drill down as many levels as you wish - determined by the variable maxLevels.
Note: This is still not a good solution. There's an implicit assumption here, as in the original code, that all shirt pages are reachable from the site's home page, via "shirt" links alone. If shirts were reachable via eg "clothing" > "shirts", then the code above won't find any shirts.
This question is about a crawler in node.js.
A start_url is given where he crawls for URLs, and "pushes" them to a .json-file (output.json).
At the moment, he runs the request function only with the start_url, and saves the collected URLs in output.json. I want that he uses the saved URLs by replacing the start_url with the first collected URL and collect links again ... and so on ...
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var start_url = ["http://stackoverflow.com/"]
var req = function(url){
request(url, function(error, response, html){
var $ = cheerio.load(html);
var data = [];
$("a").each(function() {
var link = $(this);
var exurls = {exurl: new Array(link.attr("href"))}
data.push(exurls);
// Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
// save to "output.json" from time to time, so you can stop it anytime
});
fs.writeFile("output.json", JSON.stringify(data, null, 4), function(err){
if(err){
console.log(err);
} else {
console.log("File successfully written!");
}
});
});
}
for (var i = 0; i < start_url.length; i++){
req(start_url[i]);
}
So what you can do is make the function call recursively. The below example should work:
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var start_url = ["http://stackoverflow.com/"]
var req = function(url){
var count = 0;
request(url, function(error, response, html){
var $ = cheerio.load(html);
$("a").each(function() {
var link = $(this);
var exurls = {exurl: new Array(link.attr("href"))}
start_url.push(exurls);
// Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
// save to "output.json" from time to time, so you can stop it anytime
});
try {
fs.writeFileSync("output.json");
console.log("File successfully written!");
}catch(err){
console.log(err);
}
++count;
if(start_url.length > count) {
req(start_url[count]);
}
});
}
return req(start_url[0]);
The problem with this is that you are completely rewriting the file each time. If this goes on for awhile you are going to run out of memory. Another option is to create a write stream
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var start_url = ["http://stackoverflow.com/"]
var wstream = fs.createWriteStream("output.json");
var req = function(url){
request(url, function(error, response, html){
var $ = cheerio.load(html);
$("a").each(function() {
var link = $(this);
var exurls = {exurl: new Array(link.attr("href"))}
start_url.push(exurls);
// Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
// save to "output.json" from time to time, so you can stop it anytime
wstream.write('"'+ exurls + '",');
});
start_url.shift();
if(start_url.length > 0) {
return req(start_url[0]);
}
wstream.end();
});
}
req(start_url[0]);
Edit: switched to a basic queue so combat memory problems
Trying to scrape the front page of a website (www.ozbargain.com) to return any content in the a tag that holds a reference to xbox but nothing is being returned to console. I believe the issue is with the if statement with :contains.
var fs = require('fs'),
request = require('request'),
cheerio = require('cheerio');
url = 'http://www.ozbargain.com.au';
request(url, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
if($("a:contains('Xbox')").length) {
//console.log(this);
var el = $(this);
var log = el.text();
console.log(log);
} else {
console.log('hey');
}
}
});
The html block I'm after. In particulare, I want the a tag;
<h2 class="title" id="title214252">Free on Xbox One, Xbox 360, PS3, PS4: Tales from the Borderlands (Episode 1)</h2>
The Cheerio syntax for contains is slightly different than jQuery. Ommit the single quotes around the string you're searching for an it should work:
$("a:contains(Xbox)")
Assigned the selector to a variable then called the text method.
request(url, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var $el = $("a:contains('Xbox')");
if ($el.length) {
console.log($el.text());
} else {
console.log('hey');
}
}
});