I'm trying to get the title tag of a url with cheerio. But, I'm getting empty string values. This is my code:
app.get('/scrape', function(req, res){
url = 'http://nrabinowitz.github.io/pjscrape/';
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var title, release, rating;
var json = { title : "", release : "", rating : ""};
$('title').filter(function(){
//var data = $(this);
var data = $(this);
title = data.children().first().text();
release = data.children().last().children().text();
json.title = title;
json.release = release;
})
$('.star-box-giga-star').filter(function(){
var data = $(this);
rating = data.text();
json.rating = rating;
})
}
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');
})
// Finally, we'll just send out a message to the browser reminding you that this app does not have a UI.
res.send('Check your console!')
})
});
request(url, function (error, response, body)
{
if (!error && response.statusCode == 200)
{
var $ = cheerio.load(body);
var title = $("title").text();
}
})
Using Javascript we extract the text contained within the "title" tags.
If Robert Ryan's solution still doesn't work, I'd be suspicious of the formatting of the original page, which may be malformed somehow.
In my case I was accepting gzip and other compression but never decoding, so Cheerio was trying to parse compressed binary bits. When console logging the original body, I was able to spot the binary text instead of plain text HTML.
Related
so I'm trying to create an application for the Google Assistant and the data for my application is stored in an online XML, however, I am not sure how I am supposed to extract the specific data that I require from the XML.
I have tried to fix this by indexing the results of the XML parser however I receive either undefined errors or cannot read property errors.
var eyes = require('eyes');
var https = require('https');
var fs = require('fs');
var xml2js = require('xml2js');
var parser = new xml2js.Parser({ attrkey: "ball"});
parser.on('error', function(err) { console.log('Parser error', err); });
var data = '';
https.get('https://www.national-lottery.co.uk/results/euromillions/draw-history-full/xml', function(res) {
if (res.statusCode >= 200 && res.statusCode < 400) {
res.on('data', function(data_) { data += data_.toString(); });
res.on('end', function() {
console.log('data', data);
parser.parseString(data, function(err, result) {
toUse = result['draw-results']['game']['balls']['ball'][1];
console.log(toUse);
console.log('FINISHED', err, result);
});
});
}
});
I expect to receive an output of the first ball number called, however, I cannot get the data out other than printing the entire XML.
I get the output 4 for the path result['draw-results'].game[0].balls[0].ball[0]['_'].
user will give the url in the input type field in the Html page that url need to get in the JS program and then the JS program need to execute to fetch the data from webpage.
this is what have done so far.
var request = require('request');
var cheerio = require('cheerio');
var fs = require("fs");
var url = ""
request(url, function(err, response, html){
if(!err) {
var $ =cheerio.load(html);
var allItems = $('.clearfix').parent().children();
var items = [];
allItems.each(function(index) {
var result = $('.clearfix').eq(index).parent().children().eq(1).find("a").text();
if(result !== ""){
items.push(result);
}
});
fs.writeFile("output1.xls",JSON.stringify(items, null, 1),)
console.log(items);
}
});
Is this the solution to your problem?
var url = document.getElementById('myURL').value
I used get method of request module to get content of external site. If encoding of external site is utf-8, it is ok, but it has display error with other encodings such as shift-jis
function getExternalUrl(request, response, url){
mod_request.get(url, function (err, res, body) {
//mod_request.get({uri: url, encoding: 'binary'}, function (err, res, body) {
if (err){
console.log("\terr=" + err);
}else{
var result = res.body;
// Process res.body
response.write(result);
}
response.end();
});
}
How can I get content of external site with correct encoding?
I found the way to do:
Get with binary encoding
var mod_request = require('request');
mod_request.get({ uri: url, encoding: 'binary', headers: headers }, function(err, res, body) {});
Create a Buffer with binary format
var contentBuffer = new Buffer(res.body, 'binary');
Get real encoding of page by detect-character-encoding npm
var mod_detect_character_encoding = require('detect-character-encoding');
var charsetMatch = mod_detect_character_encoding(contentBuffer);
Convert page to utf-8 by iconv npm
var mod_iconv = require('iconv').Iconv;
var iconv = new mod_iconv(charsetMatch.encoding, 'utf-8');
var result = iconv.convert(contentBuffer).toString();
P/S: This way is only applied for text file (html, css, js). Please do not apply for image file or others which is not text
I am building a content scraper for a tshirt website.
The goal is to enter a website through only one hardcoded url: http://shirts4mike.com
I will then find all the product pages for each tshirt, and then create a object with it's details. Then add it to an array.
When the array is full of the tshirts, I'll work through the array and log it into a CSV file.
Right now, I am having some trouble with the timing of the requests/responses and the function calls.
How can I make sure that I call the NEXT function on the right time? I understand that it's not working because of it's async nature.
How can I call secondScrape, lastScraper and convertJson2Csv at the right time so that the variables they're working with are not undefined?
I tried to use something such as response.end() but this is not working.
I'm assuming I NEED to use promises to make this work properly? and to be legible?
Any ideas? My code is below:
//Modules being used:
var cheerio = require('cheerio');
var request = require('request');
var moment = require('moment');
//hardcoded url
var url = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = new Set();
var remainder;
var tshirtArray;
// Load front page of shirts4mike
request(url, function(error, response, html) {
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
//iterate over links with 'shirt'
$("a[href*=shirt]").each(function(){
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
//for each new link, go in and find out if there is a submit button.
//If there, add it to the set
request(scrapeLink, function(error,response, html){
if(!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//if page has a submit it must be a product page
if($('[type=submit]').length !== 0){
//add page to set
urlSet.add(scrapeLink);
} else if(remainder === undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
remainder = scrapeLink;
}
}
});
});
}
//call second scrape for remainder
secondScrape();
});
function secondScrape() {
request(remainder, function(error, response, html) {
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
$("a[href*=shirt]").each(function(){
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
request(scrapeLink, function(error,response, html){
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
//collect remaining product pages and add to set
if($('[type=submit]').length !== 0){
urlSet.add(scrapeLink);
}
}
});
});
}
});
console.log(urlSet);
//call lastScraper so we can grab data from the set (product pages)
lastScraper();
};
function lastScraper(){
//scrape set, product pages
for(var i = 0; i < urlSet.length; i++){
var url = urlSet[i];
request(url, function(error, response, html){
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
//grab data and store as variables
var price = $('.price').text();
var img = $('.shirt-picture').find("img").attr("src");
var title = $('body').find(".shirt-details > h1").text().slice(4);
var tshirtObject = {};
//add values into tshirt object
tshirtObject.price = price;
tshirtObject.img = img;
tshirtObject.title = title;
tshirtObject.url = url;
tshirtObject.date = moment().format('MMMM Do YYYY, h:mm:ss a');
//add the object into the array of tshirts
tshirtArray.push(tshirtObject);
}
});
}
//call function to iterate through tshirt objects in array in order to convert to JSON, then into CSV to be logged
convertJson2Csv();
};
There is a npm module called request-promise.
simply:
var rp = require("request-promise");
and anywhere you are making a request you can switch with request-promise.
for instance:
rp(url)
.then(function(value){
//do whatever
})
.catch(function(err){
console.log(err)
})
You can use this example to convert the rest of your code sample.
promise = new Promise((resolve, reject) => (
request("http://shirts4mike.com/",
(err, response, html) => (response.statusCode == 200 ? resolve(html): reject(err))
)));
promise.then(html => {
var $ = cheerio.load(html);
// continue
});
You can use waterfall method of async module which can give you a smooth way to resolve this issue.
I just try to do your code with this module
Hope this will work for you
Format of waterfall
async.waterfall([
function(callback) {
callback(null, previousvalue);
},
function(previousvalue, callback) {}
], function(err, result) { //Final callback
});
var async = require('async');
var cheerio = require('cheerio');
var request = require('request');
var moment = require('moment');
//hardcoded url
var url = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = new Set();
var remainder;
var tshirtArray = [];
async.waterfall([
function(callback) {
// Load front page of shirts4mike
request(url, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//iterate over links with 'shirt'
$("a[href*=shirt]").each(function() {
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
//for each new link, go in and find out if there is a submit button.
//If there, add it to the set
request(scrapeLink, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//if page has a submit it must be a product page
if ($('[type=submit]').length !== 0) {
//add page to set
urlSet.add(scrapeLink);
callback(null, true);
} else if (remainder === undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
remainder = scrapeLink;
callback(nul, true);
}
}
});
});
}
//call second scrape for remainder
// secondScrape();
});
},
function(previousvalue, callback) {
request(remainder, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$("a[href*=shirt]").each(function() {
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
request(scrapeLink, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//collect remaining product pages and add to set
if ($('[type=submit]').length !== 0) {
urlSet.add(scrapeLink);
}
callback(null, true);
}
});
});
}
});
console.log(urlSet);
//call lastScraper so we can grab data from the set (product pages)
},
function(previousvalue, callback) {
//scrape set, product pages
for (var i = 0; i < urlSet.length; i++) {
var url = urlSet[i];
request(url, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//grab data and store as variables
var price = $('.price').text();
var img = $('.shirt-picture').find("img").attr("src");
var title = $('body').find(".shirt-details > h1").text().slice(4);
var tshirtObject = {};
//add values into tshirt object
tshirtObject.price = price;
tshirtObject.img = img;
tshirtObject.title = title;
tshirtObject.url = url;
tshirtObject.date = moment().format('MMMM Do YYYY, h:mm:ss a');
//add the object into the array of tshirts
tshirtArray.push(tshirtObject);
}
});
}
}
], function(err, result) {
//call function to iterate through tshirt objects in array in order to convert to JSON, then into CSV to be logged
convertJson2Csv();
});
You correctly identify promises as a way ahead to solving your timing issues.
In order to have promises available, you need to promisify request (or adopt a HTTP lib, whose methods return promises).
You could just fix the timing issues with promises, but you could also take the opportunity to improve the overall paradigm. Instead of discrete functions for virtually identical first/second/third stages, you can write a single function that calls itself recursively. Written correctly, this will ensure that each page in the target site is visited a maximum of once; revisits should be avoided on grounds of overall performance, and loading of the target server.
//Modules being used:
var Promise = require('path/to/bluebird');
var cheerio = require('cheerio');
var moment = require('moment');
// Promisify `request` to make `request.getAsync()` available.
// Ref: http://stackoverflow.com/questions/28308131/how-do-you-properly-promisify-request
var request = Promise.promisify(require('request'));
Promise.promisifyAll(request);
//hardcoded url
var url = 'http://shirts4mike.com/';
var urlSet = new Set();
var tshirtArray = [];
var maxLevels = 3; // limit the recursion to this number of levels.
function scrapePage(url_, levelCounter) {
// Bale out if :
// a) the target url_ has been visited already,
// b) maxLevels has been reached.
if(urlSet.has(url_) || levelCounter >= maxLevels) {
return Promise.resolve();
}
urlSet.add(url_);
return request.getAsync(url_).then(function(response, html) {
var $;
if(response.statusCode !== 200) {
throw new Error('statusCode was not 200'); // will be caught below
}
$ = cheerio.load(html);
if($('[type=submit]').length > 0) {
// yay, it's a product page.
tshirtArray.push({
price: $('.price').text(),
img: $('.shirt-picture').find("img").attr("src"),
title: $('body').find(".shirt-details > h1").text().slice(4),
url: url_,
date: moment().format('MMMM Do YYYY, h:mm:ss a')
});
}
// find any shirt links on page represented by $, visit each link in turn, and scrape.
return Promise.all($("a[href*=shirt]").map(function(link) {
return scrapePage(link.href, levelCounter + 1);
}).get());
}).catch(function(e) {
// ensure "success" even if scraping threw an error.
console.log(e);
return null;
});
}
scrapePage(url, 0).then(convertJson2Csv);
As you can see, a recursive solution :
avoids repetition of code,
will drill down as many levels as you wish - determined by the variable maxLevels.
Note: This is still not a good solution. There's an implicit assumption here, as in the original code, that all shirt pages are reachable from the site's home page, via "shirt" links alone. If shirts were reachable via eg "clothing" > "shirts", then the code above won't find any shirts.
Using Node v0.2.0 I am trying to fetch an image from a server, convert it into a base64 string and then embed it on the page in an image tag. I have the following code:
var express = require('express'),
request = require('request'),
sys = require('sys');
var app = express.createServer(
express.logger(),
express.bodyDecoder()
);
app.get('/', function(req, res){
if(req.param("url")) {
var url = unescape(req.param("url"));
request({uri:url}, function (error, response, body) {
if (!error && response.statusCode == 200) {
var data_uri_prefix = "data:" + response.headers["content-type"] + ";base64,";
var buf = new Buffer(body);
var image = buf.toString('base64');
image = data_uri_prefix + image;
res.send('<img src="'+image+'"/>');
}
});
}
});
app.listen(3000);
Note: This code requires "express" and "request". And of course, node. If you have npm installed, it should be as simple as "npm install express" or "npm install request".
Unfortunately, this doesn't work as expected. If I do the conversion with the Google logo, then I get the following at the beginning of the string:
77+9UE5HDQoaCgAAAA1JSERSAAABEwAAAF8IAwAAAO+/ve+/ve+/vSkAAAMAUExURQBzCw5xGiNmK0t+U++/vQUf77+9BiHvv70WKO+/vQkk77+9D
However if I use an online Base64 encoder with the same image, then it works perfectly. The string starts like this:
iVBORw0KGgoAAAANSUhEUgAAARMAAABfCAMAAAD8mtMpAAADAFBMVEUAcwsOcRojZitLflOWBR+aBiGQFiipCSS8DCm1Cya1FiyNKzexKTjDDSrLDS
Where am I going wrong that this isn't working correctly? I have tried so many different js base64 implementations and they all don't work in the same way. The only thing I can think of is that I am trying to convert the wrong thing into base64, but what should I convert if that is the case?
The problem is encoding and storing binary data in javascript strings. There's a pretty good section on this under Buffers at http://nodejs.org/api.html.
Unfortunately, the easiest way to fix this involved changing the request npm. I had to add response.setEncoding('binary'); on line 66 just below var buffer; in /path/to/lib/node/.npm/request/active/package/lib/main.js. This will work fine for this request but not others. You might want to hack it so that this is only set based on some other passed option.
I then changed var buf = new Buffer(body) to var buf = new Buffer(body, 'binary');. After this, everything worked fine.
Another way to do this, if you really didn't want to touch the request npm, would be to pass in an object that implements Writable Stream in the responseBodyStream argument to request. This object would then store the streamed data from the response in it's own buffer. Maybe there is a library that does this already... i'm not sure.
I'm going to leave it here for now, but feel free to comment if you want me to clarify anything.
EDIT
Check out comments. New solution at http://gist.github.com/583836
The following code (available at https://gist.github.com/804225)
var URL = require('url'),
sURL = 'http://nodejs.org/logo.png',
oURL = URL.parse(sURL),
http = require('http'),
client = http.createClient(80, oURL.hostname),
request = client.request('GET', oURL.pathname, {'host': oURL.hostname})
;
request.end();
request.on('response', function (response)
{
var type = response.headers["content-type"],
prefix = "data:" + type + ";base64,",
body = "";
response.setEncoding('binary');
response.on('end', function () {
var base64 = new Buffer(body, 'binary').toString('base64'),
data = prefix + base64;
console.log(data);
});
response.on('data', function (chunk) {
if (response.statusCode == 200) body += chunk;
});
});
should also produce a data URI without requiring any external modules.
This works for me using request:
const url = 'http://host/image.png';
request.get({url : url, encoding: null}, (err, res, body) => {
if (!err) {
const type = res.headers["content-type"];
const prefix = "data:" + type + ";base64,";
const base64 = body.toString('base64');
const dataUri = prefix + base64;
}
});
No need for any intermediate buffers. The key is to set encoding to null.