Cheerio problems on parsing html by selectors

Cheerio problems on parsing html by selectors - javascript

var request = require('request');
var cheerio = require('cheerio');
request('http://www.gatherproxy.com/proxylist/anonymity/?t=Elite', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var temp = $('#tblproxy tbody tr.loading-row')
console.log(temp.attr('class'))
}
});
The webpage is at http://www.gatherproxy.com/zh/proxylist/anonymity/?t=Elite
I want to get this element and its selector is #tblproxy > tbody > tr.loading-row
I tried the same thing in the google console,
var s = $('#tblproxy > tbody > tr.loading-row')
undefined
s.attr('class')
"loading-row"
But it doesn't work in the context of cheerio, the output for the program is undefined, any idea ?

I noticed that the element, tbody, that you're trying to query is loaded asynchronously. This is beyond the scope of what the request module is capable of. You can use phantomjs in simulating a web page in a headless manner and get the html from a web page module. If you want to create more customized web page modules you can refer to the phantomjs documentation.
Fork this github repo demo .
First, create a webpage module to get the html of a specific page.
phantom/request.js
'use strict';
var page = require('webpage').create();
var system = require('system');
page.open(system.args[1], function(status) {
console.log(page.evaluate(function() {
return document.documentElement.innerHTML;
}));
phantom.exit();
});
Second, create a phantomjs cli wrapper for all web page modules inside the phantom directory.
lib/phantom.js
'use strict';
var path = require('path');
var spawn = require('child_process').spawn;
var phantomjs = require('phantomjs');
var fs = require('fs');
var binPath = phantomjs.path;
var slice = Array.prototype.slice;
var phantomPath = path.join(
__dirname,
'..',
'phantom'
);
exports = module.exports = function() {
var args = slice.call(arguments);
var callback = args.pop();
var command = spawn(binPath, args);
command.stdout.on('data', function(data) {
callback(null, data.toString());
});
command.stderr.on('data', function(data) {
callback({ message: data.toString() }, null);
});
};
// create methods base on the ./phantom directory web page modules
fs.readdirSync(phantomPath).reduce(function(context, filename) {
var index = path.basename(filename, '.js');
context[index] = function() {
exports.apply(null, [path.join(phantomPath, filename)].concat(slice.call(arguments)));
};
}, exports);
Lastly, use the lib/phantom.js script's request method to get the html page.
index.js
'use strict';
var phantom = require('./lib/phantom');
var cheerio = require('cheerio');
var address = 'http://www.gatherproxy.com/proxylist/anonymity/?t=Elite';
phantom.request(address, function(err, html) {
if(err) {
console.log('error');
return;
}
var $ = cheerio.load(html);
var temp = $('#tblproxy tbody tr.loading-row');
console.log(temp.attr('class'));
});

From the code source of the page, there is no tbody in #tblproxy, so remove it from the selector:
var temp = $('#tblproxy tr.loading-row');
Update
Following bublik42's comment, if a tbody appears randomly, you can use find():
var temp = $('#tblproxy').find('tr.loading-row');

Related

Scraping with NodeJS

I need to extract links from the url in loop , so basically I need to execute another time the function but I don't know how to made this with nodejs.
var request = require('request');
var cheerio = require('cheerio');
var searchTerm = 'baloncesto';
var url = 'http://mismarcadores.com/' + searchTerm;
request(url , function(err,resp,body){
$ = cheerio.load(body);
links = $('a');
$(links).each(function(i,link){
console.log(url+$(link).attr('href'));
}
)
})
My question is about how to extract the links from this array because this code works correctly (This code shows in console the links) but I need to scrape these links.
The result will be scraping the urls inside each.

var request = require('request');
var cheerio = require('cheerio');
var searchTerm = 'baloncesto';
var url = 'http://mismarcadores.com/' + searchTerm;
request(url , function(err,resp,body){
$ = cheerio.load(body)
var allLinks = []
links = $('a');
$(links).each(function(i,link){
console.log(url+$(link).attr('href'))
var currentLink = url+$(link).attr('href')
allLinks.push(currentLink)
if (i == links.length-1){
useLinks(allLinks)
}
}
)
})
function useLinks(allLinks){
console.log(allLinks)
}
If you're asking how to extract the url from the links received from cheerio you're already doing it. If you'd like to use them elsewhere after the request is finished (e.g. for scraping again), then store them in an array and call a function to use the array after you iterate through the last link.

It should look something like this:
let links = $('a').get().map(a => $(a).attr('href'))

I share my solution is like the question but with differents changues.
I don't extract all links only the link thah I pass by url.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
var searchTerm = 'baloncesto';
var url = 'http://mismarcadores.com/' + searchTerm;
var arr2 = [];
app.get('/webscrape', function(req, res,body){
request(url , function(err,resp,body){
var array2 = [];
var array3 = [];
$ = cheerio.load(body);
links = $('a'); //jquery get all hyperlinks
$(links).each(function(i, link){
if($(link).attr('href').includes("baloncesto")){
array2.push($(link).attr('href'));
}
});
const uniqueLinks = new Set([...array2]);
uniqueLinks.forEach((d) => {
const row = []; // a new array for each row of data
row.push(d);
array3.push(row.join()); // by default, join() uses a ','
});
fs.writeFile('raaga_output.json', JSON.stringify(array3, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the raaga_output.json file');
})
res.send('File successfully written! - Check your project directory for the raaga_output.json file');
})
})
app.listen('3000')
console.log('Web Scrape happens on port 3000');
exports = module.exports = app;
Everyone could use this without any problem.

In Node.js I Can't find the "require('http')" dependacy for using a URL to get image size

I am trying to use the image-size module to get the width of an image via URL. I followed their directions explicitly but can't find the "HTTP" module anywhere (which this seems to require).
I'm fairly new to node.js. Here is my code:
var url = require('/usr/lib/node_modules/url');
// var http = require('/usr/lib/node_modules/http'); *** Can't find this to download
var imgSize = require('/usr/lib/node_modules/image-size/');
var imgUrl = 'http://my.nintendo.com/static/images/common/ogp/my-nintendo.png';
var options = url.parse(imgUrl);
http.get(options, function (response) {
var chunks = [];
response.on('data', function (chunk) {
chunks.push(chunk);
}).on('end', function() {
var buffer = Buffer.concat(chunks);
console.log(imgSize(buffer));
});
});
Here is a link to the package on NPM: https://www.npmjs.com/package/image-size

You must require like this at the top of your file:
var url = require('url');
var http = require('http');
var sizeOf = require('image-size');
You have the module installed in your project node_modules folder?

node.js display virtual hosts and current git branch async issue

Ok, i am just starting to learn node.js and i am having a little difficulty getting a good grasp on the async nature of it and when/how to use callbacks to get data passed along like i need it.
The concept of what i am trying to build is this. I have a small node.js app that uses the FS package - defined as var fs = require("fs"). The app is responding to localhost:4000 right now. When i hit that url, the app will use fs.readdir() to get all of the virtual host files in the directory that i pass to readdir().
Next, the app loops through those files and parses each one line by line and word by word (quick and dirty for now). I am using fs.readFile() to read the file and then literally doing lines = data.toString().split("\n") and then var words = lines[l].split(" ") to get to the data in the file i need. Each virtual host file looks something like this:
<VirtualHost *:80>
ServerName some-site.local
DocumentRoot "/path/to/the/docroot"
<Directory "/path/to/the/docroot">
Options Indexes FollowSymLinks Includes ExecCGI
AllowOverride All
Require all granted
</Directory>
ErrorLog "logs/some-site.local-error_log"
</VirtualHost>
Here is my main.js file (routes file):
var express = require("express"),
router = express.Router(),
async = require("async"),
directoryReader = require(__dirname + "/../lib/directoryReader");
fileReader = require(__dirname + "/../lib/fileReader");
router.get("/", function(req, res) {
var response = [];
directoryReader.read(function(files) {
async.each(files, function(file, callback) {
fileReader.read(file, function(data) {
if (data.length > 0) {
response.push(data);
}
callback();
});
}, function(err){
if (err) throw err;
res.json(response);
});
});
});
module.exports = router;
My directoryReader.js file:
var fs = require("fs"),
fileReader = require(__dirname + "/../lib/fileReader");
var directoryReader = {
read: function(callback) {
var self = this;
fs.readdir("/etc/apache2/sites-available", function (err, files) {
if (err) throw err;
var output = [];
for(f in files) {
output.push(files[f]);
}
callback(output);
});
}
};
module.exports = directoryReader;
And my fileReader.js file:
var fs = require("fs");
var fileReader = {
read: function(file, callback) {
fs.readFile("/etc/apache2/sites-available/" + file, { encoding: "utf8" }, function (err, data) {
if (err) throw err;
var vHostStats = ["servername", "documentroot", "errorlog"],
lines = data.toString().split("\n"),
output = [];
for(l in lines) {
var words = lines[l].split(" ");
for(w in words) {
if (words[w].toLowerCase() == "<virtualhost") {
var site = {
"servername" : "",
"documentroot" : "",
"errorlog" : "",
"gitbranch" : ""
}
w++;
}
if (vHostStats.indexOf(words[w].toLowerCase()) !== -1) {
var key = words[w].toLowerCase();
w++;
site[key] = words[w];
}
if (words[w].toLowerCase() == "</virtualhost>" && site.documentroot != "") {
w++;
output.push(site);
var cmd = "cd " + site["documentroot"] + " && git rev-parse --abbrev-ref HEAD";
var branch = ...; // get Git branch based on the above command
site["gitbranch"] = branch;
}
}
}
callback(output);
});
}
};
module.exports = fileReader;
All of this code will spit out json. This all works fine, expect for one part. The line in the fileReader.js file:
var branch = ...; // get Git branch based on the above command
I am trying to get this code to run a shell command and get the Git branch based on the document root directory. I then want to take the branch returned and add the value to the gitbranch proptery of the current site object during the loop. Hope this makes sense. I know there are probably questions on SO that cover something similar to this and i have looked at many of them. I fear i am just not educated enough in node.js yet to apply the answers to those SO questions to my particular use case.
Please let me know if there's anything i can add that can help anyoe answer this question. I should note that this app is for personal uses only, so the solution really just has to work, not be super elegant.
UPDATE: (5/1/2015)
Probably not the best solution but i got what i wanted by using the new execSync added to v0.12.x
if (words[w].toLowerCase() == "</virtualhost>" && site.documentroot != "") {
var cmd = "cd " + site["documentroot"] + " && git rev-parse --abbrev-ref HEAD";
var branch = sh(cmd, { encoding: "utf8" });
site["gitbranch"] = branch.toString().trim();
w++;
output.push(site);
}

return value with nested function using the request NPM package

I'm scraping the ghost blogging platform and the package I'm using to do so is request, but I'm not sure how to return a value of a nested request. I commented out the area that is causing the issue. Thanks!
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/', function(req, res){
var url = 'http://<subdomain>.ghost.io';
var articles = [];
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var post;
$('article').each(function(index) {
var self = $(this);
var article = {
header : self.find('h2.post-title').text(),
route: url + self.find('h2.post-title a').attr('href'),
content : '',
author: self.find('footer a').text(),
timestamp : self.find('time.post-date').text()
};
request(article.route, function(error, response, html) {
$ = cheerio.load(html);
post = $('section.post-content').text();
return post; //*** this is what I can't return ***//
//*** I'd like it to be the value of article.content ***//
});
console.log(post); //*** undefined ***//
articles.push(article);
});
fs.writeFile('posts.json', JSON.stringify(articles, null, 4), function(err){
console.log('Posts created.');
});
}
});
})
app.listen('8000');
console.log('Watching for changes.');
exports = module.exports = app;

So your problem boils down to having a list of URLs, and wanting to (asynchronously, because node.js) request all of them, and then know when they've all finished and then do something with the collected results.
The async module (npm install async) will let you do that like this:
var request = require('request');
var async = require('async');
var urls = ["http://google.com", "http://yahoo.com"];
async.map(urls, request, function(err, results) {
// First 100 characters of http://google.com
console.log(results[0].body.substr(0, 100));
// First 100 characters of http://yahoo.com
console.log(results[1].body.substr(0, 100));
});
So you can apply this to your problem by doing the following:
Synchronously create the entire articles list.
Use async.map on the list.
In the callback to async.map, you have a list of all the responses; you can process them synchronously.

How can I use these Node modules to accept HTML through a file or URL and then output JSON as validation of existing HTML elements?

Essentially what I need to do is to take a local grader.js file and then use it at the command line to input HTML, which will then output JSON data to the console to validate the existence of several HTML elements. The usage looks something like this:
./grader.js --checks checks.json --file index.html
./grader.js --checks checks.json --url http://google.com
The Node modules being used are Commander (for working at the command line), Cheerio (for HTML), and Restler (for getting HTML from URL).
The checks.json file is straightforward in that it's simply asking to check for the existence of a few simple HTML elements to find out whether or not they exist on the page:
["h1",
".navigation",
".logo",
".blank",
".about",
".heading",
".subheading",
".pitch",
".video",
".thermometer",
".order",
".social",
".section1",
".section2",
".faq",
".footer"]
The grader.js file is where things get a little more complicated. The following code actually works insofar as it takes the command line arguments and does indicate a true or false value as to whether the HTML elements exist. But it doesn't work properly after adding the URL check at the bottom. There is something wrong with my checkURL function and the way that I implement it using the Commander code at the bottom. Even though the true and false values are correct dependent upon the HTML file/URL I use, I end up spitting out both checks to the console even if I only want to check either the file or the URL, not both. I'm fairly new to this so I'm surprised that it works at all. It may have something to do with the default values, but when I try to make those changes the checkURL function seems to break down. Thanks in advance for your help I really do appreciate it.
#!/usr/bin/env node
var fs = require('fs');
var program = require('commander');
var cheerio = require('cheerio');
var rest = require('restler');
var HTMLFILE_DEFAULT = "index.html";
var CHECKSFILE_DEFAULT = "checks.json";
var URL_DEFAULT = "http://cryptic-spire-7925.herokuapp.com/index.html";
var assertFileExists = function(infile) {
var instr = infile.toString();
if(!fs.existsSync(instr)) {
console.log("%s does not exist. Exiting.", instr);
process.exit(1); // http://nodejs.org/api/process.html#process_process_exit_code
}
return instr;
};
var cheerioHtmlFile = function(htmlfile) {
return cheerio.load(fs.readFileSync(htmlfile));
};
var loadChecks = function(checksfile) {
return JSON.parse(fs.readFileSync(checksfile));
};
var checkHtmlFile = function(htmlfile, checksfile) {
$ = cheerioHtmlFile(htmlfile);
var checks = loadChecks(checksfile).sort();
var out = {};
for(var ii in checks) {
var present = $(checks[ii]).length > 0;
out[checks[ii]] = present;
}
return out;
};
var checkUrl = function(url, checksfile) {
rest.get(url).on('complete', function(data) {
$ = cheerio.load(data);
var checks = loadChecks(checksfile).sort();
var out = {};
for(var ii in checks) {
var present = $(checks[ii]).length > 0;
out[checks[ii]] = present;
}
console.log(out);
});
}
var clone = function(fn) {
// Workaround for commander.js issue.
// http://stackoverflow.com/a/6772648
return fn.bind({});
};
if(require.main == module) {
program
.option('-f, --file <html_file>', 'Path to index.html', clone(assertFileExists), HTMLFILE_DEFAULT)
.option('-u, --url <url>', 'URL to index.html', URL_DEFAULT)
.option('-c, --checks <check_file>', 'Path to checks.json', clone(assertFileExists), CHECKSFILE_DEFAULT)
.parse(process.argv);
var checkJson = checkHtmlFile(program.file, program.checks);
var outJson = JSON.stringify(checkJson, null, 4);
console.log(outJson);
var checkJson2 = checkUrl(program.url, program.checks);
var outJson2 = JSON.stringify(checkJson2, null, 4);
console.log(outJson2);
}
else {
exports.checkHtmlFile = checkHtmlFile;
}

Depending on the arguments call either one of checkHtmlFile() or checkUrl()
Something like:
if (program.url)
checkUrl(program.url, program.checks);
else checkHtmlFile(program.file, program.checks);
Read this for more references: commander.js option parsing
Also, checkJson2 is undefined as checkUrl() isn't returning anything.

Those commander .option lines look wrong to me.
Delete the clone function and revise your option lines as follows:
.option('-f, --file <html_file>', 'Path to index.html', HTMLFILE_DEFAULT)
.option('-u, --url <url>', 'URL to index.html', URL_DEFAULT)
.option('-c, --checks <check_file>', 'Path to checks.json', CHECKSFILE_DEFAULT)
This should solve your commander problem.

Here is the updated checkUrl function after the helpful hints from #David and #ankitsabharwal.
var checkUrl = function(url, checksfile) {
rest.get(url).on('complete', function(data) {
$ = cheerio.load(data);
var checks = loadChecks(checksfile).sort();
var out = {};
for(var ii in checks) {
var present = $(checks[ii]).length > 0;
out[checks[ii]] = present;
}
var outJson = JSON.stringify(out, null, 4);
console.log(outJson);
});
}
And here is the updated Commander code below:
if(require.main == module) {
program
.option('-f, --file <html_file>', 'Path to index.html')
.option('-u, --url <url>', 'URL to index.html')
.option('-c, --checks <check_file>', 'Path to checks.json')
.parse(process.argv);
if (program.url) {
checkUrl(program.url, program.checks);
} else {
checkHtmlFile (program.file, program.checks);
var checkJson = checkHtmlFile(program.file, program.checks);
var outJson = JSON.stringify(checkJson, null, 4);
console.log(outJson);
}
}

We Keep Coding

JavaScript is the programming language of the Web.

Cheerio problems on parsing html by selectors - javascript

From the code source of the page, there is no tbody in #tblproxy, so remove it from the selector: var temp = $('#tblproxy tr.loading-row'); Update Following bublik42's comment, if a tbody appears randomly, you can use find(): var temp = $('#tblproxy').find('tr.loading-row');

Related

Scraping with NodeJS

In Node.js I Can't find the "require('http')" dependacy for using a URL to get image size

node.js display virtual hosts and current git branch async issue

return value with nested function using the request NPM package

How can I use these Node modules to accept HTML through a file or URL and then output JSON as validation of existing HTML elements?

Categories

Resources