Having trouble looping through DOM elements with node web scraper - javascript

I was able to get the scraper to do what I want it to do, I'm having a lot of issues actually getting it to loop through the pages I want it to loop through. I think my issue may be with the placement of my for loop and how it's executed.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
//set object to be populated by scraped DOM elements
var author, quote;
var json = { author : [], quote : []};
//Initialize the scraper the scraper url in the DOM
app.get('/scrape', function(req, res){
//set the scraper url
This is the problem area here, how do I set this up so it doesn't just set and loop the last page, but all 101 pages?
for(var i = 1; i < 101; i++){
url = 'http://www.goodreads.com/quotes?page=' + i;
}
//
request(url, function(error, response, html){
if(!error){
//use cheerio to use jquery to select DOM elements
var $ = cheerio.load(html);
//select DOM elements using jquery selectors
$('.quoteText > a').filter(function(){
var data = $(this);
author = data.text();
json.author.push(author);
// all.push(data.text());
})
//select DOM elements using jquery selectors
$('.quoteText').filter(function(){
var data = $(this);
quote = data.text();
json.quote.push(quote);
})
}
//loop through json object to clean up stings
for(var i = 0; i < json.quote.length; i++) {
//find the index of where the quote ends
endQuote = json.quote[i].indexOf("―")
//select only the part of the string that contains a quote
json.quote[i] = json.quote[i].substring(0, endQuote - 1);
//remove non breaking spaces from string
json.quote[i] = json.quote[i].replace(/(\r\n|\n|\r)/gm,"");
}
//write the json file to folder
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');
})
res.send('Check your console!')
})
})
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
****edit****
Changed the code around to run res.send('Check your console!') at the end of function call, app will throw error if res is called more than once. Also included changes based on accepted answer.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
//set object to be populated by scraped DOM elements
var author, quote;
var json = { author : [], quote : []};
var url = []
//Initialize the scraper the scraper url in the DOM
app.get('/scrape', function(req, res){
//set the scraper url
for(var i = 1; i < 101; i++){
url.push('http://www.goodreads.com/quotes?page=' + i);
}
for(i in url){
request(url[i], function(error, response, html){
if(!error){
//use cheerio to use jquery to select DOM elements
var $ = cheerio.load(html);
//select DOM elements using jquery selectors
$('.quoteText > a').filter(function(){
var data = $(this);
author = data.text();
json.author.push(author);
// all.push(data.text());
})
//select DOM elements using jquery selectors
$('.quoteText').filter(function(){
var data = $(this);
quote = data.text();
json.quote.push(quote);
})
}
})
}
res.send('Check your console!')
})
function cleanUp(){
//loop through json object to clean up stings
for(var i = 0; i < json.quote.length; i++) {
//find the index of where the quote ends
endQuote = json.quote[i].indexOf("―")
//select only the part of the string that contains a quote
json.quote[i] = json.quote[i].substring(0, endQuote - 1);
//remove non breaking spaces from string
json.quote[i] = json.quote[i].replace(/(\r\n|\n|\r)/gm,"");
}
//write the json file to folder
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');
})
}
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;

In the example code you provided:
for(var i = 1; i < 101; i++){
url = 'http://www.goodreads.com/quotes?page=' + i;
}
The for loop is overwriting the url variable each time through the loop.
You can make it work with a few small changes to your code; the easiest way would be to make url an array and then push into the array on each time through the loop so the list of urls continues to accumulate like the code below:
var url = [];
for(var i = 1; i < 101; i++){
url.push('http://www.goodreads.com/quotes?page=' + i);
}
You would then need to call your request function for each item in the array since url now contains an array with 100 items in it and also change your fs.writeFile call to fs.appendFile so the results of each request call get added to the output.json file instead of overwriting it.
Finally, you should also consider throttling the requests so you aren't hammering the server of the site you are scraping.

Related

Scraping with NodeJS

I need to extract links from the url in loop , so basically I need to execute another time the function but I don't know how to made this with nodejs.
var request = require('request');
var cheerio = require('cheerio');
var searchTerm = 'baloncesto';
var url = 'http://mismarcadores.com/' + searchTerm;
request(url , function(err,resp,body){
$ = cheerio.load(body);
links = $('a');
$(links).each(function(i,link){
console.log(url+$(link).attr('href'));
}
)
})
My question is about how to extract the links from this array because this code works correctly (This code shows in console the links) but I need to scrape these links.
The result will be scraping the urls inside each.
var request = require('request');
var cheerio = require('cheerio');
var searchTerm = 'baloncesto';
var url = 'http://mismarcadores.com/' + searchTerm;
request(url , function(err,resp,body){
$ = cheerio.load(body)
var allLinks = []
links = $('a');
$(links).each(function(i,link){
console.log(url+$(link).attr('href'))
var currentLink = url+$(link).attr('href')
allLinks.push(currentLink)
if (i == links.length-1){
useLinks(allLinks)
}
}
)
})
function useLinks(allLinks){
console.log(allLinks)
}
If you're asking how to extract the url from the links received from cheerio you're already doing it. If you'd like to use them elsewhere after the request is finished (e.g. for scraping again), then store them in an array and call a function to use the array after you iterate through the last link.
It should look something like this:
let links = $('a').get().map(a => $(a).attr('href'))
I share my solution is like the question but with differents changues.
I don't extract all links only the link thah I pass by url.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
var searchTerm = 'baloncesto';
var url = 'http://mismarcadores.com/' + searchTerm;
var arr2 = [];
app.get('/webscrape', function(req, res,body){
request(url , function(err,resp,body){
var array2 = [];
var array3 = [];
$ = cheerio.load(body);
links = $('a'); //jquery get all hyperlinks
$(links).each(function(i, link){
if($(link).attr('href').includes("baloncesto")){
array2.push($(link).attr('href'));
}
});
const uniqueLinks = new Set([...array2]);
uniqueLinks.forEach((d) => {
const row = []; // a new array for each row of data
row.push(d);
array3.push(row.join()); // by default, join() uses a ','
});
fs.writeFile('raaga_output.json', JSON.stringify(array3, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the raaga_output.json file');
})
res.send('File successfully written! - Check your project directory for the raaga_output.json file');
})
})
app.listen('3000')
console.log('Web Scrape happens on port 3000');
exports = module.exports = app;
Everyone could use this without any problem.

Documents "disappearing" from mongodb collection after node loading routine is finished

Being new to mongodb and node, this particular issue is just driving me crazy:
I have written a small module, which reads an .csv file, makes it into a JSON array, and loads it into a mongodb collection one record at a time in a loop.
As I run this in debug, and set a breakpoint at "var v = i;" line, I can query the mongo collection, and see a fully populated records appear one after another.
However, as soon as the loop is finished - ALL records' data is gone!
The actual records are still all there, but they all empty. The data which I just saw in each record is no longer in those records.
It may be some wired scoping issue, but, being new, I just can not tell what that is.
Here is my code:
exports.excelFileParser = function(fileName, tabName, metadataFields){
var assert = require('assert');
console.log(metadataFields);
if(typeof require !== 'undefined') XLSX = require('xlsx');
var mongodb = require('mongodb');
var _ = require('underscore');
var fs = require('fs');
var Converter=require("csvtojson").core.Converter;
var distinctDataFields;
var MongoClient = mongodb.MongoClient;
var url = 'mongodb://localhost:27017/datamanager-03-test';
var fileName = 'clean_file.csv';
//fs.writeFileSync(fileName, newCsvLines);
var csvFileName=fileName;
var fileStream=fs.createReadStream(csvFileName);
//new converter instance
var csvConverter=new Converter({constructResult:true});
//end_parsed will be emitted once parsing finished
csvConverter.on("end_parsed",function(jsonObj){
//console.log(jsonObj); //here is your result json object
makeRecords(jsonObj);
});
function makeRecords(result){
console.log(result.length);
MongoClient.connect(url, function (err, db) {
if (err) {
console.log('Unable to connect to the mongoDB server. Error:', err);
} else {
console.log('Connection established to', url);
var collectionName = 'DC_Facilities';
db.open(function(err, client){
client.createCollection(collectionName, function(err, col) {
});
var collection = db.collection(collectionName);
for(var i =0;i < result.length; i++){
var dataRecord = result[i];
collection.insert(dataRecord);
var v = i;
}
console.log("finished");
db.close();
});
}
});
}
fileStream.pipe(csvConverter);
};
collection.insert is an asynchronous function, so you're calling db.close() before any of them have a chance to complete. You also don't need to call createCollection as the collection will be created for you if it doesn't already exist.
So your code should look something like this instead, so that db.close() isn't called until all the insert operations have completed:
db.open(function(err, client){
var collection = db.collection(collectionName);
var inserted = 0;
for(var i = 0; i < result.length; i++){
var dataRecord = result[i];
collection.insert(dataRecord, function(err) {
if (++inserted == result.length) {
console.log("finished");
db.close();
}
});
}
});

How to use promises to return the final result of an array?

I am currently trying to return a request of all the file names (in each existing folder) on a particular website. My web application is using NodeJS, Express, Cheerio, and Request to web scrape. My code is first getting a list of all the folder names. After retrieving a list of folder names, it then goes inside each folder name to get a list of file names and store them in the 'files' array. Finally, the 'files' array is what will be sent to the client-side.
Right now I am having a big issue with asynchronous stuff because my request would always return an empty list of 'files'. I have the Q node module installed and have tried using promises, but have had no luck getting the results I want. I am still new to nodeJS and would love it if someone can help me out.. :)
exports.getAllImages = function(req, res) {
var folders = [];
var files = [];
//Step 1: Get folder names and store all of them in the 'folders' array
var foldersUrl = 'http://students.washington.edu/jmzhwng/Images/';
request(foldersUrl, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
$("a:contains('-')").filter(function(){
var data = $(this)[0].attribs.href;
folders.push(data);
})
//Step 2: Using the 'folders' array, get file names in each folder and store all of them in the 'files' array
for (var i=0; i < folders.length; i++) {
var imagesUrl = 'http://students.washington.edu/jmzhwng/Images/' + folders[i];
request(imagesUrl, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
$("a:contains('.')").filter(function(){
var data = $(this)[0].attribs.href;
files.push(data);
})
}
})
}
//Step 3: Return all file names to client-side
res.json({
images: files
}, 200);
console.log('GET ALL IMAGES - ' + JSON.stringify(files));
}
})
For better readability or support, you can view the JSFiddle I created here: http://jsfiddle.net/fKGrm/
You don’t necessarily need promises for this—you’re 95% of the way there already without them. The main issue, as I think you’re aware, is that your response is being sent before the image requests come back. You just need to wait for those to finish before you send the response.
The most basic way is to count the number of callbacks you receive in your Step 2. When it equals the folders.length, then send your response.
Here’s a simplified version of that:
var request = require('request'),
cheerio = require('cheerio');
var baseUrl = 'http://students.washington.edu/jmzhwng/Images/';
var files = [];
request(baseUrl, function (error, res, body) {
var folders = folderLinks(cheerio.load(body));
count = 0;
folders.forEach(function (folder) {
request(baseUrl + folder, function (error, res, body) {
files.push.apply(files, fileLinks(cheerio.load(body)));
if (++count == folders.length) {
console.log(files);
}
});
});
});
function folderLinks ($) {
return $('a:contains(-)').get().map(function (a) {
return a.attribs.href;
});
}
function fileLinks ($) {
return $('a:contains(.)').get().map(function (a) {
return a.attribs.href;
});
}

how to read a file, store data and then write it

I have a text file with a ton of values that I want to convert to meaningful JSON using node.js fs module.
I want to store the first value of every line in an array unless the value is already present.
7000111,-1.31349,36.699959,1004,
7000111,-1.311739,36.698589,1005,
8002311,-1.262245,36.765884,2020,
8002311,-1.261135,36.767544,2021,
So for this case, I'd like to write to a file:
[7000111, 8002311]
Here's what I have so far. It writes [] to the file.
var fs = require('fs');
var through = require('through');
var split = require('split');
var shape_ids = [];
var source = fs.createReadStream('data/shapes.txt');
var target = fs.createWriteStream('./output3.txt');
var tr = through(write, end);
source
.pipe(split())
.pipe(tr)
// Function definitions
function write(line){
var line = line.toString();
var splitted = line.split(',');
// if it's not in array
if (shape_ids.indexOf(splitted[0]) > -1){
shape_ids.push(splitted[0]);
}
}
function end(){
shape_ids = JSON.stringify(shape_ids);
target.write(shape_ids);
console.log('data written');
}
The code is using the split and through modules
How do I store values in the array and write the populated array to the file?
== === ====== =================
Update:
This is what I want to do, but it's in Ruby:
shape_ids = []
File.open("data/shapes.txt").readlines.each do |line|
data = line.split(',')
shape_id = data.first
if !shape_ids.include? shape_id
shape_ids.push(shape_id)
end
end
puts shape_ids # array of unique shape_ids
Can I do this in javascript?
Unless you are super comfortable with the new Stream API in node, use the event-stream module to accomplish this:
var fs = require('fs');
var es = require('event-stream');
function getIds(src, target, callback) {
var uniqueIDs = [];
es.pipeline(
fs.createReadStream(src),
es.split(),
es.map(function (line, done) {
var id = line.split(',').shift();
if (uniqueIDs.indexOf(id) > -1) return done();
uniqueIDs.push(id);
done(null);
}),
es.wait(function (err, text) {
// Here we create our JSON — keep in mind that valid JSON starts
// as an object, not an array
var data = JSON.stringify({ ids: uniqueIDs});
fs.writeFile(target, data, function (err) {
if ('function' == typeof callback) callback(err);
});
})
);
}
getIds('./values.txt', './output.json');
Unfortunately there is no "easy" way to keep this as a pure stream flow so you have to "wait" until the data is done filtering before turning into a JSON string. Hope that helps!

How can I use these Node modules to accept HTML through a file or URL and then output JSON as validation of existing HTML elements?

Essentially what I need to do is to take a local grader.js file and then use it at the command line to input HTML, which will then output JSON data to the console to validate the existence of several HTML elements. The usage looks something like this:
./grader.js --checks checks.json --file index.html
./grader.js --checks checks.json --url http://google.com
The Node modules being used are Commander (for working at the command line), Cheerio (for HTML), and Restler (for getting HTML from URL).
The checks.json file is straightforward in that it's simply asking to check for the existence of a few simple HTML elements to find out whether or not they exist on the page:
["h1",
".navigation",
".logo",
".blank",
".about",
".heading",
".subheading",
".pitch",
".video",
".thermometer",
".order",
".social",
".section1",
".section2",
".faq",
".footer"]
The grader.js file is where things get a little more complicated. The following code actually works insofar as it takes the command line arguments and does indicate a true or false value as to whether the HTML elements exist. But it doesn't work properly after adding the URL check at the bottom. There is something wrong with my checkURL function and the way that I implement it using the Commander code at the bottom. Even though the true and false values are correct dependent upon the HTML file/URL I use, I end up spitting out both checks to the console even if I only want to check either the file or the URL, not both. I'm fairly new to this so I'm surprised that it works at all. It may have something to do with the default values, but when I try to make those changes the checkURL function seems to break down. Thanks in advance for your help I really do appreciate it.
#!/usr/bin/env node
var fs = require('fs');
var program = require('commander');
var cheerio = require('cheerio');
var rest = require('restler');
var HTMLFILE_DEFAULT = "index.html";
var CHECKSFILE_DEFAULT = "checks.json";
var URL_DEFAULT = "http://cryptic-spire-7925.herokuapp.com/index.html";
var assertFileExists = function(infile) {
var instr = infile.toString();
if(!fs.existsSync(instr)) {
console.log("%s does not exist. Exiting.", instr);
process.exit(1); // http://nodejs.org/api/process.html#process_process_exit_code
}
return instr;
};
var cheerioHtmlFile = function(htmlfile) {
return cheerio.load(fs.readFileSync(htmlfile));
};
var loadChecks = function(checksfile) {
return JSON.parse(fs.readFileSync(checksfile));
};
var checkHtmlFile = function(htmlfile, checksfile) {
$ = cheerioHtmlFile(htmlfile);
var checks = loadChecks(checksfile).sort();
var out = {};
for(var ii in checks) {
var present = $(checks[ii]).length > 0;
out[checks[ii]] = present;
}
return out;
};
var checkUrl = function(url, checksfile) {
rest.get(url).on('complete', function(data) {
$ = cheerio.load(data);
var checks = loadChecks(checksfile).sort();
var out = {};
for(var ii in checks) {
var present = $(checks[ii]).length > 0;
out[checks[ii]] = present;
}
console.log(out);
});
}
var clone = function(fn) {
// Workaround for commander.js issue.
// http://stackoverflow.com/a/6772648
return fn.bind({});
};
if(require.main == module) {
program
.option('-f, --file <html_file>', 'Path to index.html', clone(assertFileExists), HTMLFILE_DEFAULT)
.option('-u, --url <url>', 'URL to index.html', URL_DEFAULT)
.option('-c, --checks <check_file>', 'Path to checks.json', clone(assertFileExists), CHECKSFILE_DEFAULT)
.parse(process.argv);
var checkJson = checkHtmlFile(program.file, program.checks);
var outJson = JSON.stringify(checkJson, null, 4);
console.log(outJson);
var checkJson2 = checkUrl(program.url, program.checks);
var outJson2 = JSON.stringify(checkJson2, null, 4);
console.log(outJson2);
}
else {
exports.checkHtmlFile = checkHtmlFile;
}
Depending on the arguments call either one of checkHtmlFile() or checkUrl()
Something like:
if (program.url)
checkUrl(program.url, program.checks);
else checkHtmlFile(program.file, program.checks);
Read this for more references: commander.js option parsing
Also, checkJson2 is undefined as checkUrl() isn't returning anything.
Those commander .option lines look wrong to me.
Delete the clone function and revise your option lines as follows:
.option('-f, --file <html_file>', 'Path to index.html', HTMLFILE_DEFAULT)
.option('-u, --url <url>', 'URL to index.html', URL_DEFAULT)
.option('-c, --checks <check_file>', 'Path to checks.json', CHECKSFILE_DEFAULT)
This should solve your commander problem.
Here is the updated checkUrl function after the helpful hints from #David and #ankitsabharwal.
var checkUrl = function(url, checksfile) {
rest.get(url).on('complete', function(data) {
$ = cheerio.load(data);
var checks = loadChecks(checksfile).sort();
var out = {};
for(var ii in checks) {
var present = $(checks[ii]).length > 0;
out[checks[ii]] = present;
}
var outJson = JSON.stringify(out, null, 4);
console.log(outJson);
});
}
And here is the updated Commander code below:
if(require.main == module) {
program
.option('-f, --file <html_file>', 'Path to index.html')
.option('-u, --url <url>', 'URL to index.html')
.option('-c, --checks <check_file>', 'Path to checks.json')
.parse(process.argv);
if (program.url) {
checkUrl(program.url, program.checks);
} else {
checkHtmlFile (program.file, program.checks);
var checkJson = checkHtmlFile(program.file, program.checks);
var outJson = JSON.stringify(checkJson, null, 4);
console.log(outJson);
}
}

Categories