Scraping with NodeJS - javascript

I need to extract links from the url in loop , so basically I need to execute another time the function but I don't know how to made this with nodejs.
var request = require('request');
var cheerio = require('cheerio');
var searchTerm = 'baloncesto';
var url = 'http://mismarcadores.com/' + searchTerm;
request(url , function(err,resp,body){
$ = cheerio.load(body);
links = $('a');
$(links).each(function(i,link){
console.log(url+$(link).attr('href'));
}
)
})
My question is about how to extract the links from this array because this code works correctly (This code shows in console the links) but I need to scrape these links.
The result will be scraping the urls inside each.

var request = require('request');
var cheerio = require('cheerio');
var searchTerm = 'baloncesto';
var url = 'http://mismarcadores.com/' + searchTerm;
request(url , function(err,resp,body){
$ = cheerio.load(body)
var allLinks = []
links = $('a');
$(links).each(function(i,link){
console.log(url+$(link).attr('href'))
var currentLink = url+$(link).attr('href')
allLinks.push(currentLink)
if (i == links.length-1){
useLinks(allLinks)
}
}
)
})
function useLinks(allLinks){
console.log(allLinks)
}
If you're asking how to extract the url from the links received from cheerio you're already doing it. If you'd like to use them elsewhere after the request is finished (e.g. for scraping again), then store them in an array and call a function to use the array after you iterate through the last link.

It should look something like this:
let links = $('a').get().map(a => $(a).attr('href'))

I share my solution is like the question but with differents changues.
I don't extract all links only the link thah I pass by url.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
var searchTerm = 'baloncesto';
var url = 'http://mismarcadores.com/' + searchTerm;
var arr2 = [];
app.get('/webscrape', function(req, res,body){
request(url , function(err,resp,body){
var array2 = [];
var array3 = [];
$ = cheerio.load(body);
links = $('a'); //jquery get all hyperlinks
$(links).each(function(i, link){
if($(link).attr('href').includes("baloncesto")){
array2.push($(link).attr('href'));
}
});
const uniqueLinks = new Set([...array2]);
uniqueLinks.forEach((d) => {
const row = []; // a new array for each row of data
row.push(d);
array3.push(row.join()); // by default, join() uses a ','
});
fs.writeFile('raaga_output.json', JSON.stringify(array3, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the raaga_output.json file');
})
res.send('File successfully written! - Check your project directory for the raaga_output.json file');
})
})
app.listen('3000')
console.log('Web Scrape happens on port 3000');
exports = module.exports = app;
Everyone could use this without any problem.

Related

Switching a forum using basic file save to Mongodb using node.js

i'm using a basic forum using node.js and express writted there, and i want to switch to mongodb but i'm very new to it. I've already tried a few steps but it's not going anywhere. I just want to implement my database following the same way as this code.
var router = express.Router();
var fs = require('fs');
router.get('/forum',(req,res)=>{
res.render('forum',{db:req.app.locals.frm,'user':req.session.user});
})
.post('/forum',(req,res)=>{
var page = req.body;
req.app.locals.frm.push({'category':page.cat,'subcategories':[]});
fs.writeFile('./forum-db.json',JSON.stringify(req.app.locals.frm),(e)=>{console.log(e);});
res.redirect('/forum');
})
.get('/del-elt/:pos',(req,res)=>{
var page = req.params;
req.app.locals.frm.splice(page.pos,1);
fs.writeFile('./forum-db.json',JSON.stringify(req.app.locals.frm),(e)=>{console.log(e);});
res.redirect('/forum');
})
.get('/category/:id',(req,res)=>{
var cat = req.app.locals.frm[req.params.id]
res.render('category',{cat:cat,id:req.params.id,'user':req.session.user})
})
.post('/category',(req,res)=>{
var page = req.body;
req.app.locals.frm[page.id].subcategories.push({'title':page.cat,'discussions':[]});
fs.writeFile('./forum-db.json',JSON.stringify(req.app.locals.frm),(e)=>{console.log(e);});
res.redirect('/category/'+page.id);
})
.get('/del-cat/:id/:pos',(req,res)=>{
var page = req.params;
req.app.locals.frm[page.id].subcategories.splice(page.pos,1);
fs.writeFile('./forum-db.json',JSON.stringify(req.app.locals.frm),(e)=>{console.log(e);});
res.redirect('/category/'+page.id);
})
.get('/discussion/:cat/:id',(req,res)=>{
var dis = req.app.locals.frm[req.params.cat].subcategories[req.params.id];
res.render('discussion',{dis:dis,cat:req.params.cat,id:req.params.id,'user':req.session.user})
})
.post('/discussion',(req,res)=>{
var page = req.body;
var user = req.session.user;
var d = new Date();
var date = d.getDate()+"/"+d.getMonth()+"/"+d.getFullYear()+" ("+d.getHours()+":"+d.getMinutes()+")";
req.app.locals.frm[page.cat].subcategories[page.id].discussions.push([ page.message,user.mail,date ]); //+mail+date
fs.writeFile('./forum-db.json',JSON.stringify(req.app.locals.frm),(e)=>{console.log(e);});
res.redirect('/discussion/'+page.cat+'/'+page.id);
})
.get('/del-dis/:cat/:id/:pos',(req,res)=>{
var page = req.params;
req.app.locals.frm[page.cat].subcategories[page.id].discussions.splice(page.pos,1);
fs.writeFile('./forum-db.json',JSON.stringify(req.app.locals.frm),(e)=>{console.log(e);});
res.redirect('/discussion/'+page.cat+'/'+page.id);
});
If anyone has any clue on how to do it.
Thank you

How can I return a value from a Javascript file to a Node.js so I can write to a file?

I have a Javascript file which reads the inputs of a form and puts it into a JSON string. I want to create a file and write the JSON string to the file. I know how to write to files using Node but I am wanting to know how I can pass the string to my server.js file.
Can anyone help?
Javascript file (script.js)
(function() {
function toJSONString( form ) {
var obj = {};
var elements = form.querySelectorAll( "input, select, textarea" );
var store = elements[0];
var configValue = store.value;
for( var i = 1; i < elements.length; ++i ) {
var element = elements[i];
var name = element.name;
var value = element.value;
if( name ) {
obj[ name ] = value;
}
}
console.log(configValue);
return JSON.stringify({ [configValue] : [ obj ] } );
}
document.addEventListener( "DOMContentLoaded", function() {
var form = document.getElementById( "test" );
var output = document.getElementById( "output" );
var fs = require('fs');
form.addEventListener( "submit", function( e ) {
e.preventDefault();
var json = toJSONString( this );
output.innerHTML = json;
}, false);
});
})();
Node.js file (server.js):
const socketIO = require('socket.io');
const path = require('path');
const INDEX = path.join(__dirname, '/public/index.html');
const PORT = process.env.PORT || 3000;
var express = require('express');
var fs = require('fs');
var script = require('./public/script.js');
const app = express()
.get('/', function (req, res) {res.sendFile(INDEX)})
.use(express.static(__dirname + '/public'))
.listen(PORT, () => console.log(`Listening on ${ PORT }`));
const io = socketIO(app);
So if I understand correctly you are handling the stringification of your form on the client-side and want to pass it to your NodeJS server. To do that, you have to send a HTTP POST request to your server.
That means :
Create a server side POST route that parses your JSON and write it into a file.
On client side, after stringifying the JSON, send the POST request to the route you just created, To do that you can use the built-it javascript fetch: https://developer.mozilla.org/en-US/docs/Web/API/Fetch_API/Using_Fetch
Hope that answers your question!

Cheerio problems on parsing html by selectors

var request = require('request');
var cheerio = require('cheerio');
request('http://www.gatherproxy.com/proxylist/anonymity/?t=Elite', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var temp = $('#tblproxy tbody tr.loading-row')
console.log(temp.attr('class'))
}
});
The webpage is at http://www.gatherproxy.com/zh/proxylist/anonymity/?t=Elite
I want to get this element and its selector is #tblproxy > tbody > tr.loading-row
I tried the same thing in the google console,
var s = $('#tblproxy > tbody > tr.loading-row')
undefined
s.attr('class')
"loading-row"
But it doesn't work in the context of cheerio, the output for the program is undefined, any idea ?
I noticed that the element, tbody, that you're trying to query is loaded asynchronously. This is beyond the scope of what the request module is capable of. You can use phantomjs in simulating a web page in a headless manner and get the html from a web page module. If you want to create more customized web page modules you can refer to the phantomjs documentation.
Fork this github repo demo .
First, create a webpage module to get the html of a specific page.
phantom/request.js
'use strict';
var page = require('webpage').create();
var system = require('system');
page.open(system.args[1], function(status) {
console.log(page.evaluate(function() {
return document.documentElement.innerHTML;
}));
phantom.exit();
});
Second, create a phantomjs cli wrapper for all web page modules inside the phantom directory.
lib/phantom.js
'use strict';
var path = require('path');
var spawn = require('child_process').spawn;
var phantomjs = require('phantomjs');
var fs = require('fs');
var binPath = phantomjs.path;
var slice = Array.prototype.slice;
var phantomPath = path.join(
__dirname,
'..',
'phantom'
);
exports = module.exports = function() {
var args = slice.call(arguments);
var callback = args.pop();
var command = spawn(binPath, args);
command.stdout.on('data', function(data) {
callback(null, data.toString());
});
command.stderr.on('data', function(data) {
callback({ message: data.toString() }, null);
});
};
// create methods base on the ./phantom directory web page modules
fs.readdirSync(phantomPath).reduce(function(context, filename) {
var index = path.basename(filename, '.js');
context[index] = function() {
exports.apply(null, [path.join(phantomPath, filename)].concat(slice.call(arguments)));
};
}, exports);
Lastly, use the lib/phantom.js script's request method to get the html page.
index.js
'use strict';
var phantom = require('./lib/phantom');
var cheerio = require('cheerio');
var address = 'http://www.gatherproxy.com/proxylist/anonymity/?t=Elite';
phantom.request(address, function(err, html) {
if(err) {
console.log('error');
return;
}
var $ = cheerio.load(html);
var temp = $('#tblproxy tbody tr.loading-row');
console.log(temp.attr('class'));
});
From the code source of the page, there is no tbody in #tblproxy, so remove it from the selector:
var temp = $('#tblproxy tr.loading-row');
Update
Following bublik42's comment, if a tbody appears randomly, you can use find():
var temp = $('#tblproxy').find('tr.loading-row');

Having trouble looping through DOM elements with node web scraper

I was able to get the scraper to do what I want it to do, I'm having a lot of issues actually getting it to loop through the pages I want it to loop through. I think my issue may be with the placement of my for loop and how it's executed.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
//set object to be populated by scraped DOM elements
var author, quote;
var json = { author : [], quote : []};
//Initialize the scraper the scraper url in the DOM
app.get('/scrape', function(req, res){
//set the scraper url
This is the problem area here, how do I set this up so it doesn't just set and loop the last page, but all 101 pages?
for(var i = 1; i < 101; i++){
url = 'http://www.goodreads.com/quotes?page=' + i;
}
//
request(url, function(error, response, html){
if(!error){
//use cheerio to use jquery to select DOM elements
var $ = cheerio.load(html);
//select DOM elements using jquery selectors
$('.quoteText > a').filter(function(){
var data = $(this);
author = data.text();
json.author.push(author);
// all.push(data.text());
})
//select DOM elements using jquery selectors
$('.quoteText').filter(function(){
var data = $(this);
quote = data.text();
json.quote.push(quote);
})
}
//loop through json object to clean up stings
for(var i = 0; i < json.quote.length; i++) {
//find the index of where the quote ends
endQuote = json.quote[i].indexOf("―")
//select only the part of the string that contains a quote
json.quote[i] = json.quote[i].substring(0, endQuote - 1);
//remove non breaking spaces from string
json.quote[i] = json.quote[i].replace(/(\r\n|\n|\r)/gm,"");
}
//write the json file to folder
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');
})
res.send('Check your console!')
})
})
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
****edit****
Changed the code around to run res.send('Check your console!') at the end of function call, app will throw error if res is called more than once. Also included changes based on accepted answer.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
//set object to be populated by scraped DOM elements
var author, quote;
var json = { author : [], quote : []};
var url = []
//Initialize the scraper the scraper url in the DOM
app.get('/scrape', function(req, res){
//set the scraper url
for(var i = 1; i < 101; i++){
url.push('http://www.goodreads.com/quotes?page=' + i);
}
for(i in url){
request(url[i], function(error, response, html){
if(!error){
//use cheerio to use jquery to select DOM elements
var $ = cheerio.load(html);
//select DOM elements using jquery selectors
$('.quoteText > a').filter(function(){
var data = $(this);
author = data.text();
json.author.push(author);
// all.push(data.text());
})
//select DOM elements using jquery selectors
$('.quoteText').filter(function(){
var data = $(this);
quote = data.text();
json.quote.push(quote);
})
}
})
}
res.send('Check your console!')
})
function cleanUp(){
//loop through json object to clean up stings
for(var i = 0; i < json.quote.length; i++) {
//find the index of where the quote ends
endQuote = json.quote[i].indexOf("―")
//select only the part of the string that contains a quote
json.quote[i] = json.quote[i].substring(0, endQuote - 1);
//remove non breaking spaces from string
json.quote[i] = json.quote[i].replace(/(\r\n|\n|\r)/gm,"");
}
//write the json file to folder
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');
})
}
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
In the example code you provided:
for(var i = 1; i < 101; i++){
url = 'http://www.goodreads.com/quotes?page=' + i;
}
The for loop is overwriting the url variable each time through the loop.
You can make it work with a few small changes to your code; the easiest way would be to make url an array and then push into the array on each time through the loop so the list of urls continues to accumulate like the code below:
var url = [];
for(var i = 1; i < 101; i++){
url.push('http://www.goodreads.com/quotes?page=' + i);
}
You would then need to call your request function for each item in the array since url now contains an array with 100 items in it and also change your fs.writeFile call to fs.appendFile so the results of each request call get added to the output.json file instead of overwriting it.
Finally, you should also consider throttling the requests so you aren't hammering the server of the site you are scraping.

Unable to export variables within function expressjs/requestjs

My export code
//export.js
var express = require('express'),
fs = require('fs'),
request = require('request'),
cheerio = require('cheerio'),
app = express();
var episode = [],
title = [],
synopsis = [],
reviews = [],
date = [];
exports.showGrab = function(url,response){
request(url,response, function(error, response, html){
var $ = cheerio.load(html),
shows = {bitten:['http://www.tv.com/shows/bitten-2013/episodes/']};
$('.no_toggle._clearfix').eq(1).filter(function(){
var data = $(this);
episode = data.children().first().children().last().text();
exports.episode = episode;
})
})
}
console.log(episode); // => defined as [] not the value given in the function
//import.js
var express = require('express'),
fs = require('fs'),
request = require('request'),
cheerio = require('cheerio'),
app = express(),
server = require('./export');
console.log(server.episode);
server.showGrab('http://www.tv.com/shows/bitten-2013/episodes/');
within the import script using the function server.showGrab works fine but I need access to the variables within the show grab function in my import script. I believe this problem boils down to a scope issue, if I export variables outside a function they work fine but I was under the impression that declaring variables the way I have done would make them global. How can I run this function in the import script whilst still passing it a url and getting back episode to work with?
#Pointy you were right the import script was calling the value before it was defined
//import.js
if (server.episode === undefined){
var test = setInterval(function(){
if (server.episode != undefined){
clearInterval(test);
}
console.log(server.episode);
}, 1000);
}
this does the trick, for some reason using else instead of wrapping an if in an if does not work.

Categories