Building a simple Node.js scraper function - javascript

I'm tryign to build a very simple scraper function for nodeJS - just a function that I can pass a URL to, and it returns the scraped data as var data.
I'm completely new to Node.js and can't work out why the following isn't working:
var request = require('request');
var cheerio = require('cheerio');
function scrape(url) {
console.log("Scraping: " + url);
request(url, function(err, resp, body) {
if (err) {
throw err;
}
var html = cheerio.load(body);
return html;
});
}
var data = scrape('http://www.stackoverflow.com');
$ = data;
var logo = $('#hlogo a').text();
console.log(logo);
The above code should return "Stack Overflow" but obviously does not. When I run this in the console I get an error:
var logo = $('#hlogo a').text();
^
TypeError: Property '$' of object #<Object> is not a function
Any ideas why this isn't working for me?

Your data will be undefined, because scrape function does not return a value, additionaly it asynchronous.
You need change logic to something like this:
function scrape(url, oncomplete) {
console.log("Scraping: " + url);
request(url, function(err, resp, body) {
if (err) {
throw err;
}
var html = cheerio.load(body);
oncomplete(html);
});
}
scrape('http://www.stackoverflow.com', function(data) { /* do work here*/ });

Related

Issues with parsing tags inside an online xml document with node.js

so I'm trying to create an application for the Google Assistant and the data for my application is stored in an online XML, however, I am not sure how I am supposed to extract the specific data that I require from the XML.
I have tried to fix this by indexing the results of the XML parser however I receive either undefined errors or cannot read property errors.
var eyes = require('eyes');
var https = require('https');
var fs = require('fs');
var xml2js = require('xml2js');
var parser = new xml2js.Parser({ attrkey: "ball"});
parser.on('error', function(err) { console.log('Parser error', err); });
var data = '';
https.get('https://www.national-lottery.co.uk/results/euromillions/draw-history-full/xml', function(res) {
if (res.statusCode >= 200 && res.statusCode < 400) {
res.on('data', function(data_) { data += data_.toString(); });
res.on('end', function() {
console.log('data', data);
parser.parseString(data, function(err, result) {
toUse = result['draw-results']['game']['balls']['ball'][1];
console.log(toUse);
console.log('FINISHED', err, result);
});
});
}
});
I expect to receive an output of the first ball number called, however, I cannot get the data out other than printing the entire XML.
I get the output 4 for the path result['draw-results'].game[0].balls[0].ball[0]['_'].

Node.js ignoring code to read a file that does not yet exist

I am trying to write some basic code for use in a larger chatbot later. The code takes a user input, checks to see if a file already exists and if it does not, pull data from an api and save the file.
I can get each part working independently (for example, noting out the readfile saves the file and noting out the write successfully reads the file) however when I try to run them together I get an ENOENT error. The entire code can be seen below.
const path = require('path');
const readline = require('readline-sync');
const fs = require('fs');
const request = require('request');
//put api url into variable
const api = "http://vocadb.net/api/";
var command = readline.question("enter a command: ");
//only run with commands that start with !
if (command.startsWith('!')) {
if (command.startsWith('!artist')) {
//split input, strip command char and use as file name
userCommand = command.replace('!', '');
userCommand = userCommand.split(' ');
var filename = userCommand[0] + userCommand[1] + ".txt";
var file = path.basename(filename);
//if file does not exist, fetch JSON from api and save to file
if (!fs.existsSync(file)) {
console.log("true");
request(api + "artists?query=" + userCommand[1] + "&artistType=producer&lang=English", function(error, response, body) {
if(err) throw err;
console.log(body);
var artist = JSON.parse(body);
console.log(artist);
//if json has no items do not create file, print incorrect and stop execuution
if (artist['items'] == '') {
console.log("Artist could not be found");
throw err;
} else {
fs.writeFile(file, JSON.stringify(artist), (err) => {
if (err) throw err;
console.log('file saved');
});
}
});
}
fs.readFile(file, 'utf8', (err, data) => {
if (err) throw err;
artist = JSON.parse(data);
request(api + "songs?artistId=" + artist['items'][0]['id'] + "&maxResults=2&getTotalCount=false&sort=RatingScore&lang=English", function(error, response, body) {
var songs = JSON.parse(body);
console.log(artist['items'][0]['name'] + "\n" + artist['items'][0]['defaultName'] + "\n");
console.log("Top songs:\n" + songs['items'][0]['name'] + "\n" + songs['items'][0]['defaultName'] + "\n" + songs['items'][0]['artistString'] + "\n");
console.log(songs['items'][1]['name'] + "\n" + songs['items'][1]['defaultName'] + "\n" + songs['items'][1]['artistString']);
});
});
}
}
It seems like node skips all the code just before the request since the console.log("true) does return but nothing afterwards
So far I have tried separating them into functions (which might be a better etiquette), changing the readFile to readFileSync, using request.get(...).on(...)
Any help will be greatly appreciated
Your code just need a few minnor corrections. Since I don't have access to your API, I'm using a public one (last.fm/api) but it works with JSON almost the same way you want.
var fs = require('fs');
var request = require('request');
var api = 'http://ws.audioscrobbler.com/2.0/?callback=&method=';
var file = "test.txt";
var userCommand = 'amy';
if (!fs.existsSync(file)) {
console.log("file not found");
request(api + "artist.getinfo&artist=" + userCommand + '&api_key=57ee3318536b23ee81d6b27e36997cde&format=json&_=1520772613660', function(err, response, body) {
if (err) throw err;
else {
var data = JSON.parse(body);
if (!data.artist) {
console.error("Artist could not be found");
throw err;
} else {
fs.writeFile(file, JSON.stringify(data), function (err) {
if (err) throw err;
else console.log('file saved');
});
}
}
});
} else {
console.log('found file "' + file + '"');
fs.readFile(file, 'utf8', function (err,data) {
if (err) throw err;
else {
artist = JSON.parse(data);
request(api + "artist.getTopTracks&artist=" + userCommand + "&api_key=57ee3318536b23ee81d6b27e36997cde&format=json&_=1520772613660", function(error, response, body) {
var data = JSON.parse(body);
//console.log(data)
var songs = data.toptracks.track;
songs.forEach(function (song) {
console.log(song.name)
})
});
}
});
}

JavaScript each() is not working

var request = require('request'),
cheerio = require('cheerio');
var url = "https://namu.wiki/w/크롤링";
request(url, function (err, res, html) {
if (!err) {
var $ = cheerio.load(html);
$('.wiki-heading-content').each(function(){
var post = {"content": "" };
var data=$(this);
post['content']=data.text();
console.log(post);
});
}
});
The line of code below is not working. Why?
//$('.wiki-heading-content').each(function()
You are using jQuery in your code and you overwrite(in the function scope) your global jQuery object on this line.
var $ = cheerio.load(html);

node.js + request => node.js + bluebird + request

I'm trying to understand how to write code with promises.
Check my code plz. This is right?
Node.js + request:
request(url, function (error, response, body) {
if (!error && response.statusCode == 200) {
var jsonpData = body;
var json;
try {
json = JSON.parse(jsonpData);
} catch (e) {
var startPos = jsonpData.indexOf('({');
var endPos = jsonpData.indexOf('})');
var jsonString = jsonpData.substring(startPos+1, endPos+1);
json = JSON.parse(jsonString);
}
callback(null, json);
} else {
callback(error);
}
});
Node.js + bluebird + request:
request.getAsync(url)
.spread(function(response, body) {return body;})
.then(JSON.parse)
.then(function(json){console.log(json)})
.catch(function(e){console.error(e)});
How to check response status? I should use if from first example or something more interesting?
You can simply check if the response.statusCode is not 200 in the spread handler and throw an Error from that, so that the catch handler will take care of it. You can implement it like this
var request = require('bluebird').promisifyAll(require('request'), {multiArgs: true});
request.getAsync(url).spread(function (response, body) {
if (response.statusCode != 200)
throw new Error('Unsuccessful attempt. Code: ' + response.statusCode);
return JSON.parse(body);
}).then(console.log).catch(console.error);
And if you notice, we return the parsed JSON from the spread handler, because JSON.parse is not an async function, so we don't have to do it in a separate then handler.
One way to check the status code:
.spread(function(response, body) {
if (response.statusCode !== 200) {
throw new Error('Unexpected status code');
}
return body;
})

call two file from another with node js

I have a file called node.js:
var net = require('net');
var crypto = require('crypto');
//sjcl
var sjcl = require('./sjcl');
//retrive fb profile
var loadFb = require('./loadFb.js');
var loadFeed = require('./loadFeed.js');
//read json user file
var fs = require('fs');
var text = fs.readFileSync(__dirname + '/users','utf8');
var HOST = 'localhost';
var PORT = 7000;
net.createServer(function(sock) {
// We have a connection - a socket object
console.log('CONNECTED: ' + sock.remoteAddress +':'+ sock.remotePort);
// Add a 'data' event handler to this instance of socket
sock.on('data', function(data) {
console.log('User request profile of: ' + data);
//var date = (data.toString()).split("***");
//var from = date[1];
loadFb(extendetPath, function(pageData)
{
loadFeed(extendetPath2, function(pageData2)
{
var fs = require('fs');
var profileText = fs.readFileSync('/tmp/profile','utf8');
console.log(profileText);
sock.write(profileText);
});
});
});
// Add a 'close' event handler to this instance of socket
sock.on('close', function(data) {
console.log('CLOSED: ' + sock.remoteAddress +' '+ sock.remotePort);
});
}).listen(PORT);
console.log('Server listening on ' + HOST +':'+ PORT);
function returnKeyFromUser(id)
{
//text
var trovata = false;
var dati = JSON.parse(text);
for(var i=0; i<dati.friendlist.friend.length && trovata==false; i++)
{
var user = (dati.friendlist.friend[i].username).replace("\n","");
var userID = (id).replace("\n","");
if(user==userID)
{
trovata=true;
return ((dati.friendlist.friend[i].publicKey).toString()).replace("\n","");
}
}
if(trovata==false)
return null;
}
There is a small http server that receives a facebook username and what he have to do is retrieve 2 page:
a graphapi with the profile information, and a graphapi with the feed informations of a facebook profile
I copy the other two files:
var https = require('https');
module.exports = function(path, callback) {
var options = {
host: 'graph.facebook.com',
port: 443,
path: (path.toString()).replace("\n",""),
method: 'GET'
};
var req = https.get(options, function(res) {
var pageData = "";
if((path.toString()).indexOf("/")==0 && (path.toString()).indexOf("/GET /`HTTP/")!=0)
//for load only (I hope facebook profile)
{
console.log(options);
res.setEncoding('utf8');
res.on('data', function (chunk) {
pageData += chunk;
});
res.on('end', function()
{
var fs = require('fs');
fs.writeFile("/tmp/profile", pageData, function(err) {
if(err) {
console.log(err);
} else {
console.log("The file was saved!");
}
});
//callback(pageData);
return;
});
}
});
};
3° file
var https = require('https');
module.exports = function(path, callback) {
var options = {
host: 'graph.facebook.com',
port: 443,
path: (path.toString()).replace("\n",""),
method: 'GET'
};
var req = https.get(options, function(res) {
var pageData = "";
if((path.toString()).indexOf("/")==0 && (path.toString()).indexOf("/GET / HTTP/")!=0) //for load only (I hope facebook profile)
{
console.log(options);
res.setEncoding('utf8');
res.on('data', function (chunk) {
pageData += chunk;
});
res.on('end', function()
{
var fs = require('fs');
fs.appendFile('/tmp/profile', "***"+pageData, function (err) {
if (err) throw err;
console.log('It\'s saved!');
});
callback(pageData);
});
}
});
};
I don't know If there is a way to call the two file in the first file node.js but what I done is this: (to call from node.js the fist file, and from the second file call the third)
in node.js file I call the first file loadFb.js with this command:
loadFb(extendetPath, function(pageData)
{
This call saves a file on my tmp profile directory and inside I call the other file loadFeed that appends some text.
After that I have to send the entire information to the client but I have a mistake.
In order the nodejs correctly call loadFb and he write tmp - profile, than he call loadFeed
but before appending the information the node call back to the client only the half of informations that I need.
I'm not a good nodejs programmer, this is a work for my thesis.
Can someone help me?
Let's look at the following code:
res.on('end', function()
{
var fs = require('fs');
fs.appendFile('/tmp/profile', "***"+pageData, function (err) {
if (err) throw err;
console.log('It\'s saved!');
});
callback(pageData);
});
What it does it runs the asynchronous method appendFile and immediately after that calls callback. So when the code in the callback is executed, the file is not updated yet. You need to move the callback(pageData); to the appendFile's callback. And you need to review you code keeping this in mind because I see that the same fix should be made in another file so maybe there are some similar places as well.

Categories