I'm building a simple web scraper using cheerio, this is my code :
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/', function(req, res){
url = 'http://www.gazzetta.it/calcio/fantanews/voti/serie-a-2016-17/giornata-32';
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var json = {
teamsList : {}
};
$('.listView .magicTeamList').each(function(){
var teamName = $(this).find('.teamNameIn').text();
var playerName = $(this).find('.playerNameIn').text();
var playerGrade = $(this).find('.fvParameter').not('.head').text();
json.teamsList[teamName] = {};
json.teamsList[teamName][playerName] = playerGrade;
})
} else {
console.log('error happened :' + error);
}
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');
})
// Finally, we'll just send out a message to the browser reminding you that this app does not have a UI.
res.send('Check your console!')
});
})
app.listen(8081);
console.log('Magic happens on port 8081');
exports = module.exports = app;
I want to push data inside the json object but I'm not having the desired effect, the output.json i get is this (i'll just paste an excerpt of the result):
{
"teamsList": {
"atalanta": {
"Gollini P.Masiello A.Conti A.Hateboer H.Caldara M.Toloi R.Kurtic J.Cristante B.Freuler R.Kessie F.Petagna A.Dalessandro M.Cabezas B.Paloschi A.": "4.56.57.55.5779667666-"
},
"bologna": {
"Mirante A.Torosidis V.Maietta D.Masina A.Gastaldello D.Pulgar E.Taider S.Dzemaili B.Verdi S.Di Francesco F.Petkovic B.Krafth E.Krejci L.Sadiq U.": "5.5665.5636.565.556--5.5"
}
}
}
But what i want is an output like this:
{
"teamsList": {
"atalanta": {
"Gollini P." : 4.5,
"Masiello A." : 6.5,
...
}
}
}
I've searched for answers but I couldn't find any for my specific problem, or maybe I'm just missing something very stupid.. btw any help would be appreciated, thx guys
On each loop you have this
json.teamsList[teamName] = {};
That is going to remove any existing players from the team object. You need to have IF statement to check if it already exists.
Related
I'm attempting to make a live graph with data from my sensors, and currently that's working, but I'm not sure how to get it to update by itself.
I'm using node/express/pug, and I have a backend which listens for MQTT messages and appends them to a json file, keeping the latest 30 values or so.
The frontend's routing index.js parses the JSON and structures it in arrays, which are passed to the pug template, which the javascript inside it can then access.
My question is, in order for this to dynamically reload, can I still do it like this, through index.js, or do I need to do it another way? Ideally some sort of notification when the file is updated from the backend to front end, but a timer will be perfectly adequate. The updates will be at about 1000ms.
I am quite a beginner in javascript/web development.
My index.js file, the last part is the relevant part:
var router = express.Router();
var db;
var fs = require('fs');
var moment = require('moment');
/* GET home page. */
router.get('/', function(req, res, next) {
res.render('index', { title: 'Sematek StrainMonitor' });
next();
});
//when rawdata is loaded, load client
router.get('/rawdata', function(req, res) {
var dateFrom = req.query.from;
var dateTo = req.query.to;
var dateFromEpoch = moment(dateFrom).unix();
var dateToEpoch = moment(dateTo).unix();
let searchQuery;
if ((dateFromEpoch) && (dateToEpoch)) {
searchQuery = "{epoch : { $gt : " + dateFromEpoch + ", $lt : " + dateToEpoch + "}}";
};
req.conn.then(client=> client.db('test').collection('sensor0').find({searchQuery}).toArray(function(err, docs) {
if(err) { console.error(err) }
if (!docs) {
console.log("oops.. didn't retrieve any docs from the mongodb serv");
}
const dataPairsDB = docs.map(e => ({x: e.epoch, y: e.data}) );
let datasetDB = [];
let labelsetDB = [];
dataPairsDB.forEach((num,index) => {
labelsetDB.push(moment.unix(dataPairsDB[index].x).format('HH:mm:ss'));
datasetDB.push(dataPairsDB[index].y);
});
console.log(req.query)
//gets CloudMQTT values from JSON file and converts it into two arrays for Chart.js display
let data = JSON.parse(fs.readFileSync("./json/latest-value.json"));
const dataPairs = data.readings.map(e => ({x: e.epoch, y: e.data}) );
let dataset = [];
let labelset = [];
dataPairs.forEach((num,index) => {
labelset.push(moment.unix(dataPairs[index].x).format('HH:mm:ss'));
dataset.push(dataPairs[index].y);
});
if (req.cookies.isLoggedIn == 'true') {
res.render('rawdata', {
docs : docs,
datasetDB : JSON.stringify(datasetDB),
labelsetDB : JSON.stringify(labelsetDB),
dataset : JSON.stringify(dataset),
labelset : JSON.stringify(labelset)
});
} else {
res.redirect(401,'/');
}
}))
});
From my rawdata.pug file:
canvas(id="line-chart" )
script(src='https://cdnjs.cloudflare.com/ajax/libs/Chart.js/2.5.0/Chart.min.js')
script(type='text/javascript').
var dataset = JSON.parse('!{dataset}');
var labelset = JSON.parse('!{labelset}');
script(src='/scripts/charter.js')
Just have the webpage subscribe to the same MQTT topic (using MQTT over Websockets see the Paho Javascript client) and update live without having to go back to the back end and pull all the historical data every time a new value is added.
I am trying to create a configuration file in order to use some fields in my files.
So, in the config file ( myconfig.json ) :
var fs = require('fs');
var path = require('path');
var Struct = {
FIELD: 1
};
var Data = JSON.stringify(Struct);
fs.writeFile(__dirname + '/myconfig.json', Data, function (err) {
if (err) {
console.log('There has been an error.');
console.log(err.message);
return;
}
console.log('Configuration saved successfully.')
});
In another js file :
var path = require('path');
var fs = require('fs');
var Data = fs.readFileSync( __dirname + '/myconfig.json');
console.log("res = ", Data.FIELD);
but it prints undefined.
JSON.parse(Data) should fix it (it's a string at the moment).
Ok, the error was that the first code is a javascript file and not a json .
So, I need to have the myconfig.json :
{
"FIELDS" : 1
}
and use :
var Data = JSON.parse(fs.readFileSync( './myconfig.json'));
You can just use require to get the JSON back from the file:
var Data = require(__dirname + '/myconfig.json');
console.log("res = ", Data.FIELD);
I am very new to NodeJS and I am building my first API using restify.
I want to find out what is best practice for caching the response data - each API call must have its own cache time.
I have looked at res.cache() but that seems to be only per user request and not a global application cache.
I then looked at restify-cache but the documentation did not clearly tell me how to use it.
My application works like this:
server.js code:
var restify = require('restify');
var mysqlDB = require('./config/connection');
// REST server declaration and configuration
var server = restify.createServer({
name: 'test-api',
version: '0.0.1'
});
server.pre(restify.pre.sanitizePath());
server.use(restify.queryParser());
server.use(restify.acceptParser(server.acceptable));
server.use(restify.queryParser());
server.use(restify.bodyParser());
server.listen(9007, function() {
console.log('%s listening at %', server.name, server.url);
mysqlDB.handleDisconnect();
console.log(new Date() +': Started Cricket API on port 9007');
});
var routes = require('./routes')(server);
routes.js code:
module.exports = function(app) {
app.get('/', function(req, res, next) {
return res.send("You have reached the test API");
});
var fixtures = require('./controllers/fixtures');
app.get('/getfixtures', fixtures.getFixtures); // Get All Fixtures
};
fixtures.js code snippet:
this.getFixtures = function (req, res, next) {
res.header("Access-Control-Allow-Origin", "*");
res.header("Access-Control-Allow-Headers", "X-Requested-With");
console.log("Get All Fixtures");
var mysql = mysqlDB.getConnection();
var query = "SELECT * FROM fixtures WHERE fixture_date >= CURDATE() ORDER BY fixture_date, fixture_time";
mysql.query(query,function(err,rows){
if(err) {
var status = mysqlDB.getErrorStatus(err.code);
return res.status(status.code).send("Error : "+ status.Message);
} else {
var data = [];
for (i in rows){
var item = rows[i];
var output = util.formatDate(item.fixture_date);
item.fixture_date = output;
data.push(item);
};
return res.send(data);
}
});
};
Can someone please send me in the right direction? I don't know where to add the caching part?
From the library file:
server.use(cache.before); is a middleware that will be triggered to load before the request is handled, going to Redis and checking if the header_{url} key and payload_{url} exits, and at that case the value is returned.
You could put it as mentioned in this gist:
https://gist.github.com/jeffstieler/3d84fa5468c7eadb7685
var server = restify.createServer({
name: 'test-api',
version: '0.0.1'
});
server.pre(restify.pre.sanitizePath());
server.use(cache.before);
server.use(restify.queryParser());
server.use(restify.acceptParser(server.acceptable));
server.use(restify.queryParser());
server.use(restify.bodyParser());
server.on('after', cache.after);
In your code I would add the cache.before after you sanitize the path as this will be saved in Redis. also a next() should be included in every route cached.
I ended up using node-cache
It was easy to use since I come from a Java/Play Framework background - hopefully it helps someone else in future.
Example usage:
var nodeCache = require( "node-cache" );
var myCache = new nodeCache();
var cachedValue = myCache.get("alltests", true);
if (cachedValue != undefined) {
return res.send(cachedValue);
} else {
// Do work here and then:
success = myCache.set("alltests", valueHere, cacheTime);
}
So i have found an api for node https://github.com/schme16/node-mangafox
But i have no idea on how to use it
Lets say that i want to use this
mangaFox.getManga = function(callback){
$.get('http://mangafox.me/manga/',function(data){
var list = {};
data.find('.manga_list li a').each(function(index, d){
var b = $(d);
list[mangaFox.fixTitle(b.text())] = {id:b.attr('rel'), title:b.text()};
});
(callback||function(){})(list);
}, true);
}
What should i do to show the list in the '/' route
This i what i have so far
var express = require('express'),
path = require('path'),
mangaFox = require('node-mangafox');
var app = express();
app.get('/', function(req, res) {
});
app.listen(1337);
console.log('oke');
If some cloud help me understand how this works
app.get('/', function(req, res) {
function renderList(data) {
return Object.keys(data);
res.send(JSON.stringify(list));
}
var list = mangaFox.getManga(renderList);
});
This is the simplest thing I can come up with. You just get the object returned by the module, list its keys, and send back that stringified as your response. Try it out. You'll probably want to replace the renderList with some HTML templating.
I'm scraping the ghost blogging platform and the package I'm using to do so is request, but I'm not sure how to return a value of a nested request. I commented out the area that is causing the issue. Thanks!
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/', function(req, res){
var url = 'http://<subdomain>.ghost.io';
var articles = [];
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var post;
$('article').each(function(index) {
var self = $(this);
var article = {
header : self.find('h2.post-title').text(),
route: url + self.find('h2.post-title a').attr('href'),
content : '',
author: self.find('footer a').text(),
timestamp : self.find('time.post-date').text()
};
request(article.route, function(error, response, html) {
$ = cheerio.load(html);
post = $('section.post-content').text();
return post; //*** this is what I can't return ***//
//*** I'd like it to be the value of article.content ***//
});
console.log(post); //*** undefined ***//
articles.push(article);
});
fs.writeFile('posts.json', JSON.stringify(articles, null, 4), function(err){
console.log('Posts created.');
});
}
});
})
app.listen('8000');
console.log('Watching for changes.');
exports = module.exports = app;
So your problem boils down to having a list of URLs, and wanting to (asynchronously, because node.js) request all of them, and then know when they've all finished and then do something with the collected results.
The async module (npm install async) will let you do that like this:
var request = require('request');
var async = require('async');
var urls = ["http://google.com", "http://yahoo.com"];
async.map(urls, request, function(err, results) {
// First 100 characters of http://google.com
console.log(results[0].body.substr(0, 100));
// First 100 characters of http://yahoo.com
console.log(results[1].body.substr(0, 100));
});
So you can apply this to your problem by doing the following:
Synchronously create the entire articles list.
Use async.map on the list.
In the callback to async.map, you have a list of all the responses; you can process them synchronously.