I have a node.js app that scrapes informations from a website. I'm using npm packages request and cheerio and the scraping works fine but I want to do something else when the request function is done. Here's some code:
app.js
var express = require('express');
var extractor = require("./extractor");
console.log(extractor('http://www.example.com'));
var app = express();
app.get('/', function (req, res) {
res.send('Hello world\n');
});
app.listen(3000);
extractor.js (all the fun)
var request = require('request');
var cheerio = require('cheerio');
var Extractor = function(url) {
var games = [];
request(url, function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('tr.game').each(function(i, v){
var game = { /* many attributes */ };
games.push(game);
});
}
});
this.extractedGames = games;
};
module.exports = function(url) {
return new Extractor(url);
};
Eventually when I run this it shows { extractedGames: [] } that is because the output was printed before the request treatment was over. So I want to add an on success event to extracedGames attribute when the request job is over.
Thanks
Solved it myself ! I hope this could help people in the future (though I felt like a complete noob)
The trick was to emit an event and handle it later.
var express = require('express');
var extractor = require("./extractor");
extractor('http://www.example.com');
var app = express();
app.get('/', function (req, res) {
res.send('Hello world\n');
});
app.listen(3000);
I removed console.log(extractor('http://www.example.com')) because this would run before the request job is done. So I moved it to the event handling function.
var request = require('request');
var cheerio = require('cheerio');
var Emitter = require('events').EventEmitter;
var extractEmitter = new Emitter();
extractEmitter.on('extracted', function(extractedGames){
console.log(extractedGames);
});
var Extractor = function(url) {
var games = [];
request(url, function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('tr.game').each(function(i, v){
var game = { /* many attributes */ };
games.push(game);
});
extractEmitter.emit('extracted', games);
}
});
this.extractedGames = games;
};
module.exports = function(url) {
return new Extractor(url);
};
Related
user will give the url in the input type field in the Html page that url need to get in the JS program and then the JS program need to execute to fetch the data from webpage.
this is what have done so far.
var request = require('request');
var cheerio = require('cheerio');
var fs = require("fs");
var url = ""
request(url, function(err, response, html){
if(!err) {
var $ =cheerio.load(html);
var allItems = $('.clearfix').parent().children();
var items = [];
allItems.each(function(index) {
var result = $('.clearfix').eq(index).parent().children().eq(1).find("a").text();
if(result !== ""){
items.push(result);
}
});
fs.writeFile("output1.xls",JSON.stringify(items, null, 1),)
console.log(items);
}
});
Is this the solution to your problem?
var url = document.getElementById('myURL').value
var request = require('request'),
cheerio = require('cheerio');
var url = "https://namu.wiki/w/크롤링";
request(url, function (err, res, html) {
if (!err) {
var $ = cheerio.load(html);
$('.wiki-heading-content').each(function(){
var post = {"content": "" };
var data=$(this);
post['content']=data.text();
console.log(post);
});
}
});
The line of code below is not working. Why?
//$('.wiki-heading-content').each(function()
You are using jQuery in your code and you overwrite(in the function scope) your global jQuery object on this line.
var $ = cheerio.load(html);
This question is about a crawler in node.js.
A start_url is given where he crawls for URLs, and "pushes" them to a .json-file (output.json).
At the moment, he runs the request function only with the start_url, and saves the collected URLs in output.json. I want that he uses the saved URLs by replacing the start_url with the first collected URL and collect links again ... and so on ...
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var start_url = ["http://stackoverflow.com/"]
var req = function(url){
request(url, function(error, response, html){
var $ = cheerio.load(html);
var data = [];
$("a").each(function() {
var link = $(this);
var exurls = {exurl: new Array(link.attr("href"))}
data.push(exurls);
// Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
// save to "output.json" from time to time, so you can stop it anytime
});
fs.writeFile("output.json", JSON.stringify(data, null, 4), function(err){
if(err){
console.log(err);
} else {
console.log("File successfully written!");
}
});
});
}
for (var i = 0; i < start_url.length; i++){
req(start_url[i]);
}
So what you can do is make the function call recursively. The below example should work:
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var start_url = ["http://stackoverflow.com/"]
var req = function(url){
var count = 0;
request(url, function(error, response, html){
var $ = cheerio.load(html);
$("a").each(function() {
var link = $(this);
var exurls = {exurl: new Array(link.attr("href"))}
start_url.push(exurls);
// Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
// save to "output.json" from time to time, so you can stop it anytime
});
try {
fs.writeFileSync("output.json");
console.log("File successfully written!");
}catch(err){
console.log(err);
}
++count;
if(start_url.length > count) {
req(start_url[count]);
}
});
}
return req(start_url[0]);
The problem with this is that you are completely rewriting the file each time. If this goes on for awhile you are going to run out of memory. Another option is to create a write stream
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var start_url = ["http://stackoverflow.com/"]
var wstream = fs.createWriteStream("output.json");
var req = function(url){
request(url, function(error, response, html){
var $ = cheerio.load(html);
$("a").each(function() {
var link = $(this);
var exurls = {exurl: new Array(link.attr("href"))}
start_url.push(exurls);
// Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
// save to "output.json" from time to time, so you can stop it anytime
wstream.write('"'+ exurls + '",');
});
start_url.shift();
if(start_url.length > 0) {
return req(start_url[0]);
}
wstream.end();
});
}
req(start_url[0]);
Edit: switched to a basic queue so combat memory problems
I have a file called node.js:
var net = require('net');
var crypto = require('crypto');
//sjcl
var sjcl = require('./sjcl');
//retrive fb profile
var loadFb = require('./loadFb.js');
var loadFeed = require('./loadFeed.js');
//read json user file
var fs = require('fs');
var text = fs.readFileSync(__dirname + '/users','utf8');
var HOST = 'localhost';
var PORT = 7000;
net.createServer(function(sock) {
// We have a connection - a socket object
console.log('CONNECTED: ' + sock.remoteAddress +':'+ sock.remotePort);
// Add a 'data' event handler to this instance of socket
sock.on('data', function(data) {
console.log('User request profile of: ' + data);
//var date = (data.toString()).split("***");
//var from = date[1];
loadFb(extendetPath, function(pageData)
{
loadFeed(extendetPath2, function(pageData2)
{
var fs = require('fs');
var profileText = fs.readFileSync('/tmp/profile','utf8');
console.log(profileText);
sock.write(profileText);
});
});
});
// Add a 'close' event handler to this instance of socket
sock.on('close', function(data) {
console.log('CLOSED: ' + sock.remoteAddress +' '+ sock.remotePort);
});
}).listen(PORT);
console.log('Server listening on ' + HOST +':'+ PORT);
function returnKeyFromUser(id)
{
//text
var trovata = false;
var dati = JSON.parse(text);
for(var i=0; i<dati.friendlist.friend.length && trovata==false; i++)
{
var user = (dati.friendlist.friend[i].username).replace("\n","");
var userID = (id).replace("\n","");
if(user==userID)
{
trovata=true;
return ((dati.friendlist.friend[i].publicKey).toString()).replace("\n","");
}
}
if(trovata==false)
return null;
}
There is a small http server that receives a facebook username and what he have to do is retrieve 2 page:
a graphapi with the profile information, and a graphapi with the feed informations of a facebook profile
I copy the other two files:
var https = require('https');
module.exports = function(path, callback) {
var options = {
host: 'graph.facebook.com',
port: 443,
path: (path.toString()).replace("\n",""),
method: 'GET'
};
var req = https.get(options, function(res) {
var pageData = "";
if((path.toString()).indexOf("/")==0 && (path.toString()).indexOf("/GET /`HTTP/")!=0)
//for load only (I hope facebook profile)
{
console.log(options);
res.setEncoding('utf8');
res.on('data', function (chunk) {
pageData += chunk;
});
res.on('end', function()
{
var fs = require('fs');
fs.writeFile("/tmp/profile", pageData, function(err) {
if(err) {
console.log(err);
} else {
console.log("The file was saved!");
}
});
//callback(pageData);
return;
});
}
});
};
3° file
var https = require('https');
module.exports = function(path, callback) {
var options = {
host: 'graph.facebook.com',
port: 443,
path: (path.toString()).replace("\n",""),
method: 'GET'
};
var req = https.get(options, function(res) {
var pageData = "";
if((path.toString()).indexOf("/")==0 && (path.toString()).indexOf("/GET / HTTP/")!=0) //for load only (I hope facebook profile)
{
console.log(options);
res.setEncoding('utf8');
res.on('data', function (chunk) {
pageData += chunk;
});
res.on('end', function()
{
var fs = require('fs');
fs.appendFile('/tmp/profile', "***"+pageData, function (err) {
if (err) throw err;
console.log('It\'s saved!');
});
callback(pageData);
});
}
});
};
I don't know If there is a way to call the two file in the first file node.js but what I done is this: (to call from node.js the fist file, and from the second file call the third)
in node.js file I call the first file loadFb.js with this command:
loadFb(extendetPath, function(pageData)
{
This call saves a file on my tmp profile directory and inside I call the other file loadFeed that appends some text.
After that I have to send the entire information to the client but I have a mistake.
In order the nodejs correctly call loadFb and he write tmp - profile, than he call loadFeed
but before appending the information the node call back to the client only the half of informations that I need.
I'm not a good nodejs programmer, this is a work for my thesis.
Can someone help me?
Let's look at the following code:
res.on('end', function()
{
var fs = require('fs');
fs.appendFile('/tmp/profile', "***"+pageData, function (err) {
if (err) throw err;
console.log('It\'s saved!');
});
callback(pageData);
});
What it does it runs the asynchronous method appendFile and immediately after that calls callback. So when the code in the callback is executed, the file is not updated yet. You need to move the callback(pageData); to the appendFile's callback. And you need to review you code keeping this in mind because I see that the same fix should be made in another file so maybe there are some similar places as well.
I am very new to node-webkit. I am using the following code to download a file. How would I go about running the file automatically when the file has finished?
var https = require('https');
var fs = require('fs');
var file = fs.createWriteStream("update_setup.exe");
var request = https.get(url + "/appdata/update_setup.exe", function (response) {
response.pipe(file);
});
Just use the writable stream's close event and spawn a child process. The event will fire once the response has completed piping to the stream.
var https = require('https');
var fs = require('fs');
var exec = require('child_process').exec;
var file = fs.createWriteStream('update_setup.exe');
var request = https.get(path, function(res) {
res.pipe(file);
});
file.on('close', function() {
exec('update_setup.exe', function(err, stdout, stderr) {
// output from starting
});
});