Memory usage with Node.js, Jsdom, HttpAgent - javascript

I have made a scrapping script that navigates through a blog in order to get all titles. Problem is that Node is keeping using more and more memory as the script runs (thousands of URLs), until 8 go (max), and then the script crashes.
My script uses loops, there must be a simple way to clear memory?
Here is a code example :
var request = require('request'),
httpAgent = require('http-agent'),
jsdom = require('jsdom').jsdom,
myWindow = jsdom().createWindow(),
$ = require('jquery'),
jq = require('jquery').create(),
jQuery = require('jquery').create(myWindow),
profiler = require('v8-profiler');
profiler.startProfiling();
request({ uri:'http://www.guylabbe.ca' }, function (error, response, body) {
if (error && response.statusCode !== 200) {
console.log('Error when contacting URL')
}
var last_page_lk = $(body).find('.pane-content .pager li:last-child a').attr('href');
var nb_pages = last_page_lk.substring(last_page_lk.indexOf('=')+1);
var page_lk_base = last_page_lk.substring(0,last_page_lk.indexOf('='));
var pages = Array();
pages.push(page_lk_base);
for(var i=1;i<=nb_pages;i++) {
pages.push(page_lk_base+'='+i);
}
// parser les pages
var fiches = Array();
var agent2 = httpAgent.create('www.guylabbe.ca', pages);
agent2.addListener('next', function (err, agent2) {
var snapshot = profiler.takeSnapshot();
$(body).find('.view span.field-content span.views-field-title').each(function(){
fiches.push($(body).find(this).parents('a').attr('href'));
//console.log($(body).find(this).html());
});
agent2.next();
});
agent2.start();
agent2.addListener('stop', function (agent) {
console.log('-------------------------------- (fini de cumuler les URL fiches) --------------------------------');
// Parser les fiches
var agent_fiches = httpAgent.create('www.guylabbe.ca', fiches);
agent_fiches.addListener('next', function (err, agent_fiches) {
console.log('log info');
agent_fiches.next();
});
agent_fiches.start();
agent_fiches.addListener('stop', function (agent) {
console.log('-------------------------------- Eh voilà! --------------------------------');
});
agent_fiches.addListener('start', function (agent) {
console.log('-------------------------------- C est parti... --------------------------------');
});
});
});

explicitly null vars where you dont need them anymore. if you create variables outside a closure, and use it inside the closure, you should null it when you dont need it anymore. see this thread and read the accepted answer: How to prevent memory leaks in node.js?

I had a similar issue with jsdom leaking memory.
In my case, closing the jsdom window by doing solved it.
Maybe you should add myWindow.close() after you're done with scraping it.
See related answer https://stackoverflow.com/a/6891729/1824928

Related

local PDF file scraping in node.js

I have uploaded a pdf via a MEAN stack web application using fs. I want to extract certain fields from the pdf and display them on the web app. I have looked at a couple npm packages like pdf.js, pdf2json. I can't figure out the documentation and javascript callbacks used in the examples available. Please help!
I hope I can help answer your question. Using pdf2json can be used to parse a pdf and extract the text. There are a couple of steps that need to be taken to get it working. I have adapted the example from https://github.com/modesty/pdf2json.
The setup is to install pdf2json in the node app, and also underscore. The example page didn't explain the need to define your own callback functions. It also used self instead of this to register them. So, with the appropriate changes the code to extract all the text from the pdf will be something like this:
// Get the dependencies that have already been installed
// to ./node_modules with `npm install <dep>`in the root director
// of your app
var _ = require('underscore'),
PDFParser = require('pdf2json');
var pdfParser = new PDFParser();
// Create a function to handle the pdf once it has been parsed.
// In this case we cycle through all the pages and extraxt
// All the text blocks and print them to console.
// If you do `console.log(JSON.stringify(pdf))` you will
// see how the parsed pdf is composed. Drill down into it
// to find the data you are looking for.
var _onPDFBinDataReady = function (pdf) {
console.log('Loaded pdf:\n');
for (var i in pdf.data.Pages) {
var page = pdf.data.Pages[i];
for (var j in page.Texts) {
var text = page.Texts[j];
console.log(text.R[0].T);
}
}
};
// Create an error handling function
var _onPDFBinDataError = function (error) {
console.log(error);
};
// Use underscore to bind the data ready function to the pdfParser
// so that when the data ready event is emitted your function will
// be called. As opposed to the example, I have used `this` instead
// of `self` since self had no meaning in this context
pdfParser.on('pdfParser_dataReady', _.bind(_onPDFBinDataReady, this));
// Register error handling function
pdfParser.on('pdfParser_dataError', _.bind(_onPDFBinDataError, this));
// Construct the file path of the pdf
var pdfFilePath = 'test3.pdf';
// Load the pdf. When it is loaded your data ready function will be called.
pdfParser.loadPDF(pdfFilePath);
I am running the code out of my server side controller.
module.exports = (function() {
return {
add: function(req, res) {
var tmp_path = req.files.pdf.path;
var target_path = './uploads/' + req.files.pdf.name;
fs.rename(tmp_path, target_path, function(err) {
if (err) throw err;
// delete the temporary file, so that the explicitly set temporary upload dir does not get filled with unwanted files
fs.unlink(tmp_path, function() {
if (err) throw err;
//edit here pdf parser
res.redirect('#/');
});
})
},
show: function(req, res) {
var pdfParser = new PDFParser();
var _onPDFBinDataReady = function (pdf) {
console.log('Loaded pdf:\n');
for (var i in pdf.data.Pages) {
var page = pdf.data.Pages[i];
// console.log(page.Texts);
for (var j in page.Texts) {
var text = page.Texts[j];
// console.log(text.R[0].T);
}
}
console.log(JSON.stringify(pdf));
};
// Create an error handling function
var _onPDFBinDataError = function (error) {
console.log(error);
};
pdfParser.on('pdfParser_dataReady', _.bind(_onPDFBinDataReady, this));
// Register error handling function
pdfParser.on('pdfParser_dataError', _.bind(_onPDFBinDataError, this));
// Construct the file path of the pdf
var pdfFilePath = './uploads/Invoice_template.pdf';
// Load the pdf. When it is loaded your data ready function will be called.
pdfParser.loadPDF(pdfFilePath);
},
//end controller
}

Node.js Recursive Loop fail

Below is the Node.js Script. It downloads the images contained in a div. The loop works fine for 9.86% that is upto id = 36. When id > 36 it exits the loop. I am using the node 0.12 version. The loop needs to run 365 times before its completion. I am usign the method of recursive callback.
Code:
//Required modules
var fs = require('fs'),
cheerio = require('cheerio'),
request = require('request');
//Default Variables
var baseURI = 'http://www.website/';
var year = 2013;
var id = 1;
var savePath = process.argv[2];
//Download Function
var download = function(uri, filename, callback){
request({ uri: uri }, function(err, res, body){
var $ = cheerio.load(body);
var imgDiv = $('#img-wallpaper').children()['0'];
if(err)
console.err(err);
if(typeof imgDiv !== 'undefined') {
request(imgDiv.attribs.src).pipe(fs.createWriteStream(filename)).on('close', callback);}
});
};
//Main Function
console.log("Downloading . . .");
// Loop function to create a recursive effect
(function loop(){
download(baseURI+year+'/'+id+'/wallpaper/', savePath+id+'.jpg',
function(){
console.log(((id/365)*100).toFixed(2)+'% completed');
if(id == 330)
year = "2014";
if(((id/365)*100) != 100){
id=id+1;
loop();}
});
})(1)
Do I understand correctly that if you set the starting value for the id more than 35 (36?) the script is not downloaded any images?
Test the script on the fixed uri and on the fixed image by changing only the variables. Script is expected to work out? If this is the case:
Or not called callback for body request
Or a false condition typeof imgDiv !== 'undefined'
Or not called callback for request image
Thus, when an error in one of these points the script stops working. It is necessary to change the severity of conditions.
As #stdob said, The error was caused due to
Or a false condition typeof imgDiv !== 'undefined'
Though the answer is not the right way to overcome the error, it is more of an hack type. It ignores the error and continues the script!
if(typeof imgDiv !== 'undefined') {
request(imgDiv.attribs.src).pipe(fs.createWriteStream(filename)).on('close', callback);
}
else{
id++;
request(uri).pipe(fs.createWriteStream(filename)).on('close', callback);
}

node.js never exits after insert to couchbase, opposite of most node questions

My problem seems to be the opposite of every node.js question :-) I have a simple forEach loop to read a list of files and insert them into a Couchbase database. This works great, but it never exits after reading all the lines. So I added a counter to shutdown the couchbase connection after all inserts are complete. This works.
This process is intended to load hundreds of thousands of files, so I brought the async module into the mix to batch the inserts into groups of 100. The async.eachLimit is used to iterate over the array and insert documents in batches. Now the orig problem is back. Whatever magic async.eachLimit uses to recognize the process is complete is not happening.
I've been going through javascript scoping, callbacks, async, etc. Google searches are hitting keywords but not this issue. I've reduced the code down to the following testcase. To test, create three files and add their names to testlist.txt.
The async.eachLimit in place works up until it hits the limit, then hangs. Comment this out and uncomment array.forEach line and it works. Thanks in advance!
var fs = require('fs');
var couchbase = require('couchbase');
var async = require('async');
var filelist = 'testlist.txt';
var key_count = 0;
var cb_config = { host: 'localhost:8091', bucket: 'default'};
var db = new couchbase.Connection(cb_config, function(err) {
if (err) {
console.log('ERRR connect to couchbase at config['+cb_config+']');
throw err;
}
});
var insertFile=function(line) {
console.log('LOAD ['+line+']');
fs.readFile(line, function(file_err, f_doc) {
if(file_err) throw file_err;
db.set(line, f_doc, function(db_err, db_res){
if (db_err) {
console.log('FAIL ['+line+'] err['+db_err+']');
} else {
console.log('PASS ['+line+']');
}
key_count--;
if (key_count == 0) {
console.log('DONE Shutting down client, no more keys');
db.shutdown();
}
});
});
}
// read list of files into data array from file filelist
fs.readFile(filelist, function(filelist_err, lines) {
if(filelist_err) throw filelist_err;
// HACK split adds empty line to array, use replace to fix
var array = lines.toString().replace(/\n$/, '').split('\n');
key_count = array.length;
console.log('INIT lines['+key_count+']');
async.eachLimit(array, 2, insertFile, function(err) { console.log('FAIL async err['+err+']');} );
//array.forEach(function(data){insertFile(data);return;});
});
Testcase output using array.forEach:
INIT lines[3]
LOAD [files.big.txt]
LOAD [files.little.txt]
LOAD [files.txt]
PASS [files.little.txt]
PASS [files.big.txt]
PASS [files.txt]
DONE Shutting down client, no more keys
Testcase output using async.eachLimit:
INIT lines[3]
LOAD [files.big.txt]
LOAD [files.little.txt]
PASS [files.little.txt]
PASS [files.big.txt]
... hang, never gets to 3...
After review with a coworker, they spotted my mistake. I missed the async callback in my insertFile function. Adding that in works and allows me to remove the key counter! Solution code below:
var fs = require('fs');
var couchbase = require('couchbase');
var async = require('async');
var filelist = 'testlist.txt';
var key_count = 0;
var cb_config = { host: 'localhost:8091', bucket: 'default'};
var db = new couchbase.Connection(cb_config, function(err) {
if (err) {
console.log('ERRR connect to couchbase at config['+cb_config+']');
throw err;
}
});
var insertFile=function(line, callback) {
console.log('LOAD ['+line+']');
fs.readFile(line, function(file_err, f_doc) {
if(file_err) throw file_err;
db.set(line, f_doc, function(db_err, db_res){
if (db_err) {
console.log('FAIL ['+line+'] err['+db_err+']');
callback(db_err);
} else {
console.log('PASS ['+line+']');
callback();
}
});
});
}
// read list of files into data array from file filelist
fs.readFile(filelist, function(filelist_err, data) {
if(filelist_err) throw filelist_err;
// HACK stoopid bug split adds empty line to array, use replace to fix
var array = data.toString().replace(/\n$/, '').split('\n');
key_count = array.length;
console.log('READ files['+key_count+']');
async.eachLimit(array, 2, insertFile, function(err) {
if (err) console.log('LAST with async err['+err+']');
console.log('DONE Shutting down client, no more keys');
db.shutdown();
});
});
And successful output:
$ node testcase.js
READ files[3]
LOAD [files.big.txt]
LOAD [files.little.txt]
PASS [files.little.txt]
LOAD [files.txt]
PASS [files.big.txt]
PASS [files.txt]
DONE Shutting down client, no more keys

Requesting RSS feeds from two web sites in Node.JS

I have got a Node.JS server that requests data from two web servers: bbc.co.uk and sky.com. Then the RSS feeds are parsed, and a user sees two lists: from BBC and from sky.
Here is the code.
var feed = require('feed-read');
var http = require('http');
var async = require('async');
var request = require('request');
var LIMIT = 10;
var UNABLE_TO_CONNECT = "Unable to connect.";
var BBC_URL = 'http://feeds.bbci.co.uk/news/rss.xml';
var SKY_URL = 'http://news.sky.com/feeds/rss/home.xml';
var server = http.createServer(onRequest);
server.listen(9000);
function onRequest(req, res) {
res.writeHead(200, {
'Content-Type' : 'text/html; charset=utf-8'
});
async.parallel([ function(callback) {
feed(BBC_URL, onRssFetched);
// TODO: where to call callback()?
}, function(callback) {
feed(SKY_URL, onRssFetched);
// TODO: where to call callback()?
} ], function done(err, results) {
console.log("Done");
if (err) {
throw err;
}
});
}
function onRssFetched(err, articles) {
console.log("RSS fetched");
var html = [];
if (err) {
html.push("<p>", UNABLE_TO_CONNECT = "</p>");
} else {
html.push("<ol>");
var i = 0;
articles.forEach(function(entry) {
if (i == LIMIT) {
return;
}
html.push("<li><a href='" + entry.link + "'>" + entry.title
+ "</a></li>");
i++;
});
}
console.log(html.join(""));
}
Now I don't know how to add the result to the web page. If I call callback() right after calling the feed method, callback() will be executed without waiting until feed has completed its job. On the other hand, I can't pass callback to feed. Maybe the approach is wrong, and I need some other module for RSS parsing.
#Maksim I know your original question included the async module, but propose an alternative:
why not stream each article to the client as it comes in rather than waiting for all RSS feeds to return before sending a response...?
By using async.parallel you are telling node:
"wait until we have a response from all these news services
and only then (combine the articles into) a single response to the client ..."
This uses up memory for each connected client while you wait for all responses (from the RSS news services) ... wasteful.
So I've written my answer without resorting to async.
And, instead of waiting for ages (while async combines all the feeds into one),
the client sees news as soon as the first rss feed returns!
var feed = require('feed-read'), // require the feed-read module
http = require("http"),
urls = [
"http://feeds.bbci.co.uk/news/rss.xml",
"http://news.sky.com/feeds/rss/home.xml",
"http://www.techmeme.com/feed.xml"
]; // Example RSS Feeds
http.createServer(function (req, res) {
// send basic http headers to client
res.writeHead(200, {
"Content-Type": "text/html",
"Transfer-Encoding": "chunked"
});
// setup simple html page:
res.write("<html>\n<head>\n<title>RSS Feeds</title>\n</head>\n<body>");
// loop through our list of RSS feed urls
for (var j = 0; j < urls.length; j++) {
// fetch rss feed for the url:
feed(urls[j], function(err, articles) {
// loop through the list of articles returned
for (var i = 0; i < articles.length; i++) {
// stream article title (and what ever else you want) to client
res.write("<h3>"+articles[i].title +"</h3>");
// check we have reached the end of our list of articles & urls
if( i === articles.length-1 && j === urls.length-1) {
res.end("</body>\n</html>"); // end http response
} // else still have rss urls to check
} // end inner for loop
}); // end call to feed (feed-read) method
} // end urls for loop
}).listen(9000);
Key Advantages:
The people connecting to your app will see news/results Much faster (almost instantly!)
Your app uses much less memory
You don't have to edit/update any code when you add new RSS news feeds!
For even more detail/notes on this solution
see: https://github.com/nelsonic/node-parse-rss
No, you don't need another library. But what you need to do is to hand over callback to your feed function instead of onRssFetched. This way the single RSS feeds are handed over to the final callback in your async.parallel call, using the result variable.
In this variable you then have access to both RSS feeds at the same time, and you can do whatever you want to do with them.
So, basically your logic needs to be:
async.parallel({
bbc: function (callback) {
feed(BBC_URL, callback);
},
sky: function (callback) {
feed(SKY_URL, callback);
}
}, function (err, result) {
if (err) {
// Somewhere, something went wrong…
}
var rssBbc = result.bbc,
rssSky = result.sky;
// Merge the two feeds or deliver them to the client or do
// whatever you want to do with them.
});
And that's it :-).
To amplify #nelsonic's answer (enough that I feel this warrants its own answer), feed-parse already processes asynchronously. At its heart, it's still running on http.request. If you look at the code, you see that you can even pass in an array of URLs directly and it will loop through them, but it uses more of an "async.eachSeries" approach, where the next call only occurs after the previous one completes, which appears not to be what you're looking for.
If you truly want to wait for calls to complete first before handling them, you're better off asynchronously buffering the data, then using underscore's _.after() to run after all URLs have finished.
But odds are, what you really want to do (unless you're just looking for an example to try out async) is #nelsonic's answer.
I would ideally stream the rss data, instead of aggregating in memory. #nelsonic has explained the correct approach to solve this problem.
Still, if we were to make your code running, consider following code:
var util = require('util');
var http = require('http');
var async = require('async');
var feed = require('feed-read');
var request = require('request');
var LIMIT = 10;
var UNABLE_TO_CONNECT = 'Unable to connect.';
var BBC_URL = 'http://feeds.bbci.co.uk/news/rss.xml';
var SKY_URL = 'http://news.sky.com/feeds/rss/home.xml';
var server = http.createServer(onRequest);
server.listen(9000);
function onRequest(req, res) {
util.log('Request recieved!');
res.writeHead(200, {
'Content-Type': 'text/html; charset=utf-8'
});
async.parallel({
bbc: function (callback) {
feed(BBC_URL, function (err, articles) {
var html = onRssFetched(err, articles);
callback(err, html);
});
},
sky: function (callback) {
feed(SKY_URL, function (err, articles) {
var html = onRssFetched(err, articles);
callback(err, html);
});
}
}, done);
function done(err, results) {
util.log('Received results: ' + Object.keys(results).join(','));
if (!err && results) {
var entry, html;
for (entry in results) {
html = results[entry];
res.write(html.join(''));
}
util.log('Send complete!');
res.end();
} else {
console.log(err || 'no data in results');
res.end('Unable to process your request');
}
}
}
function onRssFetched(err, articles) {
// limit number of articles;
articles = articles.slice(0, LIMIT);
var html = [];
if (err) {
html.push('<p>', UNABLE_TO_CONNECT = '</p>');
} else {
html.push('<ol>');
articles.forEach(function (entry) {
html.push('<li>' + entry.title + '</li>');
});
html.push('</ol>');
}
return html;
}
// -- Test Code ---------------------------------------------------------
if (require.main === module) {
(function () {
var req, res = {
writeHead: console.log,
write: console.log,
end: console.log
};
// onRequest(req, res);
})();
}
Let me know if you face any issues.

Node.js - Organising code and closures - SFTP/Inotify

I was hoping I could get some advice on why my nodejs program is behaving in the way it is.
I am using two modules, node-sftp and node-inotify. I have setup node-inotify to watch a directory and call a function when something is written there, the function being an sftp upload.
Now the problem I have is that processing one file at a time is fine but when I drop 4 files in one go there, the function is called four times but only one sftp upload goes through.
Do I need to order my code in a particular way to ensure that the sftp upload occurs x times, is this something to do with closures perhaps?
This is a basic version of my code...
"event_handler" is called when something happens on a "watched" directory
"check_event" figures out if this type of event is one we want, in this case it's a "write"
"ftp_to_server" prepare connection details
"do_ftp" basically uses the node-sftp module to perform the sftp upload
event_handler = function(event){
var supplier;
check_event(event, supplier, type, ftp_to_server);
};
=================
function check_event(event, handler)
{
if (event.type === 'xxxxxx') {
var file_to_process_name = 'abc';
var file_to_process_dir = 'abc';
var remote_dir = 'abc';
handler(file_to_process_name, file_to_process_dir, remote_dir);
}
}
function ftp_to_server(file_to_process_name, file_to_process_dir, remote_dir) {
var connection_details = conf.ftp.connections
do_ftp(connection_details, file_to_process_name, file_to_process_dir, remote_dir);
}
function do_ftp(connection_details, file_to_process_name, file_to_process_dir, remote_dir) {
var credentials = {
// FTP settings here
};
var local_file = file_to_process_dir + file_to_process_name;
var remote_file = remote_dir + file_to_process_name;
connection = new sftp(credentials, function(err) {
if (err){
throw err;
}
connection.writeFile(remote_file, fs.readFileSync(local_file, "utf8"), null, function(err) {
if (err) {
throw err;
}
console.info('FTP PUT DONE');
});
});
};
Your "connection = new sftp(credentials, function(err) {"
should be
var connection = new sftp(credentials, function(err) {
The way you currently have it coded, "connection" is a global and you are writing over it.

Categories