Download a file from web using Node js and loop - javascript

I want to download multiple files from the web using this code:
var fs = require('fs');
var http = require('http');
var request = require('request');
var file;
for(var i = 1; i <= 5; i++) {
//CHECK IF REMOTE FILE EXISTS
request('http://webaddress.com/filename' + i + '.jar', function (err, resp) {
//IF EXISTS DO
if (resp.statusCode == 200) {
//DOWNLOAD DATA AND CREATE A NEW .JAR FILE
file = fs.createWriteStream('D:\\filename' + i + '.jar');
http.get('http://webaddress.com/filename' + i + '.jar', function(response) {
response.pipe(file);
file.on('finish', function() {
file.close();
});
});
}
//FILE DOES NOT EXIST
});
}
The result I want is: multiple files downloaded with filenames filename1-5.jar. The result I am getting is just 1 file with filename filename5.jar (or the last value of the i var in the loop). What am I doing wrong?

Like #Ionut said your requests are async so you need to wait for it
let fs = require('fs');
let request = require('request');
let download = (uri, filename) => {
return new Promise ((resolve, reject) => {
request.head(uri, function(err, res) {
if (res.statusCode === 200) {
request(uri).pipe(fs.createWriteStream(filename)).on('close', resolve);
} else {
reject(res.statusCode);
}
});
});
};
let promises = [];
for(let i = 1; i <= 5; i++) {
promises.push(download('http://webaddress.com/filename' + i + '.jar', 'D:\\filename' + i + '.jar'));
}
Promise.all(promises).then(() => {
process.exit(0);
});

Your request is asynchronous and it will execute only after your loop finishes hence the 5 from the filename. A solution for this is to threat your code separately by creating a new function and call it inside the loop:
var fs = require('fs');
var http = require('http');
var request = require('request');
var file;
function customRequest(i){
//CHECK IF REMOTE FILE EXISTS
return request('http://webaddress.com/filename' + i + '.jar', function(err, resp) {
//IF EXISTS DO
if (resp.statusCode == 200) {
//DOWNLOAD DATA AND CREATE A NEW .JAR FILE
file = fs.createWriteStream('D:\\filename' + i + '.jar');
http.get('http://webaddress.com/filename' + i + '.jar', function(response) {
response.pipe(file);
file.on('finish', function() {
file.close();
});
});
}
//FILE DOES NOT EXIST
});
}
for (var i = 1; i <= 5; i++) {
customRequest(i)
}

Related

Server crashing when downloading big files from url with NodeJS

I'm trying to download a file (+200mb) from an url (that requires to be logged in) using the request module from nodejs but when it finishes the download the server starts to slow down until it crashes or gets really slow.
here's my current code (it downloads the whole file but my server crashes eventually):
//Required modules
var http = require('http'),
url = require("url"),
fs = require('fs'),
request = require('request'),
path = require("path"),
events = require("events"),
j = request.jar(),
request = request.defaults({ jar : j });
// make te request login in with cookies
console.log("downloading file :)");
request({
url:"http://example.com/",
method:"POST",
form:{u: "username",p: "password"}
},
function(error,response,body){
setTimeout(function(){
request
.get('http://example.com/test.ashx?file=15')
.on('error', function(err) {
console.log(err);
})
.pipe(fs.createWriteStream("/var/www/filesDir/CustomName.zip"));
console.log(body);
},1000)
}
);
I've tried applying another solution from this answer but for some reason the file is not being downloaded properly, it only shows "Download progress: 0 bytes" all the time maybe it's something related with the login access.
here I put the other code I'm trying to implement from the last sentence:
var http = require('http');
var fs = require('fs');
var url = require("url");
var request = require('request');
var path = require("path");
var events = require("events");
var j = request.jar();
var request = request.defaults({ jar : j });
request({
url:"http://example.com/",
method:"POST",
form:{u:"username",p:"password"}
}, function(error,response,body){
var downloadfile = "http://example.com/test.ashx?file=15";
var host = url.parse(downloadfile).hostname;
var filename = "1977.zip";
var req = http.request({port: 80, host: host, method: 'GET'});
console.log("Downloading file: " + filename);
console.log("Before download request");
req.end();
dlprogress = 0;
setInterval(function () {
console.log("Download progress: " + dlprogress + " bytes");
}, 1000);
req.addListener('response', function (response) {
var downloadfile = fs.createWriteStream(filename, {'flags': 'a'});
console.log("File size " + filename + ": " + response.headers['content-length'] + " bytes.");
response.addListener('data', function (chunk) {
dlprogress += chunk.length;
downloadfile.write(chunk, encoding='binary');
});
response.addListener("end", function() {
downloadfile.end();
console.log("Finished downloading " + filename);
});
});
}
);
It doesn't matter which way you decide to help me with.
I ended up doing it like this, I've tested the code multiple times and the server didn't crash anymore:
var request = require('request');
var filed = require('filed');
var j = request.jar();
var request = request.defaults({ jar : j });
// make the request and login
request({
url: "http://example.com/login",
method:"POST",
// 'u' and 'p' are the field names on the form
form:{u:"username",p:"password"}
}, function(error,response,body){
setTimeout(function(){
var downloadURL = 'http://example.com/download/file.zip';
var downloadPath = "/path/to/download/localNameForFile.zip";
var downloadFile = filed(downloadPath);
var r = request(downloadURL).pipe(downloadFile);
r.on('data', function(data) {
console.log('binary data received');
});
downloadFile.on('end', function () {
console.log(downloadPath, 'file downloaded to path');
});
downloadFile.on('error', function (err) {
console.log(err, 'error downloading file');
});
},3000)
}
);

node.js webtorrent collect all files by magnet link

I have not used node.js before.
Have a .txt file with list of magnet links.
Want to write a json file with list of all files contained in these links.
var WebTorrent = require('webtorrent');
var fs = require('fs');
var client = new WebTorrent();
var array = fs.readFileSync('yop.txt').toString().split("\n");
i = 0;
while (i < array.length) {
//console.log(array[i]);
var magnetURI = array[i];
n = 0;
client.add(magnetURI, function (torrent) {
torrent.files.forEach(function (file) {
//console.log( file.name)
jsonString = JSON.stringify({'book': file.name});
fs.appendFile("data.json", jsonString, function (err) {
if (err) {console.log(err);} else { n++ }
});
if (n == torrent.files.length) {i++ }
})
})
}
when run gives the following error
Sorry for such a terrible code.
var WebTorrent = require('webtorrent')
var fs = require('fs')
var stream = fs.createWriteStream("2.txt");
var client = new WebTorrent()
var array = fs.readFileSync('yop.txt').toString().split("\n");
i = 0;
function parseMagnet (uri){
var magnetURI = uri[i]
console.log(magnetURI)
client.add(magnetURI, function (torrent) {
torrent.files.forEach(function (file) {
writeStr = (uri[i]+ '\n'+ file.name+ '\n');
stream.write(writeStr);
console.log(file.name)
});
console.log('Done !')
console.log(i)
i += 1
parseMagnet(array);
client.remove(magnetURI);
})
}
parseMagnet(array)

Getting hostname of URL in loop - Nodejs

So I'm currently trying to make an http status code checker. But for some reason the response hostname is returning undefined, yet i can retrieve the response statusCode. Am I grabbing it properly?
var fs = require('fs');
var http = require('http');
var stdin = process.openStdin();
stdin.addListener('data', function (userInput) {
var urlListPath = userInput.toString().trim();
fs.readFile(urlListPath, function (err, data) {
if (err) {
return console.log(err);
}
var urlArray = data.toString().split("\n");
urlArray = urlArray.filter(Boolean);
for (var i = 0; i < urlArray.length; i++){
http.get({hostname: urlArray[i]}, function(res){
console.log(res.hostname + ' | ' + res.statusCode + ' | ' + res.statusMessage);
});
}
});
});
Solved by wrapping in a function as #bluesman suggested:
var fs = require('fs');
var http = require('http');
var stdin = process.openStdin();
stdin.addListener('data', function (userInput) {
var urlListPath = userInput.toString().trim();
fs.readFile(urlListPath, function (err, data) {
if (err) {
return console.log(err);
}
var urlArray = data.toString().split("\n");
urlArray = urlArray.filter(Boolean);
for (var i = 0; i < urlArray.length; i++){
getHttpInfo(urlArray[i]);
}
});
});
function getHttpInfo(url){
http.get({hostname: url}, function(res){
console.log(url + ' : ' + res.statusCode + ' | ' + res.statusMessage);
});
}

Conceptual: Create a counter in an external function

I've written some javascript to successfully download hundreds of files from an external site, using wget at the core.
After downloading all of the files, I would like to do some stuff with them. The issue is, the files aren't of equal size. So, the last wget formed isn't necessarily the last file downloaded, meaning I can't really tell when the last file has completed.
I do, however, know how many files there are in total, and the number associated with each wget.
I have 3 js files, [parseproducts.js] ==> [createurl.js] ==> [downloadurl.js]
Using this information, how can I tell when all of the files have been downloaded?
I tried creating a "ticker" function in another file but the function resets itself on each instance, so it doesn't work at all!
Edit: Code added Didn't do this initially because I didn't think people would want to trawl through it! I'm new to programming/javascript/node. Please let me know if there's something that I could do better (I'm sure most of it could be more efficient!)
parseproducts.js
var fs = require('fs');
var iset = require('./ticker.js');
var createurl = require('./createurl.js');
var array = [];
filename = 'productlist.txt';
fs.readFile(filename, 'utf8', function(err, data) {
if (err) throw err;
content = data;
parseFile();
});
function parseFile() {
var stringarray = String(content).split(";");
for (var index = 0; index < stringarray.length; ++index) {
createurl(stringarray[index],index,stringarray.length);
console.log(index+'/'+stringarray.length+' sent.');
if (index === 0) {
iset(true,stringarray.length);
} else {
iset (false,stringarray.length);
}
};
};
createurl.js
function create(partnumber,iteration,total) {
var JSdownloadURL = require('./downloadurl.js');
JSdownloadURL(createurl(partnumber),partnumber,iteration,total);
function createurl(partnumber) {
var URL = ('"https://data.icecat.biz/xml_s3/xml_server3.cgi?prod_id='+partnumber+';vendor=hp;lang=en;output=productxml"');
return URL;
};
};
module.exports = create;
downloadurl.js
function downloadurl(URL,partnumber,iteration,total) {
// Dependencies
var fs = require('fs');
var url = require('url');
var http = require('http');
var exec = require('child_process').exec;
var spawn = require('child_process').spawn;
var checkfiles = require('./checkfiles.js');
// App variables
var file_url = URL;
var DOWNLOAD_DIR = './downloads/';
// We will be downloading the files to a directory, so make sure it's there
var mkdir = 'mkdir -p ' + DOWNLOAD_DIR;
var child = exec(mkdir, function(err, stdout, stderr) {
if (err) throw err;
else download_file_wget(file_url);
});
// Function to download file using wget
var download_file_wget = function(file_url) {
// compose the wget command
var wget = 'wget --http-user="MyAccount" --http-password="MyPassword" -P ' + DOWNLOAD_DIR + ' ' + file_url;
// excute wget using child_process' exec function
var child = exec(wget, function(err, stdout, stderr) {
if (err) throw err;
else console.log(iteration+'/'+total+' downloaded. '+partnumber + ' downloaded to ' + DOWNLOAD_DIR);
});
};
};
module.exports = downloadurl;
Failed attempt ticker.js
function iset(bol,total) {
if (bol === true) {
var i = 0;
} else {
var i = 1;
};
counter(i, total);
}
function counter(i,total) {
var n = n + i;
if (n === (total - 1)) {
var checkfiles = require('./checkfiles.js');
checkfiles(total);
} else {
console.log('nothing done');
};
}
module.exports = iset;
Update In response to answer
This is what my code looks like now. However, I get the error
child_process.js:945
throw errnoException(process._errno, 'spawn');
^
Error: spawn EMFILE
// Dependencies
var fs = require('fs');
var url = require('url');
var http = require('http');
var exec = require('child_process').exec;
var spawn = require('child_process').spawn;
var checkfiles = require('./checkfiles.js');
function downloadurl(URL,partnumber,iteration,total,clb) {
// App variables
var file_url = URL;
var DOWNLOAD_DIR = './downloads/';
// We will be downloading the files to a directory, so make sure it's there
var mkdir = 'mkdir -p ' + DOWNLOAD_DIR;
var child = exec(mkdir, function(err, stdout, stderr) {
if (err) throw err;
else download_file_wget(file_url);
});
var child = exec(mkdir, function(err, stdout, stderr) {
if (err) {
clb(err);
} else {
var wget = 'wget --http-user="amadman114" --http-password="Chip10" -P ' + DOWNLOAD_DIR + ' ' + file_url;
// excute wget using child_process' exec function
var child = exec(wget, function(err, stdout, stderr) {
if (err) {
clb(err);
} else {
console.log(iteration+'/'+total+' downloaded. '+partnumber + ' downloaded to ' + DOWNLOAD_DIR);
clb(null); // <-- you can pass more args here if you want, like result
// as a general convention callbacks take a form of
// callback(err, res1, res2, ...)
}
});
}
});
};
function clb() {
var LIMIT = 100,
errs = [];
for (var i = 0; i < LIMIT; i++) {
downloadurl(URL,partnumber,iternation,total, function(err) {
if (err) {
errs.push(err);
}
LIMIT--;
if (!LIMIT) {
finalize(errs);
}
});
}
}
function finalize(errs) {
// you can now check for err
//or do whatever stuff to finalize the code
}
module.exports = downloadurl;
OK, so you have this function downloadurl. What you need to do is to pass one more argument to it: the callback. And please, move requirements outside the function and don't define a function in a function unless necessary:
var fs = require('fs');
// other dependencies and constants
function downloadurl(URL,partnumber,iteration,total, clb) { // <-- new arg
// some code
var child = exec(mkdir, function(err, stdout, stderr) {
if (err) {
clb(err);
} else {
var wget = 'wget --http-user="MyAccount" --http-password="MyPassword" -P ' + DOWNLOAD_DIR + ' ' + file_url;
// excute wget using child_process' exec function
var child = exec(wget, function(err, stdout, stderr) {
if (err) {
clb(err);
} else {
console.log(iteration+'/'+total+' downloaded. '+partnumber + ' downloaded to ' + DOWNLOAD_DIR);
clb(null); // <-- you can pass more args here if you want, like result
// as a general convention callbacks take a form of
// callback(err, res1, res2, ...)
}
});
}
});
};
This look nicer, doesn't it? Now when you call that function multiple times you do:
var LIMIT = 100,
errs = [];
for (var i = 0; i < LIMIT; i++) {
downloadurl(..., function(err) {
if (err) {
errs.push(err);
}
LIMIT--;
if (!LIMIT) {
finalize(errs);
}
});
}
function finalize(errs) {
// you can now check for err
//or do whatever stuff to finalize the code
}
That's a general idea. You have to tweak it to your needs (in particular you have to modify the intermediate function to accept a callback as well). Of course there are libraries which will take care of most this for you like kriskowal's Q (Q.all) or caolan's async (async.parallel).
Not sure if I have understood the problem correctly as I don't see the code. I have worked on creating a download engine. I used to make background AJAX calls to download files. After every successful download or 'onComplete' event I used to increment one variable to keep track of downloaded files. Provdided user won't refresh the page till all the download is complete. Else the download counter can be saved in LocalStorage also.

How can I get node.js to return data once all operations are complete

I am just learning server-side JavaScript so please bear with any glaring mistakes I've made.
I am trying to write a file parser that operates on HTML files in a directory and returns a JSON string once all files have been parsed. I started it with a single file and it works fine. it loads the resource from Apache running on the same machine, injects jquery, does the parsing and returns my JSON.
var request = require('request'),
jsdom = require('jsdom'),
sys = require('sys'),
http = require('http');
http.createServer(function (req, res) {
request({uri:'http://localhost/tfrohe/Car3E.html'}, function (error, response, body) {
if (!error && response.statusCode == 200) {
var window = jsdom.jsdom(body).createWindow();
jsdom.jQueryify(window, 'http://ajax.googleapis.com/ajax/libs/jquery/1.4.4/jquery.min.js', function (window, jquery) {
// jQuery is now loaded on the jsdom window created from 'body'
var emps = {};
jquery("tr td img").parent().parent().each(function(){
var step = 0;
jquery(this).children().each(function(index){
if (jquery(this).children('img').attr('src') !== undefined) {
step++;
var name = jquery(this).parent().next().next().children('td:nth-child('+step+')').children().children().text();
var name_parts = name.split(",");
var last = name_parts[0];
var name_parts = name_parts[1].split(/\u00a0/g);
var first = name_parts[2];
emps[last + ",_" + first] = jquery(this).children('img').attr('src');
}
});
});
emps = JSON.stringify(emps);
//console.log(emps);
res.writeHead(200, {'Content-Type': 'text/plain'});
res.end(emps);
});
} else {
res.writeHead(200, {"Content-Type": "text/plain"});
res.end("empty");
//console.log(response.statusCode);
}
});
}).listen(8124);
Now I am trying to extend this to using the regular file system (fs) and get all HTML files in the directory and parse them the same way and return a single combined JSON object once all files have been parsed. Here is what I have so far but it does not work.
var sys = require("sys"),
fs = require("fs"),
jsdom = require("jsdom"),
emps = {};
//path = '/home/inet/www/media/employees/';
readDirectory = function(path) {
fs.readdir(path, function(err, files) {
var htmlfiles = [];
files.forEach(function(name) {
if(name.substr(-4) === "html") {
htmlfiles.push(name);
}
});
var count = htmlfiles.length;
htmlfiles.forEach(function(filename) {
fs.readFile(path + filename, "binary", function(err, data) {
if(err) throw err;
window = jsdom.jsdom(data).createWindow();
jsdom.jQueryify(window, 'http://ajax.googleapis.com/ajax/libs/jquery/1.4.4/jquery.min.js', function (window, jquery) {
jquery("tr td img").parent().parent().each(function(){
var step = 0;
jquery(this).children().each(function(index){
if (jquery(this).children('img').attr('src') !== undefined) {
step++;
var empname = jquery(this).parent().next().next().children('td:nth-child('+step+')').children().children().text();
var name_parts = empname.split(",");
var last = name_parts[0];
var name_parts = name_parts[1].split(/\u00a0/g);
var first = name_parts[2]
emps[last + ",_" + first] = jquery(this).children('img').attr('src');
}
});
});
});
});
});
});
}
readDirectory('/home/inet/www/media/employees/', function() {
console.log(emps);
});
In this particular case, there are 2 html files in the directory. If i console.log(emps) during the htmlfiles.forEach() it shows me the results from the first file then the results for both files together the way I expect. how do I get emps to be returned to readDirectory so i can output it as desired?
Completed Script
After the answers below, here is the completed script with a httpServer to serve up the detail.
var sys = require('sys'),
fs = require("fs"),
http = require('http'),
jsdom = require('jsdom'),
emps = {};
var timed = setInterval(function() {
emps = {};
readDirectory('/home/inet/www/media/employees/', function(emps) {
});
}, 3600000);
readDirectory = function(path, callback) {
fs.readdir(path, function(err, files) {
var htmlfiles = [];
files.forEach(function(name) {
if(name.substr(-4) === "html") {
htmlfiles.push(name);
}
});
var count = htmlfiles.length;
htmlfiles.forEach(function(filename) {
fs.readFile(path + filename, "binary", function(err, data) {
if(err) throw err;
window = jsdom.jsdom(data).createWindow();
jsdom.jQueryify(window, 'http://ajax.googleapis.com/ajax/libs/jquery/1.4.4/jquery.min.js', function (window, jquery) {
var imagecount = jquery("tr td img").length;
jquery("tr td img").parent().parent().each(function(){
var step = 0;
jquery(this).children().each(function(index){
if (jquery(this).children('img').attr('src') !== undefined) {
step += 1;
var empname = jquery(this).parent().next().next().children('td:nth-child('+step+')').children().children().text();
var name_parts = empname.split(",");
var last = name_parts[0];
var name_parts = name_parts[1].split(/\u00a0/g);
var first = name_parts[2]
emps[last + ",_" + first] = jquery(this).children('img').attr('src');
}
});
});
count -= 1;
if (count <= 0) {
callback(JSON.stringify(emps));
}
});
});
});
});
}
var init = readDirectory('/home/inet/www/media/employees/', function(emps) {
});
http.createServer(function (req, res) {
res.writeHead(200, {'Content-Type': 'text/plain'});
res.end(JSON.stringify(emps));
}).listen(8124);
That sure is a lot of code a couple of mistakes.
You're never calling the callback function you supply to readDirectory
You need to keep track of the files you have parsed, when you parsed all of them, call the callback and supply the emps
This should work:
var sys = require("sys"),
fs = require("fs"),
jsdom = require("jsdom"),
//path = '/home/inet/www/media/employees/';
// This is a nicer way
function readDirectory(path, callback) {
fs.readdir(path, function(err, files) {
// make this local
var emps = {};
var htmlfiles = [];
files.forEach(function(name) {
if(name.substr(-4) === "html") {
htmlfiles.push(name);
}
});
// Keep track of the number of files we have parsed
var count = htmlfiles.length;
var done = 0;
htmlfiles.forEach(function(filename) {
fs.readFile(path + filename, "binary", function(err, data) {
if(err) throw err;
window = jsdom.jsdom(data).createWindow();
jsdom.jQueryify(window, 'http://ajax.googleapis.com/ajax/libs/jquery/1.4.4/jquery.min.js', function (window, jquery) {
jquery("tr td img").parent().parent().each(function(){
var step = 0;
jquery(this).children().each(function(index){
if (jquery(this).children('img').attr('src') !== undefined) {
step++;
var empname = jquery(this).parent().next().next().children('td:nth-child('+step+')').children().children().text();
var name_parts = empname.split(",");
var last = name_parts[0];
var name_parts = name_parts[1].split(/\u00a0/g);
var first = name_parts[2]
emps[last + ",_" + first] = jquery(this).children('img').attr('src');
}
});
});
// As soon as all have finished call the callback and supply emps
done++;
if (done === count) {
callback(emps);
}
});
});
});
});
}
readDirectory('/home/inet/www/media/employees/', function(emps) {
console.log(emps);
});
You seem to be doing this a tad wrong
readDirectory('/home/inet/www/media/employees/', function() {
console.log(emps);
});
But you've defined your function as:
readDirectory = function(path) {
Where is the callback argument? Try this:
readDirectory = function(path, callback) {
then under emps[last + ",_" + first] = jquery(this).children('img').attr('src'); put
callback.call(null, emps);
Your callback function will be called however many times your loop goes on for. If you want it to return all of them at once, you'll need to get a count of how many times the loop is going to run for, count up until that number then call your callback when the emps array is full of the data you need.

Categories