So I was messing around in node.js and ran this code :
var http = require("http");
function get() {
var headers = {
'Accept-Encoding': 'gzip'
};
var startedAt = new Date().getTime();
for (var i = 0; i < 1; i++)
http.get({
host: "www.example.net",
path: "/catalog/",
header: headers
}, function (response) {
var body;
response.on('data', function (d) {});
response.on('end', function (e) {
console.log(new Date().getTime() - startedAt);
});
});
}
get()
I discovered it is almost 3x slower than GET request over Google Chrome extensions. I have copied the headers exactly, yet there is still almost a 100ms difference in speed.
Any ideas how to speed this up?
I'm finding times around 50ms/request with your same logic so I'm going to assume you are running this loop many times and taking an average. If that is the case then you are probably running a version of node < 0.12 and http.globalAgent.maxSockets has a default of 5 (which is only allowing 5 concurrent connections at a time in your case).
Try setting http.globalAgent.maxSockets = Infinity; as the setting is in current versions of Node.
Related
I am having an issue where I am running 200 concurrent GET requests in JS using 60 http proxies. Each request is downloading a 1.6 MB (megabyte) file. The proxies have speeds of 300-400 ms. The network download speed is typically 1000 Mbps to 1500 Mbps (megabits per second). I am running this script on a Windows Server.
See the screenshot here: Ookla Speed test
async function sendRequest(kw, anchor, proxy, cb) {
const [ip, port, user, pass] = proxy.split(':')
var url = /*some api endpoint*/
var options = {
headers: {
'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
'accept-language': "en-US,en;q=0.9,fr;q=0.8",
'cache-control': "max-age=0",
'if-none-match': "W/\"701c0-GT5kESYD+JEYlkRTfHGT3PH+PTI\"",
'sec-ch-ua': "\"Google Chrome\";v=\"89\", \"Chromium\";v=\"89\", \";Not A Brand\";v=\"99\"",
'sec-ch-ua-mobile': "?0",
'sec-fetch-dest': "document",
'sec-fetch-mode': "navigate",
'sec-fetch-site': "none",
'sec-fetch-user': "?1",
'upgrade-insecure-requests': "1"
},
proxy: {
host: ip,
port: port,
auth: {
username: user,
password: pass
}
}
}
axios.get(url, options).then(res => cb(res))
}
let proxies = [/*proxies go here*/]
let kws = [/*keywords go here*/]
let responses = []
let starttime
let finishtime
function sendRequests(limit) {
if (limit > kws.length) return console.log('limit exceeds request kws')
for(let i = 0; i < limit; i++) {
console.log('sending request......')
sendRequest(kws[i], '50', proxies[i], function (res) {
console.log('status: '+res.status)
responses.push(res)
if(responses.length == limit) {
finishtime = returnTimeMilliseconds()
console.log('Finished '+limit+' requests in ' + (finishtime - starttime)/1000 + ' seconds')
}
})
}
}
function returnTimeMilliseconds() {
var d = new Date();
return d.getTime()
}
starttime = returnTimeMilliseconds()
sendRequests(200)
A single request takes an average of 1.5-2 seconds. Up to 5 concurrent requests, the requests return in 1.5-2 seconds from sending them all to the last request coming in. This makes sense given the definition of concurrency. However when running a larger number like 10, 20, or 200, it almost always takes way too long for the first request to resolve - about 5 to 10 seconds - and for all of the requests to resolve the ratio is about 5 requests per second.
At any time my network usage while running this program hits 80 Mbps Maximum, found by using Resource Monitor on windows.
I have ran this script on 2 different wifi networks, the first was a 30 Mbps network. I experienced longer wait times there but the entire 30 Mbps was being used, so speeds were typically 3.45 MBps (megabytes per second). Here, the speed should be 125 to 187.5 MBps but the result is drastically less.
What could be the issue here?
I want to download 70 images. Their complete size is around 100mb.
This is my simplified part of code
function downloadImage(src){
var dst = '...';
request(src).pipe(fs.createWriteStream(dst));
return dst;
}
arrayOf70.forEach(function(e){
var thing = new Thing({
// ...
image: downloadImage(url)
});
thing.save();
}
The problem is there are too much concurrent downloads. Okay first step: Pass a huge timeout to request.
request({url: src, timeout: 120000000}).pipe(fs.createWriteStream(dst));
Well, that didn't worked well since it exceeds the OS TCP timeout. At least I think that's the problem. Anyway I'm getting timed out connections
stream.js:94
throw er; // Unhandled stream error in pipe.
^
Error: connect ETIMEDOUT
at exports._errnoException (util.js:746:11)
at TCPConnectWrap.afterConnect [as oncomplete] (net.js:1000:19)
So. What is the way to go to limit the amount of concurrent downloads?
Timeouts are not an ideal solution. What you really need is the ability to wait for a download to finish and afterwards immediately start a new download. And that a specific number of times in parallel.
You could do that by using a callback.
function downloadImage(src, callback){
var dst = '...';
http.get(src, function(res) {
res.pipe(fs.createWriteStream(dst))
.on("finish", function() {
callback(dst);
});
});
}
function downloadAllImages(array) {
var idx = 0;
function downloadLoop() {
if(idx >= array.length) return;
downloadImage(array[idx++], function(dst) {
var thing = new Thing({
// ...
image: dst
});
thing.save();
downloadLoop();
});
}
for(var i = 0; i < 5; i++) downloadLoop(); //start 5 concurrent download "loops"
}
Here's an example with setInterval:
var array_length = arrayOf70.length;
var i = 0;
var request_interval = setInterval(makeRequest, 100);
function makeRequest()
if(i<array_length){
var thing = new Thing({
// ...
image: downloadImage(url)
});
thing.save();
i++;
}else{
clearInterval(request_interval);
}
},100);
I have a simple case where I'm requesting a different upstream proxy server from my node.js server. With the increase in load I see the request takes lot of time to execute(though time taken to respond from my upstream proxy server is constant across the requests). To demonstrate the issue i've written a sample program as below. When I execute the below program, the first request takes 118ms to execute and the last one takes 10970ms depending on the website you hit (I've changed the url to google, Try it out with your favourite website). If you observe i'm using async to parallelize my requests.
The question is, what is the reason node.js takes this much time to execute a request when run in parallel. To give some more context on the infra settings(centos 6.5) I have opened up my port range from 1024 to 65535, change the fin_timeout to 15 seconds and enable tw_reuse =1 for sockets in sysctl.conf
var http = require('http');
var uuid = require('node-uuid');
var async = require('async');
function callExternalUrl(){
var uniqueId = uuid.v4();
console.time(uniqueId);
var options = {
host: 'google.com',
port: '80',
path: '/',
method: 'GET'
};
var req = http.request(options, function(res) {
var msg = '';
res.setEncoding('utf8');
res.on('data', function(chunk) {
msg += chunk;
console.timeEnd(uniqueId);
});
res.on('end', function() {
});
});
req.end();
}
function iterateAsync(callback){
var iter = [];
for(var i=0; i<1000; i++){
iter[i] = i;
}
async.each(iter,
function(item, callback) {
callExternalUrl();
},
function(err) {
callback(err);
}
);
}
iterateAsync(function(){console.log('done');});
To give more context below is the code in ruby to do the same. I understand i can't compare these two languages as in apples to apples. But the idea is to show the time it takes to execute the same requests in sequence using ruby. I don't see any increase in the response times for each request going out in sequence. So, I doubt the parallel requests using node is taking more time for the request to respond(and the issue is not from the server to respond but its from sending out the request from the machine itself)
require 'rest_client'
1000.times do |number|
beginning = Time.now
response = RestClient.get 'http://google.com'
puts "Time elapsed #{Time.now - beginning} seconds"
end
For one, you're not calling the async iterator callback function:
function callExternalUrl(asyncCallback) {
...
res.on('end', function() {
asyncCallback();
});
...
}
function iterateAsync(callback) {
var iter = [];
for(var i=0; i<1000; i++){
iter[i] = i;
}
async.each(iter,
function(item, asyncCallback) { // <-- HERE
callExternalUrl(asyncCallback);
},
function(err) {
callback(err);
}
);
}
Also, depending on the Node version you're using, the http module may limit the number of parallel requests being made to a particular hostname:
$ node -pe 'require("http").globalAgent.maxSockets'
On Node 0.10, the default is 5; on Node 0.12, the default is Infinity ("unlimited"). So if you're not on Node 0.12, you should increase that value in your code:
var http = require('http');
http.globalAgent.maxSockets = Infinity;
...
I've tried to run your scenario by using JXcore (fork of Node.JS, and an open source project now on github), which offers multitasking (among many other new features).
var task = function (item) {
var http = require('http');
var uuid = require('node-uuid');
var uniqueId = uuid.v4() + "-" + process.threadId;
console.time(uniqueId);
var options = {
host: 'google.com',
port: '80',
path: '/',
method: 'GET'
};
var req = http.request(options, function (res) {
var msg = '';
res.setEncoding('utf8');
res.on('data', function (chunk) {
msg += chunk;
console.timeEnd(uniqueId);
});
res.on('end', function () {
process.release();
});
});
req.end();
process.keepAlive();
};
jxcore.tasks.setThreadCount(4);
console.time("total");
process.on('exit', function () {
console.timeEnd("total");
});
for (var i = 0; i < 1000; i++)
jxcore.tasks.addTask(task, i);
The sample is not really optimized, but still the total 1000 requests runs with JXcore a little bit faster for me (I was able to measure up to 20% gain on my platform). That may vary depending on the machine, since multitasking is using different threads/instances within one single process (no need for clustering any more). My machine has just 4 threads, that's why I used jxcore.tasks.setThreadCount(4);. You can try with your 32 :)
The way of handling each single request is not significantly different, so I'm not saying that each request takes less time, but the key might be hidden in different queuing mechanism as opposite to "async" module. And of course thanks to multitasking.
I have pretty high traffic peaks, thus I'd like to overwrite the dynamodb retry limit and retry policy.
Somehow I'm not able to find the right config property to overwrite the retry limit and function.
my code so far
var aws = require( 'aws-sdk');
var table = new aws.DynamoDB({params: {TableName: 'MyTable'}});
aws.config.update({accessKeyId: process.env.AWS_ACCESS_KEY_ID, secretAccessKey: process.env.AWS_SECRET_KEY});
aws.config.region = 'eu-central-1';
I found the following amazon variables and code snippets, however I'm not sure how to wire this up with the config?
retryLimit: 15,
retryDelays: function retryDelays() {
var retryCount = this.numRetries();
var delays = [];
for (var i = 0; i < retryCount; ++i) {
if (i === 0) {
delays.push(0);
} else {
delays.push(60*1000 *i); // Retry every minute instead
// Amazon Defaultdelays.push(50 * Math.pow(2, i - 1));
}
}
return delays;
}
The config is pretty limited, and the only retry parameter you can set on it is maxRetries.
maxRetries (Integer) — the maximum amount of retries to attempt with a request. See AWS.DynamoDB.maxRetries for more information.
You should set the maxRetries to a value that is appropriate to your use case.
aws.config.maxRetries = 20;
The retryDelays private API uses internally the maxRetries config setting, so setting that parameter globally like in my code above should work. The retryLimit is completely useless, and forget about it.
The number of retries can be set through configuration, but seems that there is not an elegant way to set the retry delay/backoff strategy etc.
The only way to manipulate those is to listen to the retry event, and manipulate the retry delay (and related behavior) in a event handler callback:
aws.events.on('retry', function(resp) {
// Enable or disable retries completely.
// disabling is equivalent to setting maxRetries to 0.
if (resp.error) resp.error.retryable = true;
// retry all requests with a 2sec delay (if they are retryable)
if (resp.error) resp.error.retryDelay = 2000;
});
Be aware that there is an exponential backoff strategy that runs internally, so the retryDelay is not literally 2s for subsequent retries. If you look at the internal service.js file you will see how the function looks:
retryDelays: function retryDelays() {
var retryCount = this.numRetries();
var delays = [];
for (var i = 0; i < retryCount; ++i) {
delays[i] = Math.pow(2, i) * 30;
}
return delays;
}
I don't think it's a good idea to modify internal API's, but you could do it by modifying the prototype of the Service class:
aws.Service.prototype.retryDelays = function(){ // Do some }
However, this will affect all services, and after looking in depth at this stuff, it is obvious their API wasn't built to cover your use-case in an elegant way, through configuration.
The javascript AWS SDK does not allow the DynamoDB service to overwrite the retryDelayOptions and thus does not allow the customBackoff to be defined. These configurations are allowed for the rest of the services, but for some reason does not work for DynamoDB.
This page notes that :
Note: This works with all services except DynamoDB.
Therefore, if you want to define a customBackoff function, ie: determine a retryDelay, it is not possible through configuration. The only way I have found was to overwrite the private method retryDelays of the DynamoDB object (aws-sdk-js/lib/services/dynamodb.js).
Here is an example of this being done where a exponential backoff with jitter is implemented :
AWS.DynamoDB.prototype.retryDelays = (retryCount: number): number => {
let temp = Math.min(cap, base * Math.pow(2, retryCount));
let sleep = Math.random() * temp + 1;
return sleep;
};
Max retries or retry limit can be set through the maxRetries property of the DynamoDB configuration object as such :
let dynamodb = new AWS.DynamoDB({
region: 'us-east-1',
maxRetries: 30
});
See Also :
https://github.com/aws/aws-sdk-js/issues/402
https://github.com/aws/aws-sdk-js/issues/1171
https://github.com/aws/aws-sdk-js/issues/1100
http://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Programming.Errors.html#Programming.Errors.RetryAndBackoff
https://www.awsarchitectureblog.com/2015/03/backoff.html
There are two examples in between these pages 16 and 18.
Example 1.3 is a server app.
Example 1.4 is a client app doing GET requests to the server.
When I run the two examples (at the same time) I notice some quite weird behavior
in the client. All requests are executed (i.e. the for loop in the client completes)
but the callbacks of only 5 of them get called. The client doesn't exit and also
doesn't error out. And just no more callbacks are called.
Any ideas what might be happening or how I can troubleshoot this further?
Note: I am running Node.js v0.10.20 on Windows 7.
Server:
var http = require('http');
var fs = require('fs');
// write out numbers
function writeNumbers(res) {
var counter = 0;
// increment, write to client
for (var i = 0; i<100; i++) {
counter++;
res.write(counter.toString() + '\n');
}
}
// create http server
http.createServer(function (req, res) {
var query = require('url').parse(req.url).query;
var app = require('querystring').parse(query).file;
// content header
res.writeHead(200, {'Content-Type': 'text/plain'});
if (!app){
res.end();
console.log('No file argument found in query string.');
return;
}else{
app = app + ".txt";
}
// write out numbers
writeNumbers(res);
// timer/timeout to open file and read contents
setTimeout(function() {
console.log('Opening file: ' + app + '.');
// open and read in file contents
fs.readFile(app, 'utf8', function(err, data) {
res.write('\r\n');
if (err)
res.write('Could not find or open file ' + app + ' for reading.\r\n');
else {
res.write(data);
}
// response is done
res.end();
});
},2000);
}).listen(8124);
console.log('Server running at 8124');
Client:
var http = require('http');
var N = 200;
// The URL we want, plus the path and options we need
var options = {
host: 'localhost',
port: 8124,
path: '/?file=automatic',
method: 'GET'
};
var callback_function = function(response) {
// finished? ok, write the data to a file
console.log('got response back');
};
for (var i = 1; i <= N; i++) {
// make the request, and then end it, to close the connection
http.request(options, callback_function).end();
console.log('done with call # ' + i);
}
--- Experiment Done ---
If I lower N to 10 and also if I do a
global "var i = 1" and then do this thing:
function schedule(){
http.request(options, callback_function).end();
console.log('done with call ' + i);
i++;
if (i<=N){
setTimeout(function(){
schedule();
}, 1000);
}
}
schedule();
instead of the loop in the client, I get similar behavior.
I guess that's what Milimetric meant by "sleep" i.e. just
to make sure I don't hit the server too quickly with too
many simultaneous requests.
But the behavior is not fully identical, it takes several mins
to print 'got response back' on the second set of 5 requests
and then another maybe 5-6 mins for the client to exit.
Still, all that does look weird to me.
C:\PERSONAL\NODE_TEST>node test004.js
done with call 1
got response back
done with call 2
got response back
done with call 3
got response back
done with call 4
got response back
done with call 5
got response back
done with call 6
done with call 7
done with call 8
done with call 9
done with call 10
got response back
got response back
got response back
got response back
got response back
C:\PERSONAL\NODE_TEST>
The problem is that the client doesn't consume the response body sent by the server, so the connection remains (half) open and the http agent only allows 5 concurrent requests per client by default, causing it to hang after 5 requests. The connection will eventually timeout, causing the next 5 requests to be processed.
node.js http.get hangs after 5 requests to remote site
Change your callback function to consume any data sent down the response.
var callback_function = function(response) {
// finished? ok, write the data to a file
console.log('got response back');
response.on('data', function () {});
};