Speeding up IndexedDB search with Multiple Workers - javascript

PROBLEM: I am trying to speed up my IndexedDB searches by using multiple web workers and therefore executing multiple read transactions simultaneously, but it's not really working, and my CPU only gets to around 30-35% utilization. I have a 4-core processor and was hoping that spawning 4 web workers would dramatically reduce the search time.
I am using Firefox 53 with a WebExtension; other browsers are not an option.
DATABASE: I have a data store with about 250,000 records, each with about 30 keys, some of them containing paragraphs of text.
TASK: Perform a string search on a given key to find matching values. Currently, this takes about 90 seconds to do on a single thread. Adding an additional worker reduces that time to about 75 seconds. More workers than that have no noticeable effect. An acceptable time to me would be under 10 seconds (somewhat comparable to an SQL database).
CURRENT STRATEGY: Spawn a worker for each processor, and create a Promise that resolves when the worker sends a message. In each worker, open the database, divide the records up evenly, and search for the string. I do that by starting on the first record if you're the first worker, second record for the second, etc. Then advance by the number of workers. So the first worker checks records 1, 5, 9, etc. Second worker checks 2, 6, 10, etc. Of course, I could also have the first worker check 1-50, second worker check 51-100, etc. (but obviously thousands each).
Using getAll() on a single thread took almost double the time and 4GB of memory. Splitting that into 4 ranges significantly reduces the time down to a total of about 40 seconds after merging the results (the 40 seconds varies wildly every time I run the script).
Any ideas on how I can make this work, or other suggestions for significantly speeding up the search?
background.js:
var key = whatever, val = something
var proc = navigator.hardwareConcurrency; // Number of processors
var wPromise = []; // Array of promises (one for each worker)
var workers = [];
/* Create a worker for each processor */
for (var pos = 0; pos < proc; pos++) {
workers[pos] = new Worker("js/dbQuery.js");
wPromise.push(
new Promise( resolve => workers[pos].onmessage = resolve )
);
workers[pos].postMessage({key:key, val:val, pos:pos, proc:proc});
}
return Promise.all(wPromise); // Do something once all the workers have finished
dbQuery.js:
onmessage = e => {
var data = e.data;
var req = indexedDB.open("Blah", 1);
req.onsuccess = e => {
var keyArr = [];
var db = e.currentTarget.result;
db.transaction("Blah").objectStore("Blah").index(data.key).openKeyCursor().onsuccess = e => {
var cursor = e.target.result;
if (cursor) {
if (data.pos) {
cursor.advance(data.pos); // Start searching at a position based on which web worker
data.pos = false;
}
else {
if (cursor.key.includes(data.val)) {
keyArr.push(cursor.primaryKey); // Store key if value is a match
}
cursor.advance(data.proc); // Advance position based on number of processors
}
}
else {
db.close();
postMessage(keyArr);
close();
}
}
}
}

Any ideas on how I can make this work, or other suggestions for
significantly speeding up the search?
You can substitute using Promise.race() for Promise.all() to return a resolved Promise once a match is found, instead of waiting for all of the Promises passed to Promise.all() to be resolved.

Related

Make Javascript Threads Fast

Recently I have been trying to use the Web workers interface to experiment with threads in JavaScript.
Trying to make contains with web workers, following these steps:
Split the initial array to pieces of equal size
Create a web worker for each piece that runs .contains on that piece
When and if the value is found in any of the pieces, it returns true without waiting for all workers to finish.
Here is what I tried:
var MAX_VALUE = 100000000;
var integerArray = Array.from({length: 40000000}, () => Math.floor(Math.random() * MAX_VALUE));
var t0 = performance.now();
console.log(integerArray.includes(1));
var t1 = performance.now();
console.log("Call to doSomething took " + (t1 - t0) + " milliseconds.");
var promises = [];
var chunks = [];
while(integerArray.length) {
chunks.push(integerArray.splice(0,10000000));
}
t0 = performance.now();
chunks.forEach(function(element) {
promises.push(createWorker(element));
});
function createWorker(arrayChunk) {
return new Promise(function(resolve) {
var v = new Worker(getScriptPath(function(){
self.addEventListener('message', function(e) {
var value = e.data.includes(1);
self.postMessage(value);
}, false);
}));
v.postMessage(arrayChunk);
v.onmessage = function(event){
resolve(event.data);
};
});
}
firstTrue(promises).then(function(data) {
// `data` has the results, compute the final solution
var t1 = performance.now();
console.log("Call to doSomething took " + (t1 - t0) + " milliseconds.");
});
function firstTrue(promises) {
const newPromises = promises.map(p => new Promise(
(resolve, reject) => p.then(v => v && resolve(true), reject)
));
newPromises.push(Promise.all(promises).then(() => false));
return Promise.race(newPromises);
}
//As a worker normally take another JavaScript file to execute we convert the function in an URL: http://stackoverflow.com/a/16799132/2576706
function getScriptPath(foo){ return window.URL.createObjectURL(new Blob([foo.toString().match(/^\s*function\s*\(\s*\)\s*\{(([\s\S](?!\}$))*[\s\S])/)[1]],{type:'text/javascript'})); }
Any browser and cpu tried, it is extremely slow compared to just do a simple contains to the initial array.
Why is this so slow?
What is wrong with the code above?
References
Waiting for several workers to finish
Wait for the first true returned by promises
Edit: The issue is not about .contains() in specific, but it could be other array functions, e.g. .indexOf(), .map(), forEach() etc. Why splitting the work between web workers takes much longer...
This is a bit of a contrived example so it's hard to help optimize for what you're trying to do specifically but one easily-overlooked and fix-able slow path is copying data to the web-worker. If possible you can use ArrayBuffers and SharedArrayBuffers to transfer data to and from web workers quickly.
You can use the second argument to the postMessage function to transfer ownership of an arrayBuffer to the web worker. It's important to note that that buffer will no longer be usable by the main thread until it is transferred back by the web worker. SharedArrayBuffers do not have this limitation and can be read by many workers at once but aren't necessarily supported in all browsers due to a security concern (see mdn for more details)
For example
const arr = new Float64Array(new ArrayBuffer(40000000 * 8));
console.time('posting');
ww.postMessage(arr, [ arr.buffer ]);
console.timeEnd('posting');
takes ~0.1ms to run while
const arr = new Array(40000000).fill(0);
console.time('posting');
ww.postMessage(arr, [ arr ]);
console.timeEnd('posting');
takes ~10000ms to run. This is JUST to transfer the data in the message, not to run the worker logic itself.
You can read more on the postMessage transferList argument here and transferable types here. It's important to note that the way your example is doing a timing comparison includes the web worker creation time, as well, but hopefully this gives a better idea for where a lot of that time is going and how it can be better worked around.
You're doing a lot more work between t0 and t1 compared to a simple contains. These extra steps include:
converting function -> string -> regex -> blob -> object URL
calling new worker -> parses object URL -> JS engine interprets code
sending web worked data -> serialized on main thread -> deserialized in worker (likely in memory struct that's copied actually, so not super slow)
You're better off creating the thread first, then continuously handing it data. It may not be faster but it won't lock up your UI.
Also, if you're repeatedly searching through the array may I suggest converting it into a map where the key is the array value and the value is the index.
e.g.
array ['apple', 'coconut', 'kiwi'] would be converted to { apple: 1, coconut: 2, kiwi:3 }
searching through the map would occur in amortized normal time (fast), vs the array would be a linear search (slow as hell for large sets).

Interrupt `request` In a `forEach` Loop to Improve Efficiency

I'm building a simple web crawler to automate a newsletter, which means I only need to scape a set amount of pages. In this example, it is not a big deal because the script will only crawl 3 extra pages. But for a different case, this would be hugely inefficient.
So my question is, would there be a way to stop executing request() in this forEach loop?
Or would I need to change my approach to crawl pages one-by-one, as outlined in this guide.
Script
'use strict';
var request = require('request');
var cheerio = require('cheerio');
var BASEURL = 'https://jobsite.procore.com';
scrape(BASEURL, getMeta);
function scrape(url, callback) {
var pages = [];
request(url, function(error, response, body) {
if(!error && response.statusCode == 200) {
var $ = cheerio.load(body);
$('.left-sidebar .article-title').each(function(index) {
var link = $(this).find('a').attr('href');
pages[index] = BASEURL + link;
});
callback(pages, log);
}
});
}
function getMeta(pages, callback) {
var meta = [];
// using forEach's index does not work, it will loop through the array before the first request can execute
var i = 0;
// using a for loop does not work here
pages.forEach(function(url) {
request(url, function(error, response, body) {
if(error) {
console.log('Error: ' + error);
}
var $ = cheerio.load(body);
var desc = $('meta[name="description"]').attr('content');
meta[i] = desc.trim();
i++;
// Limit
if (i == 6) callback(meta);
console.log(i);
});
});
}
function log(arr) {
console.log(arr);
}
Output
$ node crawl.js
1
2
3
4
5
6
[ 'Find out why fall protection (or lack thereof) lands on the Occupational Safety and Health Administration (OSHA) list of top violations year after year.',
'noneChances are you won’t be seeing any scented candles on the jobsite anytime soon, but what if it came in a different form? The allure of smell has conjured up some interesting scent technology in recent years. Take for example the Cyrano, a brushed-aluminum cylinder that fits in a cup holder. It’s Bluetooth-enabled and emits up to 12 scents or smelltracks that can be controlled using a smartphone app. Among the smelltracks: “Thai Beach Vacation.”',
'The premise behind the hazard communication standard is that employees have a right to know the toxic substances and chemical hazards they could encounter while working. They also need to know the protective things they can do to prevent adverse effects of working with those substances. Here are the steps to comply with the standard.',
'The Weitz Company has been using Procore on its projects for just under two years. Within that time frame, the national general contractor partnered with Procore to implement one of the largest technological advancements in its 163-year history. Click here to learn more about their story and their journey with Procore.',
'MGM Resorts International is now targeting Aug. 24 as the new opening date for the $960 million hotel and casino complex it has been building in downtown Springfield, Massachusetts.',
'So, what trends are taking center stage this year? Below are six of the most prominent. Some of them are new, and some of them are continuations of current trends, but they are all having a substantial impact on construction and the structures people live and work in.' ]
7
8
9
Aside from using slice to limit your selection, you can also refactor the code to reuse some functionality.
Sorry, I couldn't help myself after thinking about this for a second.
We can begin with the refactor:
const rp = require('request-promise-native');
const {load} = require('cheerio');
function scrape(uri, transform) {
const options = {
uri,
transform: load
};
return rp(options).then(transform);
}
scrape(
'https://jobsite.procore.com',
($) => $('.left-sidebar .article-title a').toArray().slice(0,6).map((linkEl) => linkEl.attribs.href)
).then((links) => Promise.all(
links.map(
(link) => scrape(
`https://jobsite.procore.com/${link}`,
($) => $('meta[name="description"]').attr('content').trim()
)
)
)).then(console.log).catch(console.error);
While this does make the code a bit more DRY and concise, it points out one part that might need to be improved upon: the requesting of the links.
Currently it will fire off a request for all (or up to) 6 links found on the original page nearly at once. This may or may not be what you want depending on how many links this will be requesting at some other point that you alluded to.
Another potential concern is error management. As the refactor stands, if any one of the requests fail then all of the requests will be discarded.
Just a couple of points to consider if you like this approach. Both can be resolved in a variety of ways.
There's no way of stopping a forEach. You can simulate a stop by checking a flag inside the forEach, but that will still loop through all the elements. By the way, using a loop for an io operation is not optimal.
As you have stated, the best way to process a set of increasing data to process is to do it one-by-one, but I'll add a twist: Threaded-one-by-one.
NOTE: With thread I don't mean actual threads. Take it more of a
definition of "multiple lines of work". As IO operations don't lock
the main thread, while one or more requests are waiting for the data,
other "line of work" can run the JavaScript to process the data
received, as JavaScript is single threaded (Not talking about
WebWorkers).
Is as easy as having an array of pages, which receives pages to be crawled on the fly, and one function that reads one page of that array, process the result and then returns to the starting point (loading the next page of the array and processing the result).
Now you just call that function the amount of threads that you want to run, and done. Pseudo-code:
var pages = [];
function loadNextPage() {
if (pages.length == 0) {
console.log("Thread ended");
return;
}
var page = shift(); // get the first element
loadAndProcessPage(page, loadNextPage);
}
loadAndProcessPage(page, callback) {
requestOrWhatever(page, (error, data) => {
if (error) {
// retry or whatever
} else {
processData(data);
callback();
}
});
}
function processData(data) {
// Process the data and push new links to the pages array
pages.push(data.link1);
pages.push(data.link2);
pages.push(data.link3);
}
console.log("Start new thread");
loadNextPage();
console.log("And another one");
loadNextPage();
console.log("And another one");
loadNextPage();
console.log("And another thread");
loadNextPage();
This code will stop when no more pages are in the array, and if at some point happens to be less pages than the amount of threads, the threads will close. Needs some tweaks here and there, but you get the point.
I'm assuming you're trying to stop executing after some amount of pages (it looks like six in you're example). As some other replies have stated you can't prevent executing the callback from a Array.prototype.forEach(), however on each execution you could prevent running the request call.
function getMeta(pages, callback) {
var meta = []
var i = 0
pages.forEach(url => {
// MaxPages you were looking for
if(i <= maxPages)
request((err, res, body) => {
// ... Request logic
})
})
You could also use a while loop to wrap to iterate over each page and once i hits the value you want the loop will exit and no run on the additional pages

DocumentDB - Quantify bounded execution in stored procedure

I have a DocumentDB stored procedure that does insert or update (not replace but rather reads and update existing document). The stored procedure does at most two operations:
query by Id and
either insert or update
The document is also not particularly large. However, every now and then I would get either time out (caused by bounded execution) or 449 (conflict updating resources, which is a transient error).
IMO this isn't a particularly taxing stored procedure but seems that I'm running to limitations already. I could do more work client side but I love the ACID guarantee in the stored procedure.
Is there any quantitative measure on bounded execution? I'm wondering if I'm simply doing things wrong or I have indeed hit limit of DocumentDB.
My stored procedure is a modified https://github.com/Azure/azure-documentdb-js-server/blob/master/samples/stored-procedures/update.js that takes in document instead of id. I'm using "$addToSet" in particular and the code looks like
function unique(arr) {
var uniqueArr = [], map = {};
for (var i = 0; i < arr.length; i++) {
var exists = map[arr[i]];
if (!exists) {
uniqueArr.push(arr[i]);
map[arr[i]] = true;
}
}
return uniqueArr;
}
// The $addToSet operator adds elements to an array only if they do not already exist in the set.
function addToSet(document, update) {
var fields, i;
if (update.$addToSet) {
console.log(">addToSet");
fields = Object.keys(update.$addToSet);
for (i = 0; i < fields.length; i++) {
if (!Array.isArray(document[fields[i]])) {
// Validate the document field; throw an exception if it is not an array.
throw new Error("Bad $addToSet parameter - field in document must be an array.");
}
// convert to array if input is not an array
var newIds = Array.isArray(update.$addToSet[fields[i]])
? update.$addToSet[fields[i]]
: [update.$addToSet[fields[i]]];
var finalIds = unique(document[fields[i]].concat(newIds));
document[fields[i]] = finalIds;
}
}
}
DocumentDB stored procedures must complete within 5 seconds. They are also limited by the provisioned throughput of the collection. If you have 5000 RU/s provisioned, then the stored procedure cannot consume more than 5000 * 5 RUs in total.
When a stored procedure reaches its execution time or its throughput limit, any request to perform a database operation (read, write, query) will receive a pre-emption signal, i.e. the request will not be accepted as a signal for the stored procedure to wrap up execution, and return to the caller. If you check for return code from each call, your stored procedure will never timeout. Here's a snippet showing how to do this (full samples are available at https://github.com/Azure/azure-documentdb-js-server/blob/master/samples/stored-procedures/):
var isAccepted = collection.replaceDocument(...) {
// additional logic in callback
});
if (!isAccepted) {
// wrap up execution and return
}
Regarding 449, this is a concurrency error that can be returned if your stored procedure attempts to perform a conflicting write. This is side-effect free and safe to retry on from the client. You can implement a retry until succeeded pattern whenever you run into this error.

Recursion - Node out of memory

When I run the following code 9999999+ times, Node returns with:
FATAL ERROR: CALL_AND_RETRY_2 Allocation failed - process out of memory
Aborted (core dumped)
Whats the best solution to get around this issue, other than increasing the max alloc size or any command line arguments?
I'd like to improve the code quality rather than hack a solution.
The following is the main bulk of recursion within the application.
The application is a load testing tool.
a.prototype.createClients = function(){
for(var i = 0; i < 999999999; i++){
this.recursiveRequest();
}
}
a.prototype.recursiveRequest = function(){
var self = this;
self.hrtime = process.hrtime();
if(!this.halt){
self.reqMade++;
this.http.get(this.options, function(resp){
resp.on('data', function(){})
.on("connection", function(){
})
.on("end", function(){
self.onSuccess();
});
})
.on("error", function(e){
self.onError();
});
}
}
a.prototype.onSuccess = function(){
var elapsed = process.hrtime(this.hrtime),
ms = elapsed[0] * 1000000 + elapsed[1] / 1000
this.times.push(ms);
this.successful++;
this.recursiveRequest();
}
Looks like you should really be using a queue instead of recursive calls. async.queue offers a fantastic mechanism for processing asynchronous queues. You should also consider using the request module to make your http client connections simpler.
var async = require('async');
var request = require('request');
var load_test_url = 'http://www.testdomain.com/';
var parallel_requests = 1000;
function requestOne(task, callback) {
request.get(task.url, function(err, connection, body) {
if(err) return callback(err);
q.push({url:load_test_url});
callback();
});
}
var q = async.queue(requestOne, parallel_requests);
for(var i = 0; i < parallel_requests; i++){
q.push({url:load_test_url});
}
You can set the parallel_requests variable according to how many simultaneous requests you want to hit the test server with.
You are launching 1 billion "clients" in parallel, and having each of them perform an http get request recursively in an endless recursion.
Few remarks:
while your question mentions 10 million clients, your code creates 1 billion clients.
You should replace the for loop by a recursive function, to get rid of the out-of-memory error.
Something in these lines:
a.prototype.createClients = function(i){
if (i < 999999999) {
this.recursiveRequest();
this.createClients(i+1);
}
}
Then, you probably want to include some delay between the clients creations, or between the calls to recursiveRequest. Use setTimeout.
You should have a way to get the recursions stopping (onSuccess and recursiveRequest keep calling each other)
A flow control library like async node.js module may help.
10 million is very large... Assuming that the stack supports any number of calls, it should work, but you are likely asking the JavaScript interpreter to load 10 million x quite a bit of memory... and the result is Out of Memory.
Also I personally do no see why you'd want to have so many requests at the same time (testing a heavy load on a server?) one way to optimize is to NOT create "floating functions" which you are doing a lot. "Floating functions" use their own set of memory on each instantiation.
this.http.get(this.options, function(resp){ ... });
^^^^
++++--- allocates memory x 10 million
Here the function(resp)... declaration allocates more memory on each call. What you want to do is:
# either global scope:
function r(resp) {...}
this.http.get(this.options, r ...);
# or as a static member:
a.r = function(resp) {...};
this.http.get(this.options, a.r ...);
At least you'll save on all that function memory. That goes for all the functions you declare within the r function, of course. Especially if their are quite large.
If you want to use the this pointer (make r a prototype function) then you can do that:
a.prototype.r = function(resp) {...};
// note that we have to have a small function to use 'that'... probably not a good idea
var that = this;
this.http.get(this.options, function(){that.r();});
To avoid the that reference, you may use an instance saved in a global. That defeats the use of an object as such though:
a.instance = new a;
// r() is static, but can access the object as follow:
a.r = function(resp) { a.instance.<func>(); }
Using the instance you can access the object's functions from the static r function. That could be the actual implementation which could make full use of the this reference:
a.r = function(resp) { a.instance.r_impl(); }
According to a comment by Daniel, your problem is that you misuse a for() to count the total number of requests you want to send. This means you can apply a very simple fix to your code as follow:
a.prototype.createClients = function(){
this.recursiveRequest();
};
a.prototype.recursiveRequest = function(){
var self = this;
self.hrtime = process.hrtime();
if(!this.halt && this.successful < 10000000){
...
Your recursivity is enough to run the test any number of times.
What you do is never quit, though. You have a halt variable, but it does not look like you ever set that to true. However, to test 10 million times, you want to check the number of requests you already sent.
My "fix" supposes that onError() fails (is no recursive). You could also change the code to make use of the halt flag as in:
a.prototype.onSuccess = function(){
var elapsed = process.hrtime(this.hrtime),
ms = elapsed[0] * 1000000 + elapsed[1] / 1000
this.times.push(ms);
this.successful++;
if(this.successful >= 10000000)
{
this.halt = true;
}
this.recursiveRequest();
}
Note here that you will be pushing ms in the times buffer 10 million times. That's a big table! You may want to have a total instead and compute an average at the end:
this.time += ms;
// at the end:
this.average_time = this.time / this.successful;

How to use unordered bulk inserting with Mongoskin?

I'm having trouble using Mongoskin to perform bulk inserting (MongoDB 2.6+) on Node.
var dbURI = urigoeshere;
var db = mongo.db(dbURI, {safe:true});
var bulk = db.collection('collection').initializeUnorderedBulkOp();
for (var i = 0; i < 200000; i++) {
bulk.insert({number: i}, function() {
console.log('bulk inserting: ', i);
});
}
bulk.execute(function(err, result) {
res.json('send response statement');
});
The above code gives the following warnings/errors:
(node) warning: possible EventEmitter memory leak detected. 51 listeners added. Use emitter.setMaxListeners() to increase limit.
TypeError: Object #<SkinClass> has no method 'execute'
(node) warning: possible EventEmitter memory leak detected. 51 listeners added. Use emitter.setMaxListeners() to increase limit.
TypeError: Object #<SkinClass> has no method 'execute'
Is it possible to use Mongoskin to perform unordered bulk operations? If so, what am I doing wrong?
You can do it but you need to change your calling conventions to do this as only the "callback" form will actually return a collection object from which the .initializeUnorderedBulkOp() method can be called. There are also some usage differences to how you think this works:
var dbURI = urigoeshere;
var db = mongo.db(dbURI, {safe:true});
db.collection('collection',function(err,collection) {
var bulk = collection.initializeUnorderedBulkOp();
count = 0;
for (var i = 0; i < 200000; i++) {
bulk.insert({number: i});
count++;
if ( count % 1000 == 0 )
bulk.execute(function(err,result) {
// maybe do something with results
bulk = collection.initializeUnorderedBulkOp(); // reset after execute
});
});
// If your loop was not a round divisor of 1000
if ( count % 1000 != 0 )
bulk.execute(function(err,result) {
// maybe do something here
});
});
So the actual "Bulk" methods themselves don't require callbacks and work exactly as shown in the documentation. The exeception is .execute() which actually sends the statements to the server.
While the driver will sort this out for you somewhat, it probably is not a great idea to queue up too many operations before calling execute. This basically builds up in memory, and though the driver will only send in batches of 1000 at a time ( this is a server limit as well as the complete batch being under 16MB ), you probably want a little more control here, at least to limit memory usage.
That is the point of the modulo tests as shown, but if memory for building the operations and a possibly really large response object are not a problem for you then you can just keep queuing up operations and call .execute() once.
The "response" is in the same format as given in the documentation for BulkWriteResult.

Categories