I want to update large numbers (> 100,000) of documents most efficiently.
My first naive approach was doing it on the JS level, writing scripts that
fetch _ids first, then loop through _ids and invoke updates by _id (full
docs or $set patches).
I ran into memory issues, also sharding the data into chunks of max. 500
documents (with opening and closing the connection) doesn't seem to work well.
So how can i solve this on the MongoDB level?
Best practice?
I have 3 common use cases, typically maintenance work flows:
1. Change type of value of property, without changing the value.
// before
{
timestamp : '1446987395'
}
// after
{
timestamp : 1446987395
}
2. Add new property based on value of existing property.
// before
{
firstname : 'John',
lastname : 'Doe'
}
// after
{
firstname : 'John',
lastname : 'Doe',
name : 'John Doe'
}
3. Simply adding removing properties from documents.
// before
{
street : 'Whatever Ave',
street_no : '1025'
}
// after
{
street : 'Whatever Ave',
no : '1025'
}
Thanks for helping out.
If your MongoDB server is 2.6 or newer, it would be better to take advantage of using a write commands Bulk API that allow for the execution of bulk update operations which are simply abstractions on top of the server to make it easy to build bulk operations. These bulk operations come mainly in two flavours:
Ordered bulk operations. These operations execute all the operation in order and error out on the first write error.
Unordered bulk operations. These operations execute all the operations in parallel and aggregates up all the errors. Unordered bulk operations do not guarantee order of execution.
Note, for older servers than 2.6 the API will downconvert the operations. However it's not possible to downconvert 100% so there might be some edge cases where it cannot correctly report the right numbers.
For your three common use cases, you could implement the Bulk API like this:
Case 1. Change type of value of property, without changing the value:
var MongoClient = require('mongodb').MongoClient;
MongoClient.connect("mongodb://localhost:27017/test", function(err, db) {
// Handle error
if(err) throw err;
// Get the collection and bulk api artefacts
var col = db.collection('users'),
bulk = col.initializeOrderedBulkOp(), // Initialize the Ordered Batch
counter = 0;
// Case 1. Change type of value of property, without changing the value.
col.find({"timestamp": {"$exists": true, "$type": 2} }).each(function (err, doc) {
var newTimestamp = parseInt(doc.timestamp);
bulk.find({ "_id": doc._id }).updateOne({
"$set": { "timestamp": newTimestamp }
});
counter++;
if (counter % 1000 == 0 ) {
bulk.execute(function(err, result) {
// re-initialise batch operation
bulk = col.initializeOrderedBulkOp();
});
}
});
if (counter % 1000 != 0 ){
bulk.execute(function(err, result) {
// do something with result
db.close();
});
}
});
Case 2. Add new property based on value of existing property:
MongoClient.connect("mongodb://localhost:27017/test", function(err, db) {
// Handle error
if(err) throw err;
// Get the collection and bulk api artefacts
var col = db.collection('users'),
bulk = col.initializeOrderedBulkOp(), // Initialize the Ordered Batch
counter = 0;
// Case 2. Add new property based on value of existing property.
col.find({"name": {"$exists": false } }).each(function (err, doc) {
var fullName = doc.firstname + " " doc.lastname;
bulk.find({ "_id": doc._id }).updateOne({
"$set": { "name": fullName }
});
counter++;
if (counter % 1000 == 0 ) {
bulk.execute(function(err, result) {
// re-initialise batch operation
bulk = col.initializeOrderedBulkOp();
});
}
});
if (counter % 1000 != 0 ){
bulk.execute(function(err, result) {
// do something with result
db.close();
});
}
});
Case 3. Simply adding removing properties from documents.
MongoClient.connect("mongodb://localhost:27017/test", function(err, db) {
// Handle error
if(err) throw err;
// Get the collection and bulk api artefacts
var col = db.collection('users'),
bulk = col.initializeOrderedBulkOp(), // Initialize the Ordered Batch
counter = 0;
// Case 3. Simply adding removing properties from documents.
col.find({"street_no": {"$exists": true } }).each(function (err, doc) {
bulk.find({ "_id": doc._id }).updateOne({
"$set": { "no": doc.street_no },
"$unset": { "street_no": "" }
});
counter++;
if (counter % 1000 == 0 ) {
bulk.execute(function(err, result) {
// re-initialise batch operation
bulk = col.initializeOrderedBulkOp();
});
}
});
if (counter % 1000 != 0 ){
bulk.execute(function(err, result) {
// do something with result
db.close();
});
}
});
Related
QUESTION:
After reading this:
https://www.npmjs.com/package/riot-lol-api#caching
I am still confused. This is my first time trying to cache api responses.
For example, I do not know what values are available for YOUR_CACHE_STRATEGY and it is not explained in the docs.
Essentially, I would be looking for an example, like how can I cache and serve for 5 min the response from /lol/summoner/v3/summoners/by-name/ ?
CODE:
riotRequest.request(region.toLowerCase(), 'summoner', '/lol/summoner/v3/summoners/by-name/'+encodeURI(player), function(err, data) {
if (!err) {
var summonerID = data.id;
} else {
console.error("ERROR1: "+err);
res.render("page", {errorMessage: "Player not found !"});
}
});
The documentation is not very detailed indeed. What you need to do is basically implement the cache object as specified in the code sample from the doc (the commented area).
Below is an example of caching to an array (in memory). You could also save this array to a file or into a Redis database as suggested in the doc.
//cacheData holds objects of type {key: 123, value: "request data"}
var cacheData = []
var cacheIndex = 0
function deleteFromCache(key) {
for (var i = 0; i < cacheData.length; i++) {
if (cacheData[i].key == key) {
cacheData.splice(i, 1);
return;
}
}
}
var cache = {
get: function(region, endpoint, cb) {
for (var entry of cacheData) {
if (entry.value == data) {
//we have a cache hit
return cb(null, entry.value);
}
}
return cb(null, null);
},
set: function(region, endpoint, cacheStrategy, data) {
var key = cacheIndex++;
var value = data;
cacheData.push({key, value});
//cacheStrategy is a number representing the number of seconds to keep the data in cache
setTimeout(() => {
deleteFromCache(key);
}, cacheStrategy * 1000);
}
};
YOUR_CACHE_STRATEGY is an object that is passed to your set function in the cacheStrategy parameter. They suggest it can be a number representing the lifespan of the cache entry, so I implemented a timer to delete the cache entry after a number of seconds equal to cacheStrategy.
You would call the request using this number:
riotRequest.request(region.toLowerCase(), 'summoner', '/lol/summoner/v3/summoners/by-name/'+encodeURI(player), 30, function(err, data) {//.....
To enable caching you need to pass the cache object to the constructor of RiotRequest :
var riotRequest = new RiotRequest('my_api_key', cache);
I posted a question before and realized my problem actually was async functions. I managed to work out most of it, but I got one little problem left. Using async I used waterfall to create an order for the some queries...
exports.getMenu = function(id_restaurant, callback){
async.waterfall([
async.apply(firstQuery, id_restaurant),
secondQuery,
thirdQuery,
fourthQuery,
formMenu
], function(err, result){
if(err){
console.log(err);
}
callback(result);
});
};
Everything works until fourthQuery, where I have to loop to get all dishes of a menu.
function fourthQuery(array_totalP, array_nombresSecc, array_secciones, callback){
var size = array_nombresSecc.length;
var array_secciones = array_secciones;
var array_nombresSecc = array_nombresSecc;
var dishes = [];
pool.getConnection(function(err, connection) {
if(err) {
console.log(err);
callback(true);
return;
}
for (var i = 0; i < size; i++) {
connection.query("SELECT name, price FROM menu_product WHERE id_seccion = ? AND active = 1", [array_secciones[i]],
function(err, results2) {
if(err) {
console.log(err);
callback(true);
return;
}
console.log("Result query 4 " + JSON.stringify(results2));
dishes[i] = results2;
console.log("VALOR PLATILLOS EN i : " + JSON.stringify(dishes[i]));
// this prints the result but only if it has a value over 2
});
};
}); // pool
console.log("I'm sending " + dishes); // this logs an empty array
callback(null, dishes, array_nombresSecc);
};
So what i can see that happens from printing the value of 'i' each loop is that it always has the value of 2. Because that's 'size' value. Also, even though it's saving results of index '2' I believe the callback is being done even before the for loop is done, because my fifth function is recieving an empty array.
How can i make my code wait to callback until my for loop is done?
NOTE: Sorry, part of my code is in spanish, tried to translate the important parts of it.
There are a few ways to handle this, one is to look into promise architecture. Promise.all will let you supply one callback to handle the values from each child promise.
To use what you've already got, however, I'd push the values into your dishes array, rather than assigning them specifically to i indexes, then check the size of that array at the end of each connection. When the array length matches the size, fire the callback. (as seen below)
If you need a way to tie each result to that specific i value, I'd recommend pushing them as an object
dishes.push({'index': i, 'dish': results2})
Afterward, if you need the array of just dishes, you can sort the array by that index value and run a map function.
dishes.sort(function(a,b){ return a.index - b.index; })
dishes = dishes.map(function(a){ return a.dish })
Here's the code adjusted:
function fourthQuery(array_totalP, array_nombresSecc, array_secciones, callback) {
var size = array_nombresSecc.length;
var array_secciones = array_secciones;
var array_nombresSecc = array_nombresSecc;
var dishes = [];
pool.getConnection(function(err, connection) {
if (err) {
console.log(err);
callback(true);
return;
}
for (var i = 0; i < size; i++) {
connection.query("SELECT name, price FROM menu_product WHERE id_seccion = ? AND active = 1", [array_secciones[i]],
function(err, results2) {
if (err) {
console.log(err);
callback(true);
return;
}
console.log("Result query 4 " + JSON.stringify(results2));
dishes.push(results2)
if(dishes.length == size){
console.log("I'm sending " + dishes);
callback(null, dishes, array_nombresSecc)
}
console.log("VALOR PLATILLOS EN i : " + JSON.stringify(dishes[i]));
// this prints the result but only if it has a value over 2
});
};
}); // pool
;
};
Since you're already using the async, I would suggest replacing the for() loop in fourthQuery with async.each().
The updated fourthQuery would look like this:
function fourthQuery(array_totalP, array_nombresSecc, array_secciones, callback){
var size = array_nombresSecc.length;
var array_secciones = array_secciones;
var array_nombresSecc = array_nombresSecc;
var dishes = [];
pool.getConnection(function(err, connection) {
if(err) {
console.log(err);
callback(true);
return;
}
async.each(array_secciones,
function(item, itemCallback) {
// Function fun for each item in array_secciones
connection.query("SELECT name, price FROM menu_product WHERE id_seccion = ? AND active = 1", [item],
function(err, results2) {
if(err) {
console.log(err);
return itemCallback(true);
}
console.log("Result query 4 " + JSON.stringify(results2));
dishes.push(results2);
console.log("VALOR PLATILLOS EN i : " + JSON.stringify(dishes[dishes.length-1]));
// this prints the result but only if it has a value over 2
return itemCallback();
});
},
function(err) {
// Function run after all items in array are processed or an error occurs
console.log("I'm sending " + dishes); // this logs an empty array
callback(null, dishes, array_nombresSecc);
});
}); // pool
};
Alternatively, you can use async.map(), which handles gathering the results in the final callback so doesn't rely on the dishes variable.
I need to remove all documents from my mongo db, which dont exists in new array with objects.
So I have array with objects like :
var items = [
{product_id:15, pr_name: 'a', description : 'desc'},
{product_id:44, pr_name: 'b', description : 'desc2'}
{product_id:32, pr_name: 'c', description : 'desc3'}];
and I have array with db values which I get by calling Model.find({}).
So now I do it in a 'straight' way:
async.each(products, function (dbProduct, callback) { //cycle for products removing
var equals = false;
async.each(items, function(product, callback){
if (dbProduct.product_id === product.product_id){
product.description = dbProduct.description;// I need to save desc from db product to new product
equals = true;
}
callback();
});
if (!equals) {
log.warn("REMOVE PRODUCT " + dbProduct.product_id);
Product.remove({ _id: dbProduct._id }, function (err) {
if (err) return updateDBCallback(err);
callback();
});
}
});
But its blocks the whole app and its very slow, because I have around 5000 values in my items array and in database too. So its very huge cycle numbers.
Maybe there can be a faster way?
UPDATE1
Using code below, from TbWill4321 answer:
var removeIds = [];
// cycle for products removing
async.each(products, function (dbProduct, callback) {
for ( var i = 0; i < items.length; i++ ) {
if (dbProduct.product_id === product.product_id) {
// I need to save desc from db product to new product
product.description = dbProduct.description;
// Return early for performance
return callback();
}
}
// Mark product to remove.
removeIds.push( dbProduct._id );
log.warn("REMOVE PRODUCT " + dbProduct.product_id);
return callback();
}, function() {
Product.remove({ _id: { $in: removeIds } }, function (err) {
if (err) return updateDBCallback(err);
// Continue Here.
// TODO
});
});
Its takes around 11 sec(blocks whole web-app) and takes 12 362 878 cycles for me.
So maybe somebody can advise me something?
The Async library does not execute synchronous code in an asynchronous fashion.
5000 items is not a huge number for JavaScript, as I've worked on Big Data set's with 5 million+ points and it doesn't take long. You can get better performance by structuring like this:
var removeIds = [];
// cycle for products removing
async.each(products, function (dbProduct, callback) {
for ( var i = 0; i < items.length; i++ ) {
if (dbProduct.product_id === product.product_id) {
// I need to save desc from db product to new product
product.description = dbProduct.description;
// Return early for performance
return callback();
}
}
// Mark product to remove.
removeIds.push( dbProduct._id );
log.warn("REMOVE PRODUCT " + dbProduct.product_id);
return callback();
}, function() {
Product.remove({ _id: { $in: removeIds } }, function (err) {
if (err) return updateDBCallback(err);
// Continue Here.
// TODO
});
});
Among the many problems you may have, off the top of my head you may want to start off by changing this bit:
Product.remove({ _id: dbProduct._id }, function (err) {
if (err) return updateDBCallback(err);
callback();
});
Being within a .each() call, you'll make one call to the database for each element you want to delete. It's better to store all the ids in one array and then make a single query to delete all elements that have an _id that is in that array. Like this
Product.remove({ _id: {$in: myArrayWithIds} }, function (err) {
if (err) return updateDBCallback(err);
callback();
});
On another note, since async will execute synchronously, node.js does offer setImmediate() (docs here), that will execute the function from within the event loop. So basically you can "pause" execution of new elements and serve any incoming requests to simulate "non-blocking" processing.
Is there any option to perform bulk upserts with mongoose? So basically having an array and insert each element if it not exists or update it if it exists? (I am using customs _ids)
When I do use .insert MongoDB returns an error E11000 for duplicate keys (which should be updated). Inserting multiple new document works fine though:
var Users = self.db.collection('Users');
Users.insert(data, function(err){
if (err) {
callback(err);
}
else {
callback(null);
}
});
Using .save returns an error that the parameter must be a single document:
Users.save(data, function(err){
...
}
This answer suggest there is no such option, however it is specific for C# and also already 3 years old. So I was wondering if there is any option to do that using mongoose?
Thank you!
Not in "mongoose" specifically, or at least not yet as of writing. The MongoDB shell as of the 2.6 release actually uses the "Bulk operations API" "under the hood" as it were for all of the general helper methods. In it's implementation, it tries to do this first, and if an older version server is detected then there is a "fallback" to the legacy implementation.
All of the mongoose methods "currently" use the "legacy" implementation or the write concern response and the basic legacy methods. But there is a .collection accessor from any given mongoose model that essentially accesses the "collection object" from the underlying "node native driver" on which mongoose is implemented itself:
var mongoose = require('mongoose'),
Schema = mongoose.Schema;
mongoose.connect('mongodb://localhost/test');
var sampleSchema = new Schema({},{ "strict": false });
var Sample = mongoose.model( "Sample", sampleSchema, "sample" );
mongoose.connection.on("open", function(err,conn) {
var bulk = Sample.collection.initializeOrderedBulkOp();
var counter = 0;
// representing a long loop
for ( var x = 0; x < 100000; x++ ) {
bulk.find(/* some search */).upsert().updateOne(
/* update conditions */
});
counter++;
if ( counter % 1000 == 0 )
bulk.execute(function(err,result) {
bulk = Sample.collection.initializeOrderedBulkOp();
});
}
if ( counter % 1000 != 0 )
bulk.execute(function(err,result) {
// maybe do something with result
});
});
The main catch there being that "mongoose methods" are actually aware that a connection may not actually be made yet and "queue" until this is complete. The native driver you are "digging into" does not make this distinction.
So you really have to be aware that the connection is established in some way or form. But you can use the native driver methods as long as you are careful with what you are doing.
You don't need to manage limit (1000) as #neil-lunn suggested. Mongoose does this already. I used his great answer as a basis for this complete Promise-based implementation & example:
var Promise = require('bluebird');
var mongoose = require('mongoose');
var Show = mongoose.model('Show', {
"id": Number,
"title": String,
"provider": {'type':String, 'default':'eztv'}
});
/**
* Atomic connect Promise - not sure if I need this, might be in mongoose already..
* #return {Priomise}
*/
function connect(uri, options){
return new Promise(function(resolve, reject){
mongoose.connect(uri, options, function(err){
if (err) return reject(err);
resolve(mongoose.connection);
});
});
}
/**
* Bulk-upsert an array of records
* #param {Array} records List of records to update
* #param {Model} Model Mongoose model to update
* #param {Object} match Database field to match
* #return {Promise} always resolves a BulkWriteResult
*/
function save(records, Model, match){
match = match || 'id';
return new Promise(function(resolve, reject){
var bulk = Model.collection.initializeUnorderedBulkOp();
records.forEach(function(record){
var query = {};
query[match] = record[match];
bulk.find(query).upsert().updateOne( record );
});
bulk.execute(function(err, bulkres){
if (err) return reject(err);
resolve(bulkres);
});
});
}
/**
* Map function for EZTV-to-Show
* #param {Object} show EZTV show
* #return {Object} Mongoose Show object
*/
function mapEZ(show){
return {
title: show.title,
id: Number(show.id),
provider: 'eztv'
};
}
// if you are not using EZTV, put shows in here
var shows = []; // giant array of {id: X, title: "X"}
// var eztv = require('eztv');
// eztv.getShows({}, function(err, shows){
// if(err) return console.log('EZ Error:', err);
// var shows = shows.map(mapEZ);
console.log('found', shows.length, 'shows.');
connect('mongodb://localhost/tv', {}).then(function(db){
save(shows, Show).then(function(bulkRes){
console.log('Bulk complete.', bulkRes);
db.close();
}, function(err){
console.log('Bulk Error:', err);
db.close();
});
}, function(err){
console.log('DB Error:', err);
});
// });
This has the bonus of closing the connection when it's done, displaying any errors if you care, but ignoring them if not (error callbacks in Promises are optional.) It's also very fast. Just leaving this here to share my findings. You can uncomment the eztv stuff if you want to save all eztv shows to a database, as an example.
await Model.bulkWrite(docs.map(doc => ({
updateOne: {
filter: {id: doc.id},
update: doc,
upsert: true
}
})))
Or more verbose:
const bulkOps = docs.map(doc => ({
updateOne: {
filter: {id: doc.id},
update: doc,
upsert: true
}
}))
Model.bulkWrite(bulkOps)
.then(bulkWriteOpResult => console.log('BULK update OK:', bulkWriteOpResult))
.catch(err => console.error('BULK update error:', err))
https://stackoverflow.com/a/60330161/5318303
I have released a plugin for Mongoose that exposes a static upsertMany method to perform bulk upsert operations with a promise interface.
An added benefit of using this plugin over initializing your own bulk op on the underlying collection, is that this plugin converts your data to Mongoose model's first, and then back to plain objects before the upsert. This ensures Mongoose schema validation is applied, and data is depopulated and fit for raw insertion.
https://github.com/meanie/mongoose-upsert-many
https://www.npmjs.com/package/#meanie/mongoose-upsert-many
Hope it helps!
If you're not seeing the bulk methods in your db.collection ie you're getting a error to the effect of
xxx variable has no method: initializeOrderedBulkOp()
Try updating your mongoose version. Apparently older mongoose versions don't pass through all of the underlying mongo db.collection methods.
npm install mongoose
took care of it for me.
I had to achieve this recently while storing products in my ecommerce app. My database used to timeout as I had to upsert 10000 items every 4 hours. One option for me was to set the socketTimeoutMS and connectTimeoutMS in mongoose while connecting to the database but it sorta felt hacky and I did not want to manipulate connection timeout defaults of the database. I also see that the solution by #neil lunn takes a simple sync approach of taking a modulus inside the for loop. Here is an async version of mine that I believe does the job much better
let BATCH_SIZE = 500
Array.prototype.chunk = function (groupsize) {
var sets = [];
var chunks = this.length / groupsize;
for (var i = 0, j = 0; i < chunks; i++ , j += groupsize) {
sets[i] = this.slice(j, j + groupsize);
}
return sets;
}
function upsertDiscountedProducts(products) {
//Take the input array of products and divide it into chunks of BATCH_SIZE
let chunks = products.chunk(BATCH_SIZE), current = 0
console.log('Number of chunks ', chunks.length)
let bulk = models.Product.collection.initializeUnorderedBulkOp();
//Get the current time as timestamp
let timestamp = new Date(),
//Keep track of the number of items being looped
pendingCount = 0,
inserted = 0,
upserted = 0,
matched = 0,
modified = 0,
removed = 0,
//If atleast one upsert was performed
upsertHappened = false;
//Call the load function to get started
load()
function load() {
//If we have a chunk to process
if (current < chunks.length) {
console.log('Current value ', current)
for (let i = 0; i < chunks[current].length; i++) {
//For each item set the updated timestamp to the current time
let item = chunks[current][i]
//Set the updated timestamp on each item
item.updatedAt = timestamp;
bulk.find({ _id: item._id })
.upsert()
.updateOne({
"$set": item,
//If the item is being newly inserted, set a created timestamp on it
"$setOnInsert": {
"createdAt": timestamp
}
})
}
//Execute the bulk operation for the current chunk
bulk.execute((error, result) => {
if (error) {
console.error('Error while inserting products' + JSON.stringify(error))
next()
}
else {
//Atleast one upsert has happened
upsertHappened = true;
inserted += result.nInserted
upserted += result.nUpserted
matched += result.nMatched
modified += result.nModified
removed += result.nRemoved
//Move to the next chunk
next()
}
})
}
else {
console.log("Calling finish")
finish()
}
}
function next() {
current++;
//Reassign bulk to a new object and call load once again on the new object after incrementing chunk
bulk = models.Product.collection.initializeUnorderedBulkOp();
setTimeout(load, 0)
}
function finish() {
console.log('Inserted ', inserted + ' Upserted ', upserted, ' Matched ', matched, ' Modified ', modified, ' Removed ', removed)
//If atleast one chunk was inserted, remove all items with a 0% discount or not updated in the latest upsert
if (upsertHappened) {
console.log("Calling remove")
remove()
}
}
/**
* Remove all the items that were not updated in the recent upsert or those items with a discount of 0
*/
function remove() {
models.Product.remove(
{
"$or":
[{
"updatedAt": { "$lt": timestamp }
},
{
"discount": { "$eq": 0 }
}]
}, (error, obj) => {
if (error) {
console.log('Error while removing', JSON.stringify(error))
}
else {
if (obj.result.n === 0) {
console.log('Nothing was removed')
} else {
console.log('Removed ' + obj.result.n + ' documents')
}
}
}
)
}
}
You can use mongoose's Model.bulkWrite()
const res = await Character.bulkWrite([
{
updateOne: {
filter: { name: 'Will Riker' },
update: { age: 29 },
upsert: true
}
},
{
updateOne: {
filter: { name: 'Geordi La Forge' },
update: { age: 29 },
upsert: true
}
}
]);
reference : https://masteringjs.io/tutorials/mongoose/upsert
Working with Nodejs and MongoDB through Node MongoDB native driver. Need to retrieve some documents, and make modification, then save them right back. This is an example:
db.open(function (err, db) {
db.collection('foo', function (err, collection) {
var cursor = collection.find({});
cursor.each(function (err, doc) {
if (doc != null) {
doc.newkey = 'foo'; // Make some changes
db.save(doc); // Update the document
} else {
db.close(); // Closing the connection
}
});
});
});
With asynchronous nature, if the process of updating the document takes longer, then when cursor reaches the end of documents, database connection is closed. Not all updates are saved to the database.
If the db.close() is omitted, all the documents are correctly updated, but the application hangs, never exits.
I saw a post suggesting using a counter to track number of updates, when fall back to zero, then close the db. But am I doing anything wrong here? What is the best way to handle this kind of situation? Does db.close() have to be used to free up resource? Or does a new db connection needs to open?
Here's a potential solution based on the counting approach (I haven't tested it and there's no error trapping, but it should convey the idea).
The basic strategy is: Acquire the count of how many records need to be updated, save each record asynchronously and a callback on success, which will decrement the count and close the DB if the count reaches 0 (when the last update finishes). By using {safe:true} we can ensure that each update is successful.
The mongo server will use one thread per connection, so it's good to either a) close unused connections, or b) pool/reuse them.
db.open(function (err, db) {
db.collection('foo', function (err, collection) {
var cursor = collection.find({});
cursor.count(function(err,count)){
var savesPending = count;
if(count == 0){
db.close();
return;
}
var saveFinished = function(){
savesPending--;
if(savesPending == 0){
db.close();
}
}
cursor.each(function (err, doc) {
if (doc != null) {
doc.newkey = 'foo'; // Make some changes
db.save(doc, {safe:true}, saveFinished);
}
});
})
});
});
It's best to use a pooled connection and then call db.close() in cleanup function at the end of your application's life:
process.on('SIGINT', cleanup);
process.on('SIGTERM', cleanup);
See http://mongodb.github.io/node-mongodb-native/driver-articles/mongoclient.html
A bit old thread, but anyway.
Here an extended example to the answer given by pkopac, since I had to figure out the rest of the details:
const client = new MongoClient(uri);
(async () => await client.connect())();
// use client to work with db
const find = async (dbName, collectionName) => {
try {
const collection = client.db(dbName).collection(collectionName);
const result = await collection.find().toArray()
return result;
} catch (err) {
console.error(err);
}
}
const cleanup = (event) => { // SIGINT is sent for example when you Ctrl+C a running process from the command line.
client.close(); // Close MongodDB Connection when Process ends
process.exit(); // Exit with default success-code '0'.
}
process.on('SIGINT', cleanup);
process.on('SIGTERM', cleanup);
Here is a link to the difference between SIGINT and SIGTERM.
I had to add the process.exit(), otherwise my node web-server didn't exit cleanly when doing Ctrl + C on the running process in command line.
I found that using counter may apply to simple scenario, but may be hard in complicated situations. Here is a solution that I come up by closing the database connection when database connection is idle:
var dbQueryCounter = 0;
var maxDbIdleTime = 5000; //maximum db idle time
var closeIdleDb = function(connection){
var previousCounter = 0;
var checker = setInterval(function(){
if (previousCounter == dbQueryCounter && dbQueryCounter != 0) {
connection.close();
clearInterval(closeIdleDb);
} else {
previousCounter = dbQueryCounter;
}
}, maxDbIdleTime);
};
MongoClient.connect("mongodb://127.0.0.1:27017/testdb", function(err, connection)(
if (err) throw err;
connection.collection("mycollection").find({'a':{'$gt':1}}).toArray(function(err, docs) {
dbQueryCounter ++;
});
//do any db query, and increase the dbQueryCounter
closeIdleDb(connection);
));
This can be a general solution for any database Connections. maxDbIdleTime can be set as the same value as db query timeout or longer.
This is not very elegant, but I can't think of a better way to do this. I use NodeJs to run a script that queries MongoDb and Mysql, and the script hangs there forever if the database connections are not closed properly.
Here's a solution I came up with. It avoids using toArray and it's pretty short and sweet:
var MongoClient = require('mongodb').MongoClient;
MongoClient.connect("mongodb://localhost:27017/mydb", function(err, db) {
let myCollection = db.collection('myCollection');
let query = {}; // fill in your query here
let i = 0;
myCollection.count(query, (err, count) => {
myCollection.find(query).forEach((doc) => {
// do stuff here
if (++i == count) db.close();
});
});
});
I came up with a solution that involves a counter like this. It does not depend on a count() call nor does it wait for a time out. It will close the db after all the documents in each() are exhausted.
var mydb = {}; // initialize the helper object.
mydb.cnt = {}; // init counter to permit multiple db objects.
mydb.open = function(db) // call open to inc the counter.
{
if( !mydb.cnt[db.tag] ) mydb.cnt[db.tag] = 1;
else mydb.cnt[db.tag]++;
};
mydb.close = function(db) // close the db when the cnt reaches 0.
{
mydb.cnt[db.tag]--;
if ( mydb.cnt[db.tag] <= 0 ) {
delete mydb.cnt[db.tag];
return db.close();
}
return null;
};
So that each time you are going to make a call like db.each() or db.save() you would use these methods to ensure the db is ready while working and closed when done.
Example from OP:
foo = db.collection('foo');
mydb.open(db); // *** Add here to init the counter.**
foo.find({},function(err,cursor)
{
if( err ) throw err;
cursor.each(function (err, doc)
{
if( err ) throw err;
if (doc != null) {
doc.newkey = 'foo';
mydb.open(db); // *** Add here to prevent from closing prematurely **
foo.save(doc, function(err,count) {
if( err ) throw err;
mydb.close(db); // *** Add here to close when done. **
});
} else {
mydb.close(db); // *** Close like this instead. **
}
});
});
Now, this assumes that the second to last callback from each makes it through the mydb.open() before the last callback from each goes to mydb.close().... so, of course, let me know if this is an issue.
So: put a mydb.open(db) before a db call and put a mydb.close(db) at the return point of the callback or after the db call (depending on the call type).
Seems to me that this kind of counter should be maintained within the db object but this is my current workaround. Maybe we could create a new object that takes a db in the constructor and wrap the mongodb functions to handle the close better.
Based on the suggestion from #mpobrien above, I've found the async module to be incredibly helpful in this regard. Here's an example pattern that I've come to adopt:
const assert = require('assert');
const async = require('async');
const MongoClient = require('mongodb').MongoClient;
var mongodb;
async.series(
[
// Establish Covalent Analytics MongoDB connection
(callback) => {
MongoClient.connect('mongodb://localhost:27017/test', (err, db) => {
assert.equal(err, null);
mongodb = db;
callback(null);
});
},
// Insert some documents
(callback) => {
mongodb.collection('sandbox').insertMany(
[{a : 1}, {a : 2}, {a : 3}],
(err) => {
assert.equal(err, null);
callback(null);
}
)
},
// Find some documents
(callback) => {
mongodb.collection('sandbox').find({}).toArray(function(err, docs) {
assert.equal(err, null);
console.dir(docs);
callback(null);
});
}
],
() => {
mongodb.close();
}
);
Modern way of doing this without counters, libraries or any custom code:
let MongoClient = require('mongodb').MongoClient;
let url = 'mongodb://yourMongoDBUrl';
let database = 'dbName';
let collection = 'collectionName';
MongoClient.connect(url, { useNewUrlParser: true }, (mongoError, mongoClient) => {
if (mongoError) throw mongoError;
// query as an async stream
let stream = mongoClient.db(database).collection(collection)
.find({}) // your query goes here
.stream({
transform: (readElement) => {
// here you can transform each element before processing it
return readElement;
}
});
// process each element of stream (async)
stream.on('data', (streamElement) => {
// here you process the data
console.log('single element processed', streamElement);
});
// called only when stream has no pending elements to process
stream.once('end', () => {
mongoClient.close().then(r => console.log('db successfully closed'));
});
});
Tested it on version 3.2.7 of mongodb driver but according to link might be valid since version 2.0