Bulk insert in MongoDB using mongoose - javascript

I currently have a collection in Mongodb say "Collection1".
I have the following array of objects that need to be into inserted into MongoDB. I am using Mongoose API. For now, I am iterating through the array and inserting each of them into mongo.
This is ok for now, but will be a problem when the data is too big.
I need a way of inserting the data in bulk into MongoDB without repetition.
I am not sure how to do this. I could not find a bulk option in Mongoose.
My code below
myData = [Obj1,Obj2,Obj3.......]
myData.forEach(function(ele){
//console.log(ele)
saveToMongo(ele);
});
function saveToMongo(obj){
(new Collection1(obj)).save(function (err, response) {
if (err) {
// console.log('Error while inserting: ' + obj.name + " " +err);
} else {
// console.log('Data successfully inserted');
}
});
return Collection1(obj);
}

You might want to use the insertMany() method here if you're using the latest Mongoose version 4.4.X and greater, which essentially uses Model.collection.insertMany() under the hood and the driver might handle parallelizing >= 1000 docs for you.
myData = [Obj1, Obj2, Obj3.......];
Collection1.insertMany(myData, function(error, docs) {});
or using Promises for better error handling
Collection1.insertMany(myData)
.then(function(docs) {
// do something with docs
})
.catch(function(err) {
// error handling here
});
It works by creating a bunch of documents, calls .validate() on them in parallel, and then calls the underlying driver's insertMany() on the result of toObject({ virtuals: false }); of each doc.
Although insertMany() doesn't trigger pre-save hooks, it has better performance because it only makes 1 round-trip to the server rather than 1 for each document.
For Mongoose versions ~3.8.8, ~3.8.22, 4.x which support MongoDB Server >=2.6.x, you could use the Bulk API as follows
var bulk = Collection1.collection.initializeOrderedBulkOp(),
counter = 0;
myData.forEach(function(doc) {
bulk.insert(doc);
counter++;
if (counter % 500 == 0) {
bulk.execute(function(err, r) {
// do something with the result
bulk = Collection1.collection.initializeOrderedBulkOp();
counter = 0;
});
}
});
// Catch any docs in the queue under or over the 500's
if (counter > 0) {
bulk.execute(function(err,result) {
// do something with the result here
});
}

you can pass an array of objects to mongoose model create function
var Collection1 = mongoose.model('Collection1');
Collection1.create(myData,function(err){
if(err) ...
});

Related

Using findOne in a loop takes too long in Node.js

I'm using Node.js with MongoDB, I'm also using Monk for db access. I have the below code :
console.time("start");
collection.findOne({name: "jason"},
function(err, document) {
for(var i = 0; i < document.friends.length; i++) // "friends is an array contains ids of the user's friends"
{
collection.findOne({id: document.friends[i]}, function(err, doc)
{
console.log(doc.name);
});
}
});
console.log("The file was saved!");
console.timeEnd("start");
I have two questions regarding this code :
I see the execution time and "The file was saved!" string first, then I see the names of the friends coming in the console. Why is that? Shouldn't I see the names first then the execution time? Is it because the async nature of Node.js?
Names are printing very slowly in the console, the speed is like one name in two seconds. Why is it so slow? Is there a way to make the process faster?
EDIT:
Is it a good idea to break friends list to smaller pieces and call friends asynchronously? Would it make the process faster?
EDIT 2:
I changed my code to this :
collection.find({ id: { "$in": document.friends}}).then(function(err, doc)
{
console.log(doc.name);
if(err) {
return console.log(err);
}
}
This doesn't give an error, but this doesn't print anything either.
Thanks in advance.
Answer for question 1:
Yes, you are right.
Is it because the async nature of Node.js.
And to prevent that Node.js provides some mechanism for that you can use it otherwise you can do it on your own manually by setting one flag.
Answer for question 2:
you can use $in instead of findOne, it will be ease and fast.
e.g. .find({ "fieldx": { "$in": arr } })
arr :- In this you need to provide whole array.
yes, it's because javascript's async nature.
As you have called db from for loop javascript will not wait for it's response and continue the execution so it will print the file was saved first.
about your ans 2
It's making a dbCall for every friend then it's obvious that it will take some time that's why it's taking 1 or 2 secs for every friend.
console.time("start");
collection.findOne({name: "jason"},
function(err, document) {
for(var i = 0; i < document.friends.length; i++) // "friends is an array contains ids of the user's friends"
{
console.log("InsideforLoop Calling " + i + " friend");
collection.findOne({id: document.friends[i]}, function(err, doc)
{
console.log(doc.name);
});
console.log("Terminating " + i + "-----");
}
});
console.log("The file was saved!");
console.timeEnd("start");
This will make your async and db doubts more clear.
As you will see it will print all console in line.
InsideforLoop Calling 0 friend
Terminating 0 -----
and so on....Like this
console.log(doc.name);
but this will be printed asynchronusly
Added
collection.findOne({name: "jason"},
function(err, document) {
//you can do this
collection.find({id: $in:{document.friends}, function(err, doc)
{
console.log(doc);
});
});
Find All Details in one call
collection.aggregate([
{
$match:{
id :{ "$in" : document.friends},
}
}
]).exec(function ( e, d ) {
console.log( d )
if(!e){
// your code when got data successfully
}else{
// your code when you got the error
}
});
collection.findOne({name: "jason"},
function(err, document) {
if(document != undefined){
collection.find({ id: { "$in": document.friends}}).then(function(err, doc)
{
console.log(doc.name);
if(err) {
return console.log(err);
}
}
}
});
Answer to 1: Yes, it is because node is async. The part where it logs names is executed only when the first findOne returns, whereas the file was saved is executed straight away.

Sails.js: Nested MongoDB queries

I am using Sails v0.11 and am developing an standalone importer script in order to import data to mongoDB and - that is now the not-working part - build the associations between the models.
For this process I introduced temporary helper properties in the models in order to find the associated records and replace them by in real MongoDB _ids.
The script starts Sails in order to be able use its features (waterline, etc.):
var app = Sails();
app.load({
hooks: { grunt: false },
log: { level: 'warn' }
}, function sailsReady(err){
processUsers() finds all users and their _ids and iterates over them to invoke a second function addOrgsToOneUser()
var processUsers = function() {
// Iterate through all users in order to retrieve their _ids and
app.models['user'].native(function(err, collection) {
collection.find({}, projectionOrgInUser).toArray(function (err, users) {
Async.eachSeries(users, function (user, next){
// prepare userInOrgs
whereUserInOrg = { orgId: { $in: userInOrgs } };
//This is invoking
addOrgsToOneUser(user, whereUserInOrg);
next();
}, function afterwards (err) {
if (err) {
console.error('Import failed, error details:\n',err);
return process.exit(1);
}
console.log("done");
return process.exit(0); // This returns too early, not executing the addOrgsToOneUser
});
});
});
};
addOrgsToOneUser() finds all orgs belonging to THIS user and updates then the orgs array property of THIS user
var addOrgsToOneUser = function(user, whereUserInOrg) {
var projectionUserInOrg = "...";
// Find all orgs that this user is associated to and store it in inOrgs
app.models['org'].native(function(err, collection) {
collection.find(whereUserInOrg, projectionUserInOrg).toArray(function (err, orgs) {
// prepare inOrgs which is needed for updating
//update user to have an updated orgs array based on inOrgs.
app.models['user'].update({'id' : user._id.toString()}, {'orgs': inOrgs}).exec(function afterwards(err, updated){
console.log('Updated user ' + user._id.toString() + ' to be in their orgs');
});
});
});
}
Problem:
Process.exit(0) is called before the query/update of saddOrgsToOneUser() has completed. It behaves as expected if saddOrgsToOneUser() contains just a console.log for instance, but queries are triggered ansynchronously of course.
In case I comment out Process.exit(0), the script never stops, but the queries are executed as intented.
As the script will have further nested queries, I need a better approach to this as manually kill this script ...
How is nesting queries and iterating over their results done properly?
Thank you very much,
Manuel
addOrgsToOneUser is asynchronous. next() needs to be called after everything is done inside addOrgsToOneUser. The way I would do it is to pass in a callback (next) and call it when everything is done. So the call is
addOrgsToOneUser(user, whereUserInOrg, next);
and the addOrgsToOneUser will have an extra argument:
var addOrgsToOneUser = function(user, whereUserInOrg, callback) {
var projectionUserInOrg = "...";
// Find all orgs that this user is associated to and store it in inOrgs
app.models['org'].native(function(err, collection) {
collection.find(whereUserInOrg, projectionUserInOrg).toArray(function (err, orgs) {
// prepare inOrgs which is needed for updating
//update user to have an updated orgs array based on inOrgs.
app.models['user'].update({'id' : user._id.toString()}, {'orgs': inOrgs}).exec(function afterwards(err, updated){
console.log('Updated user ' + user._id.toString() + ' to be in their orgs');
callback(); // your original next() is called here
});
});
});
}

Will Mongo handle my service?

I've been using MongoDB with node.js and mongoose library. I decided to start using MongoDB because I found everywhere that it is the best solution for node.js applications.
Although the response times of my API are good, I'm unsure that MongoDB will handle it when scaling it.
I've noticed that most of my queries aren't enough to get all the data I need, so I rely on creating several queries and using some javascript map/reduce functions (that is what I'm afraid of).
Look at this example:
User
.find({
idol : true
})
.sort({
'metas.followers' : -1
})
.select('-password -__v -posts -email')
.skip(offset)
.limit(30)
.exec(function(err, retData)
{
promisedIdols = retData.map(function(idol)
{
return idol.withStatistics(Post, Follow, req.user);
});
idols = [];
if(promisedIdols.length == 0)
{
callback();
}
for(var i=0; i<promisedIdols.length; i++)
{
promisedIdols[i].delegate(function(result)
{
idols.push(result);
if(idols.length == promisedIdols.length)
{
callback();
}
});
}
});
I've used a map to gather an array of promises that will be resolved after running the following code:
var obj = this.toObject();
var deferred = new Promise();
Post
.find({ idol : obj._id })
.lean()
.exec(function(err, posts)
{
var postViews = 0;
var postLikes = 0;
var postShares = 0;
posts.reduce(function(prev, next)
{
postViews += next.views.length;
postLikes += next.likes.length;
postShares += next.shares.length;
}, 0);
obj.metas.postViews = postViews;
obj.metas.postLikes = postLikes;
obj.metas.postShares = postShares;
obj.metas.postCount = posts.length;
Subscription
.count({ idol : obj._id }, function(err, count)
{
obj.metas.subscribers = count;
deferred.deliver(obj);
});
});
that uses a reduce function.
I can't see this code working well on big scale. Maybe should I restructure my database? Maybe should I change my database system? Maybe I'm using MongoDB wrongly?
Experts?
Thanks.
Mongo can handle a lot, if you setup a good data model. There are a few things to keep in mind when you want to scale.
Try to avoid normalizing the data much and split it into different collections.
Data duplication is (sometimes, when used wisely) your friend, it will help you make simpler queries, populate right away. Yeah, that may mean that when you're updating data, you'll have to update in two places, but Mongo is ok with a lot of writes if you do it asynchronously (promises or not).
To your specific query, I don't see the full data model, but maybe you can use aggregation framework. That pipeline is native (C++, as opossed to mapReduce JavaScript) and will work really really fast.
Something like:
db.post.aggregate(
// First $match to reduce the dataset
{
$match: {idol : obj._id}
},
// then group and aggregate your data
{
$group: {
_id: '$idol', // group by that idol thing
postViews: {$sum: '$postViews'},
postLikes: {$sum: '$postLikes'}
},
},
// Then use project to arrange the result the way you like it
{
$project: {
_id: false, //or true if you need it
metas: {
postViews: '$postViews'
},
likeCountOfPosts: '$postLikes', // that's how you'd rename
whatIsIt: {$literal: 'a great post'}
}
}
);
You can also do a lot of conditional, groupings, sortings, winding and unwinding, mixing and shuffling the pipeline.
It's much much faster then Mongo mapReduce.

Insert document loop - RangeError: Maximum call stack size exceeded

I am literally giving my first steps with node and mongodb and I have recently hit this RangeError wall.
Here's what I am trying to do, I have a file that contains a list of countries that I would like to add to my mongo db. This would be part of my "seed" mechanism to get the app running.
I load the json and then I iterate through the collection of objects and add them one by one to the 'Countries' collection.
However, everytime I run the code, I get a "RangeError: Maximum call stack size exceeded".
I have googled around but none of the suggested solutions seem to apply for me.
My guess is there is something wrong with my insertCountry function...
Anyways, here's my code:
var mongoose = require('mongoose');
var countries = require('./seed/countries.json');
// mongodb
var Country = mongoose.Schema({
name: String,
code: String,
extra: [Extra]
});
var Extra = mongoose.Schema({
exampleField: Boolean,
anotherField: Boolean
});
var mCountry = mongoose.model('Countries', Country);
var mExtra = mongoose.model('Extras', Extra);
// do connection
mongoose.connect('...');
var db = mongoose.connection;
db.on('error', console.error.bind(console, 'connection error'));
db.once('open', function callback() {
});
// async function
var insertCountry = function(document, callback) {
db.model('Countries').count({code: document.code}, function (err, count) {
if (count < 1) {
db.collection('Countries').insert(document, function (err, result) {
if (!err) {
console.log('country ' + document.name + ' added');
}
else {
console.log('- [' + document.name + '] ' + err);
}
});
}
callback(null,document);
});
};
// doing countries
var Country = mongoose.model('Countries');
var Extras = mongoose.model('Extras');
for(i = 0; i < countries.length; i++)
{
nCountry = new Country();
nCountry.name = countries[i].name;
nCountry.code = countries[i].code;
nCountry.benefits = new Extras();
nCountry.benefits.exampleField = false;
nCountry.benefits.anotherField = false;
insertCountry(nCountry, function (err, value) {
console.log(value.name + ' added to collection (callback)');
});
}
I have been using some guides I have found to build this so this might not be optimal code. Any best pratices, standards, guides or tutorials you can share are most welcome!
Your callback is in the wrong place. It is not waiting for the insert operation to complete before you return from it's own callback. Altering your code:
var insertCountry = function(document, callback) {
db.model('Countries').count({code: document.code}, function (err, count) {
if (count < 1) {
db.collection('Countries').insert(document, function (err, result) {
if (!err) {
console.log('country ' + document.name + ' added');
}
else {
console.log('- [' + document.name + '] ' + err);
}
callback(null,document);
});
}
});
};
That is part of your problem, but it does not completely solve it. The other part is the loop which also does not wait for the wrapping function to complete before moving on. You want something like asyc.eachSeries in order to wait for inserts to complete before performing the next iteration. This is mostly why you are exceeding the call stack:
async.eachSeries(
countries,
function(current,callback) {
// make your nCountry object
insertCountry(nCountry,function(err,value) {
// do something, then
callback(err);
})
},
function(err) {
// called where done, err contains err where set
console.log( "done" );
}
);
There is really still and issue with the array, which must be reasonably large if you are exceeding the call stack limit. You probably should look at using event streams to process that rather that load everything in memory to the array.
Personally, if you were just trying not to insert duplicates for a field and had MongoDB 2.6 available I would just use the Bulk Operations API with "unordered operations" and allow non fatal failures on the duplicate keys. Coupled with the fact that bulk operations are sent in "batches" and not one at a time, this is much more efficient than checking for the presence on every request:
var Country = mongoose.Schema({
name: String,
code: { type: String, unique: true }, // define a unique index
extra: [Extra]
});
var insertCountries = function(countries,callback) {
var bulk = Country.collection.initializeUnorderedBulkOp();
var counter = 0;
async.eachSeries(
countries,
function(current,callback) {
// same object construction
bulk.insert(nCountry);
counter++;
// only send once every 1000
if ( counter % 1000 == 0 ) {
bulk.execute(function(err,result) {
// err should generally not be set
// but result would contain any duplicate errors
// along with other insert responses
// clear to result and callback
bulk = Country.collection.initializeUnorderedBulkOp();
callback();
});
} else {
callback();
}
},
function(err) {
// send anything still queued
if ( counter % 1000 != 0 )
bulk.execute(function(err,result) {
// same as before but no need to reset
callback(err);
});
}
);
};
mongoose.on("open",function(err,conn) {
insertCountries(countries,function(err) {
console.log("done");
});
});
Keeping in mind that unlike the methods implemented directly on the mongoose models, the native driver methods require that a connection is actually established before they can be called. Mongoose "queues" these up for you, but otherwise you need something to be sure the connection is actually open. The example of the "open" event is used here.
Take a look at event streams as well. If you are constructing an array large enough to cause a problem by missing callback execution then you probably should not be loading it all in memory from whatever your source is. Stream processing that source combined with an approach as shown above should provide efficient loading.

How to use "q" module for refactoring mongoose code?

I'm using mongoose to insert some data into mongodb. The code looks like:
var mongoose = require('mongoose');
mongoose.connect('mongo://localhost/test');
var conn = mongoose.connection;
// insert users
conn.collection('users').insert([{/*user1*/},{/*user2*/}], function(err, docs) {
var user1 = docs[0], user2 = docs[1];
// insert channels
conn.collection('channels').insert([{userId:user1._id},{userId:user2._id}], function(err, docs) {
var channel1 = docs[0], channel2 = docs[1];
// insert articles
conn.collection('articles').insert([{userId:user1._id,channelId:channel1._id},{}], function(err, docs) {
var article1 = docs[0], article2 = docs[1];
}
});
};
You can see there are a lot of nested callbacks there, so I'm trying to use q to refactor it.
I hope the code will look like:
Q.fcall(step1)
.then(step2)
.then(step3)
.then(step4)
.then(function (value4) {
// Do something with value4
}, function (error) {
// Handle any error from step1 through step4
})
.end();
But I don't know how to do it.
You'll want to use Q.nfcall, documented in the README and the Wiki. All Mongoose methods are Node-style. I'll also use .spread instead of manually destructuring .then.
var mongoose = require('mongoose');
mongoose.connect('mongo://localhost/test');
var conn = mongoose.connection;
var users = conn.collection('users');
var channels = conn.collection('channels');
var articles = conn.collection('articles');
function getInsertedArticles() {
return Q.nfcall(users.insert.bind(users), [{/*user1*/},{/*user2*/}]).spread(function (user1, user2) {
return Q.nfcall(channels.insert.bind(channels), [{userId:user1._id},{userId:user2._id}]).spread(function (channel1, channel2) {
return Q.nfcall(articles.insert.bind(articles), [{userId:user1._id,channelId:channel1._id},{}]);
});
})
}
getInsertedArticles()
.spread(function (article1, article2) {
// you only get here if all three of the above steps succeeded
})
.fail(function (error) {
// you get here if any of the above three steps failed
}
);
In practice, you will rarely want to use .spread, since you usually are inserting an array that you don't know the size of. In that case the code can look more like this (here I also illustrate Q.nbind).
To compare with the original one is not quite fair, because your original has no error handling. A corrected Node-style version of the original would be like so:
var mongoose = require('mongoose');
mongoose.connect('mongo://localhost/test');
var conn = mongoose.connection;
function getInsertedArticles(cb) {
// insert users
conn.collection('users').insert([{/*user1*/},{/*user2*/}], function(err, docs) {
if (err) {
cb(err);
return;
}
var user1 = docs[0], user2 = docs[1];
// insert channels
conn.collection('channels').insert([{userId:user1._id},{userId:user2._id}], function(err, docs) {
if (err) {
cb(err);
return;
}
var channel1 = docs[0], channel2 = docs[1];
// insert articles
conn.collection('articles').insert([{userId:user1._id,channelId:channel1._id},{}], function(err, docs) {
if (err) {
cb(err);
return;
}
var article1 = docs[0], article2 = docs[1];
cb(null, [article1, article2]);
}
});
};
}
getInsertedArticles(function (err, articles) {
if (err) {
// you get here if any of the three steps failed.
// `articles` is `undefined`.
} else {
// you get here if all three succeeded.
// `err` is null.
}
});
With alternative deferred promise implementation, you may do it as following:
var mongoose = require('mongoose');
mongoose.connect('mongo://localhost/test');
var conn = mongoose.connection;
// Setup 'pinsert', promise version of 'insert' method
var promisify = require('deferred').promisify
mongoose.Collection.prototype.pinsert = promisify(mongoose.Collection.prototype.insert);
var user1, user2;
// insert users
conn.collection('users').pinsert([{/*user1*/},{/*user2*/}])
// insert channels
.then(function (users) {
user1 = users[0]; user2 = users[1];
return conn.collection('channels').pinsert([{userId:user1._id},{userId:user2._id}]);
})
// insert articles
.match(function (channel1, channel2) {
return conn.collection('articles').pinsert([{userId:user1._id,channelId:channel1._id},{}]);
})
.done(function (articles) {
// Do something with articles
}, function (err) {
// Handle any error that might have occurred on the way
});
Considering Model.save instead of Collection.insert (quite the same in our case).
You don't need to use Q, you can wrap yourself the save method and return directly a Mongoose Promise.
First create an utility method to wrap the save function, that's not very clean but something like:
//Utility function (put it in a better place)
var saveInPromise = function (model) {
var promise = new mongoose.Promise();
model.save(function (err, result) {
promise.resolve(err, result);
});
return promise;
}
Then you can use it instead of save to chain your promises
var User = mongoose.model('User');
var Channel = mongoose.model('Channel');
var Article = mongoose.model('Article');
//Step 1
var user = new User({data: 'value'});
saveInPromise(user).then(function () {
//Step 2
var channel = new Channel({user: user.id})
return saveInPromise(channel);
}).then(function (channel) {
//Step 3
var article = new Article({channel: channel.id})
return saveInPromise(article);
}, function (err) {
//A single place to handle your errors
});
I guess that's the kind of simplicity we are looking for.. right? Of course the utility function can be implemented with better integration with Mongoose.
Let me know what you think about that.
By the way there is an issue about that exact problem in the Mongoose Github:
Add 'promise' return value to model save operation
I hope it's gonna be solved soon. I think it takes some times because they are thinking of switching from mpromise to Q: See here and then here.
Two years later, this question just popped up in my RSS client ...
Things have moved on somewhat since May 2012 and we might choose to solve this one in a different way now. More specifically, the Javascript community has become "reduce-aware" since the decision to include Array.prototype.reduce (and other Array methods) in ECMAScript5. Array.prototype.reduce was always (and still is) available as a polyfill but was little appreciated by many of us at that time. Those who were running ahead of the curve may demur on this point, of course.
The problem posed in the question appears to be formulaic, with rules as follows :
The objects in the array passed as the first param to conn.collection(table).insert() build as follows (where N corresponds to the object's index in an array):
[ {}, ... ]
[ {userId:userN._id}, ... ]
[ {userId:userN._id, channelId:channelN._id}, ... ]
table names (in order) are : users, channels, articles.
the corresopnding object properties are : user, channel, article (ie the table names without the pluralizing 's').
A general pattern from this article by Taoofcode) for making asynchronous call in series is :
function workMyCollection(arr) {
return arr.reduce(function(promise, item) {
return promise.then(function(result) {
return doSomethingAsyncWithResult(item, result);
});
}, q());
}
With quite light adaptation, this pattern can be made to orchestrate the required sequencing :
function cascadeInsert(tables, n) {
/*
/* tables: array of unpluralisd table names
/* n: number of users to insert.
/* returns promise of completion|error
*/
var ids = []; // this outer array is available to the inner functions (to be read and written to).
for(var i=0; i<n; i++) { ids.push({}); } //initialize the ids array with n plain objects.
return tables.reduce(function (promise, t) {
return promise.then(function (docs) {
for(var i=0; i<ids.length; i++) {
if(!docs[i]) throw (new Error(t + ": returned documents list does not match the request"));//or simply `continue;` to be error tolerant (if acceptable server-side).
ids[i][t+'Id'] = docs[i]._id; //progressively add properties to the `ids` objects
}
return insert(ids, t + 's');
});
}, Q());
}
Lastly, here's the promise-returning worker function, insert() :
function insert(ids, t) {
/*
/* ids: array of plain objects with properties as defined by the rules
/* t: table name.
/* returns promise of docs
*/
var dfrd = Q.defer();
conn.collection(t).insert(ids, function(err, docs) {
(err) ? dfrd.reject(err) : dfrd.resolve(docs);
});
return dfrd.promise;
}
Thus, you can specify as parameters passed to cascadeInsert, the actual table/property names and the number of users to insert.
cascadeInsert( ['user', 'channel', 'article'], 2 ).then(function () {
// you get here if everything was successful
}).catch(function (err) {
// you get here if anything failed
});
This works nicely because the tables in the question all have regular plurals (user => users, channel => channels). If any of them was irregular (eg stimulus => stimuli, child => children), then we would need to rethink - (and probably implement a lookup hash). In any case, the adaptation would be fairly trivial.
Today we have mongoose-q as well. A plugin to mongoose that gives you stuff like execQ and saveQ which return Q promises.

Categories