Mongodb: use $sample after $group - javascript

I have the following data set:
{company:"One", employee:"John"},
{company:"One", employee:"Mike"},
{company:"One", employee:"Donald"},
{company:"One", employee:"Mickey"},
{company:"Two", employee:"Johnny"},
{company:"Two", employee:"David"},
Ideally, I want a query that returns all distinct companies, number of employees for each company, random employee for each company
{Company: "One" , employee_count=4, randomemployee="Donald"},
{Company: "Two" , employee_count=2, randomemployee="David"},
I do find a way to get company and employee_count using aggregate/group
However I don't find a way to add the randomemployee with the same query.
My aggregation:
function aggr (collection,cb){
collection.aggregate(([{$group:{_id:'$company',total:{$sum:1}}},{$sort:{total:-1}}]),function(err, l1){
cb(null, l1)
})
}
I began an other Sample function:
function onesample (collection,arg,cb){
collection.aggregate(([{ $match: { "company": arg }},{ $sample: { size: 1 }}]),function(err, item){
cb(null, item[0].employee)
})
}
But i'm loosing myself with callbacks and loop.
Any elegant way to do this within one query?
Thanks a lot.
following your answer, I tried the following code.
I have an issue with the callback of async.foreachof, seems it doesn't finish before leaving to next step: any clue?
var async = require("async");
var MongoClient = require('mongodb').MongoClient;
var assert = require('assert');
var url = 'mongodb://localhost:27017/eyc0';
async.waterfall ([
function(cb) {
MongoClient.connect(url, function(err, db) {
cb(null,db)
})
},
function (db, cb) {
db.collection('kodes', function(err, coll) {
cb(null,db,coll)
})
},
function (db,coll, cb) {
var pipeline = [
{"$group": {"_id": "$ouat","total": { "$sum": 1}}},
{"$sort":{"total":-1} },
{"$project":{"_id": 0,"total":1,"company": "$_id"}}];
coll.aggregate(pipeline).toArray(function(err, dlist){
cb(null,db,coll,dlist)
})
},
function (db,coll,dlist, cb) {
// console.log(dlist)
cb(null,db,coll,dlist)
},
function (db,coll,dlist, cb) {
var dlist2 = []
async.forEachOf(
dlist,
function(item, key, cb){
var pipeline = [{ "$match": { "ouat": item.company } },{ "$sample": { size: 1 } }];
coll.aggregate(pipeline, function (err, data) {
item["randref"] = data[0].code;
console.log(item.company)
dlist2.push(item)
cb()
});
}
);
cb(null,db,coll,dlist,dlist2);
},
function (db,coll,dlist,dlist2, cb) {
console.log(dlist2)
console.log(dlist)
},
])

There's one approach that involves one query, it could be close but not as performant (as it uses $unwind) and won't give you the desired result (only the filtered company):
var pipeline = [
{
"$group": {
"_id": "$company",
"total": { "$sum": 1 },
"employees": { "$push": "$employee" }
}
},
{
"$project": {
"_id": 0,
"company": "$_id",
"employee_count": "$total"
"randomemployee": "$employees"
}
},
{ "$unwind": "$randomemployee" },
{ "$match": { "company": arg } },
{ "$sample": { size: 1 } }
];
collection.aggregate(pipeline, function(err, result){
console.log(result);
});
However, for a solution that uses callbacks from multiple queries, this can be handled easily with use of async module.
To get all distinct companies, number of employees for each company, random employee for each company consider using the async.waterfall() function where the first task returns the aggregation results with all distinct companies and number of employees for each company.
The second task uses the results from taks 1 above to iterate over using async.forEachOf(). This allows you to perform an asynchronous task for each item, and when they're all done do something else. With each document from the array, run the aggregation operation that uses the $sample operator to get a random document with the specified company. With each result, create an extra field with the random employee and push that to an array with the final results that you can access at the end of each task.
Below shows this approach:
var async = require("async");
async.waterfall([
// Load full aggregation results (won't be called before task 1's "task callback" has been called)
function(callback) {
var pipeline = [
{
"$group": {
"_id": "$company",
"total": { "$sum": 1 }
}
},
{
"$project": {
"_id": 0,
"company": "$_id",
"employee_count": "total"
}
}
];
collection.aggregate(pipeline, function(err, results){
if (err) return callback(err);
callback(results);
});
},
// Load random employee for each of the aggregated results in task 1
function(results, callback) {
var docs = []
async.forEachOf(
results,
function(value, key, callback) {
var pipeline = [
{ "$match": { "company": value.company } },
{ "$sample": { size: 1 } }
];
collection.aggregate(pipeline, function (err, data) {
if (err) return callback(err);
value["randomemployee"] = data[0].employee;
docs.push(value);
callback();
});
},
function(err)
callback(null, docs);
}
);
},
], function(err, result) {
if (err) return next(err);
console.log(JSON.stringify(result, null, 4));
}
);
With the the async.series() function, this is useful if you need to execute a set of async functions in a certain order.
Consider the following approach if you wish to get the all the distinct companies and their employee count as one result and the other random employee as another:
var async = require("async"),
locals = {},
company = "One";
async.series([
// Load random company
function(callback) {
var pipeline = [
{ "$match": { "company": company } },
{ "$sample": { size: 1 } }
];
collection.aggregate(pipeline, function(err, result){
if (err) return callback(err);
locals.randomcompany = result[0];
callback();
});
},
// Load full aggregation results (won't be called before task 1's "task callback" has been called)
function(callback) {
var pipeline = [
{
"$group": {
"_id": "$company",
"total": { "$sum": 1 }
}
},
{
"$project": {
"_id": 0,
"company": "$_id",
"employee_count": "total"
}
}
];
collection.aggregate(pipeline, function(err, result){
if (err) return callback(err);
locals.aggregation = result;
callback();
});
}
], function(err) { //This function gets called after the two tasks have called their "task callbacks"
if (err) return next(err);
//Here locals will be populated with 'randomcompany' and 'aggregation'
console.log(JSON.stringify(locals, null, 4));
}
);

db.comp.aggregate([
{$group:{_id:'$company',emp:{$addToSet:'$employee'}}},
{$project:{emp:1,employee_count:{'$size':'$emp'},
randomvalue:{'$literal':Math.random()}}},
{$project:{emp:1,employee_count:1,
randomposition:{'$floor':
{'$multiply':['$randomvalue', '$employee_count']}}}},
{$project:{'Company':'$_id', _id:0, employee_count:1,
randomemployee:{'$arrayElemAt':['$emp','$randomposition']}}},
{$sort:{Company:1}} ])
Seems to work!
A couple of results:
{ "employee_count" : 4, "Company" : "One", "randomemployee" : "Mike" }
{ "employee_count" : 2, "Company" : "Two", "randomemployee" : "Johnny" }
{ "employee_count" : 4, "Company" : "One", "randomemployee" : "Mickey" }
{ "employee_count" : 2, "Company" : "Two", "randomemployee" : "David" }

Related

How to get array from getPhotos() function to the callback function or how to get deep directory structure using fs

I'm trying to gather the gallery folders and images that are listed on my server, but am having a difficult time getting the photos array to return with the results. Can someone help me understand what I'm doing wrong that it won't send the data back?
My code:
module.exports.getGalleryImages = function(req, res) {
console.log('Fetch gallery image names');
var gallerypath = './public/assets/gallery';
function done(err, results) {
if (err) {
res.status(500).json(err);
} else {
res.status(200).json(results);
}
}
getGalleryFolders(gallerypath, done);
};
var getGalleryFolders = function(gallerypath, callback) {
const galleryFolders = [];
fs.readdir(gallerypath, async (err, files) => {
if (err) {
console.log('error: ', err);
callback(err);
}
files.forEach(file => {
let folder = { year: null, photos: [] };
folder = { year: file, photos: [file.files] };
galleryFolders.push(folder);
});
console.log(galleryFolders);
callback(null, galleryFolders);
// getPhotos(galleryFolders, gallerypath, callback);
});
};
var getPhotos = function(galleryFolders, gallerypath, callback) {
const gallery = [...galleryFolders];
gallery.forEach(folder => {
let subfolder = `${gallerypath}/${folder.year}`;
fs.readdir(subfolder, (err, photos) => {
if (err) {
console.log('error: ', err);
callback(err);
}
folder.photos = photos;
console.log(
`Returning folder ${folder.year} photos`,
folder.photos.length
);
return folder;
// console.log(folder);
});
gallery.push(folder);
console.log('returning gallery', gallery);
return gallery;
});
console.log(gallery);
callback(null, gallery);
};
It gets the correct folders in the getGalleryFolders() function and passes them to the next function getPhotos(). The photos array is right, it just doesn't seem to want to leave that function no matter what or where I've moved return.
I'm getting a result of:
[
{
"year": "2004",
"photos": [
null
]
},
{
"year": "2011",
"photos": [
null
]
},
{
"year": "2012",
"photos": [
null
]
},
{
"year": "2013",
"photos": [
null
]
},
{
"year": "2014",
"photos": [
null
]
} ]
I want to return something like this:
[
{
"year": "2004",
"photos": [
"1.jpg","3.jpg","somatic.jpg"
]
},
{
"year": "2011",
"photos": [
"blue.jpg","green.jpg",yellow.jpg"
]
},
{
"year": "2012",
"photos": [
"a.jpg","b.jpg","c.jpg"
]
},
{
"year": "2013",
"photos": [
"2013.jpg"
]
},
{
"year": "2014",
"photos": [
"2014.jpg"
]
} ]
fs.readdir will get you the names of all the files in a directory. It does not get you the files.
You can see this when you create the objects in your array:
folder = { year: file, photos: [file.files] }
Where the property 'year' gets set to a string (2004, 2011, etc), which must be the names of the files in that directory. This is also why your photos array is always null. The string stored in the 'file' variable doesn't have any properties, because it's a string and not an object. When you try to access file.files, that 'files' property doesn't exist.
To get the contents of the files, you'll need to call something like fs.readFile with all the filenames you've gotten with readdir.
https://nodejs.org/api/fs.html#fs_fs_readfile_path_options_callback

How to add new objects inside nested array for mongodb using node.js?

I have the following database structure stored in the mongoDB:
"location" : "Halifax",
"students" : [
{
"name": "Mike",
"ID": "B00123456",
"images": [
{
"image_url":"",
"image_id":""
}
]
},
{
"name": "Rinan",
"ID": "B00999999",
"images": [
{
"image_url":"",
"image_id":""
}
]
}
]
My question is: how do I push a new object to images array inside a student named Mike who has an ID of "B00123456", I know I should use mongoDB's update and set method. But I just couldn't find a way to achieve that. The result I want is:
"location" : "Halifax",
"students" : [
{
"name": "Mike",
"ID": "B00123456",
"images": [
{
"image_url":"",
"image_id":""
},
{
"image_url":"www.example.com",
"image_id":"uqxhqbxqx_1219"
}
]
},
{
"name": "Rinan",
"ID": "B00999999",
"images": [
{
"image_url":"",
"image_id":""
}
]
}
]
Below is what I am trying using MongoDB's update and set:
// Connect and create a new doc
MongoClient.connect('mongodb://username:password#iad1- mongos0.objectrocket.com:someNode/db_name', functionb(err, db) {
if (err) {
console.dir(err);
console.log("error connected to mongodb");
} else {
var collection = db.collection('student_info_collection');
var student_name = req.body.name;
var student_id = req.body.ID;
collection.update(
{ location:"Halifax" },
{ ID:student_id}
{ name: student_name},
{$push: {
{
"images": [
{
"image_url":"www.example.com",
"image_id":"uqxhqbxqx_1219"
}
]
}
}
}, function(err,result){
if (err)
console.log("Something's wrong");
else
res.sendStatus(200);
}
);
}
});
Any help?
The update() function is
update(selector, document[, options][, callback])
The first parameter is selector, please try this one
var student_name = req.body.name;
var student_id = req.body.ID;
collection.update(
{ location:"Halifax",
'students.ID': student_id,
'students.name': student_name},
{$push: { "students.$.images":
{
"image_url":"www.example.com",
"image_id":"uqxhqbxqx_1219"
}
}
}, function(err,result){

Output is 'undefined' on retrieval of array field from Mongodb using node.js

I want to fetch an array which consists of id referenced from a different collection.
Region
var list1 = {
region_id: 'us-east-1',
region_name:'US East(N.Virginia)',
os_type: 'linux',
inst_name: {$ref : "instance", $id: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38]},
prov_id: 'aws'
};
Instance
{
inst_id: 1,
inst_type: 't2.micro',
vcpu: 1,
memory_gib: 1,
prov_id: 'aws'
};
{
inst_id: 2,
inst_type: 't2.small',
vcpu: 1,
memory_gib: 2,
prov_id: 'aws'
};
My node.js code
region.findOne(({
$and: [{
"region_id": "ap-east-1"
}, {
"os_type": "sles"
}]
}),
function (err, result) {
if (err) {
throw (err);
} else {
var arr = result.inst_name;
var arrli = arr.$id;
console.log(arrli);
}
});
The output for this is printed as 'undefined' in console. How do I fetch the array from $id field as it is?

Get sums by grouping a collection in Meteor

I have a collection with fields: number, a, b, c.
I want to divide the collection in three based on the number and get separate sums of a, b, and c for each group division.
I have done this with
function sumList(amountList) {
return _.reduce(amountList, function(sum, amount) {
return sum + amount;
}, -1);
}
// cursors
var group1 = Groups.find({ number: { $lte: 32 } }).fetch();
var group2 = Groups.find({ number: { $gte: 33, $lte: 70 } }).fetch();
var group3 = Groups.find({ number: { $gte: 71 } }).fetch();
// sums for group1
var group1SumA = sumList(_.pluck(group1, "a"));
var group1SumB = sumList(_.pluck(group1, "b"));
var group1SumC = sumList(_.pluck(group1, "c"));
// sums for group2
var group2SumA = sumList(_.pluck(group2, "a"));
var group2SumB = sumList(_.pluck(group2, "b"));
var group2SumC = sumList(_.pluck(group2, "c"));
// sums for group3
var group3SumA = sumList(_.pluck(group3 "a"));
var group3SumB = sumList(_.pluck(group3, "b"));
var group3SumC = sumList(_.pluck(group3, "c"));
It works but I think the code is very ugly.
I wonder if this can be done with some smart mapping. Besides, I guess it might have bad performance.
How can these sums be optimized?
Use the aggregation framework which will have the $match pipeline operator to filter the collection on the number field. The $group pipeline step then groups all the filtered input documents and applies the accumulator expression $sum to each field to get the sums.
Your pipeline would look like this:
var pipeline = [
{
"$match": { "number": { "$lte": 32 } } /* group1 filter */
},
{
"$group": {
"_id": 0,
"sumA": { "$sum": "$a" },
"sumB": { "$sum": "$b" },
"sumC": { "$sum": "$c" }
}
}
];
You can add the meteorhacks:aggregate package to implement the aggregation in Meteor:
Add to your app with
meteor add meteorhacks:aggregate
Since this package exposes .aggregate method on Mongo.Collection instances, you can then call the method to get the resulting array with the document that has the sums. For example
if (Meteor.isServer) {
var Coll = new Mongo.Collection('collectionName');
Meteor.methods({
sumList: function (filter) {
var pipeline = [
{
"$match": filter
},
{
"$group": {
"_id": 0,
"sumA": { "$sum": "$a" },
"sumB": { "$sum": "$b" },
"sumC": { "$sum": "$c" }
}
}
];
var result = Coll.aggregate(pipeline);
return result[0];
}
});
}
if (Meteor.isClient) {
// filters
var group1 = { "number": { "$lte": 32 } };
var group2 = { "number": { "$gte": 33, "$lte": 70 } };
var group3 = { "number": { "$gte": 71 } };
Meteor.call('sumList', group1, callback);
//Meteor.call('sumList', group2, callback);
//Meteor.call('sumList', group3, callback);
function callback(err, result) {
console.log(result)
}
}

skipped count 0 in aggregate function

I'm stuck on this for couple of days. I'm trying to get the count: 0 where there is no documents in the given time period. This is the aggregate function I'm using at the moment:
var getCount = function(timeBlock, start, end, cb) {
Document.aggregate(
{
$match: {
time: {
$gte: new Date(start),
$lt: new Date(end)
}
}
},
{
$project: {
time: 1,
delta: { $subtract: [
new Date(end),
'$time'
]}
}
},
{
$project: {
time: 1,
delta: { $subtract: [
"$delta",
{ $mod: [
"$delta",
timeBlock
]}
]}
}
},
{
$group: {
_id: { $subtract: [
end,
"$delta"
]},
count: { $sum: 1 }
}
},
{
$project: {
time: "$_id",
count: 1,
_id: 0
}
},
{
$sort: {
time: 1
}
}, function(err, results) {
if (err) {
cb(err)
} else {
cb(null, results)
}
})
}
I tried using $cond, but with no luck
The group stage is producing documents based on grouping on your given _id and counting the number of documents from the previous stage that end up in the group. Hence, a count of zero would be the result of a document being created from 0 input documents belonging to the group. Thinking about it this way, it's clear that there's no way the aggregation pipeline can do this for you. It doesn't know what all of the "missing" time periods are and it can't invent the appropriate documents out of thin air. Reapplying your extra knowledge about the missing time periods to complete the picture at the end seems like a reasonable solution (not "hacky") if you need to have an explicit count of 0 for empty time periods.
Though it has already been said the best thing to do here is "merge" your results post process rather than expect "keys" that do not exist to appear or to issue multiple queries with explicit keys that are possibly not going to aggregate results and combine them.
What has not already been said is how you actually do this, so I'll give you a MongoDB "thinking" kind of way to collect your results.
As a quick disclaimer, you could possibly employ much the same approach by "seeding" empty keys for each interval using mapReduce, or possibly even altering your data so that there is always an empty value within each possible block. Those approaches seem basically "hacky" and in the mapReduce case is not going to provide the best performance or muliple results.
What I would suggest is that working with collection results for the MongoDB brain can be made simple. There is a neat little solution called neDB, which is billed as a kind of SQL Lite for MongoDB. It supports a subset of functionality and is therefore perfect for "in memory" manipulation of results with a MongoDB mindset:
var async = require('async'),
mongoose = require('mongoose'),
DataStore = require('nedb'),
Schema = mongoose.Schema;
var documentSchema = new Schema({
time: { type: Date, default: Date.now }
});
var Document = mongoose.model( "Document", documentSchema );
mongoose.connect('mongodb://localhost/test');
var getCount = function(timeBlock, start, end, callback) {
async.waterfall(
[
// Fill a blank series
function(callback) {
var db = new DataStore();
var current = start;
async.whilst(
function() { return current < end },
function(callback) {
var delta = end - current;
db.insert({ "_id": end - delta, "count": 0 },function(err,doc) {
//console.log( doc );
current += timeBlock;
callback(err);
});
},
function(err) {
callback(err,db);
}
);
},
// Get data and update
function(db,callback) {
var cursor = Document.collection.aggregate(
[
// Match documents
{ "$match": {
"time": {
"$gte": new Date(start),
"$lt": new Date(end)
}
}},
// Group. 1 step and less hacky
{ "$group": {
"_id": {
"$let": {
"vars": {
"delta": {
"$subtract": [
{ "$subtract": [ new Date(end), "$time" ] },
{ "$mod": [
{ "$subtract": [ new Date(end), "$time" ] },
timeBlock
]}
]
}
},
"in": { "$subtract": [ end, "$$delta" ] }
}
},
"count": { "$sum": 1 }
}}
],
{ "cursor": { "batchSize": 100 } }
);
cursor.on("data",function(item) {
cursor.pause();
console.log( "called" );
db.update(
{ "_id": item._id },
{ "$inc": { "count": item.count } },
{ "upsert": true },
function(err) {
cursor.resume();
}
);
});
cursor.on("end",function() {
console.log( "done" );
db.find({},function(err,result) {
callback(err,result);
});
});
}
],
function(err,result) {
callback(err,result);
}
);
}
mongoose.connection.on("open", function(err,conn) {
getCount(
1000 * 60 * 60, // each hour
new Date("2014-07-01").valueOf(), // start
new Date("2014-07-02").valueOf(), // end
function(err,result) {
if (err) throw err;
console.log( result );
}
);
});
So essentially create each interval as in memory collection and then just update those interval records with the actual data retrieved. I can't think of another way to do that where it would be more simple and natural to the way of thinking.
Just a footnote, the "interval" logic is just replicated from your question, but in fact the time periods are "rounded up" where 15 minutes would appear in hour 1. It usually is the practice to round down so that everything belongs to the interval it falls in and not the next one.
this is hacky fix I did for now:
var getCount = function(timeBlock, start, end, cb) {
Document.aggregate(
{
$match: {
time: {
$gte: new Date(start),
$lt: new Date(end)
}
}
},
{
$project: {
time: 1,
delta: { $subtract: [
new Date(end),
'$time'
]}
}
},
{
$project: {
time: 1,
delta: { $subtract: [
"$delta",
{ $mod: [
"$delta",
timeBlock
]}
]}
}
},
{
$group: {
_id: { $subtract: [
end,
"$delta"
]},
count: { $sum: 1 }
}
},
{
$project: {
time: "$_id",
count: 1,
_id: 0
}
},
{
$sort: {
time: 1
}
}, function(err, results) {
if (err) {
cb(err)
} else {
// really hacky way
var numOfTimeBlocks = ( end - start ) / timeBlock
// in case there is no 0s in the given period of time there is no need
// to iterate through all of the results
if ( results.length === numOfTimeBlocks ) {
cb(results);
} else {
var time = start;
var details = [];
var times = results.map(function(item) {
return item.time;
});
for( var i = 0; i < numOfTimeBlocks; i++) {
time += timeBlock;
var idx = times.indexOf(time);
if (idx > -1) {
details.push(results[idx]);
} else {
var documentCount = { count: 0, time: time };
details.push(documentCount);
}
}
cb(details);
}
}
})
}
I was also thinking about doing one query per time block, which gives the same result but I think is inefficient because you query the database N times.

Categories