| 2-Oct-2017 | Like this? Dislike this? Let me know |
MongoDB is document data model database and as such requires no formal schema for each document in a collection. If you are unfamiliar with these terms, just consider a document like a "row" but with rich structure and a collection as a table. Here is an example of a document as it might appear in the MongoDB command line interface:
{
"_id" : ObjectId("59d2465474d1e2d00ebebee9"),
"device":"D1",
"created": ISODate("2017-10-02T13:59:48.194Z"),
"participants": [ "A1","A8","A45" ],
"data": {
"color":"red",
"maxAmps":10,
"maxVolts":240
}
}
There are a number of design benefits of a schema-free document model but for this article we will focus on two:
{
"_id" : ObjectId("59d24a3c74d1e2d00ebebef4"),
"device" : "D1",
"ts" : ISODate("2017-10-02T14:16:28.028Z"),
"data" : { "color" : "red", "maxAmps" : 10, "maxVolts" : 240 }
}
{
"_id" : ObjectId("59d24a3c74d1e2d00ebebef5"),
"device" : "D2",
"ts" : ISODate("2017-10-02T14:16:28.028Z"),
"data" : { "color" : "green", "maxAmps" : 10, "maxVolts" : 240 }
}
{
"_id" : ObjectId("59d24a3c74d1e2d00ebebef6"),
"device" : "D3",
"ts" : ISODate("2017-10-02T14:16:28.028Z"),
"data" : { "weight" : 12, "maxAmps" : 10, "maxVolts" : 240 }
}
{
"_id" : ObjectId("59d24a3c74d1e2d00ebebef7"),
"device" : "D3",
"ts" : ISODate("2017-10-02T14:16:28.028Z"),
"data" : { "weight" : 12, "maxAmps" : 10, "maxVolts" : 120 }
}
{
"_id" : ObjectId("59d24a3c74d1e2d00ebebef8"),
"device" : "D4",
"ts" : ISODate("2017-10-02T14:16:28.028Z"),
"data" : { "weight" : 12 }
}
{
"_id" : ObjectId("59d24a3c74d1e2d00ebebef9"),
"device" : "D5",
"ts" : ISODate("2017-10-02T14:16:28.028Z"),
"data" : { "maxAmps" : 10, "temp" : 77 }
}
{
"_id" : ObjectId("59d24a3c74d1e2d00ebebefa"),
"device" : "D6",
"ts" : ISODate("2017-10-02T14:16:28.028Z"),
"data" : { "maxAmps" : 10, "temp" : 90, "fish" : "cod" }
}
{
"_id" : ObjectId("59d24a3c74d1e2d00ebebefb"),
"device" : "D7",
"ts" : ISODate("2017-10-02T14:16:28.028Z"),
"data" : { "OS" : "Linux", "language" : "Java" }
}
{
"_id" : ObjectId("59d24a3c74d1e2d00ebebefc"),
"device" : "D8",
"ts" : ISODate("2017-10-02T14:16:28.028Z"),
"data" : { "OS" : "Linux", "language" : "Python" }
}
{
"_id" : ObjectId("59d24a3c74d1e2d00ebebefd"),
"device" : "D9",
"ts" : ISODate("2017-10-02T14:16:28.028Z"),
"data" : { "OS" : "Windows", "language" : "Python" }
}
db.foo.aggregate([
{$group: {_id: null,
"distinctColor":{$addToSet: "$data.color"},
"distinctMaxAmps":{$addToSet: "$data.maxAmps"},
"distinctMaxVolts":{$addToSet: "$data.maxVolts"} }}
]);
{
"_id" : null,
"distinctColor" : [ "green", "red" ],
"distinctMaxAmps" : [ 10 ],
"distinctMaxVolts" : [ 120, 240 ]
}
// This is the data field:
> db.foo.aggregate([{$project: {_id:0, z: "$data"}} ]);
{ "z" : { "x" : "A", "y" : 1 } }
// How do we dynamically fetch keys x and y from "data"? Use $objectToArray!
> db.foo.aggregate([{$project: {_id:0, z: {$objectToArray: "$data"}}} ]);
{ "z" : [ { "k" : "x", "v" : "A" }, { "k" : "y", "v" : 1 } ] }
// ... and going in the other direction.
> db.foo.aggregate([{$project: {_id:0, z: "$data2"}} ]);
{ "z" : [ [ "crop", "corn" ], [ "amount", 113 ] ] }
// $arrayToObject supports two input forms:
// array of arrays: [ [fname1,val1], [fname2,val2], ... ]
// array of k-v objects: [ {k:"fname1",v:val1}, {k:"fname2",v:val2}, etc. ]
> db.foo.aggregate([{$project: {_id:0, z: {$arrayToObject: "$data2"}}} ]);
{ "z" : { "crop" : "corn", "amount" : 113 } }
Let's begin to answer our question by first "breaking down" the fields inside the data substructure.
db.foo.aggregate([
{$project: {x: {$objectToArray: "$data"}}}
]);
{
"_id" : ObjectId("59d253b074d1e2d00ebebf2a"),
"x" : [
{ "k" : "color", "v" : "red" },
{ "k" : "maxAmps", "v" : 10 },
{ "k" : "maxVolts", "v" : 240 }
]
}
{
"_id" : ObjectId("59d253b074d1e2d00ebebf2a"),
"x" : [
{ "k" : "color", "v" : "green" },
{ "k" : "maxAmps", "v" : 10 },
{ "k" : "maxVolts", "v" : 240 }
]
}
// etc.....
db.foo.aggregate([
{$project: {x: {$objectToArray: "$data"}}}
,{$unwind: "$x"}
]);
{
"_id" : ObjectId("59d253b074d1e2d00ebebf2a"),
"x" : { "k" : "color", "v" : "red" }
}
{
"_id" : ObjectId("59d253b074d1e2d00ebebf2a"),
"x" : { "k" : "maxAmps", "v" : 10 }
}
{
"_id" : ObjectId("59d253b074d1e2d00ebebf2a"),
"x" : { "k" : "maxVolts", "v" : 240 }
}
{
"_id" : ObjectId("59d2549f74d1e2d00ebebf35"),
"x" : { "k" : "color", "v" : "green" }
}
// etc.....
db.foo.aggregate([
{$project: {x: {$objectToArray: "$data"}}}
,{$unwind: "$x"}
,{$group: {_id: "$x.k", distinctVals: {$addToSet: "$x.v"}}}
]);
{ "_id" : "language", "distinctVals" : [ "Python", "Java" ] }
{ "_id" : "OS", "distinctVals" : [ "Windows", "Linux" ] }
{ "_id" : "temp", "distinctVals" : [ 90, 77 ] }
{ "_id" : "weight", "distinctVals" : [ 12 ] }
{ "_id" : "maxVolts", "distinctVals" : [ 120, 240 ] }
{ "_id" : "fish", "distinctVals" : [ "cod" ] }
{ "_id" : "maxAmps", "distinctVals" : [ 10 ] }
{ "_id" : "color", "distinctVals" : [ "green", "red" ] }
We can use either form of $arrayToObject to turn the RHS of each doc into an LHS:
// For k-v, must have two fields k and v only (no _id or anything else)
,{$project: {_id:0, k: "$_id", v:"$distinctVals"}} //
// Now take the resulting {k: "fname", v: value}, wrap it in an array
// (note the [] wrapping $$CURRENT) and pass to $arrayToObject:
,{$project: {xx2: {$arrayToObject: [ [ "$$CURRENT"] ] }}}
// For array of array, perhaps a little simpler because not using $$CURRENT:
,{$project: { _id:0, xx: [ [ "$_id", "$distinctVals" ] ] }}
,{$project: {xx2: {$arrayToObject: "$xx"}}}
db.foo.aggregate([
{$project: {x: {$objectToArray: "$data"}}}
,{$unwind: "$x"}
,{$group: {_id: "$x.k", distinctVals: {$addToSet: "$x.v"}}}
,{$project: { _id:0, xx: [ [ {$concat: [ "distinct_", "$_id"]}, "$distinctVals" ] ] }}
]);
{ "xx" : [ [ "distinct_language", [ "Python", "Java" ] ] ] }
{ "xx" : [ [ "distinct_OS", [ "Windows", "Linux" ] ] ] }
{ "xx" : [ [ "distinct_temp", [ 90, 77 ] ] ] }
{ "xx" : [ [ "distinct_weight", [ 12 ] ] ] }
{ "xx" : [ [ "distinct_maxVolts", [ 120, 240 ] ] ] }
{ "xx" : [ [ "distinct_fish", [ "cod" ] ] ] }
{ "xx" : [ [ "distinct_maxAmps", [ 10 ] ] ] }
{ "xx" : [ [ "distinct_color", [ "green", "red" ] ] ] }
// Note that typically we'd merge the last $project into $replaceRoot for
// efficiency and a shorter pipeline, e.g.
// ,{$replaceRoot: { newRoot: {$arrayToObject: "$xx"}}}
// but it's expanded out below so you can see what's going on.
db.foo.aggregate([
{$project: {x: {$objectToArray: "$data"}}}
,{$unwind: "$x"}
,{$group: {_id: "$x.k", distinctVals: {$addToSet: "$x.v"}}}
,{$project: { _id:0, xx: [ [ {$concat: [ "distinct_", "$_id"]}, "$distinctVals" ] ] }}
,{$project: {xx2: {$arrayToObject: "$xx"}}}
,{$replaceRoot: { newRoot: "$xx2"}}
]);
{ "distinct_language" : [ "Python", "Java" ] }
{ "distinct_OS" : [ "Windows", "Linux" ] }
{ "distinct_temp" : [ 90, 77 ] }
{ "distinct_weight" : [ 12 ] }
{ "distinct_maxVolts" : [ 120, 240 ] }
{ "distinct_fish" : [ "cod" ] }
{ "distinct_maxAmps" : [ 10 ] }
{ "distinct_color" : [ "green", "red" ] }
If you understood how $$CURRENT was used in the example above, then you should also realize that you don't necessarily need to start with a substructure (although it's always a really good idea to separate truly dynamic fields into a substructure). If your data looked like this:
{
"_id" : ObjectId("59d2465474d1e2d00ebebee9"),
"name":"buzz",
"city":"NYC",
"favoriteCuisine":"Italian"
}
{
"_id" : ObjectId("59d2465474d1e2d00ebebeea"),
"name":"buzz",
"city":"Miami",
"favoriteCuisine":"Cuban"
}
db.foo.aggregate([
{$project: {x: {$objectToArray: "$$CURRENT"}}}
,{$unwind: "$x"}
,{$match: {"x.k": {$ne: "_id"}}}
// etc....
// Note! Holding back _id will NOT work:
// {$project: {_id:0, x: {$objectToArray: "$$CURRENT"}}}
// because $objectToArray will have already operated upon $$CURRENT and
// {k:"_id", v:ObjectId("...")} is now "buried" in the output array.
Like this? Dislike this? Let me know