[关闭]
@a5635268 2015-09-12T20:23:17.000000Z 字数 7449 阅读 2168

【mongoDB高级篇①】聚集运算之group,aggregate

mongoDB


group

语法

  1. db.collection.group({
  2. key:{field:1},//按什么字段进行分组
  3. initial:{count:0},//进行分组前变量初始化,该处声明的变量可以在以下回调函数中作为result的属性使用
  4. cond:{},//类似mysql中的having,分组后的查询返回
  5. reduce: function ( curr, result ) { }, //The function takes two arguments: the current document and an aggregation result document for that group.先迭代出分组,然后再迭代分组中的文档,即curr变量就代表当前分组中此刻迭代到的文档,result变量就代表当前分组。
  6. keyffunction(doc){},//keyf和key二选一,传入的参数doc代表当前文档,如果分组的字段是经过运算后的字段用到,作用类似mysql中的group by left('2015-09-12 14:05:22',10);
  7. finalize:function(result) {}//该result也就是reduce的result,都是代表当前分组,这个函数是在走完当前分组结束后回调;
  8. })

除了分组的key字段外,就只返回有result参数的回调函数中的操作的属性字段;

实例

  1. # 表结构如下
  2. {
  3. _id: ObjectId("5085a95c8fada716c89d0021"),
  4. ord_dt: ISODate("2012-07-01T04:00:00Z"),
  5. ship_dt: ISODate("2012-07-02T04:00:00Z"),
  6. item: { sku: "abc123",
  7. price: 1.99,
  8. uom: "pcs",
  9. qty: 25 }
  10. }
  1. #Example1
  2. SELECT ord_dt, item_sku
  3. FROM orders
  4. WHERE ord_dt > '01/01/2012'
  5. GROUP BY ord_dt, item_sku
  6. ↓↓↓↓
  7. db.orders.group(
  8. {
  9. key: { ord_dt: 1, 'item.sku': 1 },
  10. cond: { ord_dt: { $gt: new Date( '01/01/2012' ) } },
  11. reduce: function ( curr, result ) { },
  12. initial: { }
  13. }
  14. )
  15. #Example2
  16. SELECT ord_dt, item_sku, SUM(item_qty) as total
  17. FROM orders
  18. WHERE ord_dt > '01/01/2012'
  19. GROUP BY ord_dt, item_sku
  20. ↓↓↓↓
  21. db.orders.group(
  22. {
  23. key: { ord_dt: 1, 'item.sku': 1 },
  24. cond: { ord_dt: { $gt: new Date( '01/01/2012' ) } },
  25. reduce: function( curr, result ) {
  26. result.total += curr.item.qty;
  27. },
  28. initial: { total : 0 }
  29. }
  30. )
  31. #Example3
  32. db.orders.group(
  33. {
  34. keyf: function(doc) {
  35. return { day_of_week: doc.ord_dt.getDay() };
  36. },
  37. cond: { ord_dt: { $gt: new Date( '01/01/2012' ) } },
  38. reduce: function( curr, result ) {
  39. result.total += curr.item.qty;
  40. result.count++;
  41. },
  42. initial: { total : 0, count: 0 },
  43. finalize: function(result) {
  44. var weekdays = [
  45. "Sunday", "Monday", "Tuesday",
  46. "Wednesday", "Thursday",
  47. "Friday", "Saturday"
  48. ];
  49. result.day_of_week = weekdays[result.day_of_week];
  50. result.avg = Math.round(result.total / result.count);
  51. }
  52. }
  53. )
  54. [
  55. { "day_of_week" : "Sunday", "total" : 70, "count" : 4, "avg" : 18 },
  56. { "day_of_week" : "Friday", "total" : 110, "count" : 6, "avg" : 18 },
  57. { "day_of_week" : "Tuesday", "total" : 70, "count" : 3, "avg" : 23 }
  58. ]

工作中用到的实例

  1. #查询每个栏目最贵的商品价格, max()操作
  2. {
  3. key:{cat_id:1},
  4. cond:{},
  5. reduce:function(curr , result) {
  6. if(curr.shop_price > result.max) {
  7. result.max = curr.shop_price;
  8. }
  9. },
  10. initial:{max:0}
  11. }
  12. #查询每个栏目下商品的平均价格
  13. {
  14. key:{cat_id:1},
  15. cond:{},
  16. reduce:function(curr , result) {
  17. result.cnt += 1;
  18. result.sum += curr.shop_price;
  19. },
  20. initial:{sum:0,cnt:0},
  21. finalize:function(result) {
  22. result.avg = result.sum/result.cnt; //在每次分组完毕后进行运算
  23. }
  24. }

group其实略微有点鸡肋,因为既然用到了mongodb,那复制集和分片是避无可免的,而group是不支持分片的运算

Aggregation

聚合管道是一个基于数据处理管道概念的框架。通过使用一个多阶段的管道,将一组文档转换为最终的聚合结果。

aggregation-pipeline.png-78.1kB

语法

参考手册: http://docs.mongoing.com/manual-zh/core/aggregation-pipeline.html

  1. db.collection.aggregate(pipeline, options);
  2. pipeline Array
  3. # 与mysql中的字段对比说明
  4. $project # 返回哪些字段,select,说它像select其实是不太准确的,因为aggregate是一个阶段性管道操作符,$project是取出哪些数据进入下一个阶段管道操作,真正的最终数据返回还是在group等操作中;
  5. $match # 放在group前相当于where使用,放在group后面相当于having使用
  6. $sort # 排序1升-1降 sort一般放在group后,也就是说得到结果后再排序,如果先排序再分组没什么意义;
  7. $limit # 相当于limit m,不能设置偏移量
  8. $skip # 跳过第几个文档
  9. $unwind # 把文档中的数组元素打开,并形成多个文档,参考Example1
  10. $group: { _id: <expression>, <field1>: { <accumulator1> : <expression1> }, ... # 按什么字段分组,注意所有字段名前面都要加$,否则mongodb就为以为不加$的是普通常量,其中accumulator又包括以下几个操作符
  11. # $sum,$avg,$first,$last,$max,$min,$push,$addToSet
  12. #如果group by null就是 count(*)的效果
  13. $geoNear # 取某一点的最近或最远,在LBS地理位置中有用
  14. $out # 把结果写进新的集合中。注意1,不能写进一个分片集合中。注意2,不能写进

实例

Example1: unwind

  1. > db.test.insert({ "_id" : 1, "item" : "ABC1", sizes: [ "S", "M", "L"] });
  2. WriteResult({ "nInserted" : 1 })
  3. > db.test.aggregate( [ { $unwind : "$sizes" } ] )
  4. { "_id" : 1, "item" : "ABC1", "sizes" : "S" }
  5. { "_id" : 1, "item" : "ABC1", "sizes" : "M" }
  6. { "_id" : 1, "item" : "ABC1", "sizes" : "L" }
  7. db.test.insert({ "_id" : 2, "item" : "ABC1", sizes: [ "S", "M", "L",["XXL",'XL']] });
  8. WriteResult({ "nInserted" : 1 })
  9. > db.test.aggregate( [ { $unwind : "$sizes" } ] )
  10. { "_id" : 1, "item" : "ABC1", "sizes" : "S" }
  11. { "_id" : 1, "item" : "ABC1", "sizes" : "M" }
  12. { "_id" : 1, "item" : "ABC1", "sizes" : "L" }
  13. { "_id" : 2, "item" : "ABC1", "sizes" : "S" }
  14. { "_id" : 2, "item" : "ABC1", "sizes" : "M" }
  15. { "_id" : 2, "item" : "ABC1", "sizes" : "L" }
  16. { "_id" : 2, "item" : "ABC1", "sizes" : [ "XXL", "XL" ] } # 只能打散一维数组

Example2

  1. #数据源
  2. { "_id" : 1, "item" : "abc", "price" : 10, "quantity" : 2, "date" : ISODate("2014-03-01T08:00:00Z") }
  3. { "_id" : 2, "item" : "jkl", "price" : 20, "quantity" : 1, "date" : ISODate("2014-03-01T09:00:00Z") }
  4. { "_id" : 3, "item" : "xyz", "price" : 5, "quantity" : 10, "date" : ISODate("2014-03-15T09:00:00Z") }
  5. { "_id" : 4, "item" : "xyz", "price" : 5, "quantity" : 20, "date" : ISODate("2014-04-04T11:21:39.736Z") }
  6. { "_id" : 5, "item" : "abc", "price" : 10, "quantity" : 10, "date" : ISODate("2014-04-04T21:23:13.331Z") }
  7. # 综合示例
  8. db.sales.aggregate([
  9. # 由上到下,分阶段的进行,注意该数组中的顺序是有意义的
  10. {
  11. $project:{item:1,price:1,quantity:1} # 1.取出什么元素待操作;
  12. },
  13. {
  14. $group:{ # 2. 对已取出的元素进行聚合运算;
  15. _id:"$item", # 根据什么来分组
  16. quantityCount:{$sum:'$quantity'},
  17. priceTotal:{$sum:'$price'}
  18. }
  19. },
  20. {
  21. $sort:{
  22. quantityCount:1 #3.升序
  23. }
  24. },
  25. # 4.基于上面的结果,取倒数第二名
  26. {
  27. $skip: 2
  28. },
  29. {
  30. $limit:1
  31. },
  32. # 5.然后把结果写到result集合中
  33. {
  34. $out:'result'
  35. }
  36. ])
  37. #表达式$month,$dayOfMonth,$year,$sum,$avg
  38. db.sales.aggregate(
  39. [
  40. {
  41. $group : {
  42. _id : { month: { $month: "$date" }, day: { $dayOfMonth: "$date" }, year: { $year: "$date" } }, #按月日年分组
  43. totalPrice: { $sum: { $multiply: [ "$price", "$quantity" ] } },
  44. averageQuantity: { $avg: "$quantity" },
  45. count: { $sum: 1 }
  46. }
  47. }
  48. ]
  49. )
  50. #结果
  51. { "_id" : { "month" : 3, "day" : 15, "year" : 2014 }, "totalPrice" : 50, "averageQuantity" : 10, "count" : 1 }
  52. { "_id" : { "month" : 4, "day" : 4, "year" : 2014 }, "totalPrice" : 200, "averageQuantity" : 15, "count" : 2 }
  53. { "_id" : { "month" : 3, "day" : 1, "year" : 2014 }, "totalPrice" : 40, "averageQuantity" : 1.5, "count" : 2 }
  54. #
  55. #
  56. # 表达式$push
  57. db.sales.aggregate(
  58. [
  59. {
  60. $group:
  61. {
  62. _id: { day: { $dayOfYear: "$date"}, year: { $year: "$date" } },
  63. itemsSold: { $push: { item: "$item", quantity: "$quantity" } }
  64. }
  65. }
  66. ]
  67. )
  68. # result
  69. {
  70. "_id" : { "day" : 46, "year" : 2014 },
  71. "itemsSold" : [
  72. { "item" : "abc", "quantity" : 10 },
  73. { "item" : "xyz", "quantity" : 10 },
  74. { "item" : "xyz", "quantity" : 5 },
  75. { "item" : "xyz", "quantity" : 10 }
  76. ]
  77. }
  78. {
  79. "_id" : { "day" : 34, "year" : 2014 },
  80. "itemsSold" : [
  81. { "item" : "jkl", "quantity" : 1 },
  82. { "item" : "xyz", "quantity" : 5 }
  83. ]
  84. }
  85. {
  86. "_id" : { "day" : 1, "year" : 2014 },
  87. "itemsSold" : [ { "item" : "abc", "quantity" : 2 } ]
  88. }
  89. #
  90. #
  91. # 表达式$addToSet
  92. db.sales.aggregate(
  93. [
  94. {
  95. $group:
  96. {
  97. _id: { day: { $dayOfYear: "$date"}, year: { $year: "$date" } },
  98. itemsSold: { $addToSet: "$item" }
  99. }
  100. }
  101. ]
  102. )
  103. #result
  104. { "_id" : { "day" : 46, "year" : 2014 }, "itemsSold" : [ "xyz", "abc" ] }
  105. { "_id" : { "day" : 34, "year" : 2014 }, "itemsSold" : [ "xyz", "jkl" ] }
  106. { "_id" : { "day" : 1, "year" : 2014 }, "itemsSold" : [ "abc" ] }
  107. #
  108. #
  109. # 表达式 $first
  110. db.sales.aggregate(
  111. [
  112. { $sort: { item: 1, date: 1 } },
  113. {
  114. $group:
  115. {
  116. _id: "$item",
  117. firstSalesDate: { $first: "$date" }
  118. }
  119. }
  120. ]
  121. )
  122. # result
  123. { "_id" : "xyz", "firstSalesDate" : ISODate("2014-02-03T09:05:00Z") }
  124. { "_id" : "jkl", "firstSalesDate" : ISODate("2014-02-03T09:00:00Z") }
  125. { "_id" : "abc", "firstSalesDate" : ISODate("2014-01-01T08:00:00Z") }

Example3

  1. db.sales.aggregate(
  2. [
  3. {
  4. $group : {
  5. _id : null, # 如果为null,就统计出全部
  6. totalPrice: { $sum: { $multiply: [ "$price", "$quantity" ] } },
  7. averageQuantity: { $avg: "$quantity" },
  8. count: { $sum: 1 }
  9. }
  10. }
  11. ]
  12. )

Example4

  1. # 数据源
  2. { "_id" : 8751, "title" : "The Banquet", "author" : "Dante", "copies" : 2 }
  3. { "_id" : 8752, "title" : "Divine Comedy", "author" : "Dante", "copies" : 1 }
  4. { "_id" : 8645, "title" : "Eclogues", "author" : "Dante", "copies" : 2 }
  5. { "_id" : 7000, "title" : "The Odyssey", "author" : "Homer", "copies" : 10 }
  6. { "_id" : 7020, "title" : "Iliad", "author" : "Homer", "copies" : 10 }
  7. # 根据作者分组,获得其著多少书籍
  8. db.books.aggregate(
  9. [
  10. { $group : { _id : "$author", books: { $push: "$title" } } }
  11. ]
  12. )
  13. # result
  14. { "_id" : "Homer", "books" : [ "The Odyssey", "Iliad" ] }
  15. { "_id" : "Dante", "books" : [ "The Banquet", "Divine Comedy", "Eclogues" ] }
  16. # 通过系统变量$$ROOT(当前的根文档)来分组
  17. db.books.aggregate(
  18. [
  19. { $group : { _id : "$author", books: { $push: "$$ROOT" } } }
  20. ]
  21. )
  22. # result
  23. {
  24. "_id" : "Homer",
  25. "books" :
  26. [
  27. { "_id" : 7000, "title" : "The Odyssey", "author" : "Homer", "copies" : 10 },
  28. { "_id" : 7020, "title" : "Iliad", "author" : "Homer", "copies" : 10 }
  29. ]
  30. }
  31. {
  32. "_id" : "Dante",
  33. "books" :
  34. [
  35. { "_id" : 8751, "title" : "The Banquet", "author" : "Dante", "copies" : 2 },
  36. { "_id" : 8752, "title" : "Divine Comedy", "author" : "Dante", "copies" : 1 },
  37. { "_id" : 8645, "title" : "Eclogues", "author" : "Dante", "copies" : 2 }
  38. ]
  39. }

邮政编码数据集的聚合实例: http://docs.mongoing.com/manual-zh/tutorial/aggregation-zip-code-data-set.html

对用户爱好数据做聚合实例:
http://docs.mongoing.com/manual-zh/tutorial/aggregation-with-user-preference-data.html

添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注