魏长东

weichangdong

东邪

mongodb实现多列的group

一、mongodb实现多列的group(方法1)。
类似如下SQL:
select PhoneId, hour, count(*)
from tianyc_test8
where PhoneId in ('xx','yy','zz')
group by PhoneId, hour;


1. 构建测试数据
在集合tianyc_test8中,每个PhoneId在每小时有多条数据,下面统计每个PhoneId在每小时的记录数。
> db.tianyc_test8.find()
{ "_id" : ObjectId("50d0a5bdd03"), "PhoneId" : "460001201624570", "Time" : ISODate("2012-12-18T17:21:11Z") }
{ "_id" : ObjectId("50d0a5bdd0c"), "PhoneId" : "460001201521818", "Time" : ISODate("2012-12-18T17:21:12Z") }
{ "_id" : ObjectId("50d0a19194a"), "PhoneId" : "460001204101466", "Time" : ISODate("2012-12-18T17:23:01Z") }
{ "_id" : ObjectId("50d0a5bdd11"), "PhoneId" : "460001202749862", "Time" : ISODate("2012-12-18T17:19:43Z") }
{ "_id" : ObjectId("50d0a5bdd13"), "PhoneId" : "460001202749862", "Time" : ISODate("2012-12-18T17:21:13Z") }
{ "_id" : ObjectId("50d0a5bdd14"), "PhoneId" : "460001202749919", "Time" : ISODate("2012-12-18T17:21:14Z") }
{ "_id" : ObjectId("50d0a5bdd17"), "PhoneId" : "460001202445292", "Time" : ISODate("2012-12-18T17:21:14Z") }
{ "_id" : ObjectId("50d0a5bdd20"), "PhoneId" : "460001201829623", "Time" : ISODate("2012-12-18T17:21:15Z") }
{ "_id" : ObjectId("50d0a5bdd22"), "PhoneId" : "460001202441775", "Time" : ISODate("2012-12-18T17:21:15Z") }
{ "_id" : ObjectId("50d0a5bdd25"), "PhoneId" : "460001202441680", "Time" : ISODate("2012-12-18T17:21:14Z") }
{ "_id" : ObjectId("50d0a5bdd2a"), "PhoneId" : "460001204608783", "Time" : ISODate("2012-12-18T17:21:15Z") }
{ "_id" : ObjectId("50d0a5bdd38"), "PhoneId" : "460001202747843", "Time" : ISODate("2012-12-18T17:21:18Z") }
{ "_id" : ObjectId("50d0a5bdd3b"), "PhoneId" : "460001201624758", "Time" : ISODate("2012-12-18T17:21:18Z") }
{ "_id" : ObjectId("50d0a5bdd3d"), "PhoneId" : "460001201930213", "Time" : ISODate("2012-12-18T17:21:19Z") }
{ "_id" : ObjectId("50d0a2f1e84"), "PhoneId" : "460001203762426", "Time" : ISODate("2012-12-18T17:23:50Z") }
{ "_id" : ObjectId("50d0a0f0852"), "PhoneId" : "460001201624766", "Time" : ISODate("2012-12-18T17:23:05Z") }
{ "_id" : ObjectId("50d0a5bdd47"), "PhoneId" : "460001204101332", "Time" : ISODate("2012-12-18T17:20:54Z") }
{ "_id" : ObjectId("50d0c302f99"), "PhoneId" : "460001201624784", "Time" : ISODate("2012-12-18T19:36:16Z") }
{ "_id" : ObjectId("50d0c100588"), "PhoneId" : "460001204305245", "Time" : ISODate("2012-12-18T19:35:27Z") }
{ "_id" : ObjectId("50d0c100591"), "PhoneId" : "460001204305240", "Time" : ISODate("2012-12-18T19:35:30Z") }
has more
2. 编写map函数,将需要groupby的两列使用“-”拼接在一起,作为key。
 

 var m = function(){
var hour=String(this.Time.getHours());
if(hour.length==1) {
hour='0'+hour
};
key=this.PhoneId+'-'+hour;
emit(key,{count:1});
}

3. 编写reduce函数,将输出结果进行拆分。
 

 var r = function (key,values) {
var total = 0;
var locate = key.indexOf('-');
for (var i=0; i
total += values[i].count;
}
return {PhoneId:key.substr(0,locate), hour:key.substr(locate+1,2), count : total};
};

4. 进行mapreduce,加入查询条件
 

var res = db.tianyc_test8.mapReduce(
m,
r,
{query:{PhoneId:{$in:["460001203762426","460001201521818","460001202749919"]}},
out:{replace:'tianyc_mr_Result1'}}
)

5. 查看输出结果
> db.tianyc_mr_Result1.find()
{ "_id" : "460001201521818-00", "value" : { "PhoneId" : "460001201521818", "hour" : "00", "count" : 112 } }
{ "_id" : "460001201521818-01", "value" : { "PhoneId" : "460001201521818", "hour" : "01", "count" : 65 } }
{ "_id" : "460001201521818-02", "value" : { "PhoneId" : "460001201521818", "hour" : "02", "count" : 52 } }
{ "_id" : "460001201521818-03", "value" : { "PhoneId" : "460001201521818", "hour" : "03", "count" : 98 } }
{ "_id" : "460001201521818-04", "value" : { "PhoneId" : "460001201521818", "hour" : "04", "count" : 112 } }
{ "_id" : "460001201521818-05", "value" : { "PhoneId" : "460001201521818", "hour" : "05", "count" : 112 } }
{ "_id" : "460001201521818-06", "value" : { "PhoneId" : "460001201521818", "hour" : "06", "count" : 111 } }
{ "_id" : "460001201521818-07", "value" : { "PhoneId" : "460001201521818", "hour" : "07", "count" : 111 } }
{ "_id" : "460001201521818-08", "value" : { "PhoneId" : "460001201521818", "hour" : "08", "count" : 105 } }
{ "_id" : "460001201521818-09", "value" : { "PhoneId" : "460001201521818", "hour" : "09", "count" : 114 } }
{ "_id" : "460001201521818-10", "value" : { "PhoneId" : "460001201521818", "hour" : "10", "count" : 107 } }
{ "_id" : "460001201521818-11", "value" : { "PhoneId" : "460001201521818", "hour" : "11", "count" : 115 } }
{ "_id" : "460001201521818-12", "value" : { "PhoneId" : "460001201521818", "hour" : "12", "count" : 89 } }
{ "_id" : "460001201521818-13", "value" : { "PhoneId" : "460001201521818", "hour" : "13", "count" : 114 } }
{ "_id" : "460001201521818-14", "value" : { "PhoneId" : "460001201521818", "hour" : "14", "count" : 114 } }
{ "_id" : "460001201521818-15", "value" : { "PhoneId" : "460001201521818", "hour" : "15", "count" : 115 } }
{ "_id" : "460001201521818-16", "value" : { "PhoneId" : "460001201521818", "hour" : "16", "count" : 115 } }
{ "_id" : "460001201521818-17", "value" : { "PhoneId" : "460001201521818", "hour" : "17", "count" : 112 } }
{ "_id" : "460001201521818-18", "value" : { "PhoneId" : "460001201521818", "hour" : "18", "count" : 115 } }
{ "_id" : "460001201521818-19", "value" : { "PhoneId" : "460001201521818", "hour" : "19", "count" : 70 } }

二、实现多列的group by操作(方法2)。
在map函数中,将key设置为内嵌的集合。此时MR过程与单个列的group by相同。
 

var m = function(){
var hour=String(this.Time.getHours());
if(hour.length==1) {hour='0'+hour};
key={PhoneId:this.PhoneId,hour:hour};
emit(key,{count:1});
}
 var r = function (key,values) {
var total = 0;
for (var i=0; i
total += values[i].count;
}
return {count : total};
};
var res = db.tianyc_test8.mapReduce(
m,
r,
{query:{PhoneId:{$in:["460001203762426","460001201521818","460001202749919"]}},
out:{replace:'tianyc_mr_Result1'}}
)


> db.tianyc_mr_Result1.find()
{ "_id" : { "PhoneId" : "460001201521818", "hour" : "00" }, "value" : { "count" : 112 } }
{ "_id" : { "PhoneId" : "460001201521818", "hour" : "01" }, "value" : { "count" : 65 } }
{ "_id" : { "PhoneId" : "460001201521818", "hour" : "02" }, "value" : { "count" : 52 } }
{ "_id" : { "PhoneId" : "460001201521818", "hour" : "03" }, "value" : { "count" : 98 } }
{ "_id" : { "PhoneId" : "460001201521818", "hour" : "04" }, "value" : { "count" : 112 } }
{ "_id" : { "PhoneId" : "460001201521818", "hour" : "05" }, "value" : { "count" : 112 } }
{ "_id" : { "PhoneId" : "460001201521818", "hour" : "06" }, "value" : { "count" : 111 } }
{ "_id" : { "PhoneId" : "460001201521818", "hour" : "07" }, "value" : { "count" : 111 } }
{ "_id" : { "PhoneId" : "460001201521818", "hour" : "08" }, "value" : { "count" : 105 } }
{ "_id" : { "PhoneId" : "460001201521818", "hour" : "09" }, "value" : { "count" : 114 } }
{ "_id" : { "PhoneId" : "460001201521818", "hour" : "10" }, "value" : { "count" : 107 } }
{ "_id" : { "PhoneId" : "460001201521818", "hour" : "11" }, "value" : { "count" : 115 } }
{ "_id" : { "PhoneId" : "460001201521818", "hour" : "12" }, "value" : { "count" : 89 } }
{ "_id" : { "PhoneId" : "460001201521818", "hour" : "13" }, "value" : { "count" : 114 } }
{ "_id" : { "PhoneId" : "460001201521818", "hour" : "14" }, "value" : { "count" : 114 } }
{ "_id" : { "PhoneId" : "460001201521818", "hour" : "15" }, "value" : { "count" : 115 } }
{ "_id" : { "PhoneId" : "460001201521818", "hour" : "16" }, "value" : { "count" : 115 } }
{ "_id" : { "PhoneId" : "460001201521818", "hour" : "17" }, "value" : { "count" : 112 } }
{ "_id" : { "PhoneId" : "460001201521818", "hour" : "18" }, "value" : { "count" : 115 } }
{ "_id" : { "PhoneId" : "460001201521818", "hour" : "19" }, "value" : { "count" : 70 } }
has more
三、实现多列的group by having操作。
类似如下SQL:
select PhoneId, hour, count(*)
from tianyc_test8
where PhoneId in ('xx','yy','zz')
group by PhoneId, hour having count(*)>=100;


> var m = function(){
var hour=String(this.Time.getHours());
if(hour.length==1) {
hour='0'+hour
};
key=this.PhoneId+'-'+hour;
emit(key,{count:1});
}
3. 编写reduce函数,将输出结果进行拆分。
> var r = function (key,values) {
var total = 0;
var locate = key.indexOf('-');
for (var i=0; i
total += values[i].count;
}
return {PhoneId:key.substr(0,locate), hour:key.substr(locate+1,2), count : total};
};

 

1. 使用第一种group by方法进行测试。map和reduce函数不变,增加finalize函数,设置指示列get。
> var f = function(key, rvalues){
if (rvalues.count>=100){rvalues.get='true';}
else {rvalues.get='false'};
return rvalues;
}
2. 使用runCommand来执行mapredurce。
> var res = db.runCommand({
mapreduce:'tianyc_test8',
map:m,
reduce:r,
finalize:f,
query:{PhoneId:{$in:["460001203762426","460001201521818","460001202749919"]}},
out:'tianyc_mr_Result1'
}
)
3. 查询结果。
使用get列来过滤需要的数据,并隐藏该列:
> db.tianyc_mr_Result1.find({'value.get':'true'},{'value.get':0})
{ "_id" : "460001201521818-00", "value" : { "PhoneId" : "460001201521818", "hour" : "00", "count" : 112 } }
{ "_id" : "460001201521818-04", "value" : { "PhoneId" : "460001201521818", "hour" : "04", "count" : 112 } }
{ "_id" : "460001201521818-05", "value" : { "PhoneId" : "460001201521818", "hour" : "05", "count" : 112 } }
{ "_id" : "460001201521818-06", "value" : { "PhoneId" : "460001201521818", "hour" : "06", "count" : 111 } }
{ "_id" : "460001201521818-07", "value" : { "PhoneId" : "460001201521818", "hour" : "07", "count" : 111 } }
{ "_id" : "460001201521818-08", "value" : { "PhoneId" : "460001201521818", "hour" : "08", "count" : 105 } }
{ "_id" : "460001201521818-09", "value" : { "PhoneId" : "460001201521818", "hour" : "09", "count" : 114 } }
{ "_id" : "460001201521818-10", "value" : { "PhoneId" : "460001201521818", "hour" : "10", "count" : 107 } }
{ "_id" : "460001201521818-11", "value" : { "PhoneId" : "460001201521818", "hour" : "11", "count" : 115 } }
{ "_id" : "460001201521818-13", "value" : { "PhoneId" : "460001201521818", "hour" : "13", "count" : 114 } }
{ "_id" : "460001201521818-14", "value" : { "PhoneId" : "460001201521818", "hour" : "14", "count" : 114 } }
{ "_id" : "460001201521818-15", "value" : { "PhoneId" : "460001201521818", "hour" : "15", "count" : 115 } }
{ "_id" : "460001201521818-16", "value" : { "PhoneId" : "460001201521818", "hour" : "16", "count" : 115 } }
{ "_id" : "460001201521818-17", "value" : { "PhoneId" : "460001201521818", "hour" : "17", "count" : 112 } }
{ "_id" : "460001201521818-18", "value" : { "PhoneId" : "460001201521818", "hour" : "18", "count" : 115 } }
{ "_id" : "460001201521818-21", "value" : { "PhoneId" : "460001201521818", "hour" : "21", "count" : 118 } }
{ "_id" : "460001201521818-22", "value" : { "PhoneId" : "460001201521818", "hour" : "22", "count" : 116 } }
{ "_id" : "460001201521818-23", "value" : { "PhoneId" : "460001201521818", "hour" : "23", "count" : 112 } }
{ "_id" : "460001202749919-00", "value" : { "PhoneId" : "460001202749919", "hour" : "00", "count" : 112 } }
{ "_id" : "460001202749919-01", "value" : { "PhoneId" : "460001202749919", "hour" : "01", "count" : 111 } }
has more
>

=================以下是我自己搞的=====================================

var m = function(){
var soft_type=this.soft_type;
key=this.pkg_name+'-'+soft_type;
emit(key,{count:1});
}
3. 编写reduce函数,将输出结果进行拆分。
var r = function (key,values) {
var total = 0;
var locate = key.indexOf('-');
for (var i=0; i<values.length; i++){
total += values[i].count;
}
return {pkg_name:key.substr(0,locate), soft_type:key.substr(locate+1,2), count:total}
};
4. 进行mapreduce,加入查询条件
var f = function(key, rvalues){
 if (rvalues.count>=1){
  return rvalues;
 }
}
2. 使用runCommand来执行mapredurce。
var res = db.runCommand({
mapreduce:'clear_info',
map:m,
reduce:r,
finalize:f,
out:'test0728'
}
)