mongodb联合索引
1. 需求
- 对mongo的数据进行分类,比如按照type进行分类;比如type的value:
v1,v2,v3 - 需要支持按照主键进行排序输出;主要是因为要支持列举
2. 索引方案
// 为type和_id来创建联合索引,type放在前面
db.v4.createIndex({type:1, _id:1})
3. 实际效果
数据分布:
- 总量为:1000w
type=v1: 100wtype=v2: 600wtype=v3: 300w
3.1 查询type=v1并且按照_id进行排序
查询语句:
db.v4.find({type:"v1"}).sort({type:1,_id:1}).explain("allPlansExecution")
执行计划和执行详情
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"type" : 1,
"_id" : 1
},
"indexName" : "type_1__id_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"type" : [ ],
"_id" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"type" : [
"[\"v1\", \"v1\"]"
],
"_id" : [
"[MinKey, MaxKey]"
]
}
}
}
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 1000000,
"executionTimeMillis" : 3990,
"totalKeysExamined" : 1000000,
"totalDocsExamined" : 1000000,
"executionStages" : {
"stage" : "FETCH",
"nReturned" : 1000000,
"executionTimeMillisEstimate" : 3874,
"works" : 1000001,
"advanced" : 1000000,
"needTime" : 0,
"needYield" : 0,
"saveState" : 7829,
"restoreState" : 7829,
"isEOF" : 1,
"invalidates" : 0,
"docsExamined" : 1000000,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 1000000,
"executionTimeMillisEstimate" : 767,
"works" : 1000001,
"advanced" : 1000000,
"needTime" : 0,
"needYield" : 0,
"saveState" : 7829,
"restoreState" : 7829,
"isEOF" : 1,
"invalidates" : 0,
"keyPattern" : {
"type" : 1,
"_id" : 1
},
"indexName" : "type_1__id_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"type" : [ ],
"_id" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"type" : [
"[\"v1\", \"v1\"]"
],
"_id" : [
"[MinKey, MaxKey]"
]
},
"keysExamined" : 1000000,
"seeks" : 1,
"dupsTested" : 0,
"dupsDropped" : 0,
"seenInvalidated" : 0
}
},
"allPlansExecution" : [ ]
},
结论: 通过type + _id组合索引可以达到我们的需求,排序本身在遍历的过程中就已经完成,不需要二次排序
3.2 查询type=v1并且按照_id逆序
查询语句
//这边有两种方式来进行查询,区别在于type和_id的排序顺序问题
db.v4.find({type:"v1"}).sort({type:1,_id:-1}).explain("allPlansExecution")
db.v4.find({type:"v1"}).sort({type:-1,_id:-1}).explain("allPlansExecution")
执行计划
// plan-1
"winningPlan" : {
"stage" : "SORT",
"sortPattern" : {
"type" : 1,
"_id" : -1
},
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"type" : 1,
"_id" : 1
},
"indexName" : "type_1__id_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"type" : [ ],
"_id" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"type" : [
"[\"v1\", \"v1\"]"
],
"_id" : [
"[MinKey, MaxKey]"
]
}
}
}
}
}
// plan-2
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"type" : 1,
"_id" : 1
},
"indexName" : "type_1__id_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"type" : [ ],
"_id" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "backward",
"indexBounds" : {
"type" : [
"[\"v1\", \"v1\"]"
],
"_id" : [
"[MaxKey, MinKey]"
]
}
}
},
结论: type和_id的排序的方向需要一致,在我们的场景中其实type是确定的,所以只要查询语句写的ok是符合要求的
3.3 type=v1 && _id > xxx
这个查询的需求在于list请求,每个请求会带上_id来保证接个下次的请求继续查询
查询语句
db.v4.find({type:"v1",_id:{"$gte":"0c82507c-d67e-11ed-8157-6c92bf361376"}}).sort({type:1,_id:1}).explain()
执行计划
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"type" : 1,
"_id" : 1
},
"indexName" : "type_1__id_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"type" : [ ],
"_id" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"type" : [
"[\"v1\", \"v1\"]"
],
"_id" : [
"[\"0c82507c-d67e-11ed-8157-6c92bf361376\", {})"
]
}
}
},
"rejectedPlans" : [
{
"stage" : "SORT",
"sortPattern" : {
"type" : 1,
"_id" : 1
},
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"type" : {
"$eq" : "v1"
}
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"_id" : 1
},
"indexName" : "_id_",
"isMultiKey" : false,
"multiKeyPaths" : {
"_id" : [ ]
},
"isUnique" : true,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"_id" : [
"[\"0c82507c-d67e-11ed-8157-6c92bf361376\", {})"
]
}
}
}
}
}
]
//这边出现了另外一个执行计划,想走_id索引,然后过滤type,在排序; 不过从最终执行计划的积分发现,这种性能会比较差
//这个是被拒绝的计划的得分
score(1.0002) = baseScore(1) + productivity((0 advanced)/(101 works) = 0) + tieBreakers(0.0001 noFetchBonus + 0 noSortBonus + 0.0001 noIxisectBonus = 0.0002)
//这个是走联合索引的得分;因为在我们场景中这个索引的得分是一定会高的,因为底层的到的数据就是有效数据并且排序
score(2.0003) = baseScore(1) + productivity((101 advanced)/(101 works) = 1) + tieBreakers(0.0001 noFetchBonus + 0.0001 noSortBonus + 0.0001 noIxisectBonus = 0.0003)
结论: 符合要求
3.4 type=1 && _id按照某一个前缀
查询语句
db.v4.find({type:"v1",_id:{"$regex":"^0c82507c"}}).sort({type:1,_id:1}).explain()
执行计划
"winningPlan" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"type" : 1,
"_id" : 1
},
"indexName" : "type_1__id_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"type" : [ ],
"_id" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"type" : [
"[\"v1\", \"v1\"]"
],
"_id" : [
"[\"0c82507c\", \"0c82507d\")",
"[/^0c82507c/, /^0c82507c/]"
]
}
}
}
结论:符合预期,场景可能想找某一个共同前缀的信息吧;
4. 联合索引的存储成本
hd_listexporter_shard1:PRIMARY> db.v4.stats()
{
"ns" : "test.v4",
"size" : 850000000,
"count" : 10000000,
"avgObjSize" : 85,
"storageSize" : 849999872,
"capped" : false,
"nindexes" : 2,
"totalIndexSize" : 1371037969,
"indexSizes" : {
"_id_" : 430000000,
"type_1__id_1" : 941037969
},
"ok" : 1
}
新加的联合索引目前是_id_的两倍多,如果现实场景可能会随着type本身的长度比例上可能会有偏差
5. 结论
联合索引在我来看其实本质是就是将type + _id两个字段按照某一种组合再次创建索引的过程的,而且联合索引本身有有顺序的,比如A,B两个字段创建了索引,你单独查询B是不能命中索引的,需要单独对B来创建索引解决你这样问题的。不过看到最终的存储成本,发现远比我想象中的要大一些,先留个疑问,下次看源码的时候再来解释。