mongodb 一次索引调研
1. 需求
本来是有业务需求,所以开始调研mongodb的索引方案;需求主要是:为元数据添加tags信息,这样通过tags信息可以来过滤对应的信息;这次做方案的时候,自己有点一开始就被限制住,因为当时给我这个需求调研的时候,就已经给了一个粗略的方案;所以我就开始往索引的方向走,总是在思考怎么样的索引方案才是最好的,最优秀的,性能上最好的;
但是最终搞了很久才发现,不管索引如何设计,在业务场景上性能也是不可能好的,这是一个不可能的任务的,而且在分析竞品的产品的时候,发现对方在这个场景上根本就不需要索引,所以我做了半个月的方案从一开始就没用。不过竟然做了分析,我就总结一下;
需求:支持用户数据按照标签能查询的功能;比如用户数据可以添加tags:A,B,后期可以根据tags 包含A这种请求的方法来查询到对应的数据;
为什么这个功能最终实现不了呢?
db.user.find({tags:{$all:["A"]}'}) 从mongodb的角度来说,通过cursor来接连获得数据;但是但是当用户通过http请求来list数据的时候,就不行了;因为每个请求都是独立的,没办法对接到cursor上去,即使我们最终把cursor对接上了,如果cursor失效了,那后面怎么样才能续上;正常情况下的话可能需要按照某一个字段排序,然后下次一个请求加上_id > xxx这样的方式来进行紧接上面的查询,但是这样的话就需要对_id排序,这样最终的性能即使对tags进行了索引,最终也会因为要sort而变差很多很多;
其实后来思考,在数据库领域做分页其实的确不好做;尤其是当数据量极大的情况下,需要按照场景来进行索引优化,但是代价基本都是空间换时间的。
2. 索引设计
总共设计了三个:
db.v1.createIndex({itbl:1, "tags.k":1, "tags.v":1})
db.v2.createIndex({itbl:1, tags:1})
db.v3.createIndex({tags:1})
//v1的数据:
{
"_id":"f3aa3be7-d524-11ed-9a6a-6c92bf361376",
"itbl":"123456",
"tags":[
{
"k":"0c1",
"v":"d1"
},
{
"k":"1f1",
"v":"d1"
},
{
"k":"2b1",
"v":"e1"
}
]
}
// v2数据
{
"_id":"f3aa3273-d524-11ed-9a6a-6c92bf361376",
"itbl":"123456",
"tags":[
"0f1@e1",
"1b1@b1",
"2b1@b1"
]
}
// v3数据
{
"_id":"f3aa3273-d524-11ed-9a6a-6c92bf361376",
"tags":[
"123456@0f1@e1",
"123456@1b1@b1",
"123456@2b1@b1"
]
}
其实第一个索引和第二个索引本质上是差不多的,都属于联合索引,而第三个索引属于multikey index;
3. 相同查询不同索引的执行计划
3.1 指定itbl|tagk|tagv查询
db.v1.find({"itbl":"123456","tags":{$elemMatch:{"k":"0f1","v":"e1"}}}).explain("executionStats")
// 看执行计划,命中了索引
{
"executionStats":{
"executionSuccess":true,
"nReturned":27759,
"executionTimeMillis":444,
"totalKeysExamined":27759,
"totalDocsExamined":27759,
"executionStages":{
"stage":"FETCH",
"filter":{
"tags":{
"$elemMatch":{
"$and":[
{
"k":{
"$eq":"0f1"
}
},
{
"v":{
"$eq":"e1"
}
}
]
}
}
},
"nReturned":27759,
"executionTimeMillisEstimate":424,
"works":27760,
"advanced":27759,
"needTime":0,
"needYield":0,
"saveState":219,
"restoreState":219,
"isEOF":1,
"invalidates":0,
"docsExamined":27759,
"alreadyHasObj":0,
"inputStage":{
"stage":"IXSCAN",
"nReturned":27759,
"executionTimeMillisEstimate":31,
"works":27760,
"advanced":27759,
"needTime":0,
"needYield":0,
"saveState":219,
"restoreState":219,
"isEOF":1,
"invalidates":0,
"keyPattern":{
"itbl":1,
"tags.k":1,
"tags.v":1
},
"indexName":"itbl_1_tags.k_1_tags.v_1",
"isMultiKey":true,
"multiKeyPaths":{
"itbl":[
],
"tags.k":[
"tags"
],
"tags.v":[
"tags"
]
},
"isUnique":false,
"isSparse":true,
"isPartial":false,
"indexVersion":2,
"direction":"forward",
"indexBounds":{
"itbl":[
"[\"123456\", \"123456\"]"
],
"tags.k":[
"[\"0f1\", \"0f1\"]"
],
"tags.v":[
"[\"e1\", \"e1\"]"
]
},
"keysExamined":27759,
"seeks":1,
"dupsTested":27759,
"dupsDropped":0,
"seenInvalidated":0
}
}
}
}
db.v2.find({itbl:"123456", tags:{"$all":["0f1@e1"]}}).explain("executionStats")
// 整体执行时间比第一个索引要好,
{
"executionStats":{
"executionSuccess":true,
"nReturned":27759,
"executionTimeMillis":351,
"totalKeysExamined":27759,
"totalDocsExamined":27759,
"executionStages":{
"stage":"FETCH",
"nReturned":27759,
"executionTimeMillisEstimate":353,
"works":27760,
"advanced":27759,
"needTime":0,
"needYield":0,
"saveState":217,
"restoreState":217,
"isEOF":1,
"invalidates":0,
"docsExamined":27759,
"alreadyHasObj":0,
"inputStage":{
"stage":"IXSCAN",
"nReturned":27759,
"executionTimeMillisEstimate":31,
"works":27760,
"advanced":27759,
"needTime":0,
"needYield":0,
"saveState":217,
"restoreState":217,
"isEOF":1,
"invalidates":0,
"keyPattern":{
"itbl":1,
"tags":1
},
"indexName":"itbl_1_tags_1",
"isMultiKey":true,
"multiKeyPaths":{
"itbl":[
],
"tags":[
"tags"
]
},
"isUnique":false,
"isSparse":true,
"isPartial":false,
"indexVersion":2,
"direction":"forward",
"indexBounds":{
"itbl":[
"[\"123456\", \"123456\"]"
],
"tags":[
"[\"0f1@e1\", \"0f1@e1\"]"
]
},
"keysExamined":27759,
"seeks":1,
"dupsTested":27759,
"dupsDropped":0,
"seenInvalidated":0
}
}
}
}
db.v3.find({tags:{$all:["123456@0f1@e1"]}}).explain("executionStats")
{
"executionStats":{
"executionSuccess":true,
"nReturned":27759,
"executionTimeMillis":349,
"totalKeysExamined":27759,
"totalDocsExamined":27759,
"executionStages":{
"stage":"FETCH",
"nReturned":27759,
"executionTimeMillisEstimate":323,
"works":27760,
"advanced":27759,
"needTime":0,
"needYield":0,
"saveState":218,
"restoreState":218,
"isEOF":1,
"invalidates":0,
"docsExamined":27759,
"alreadyHasObj":0,
"inputStage":{
"stage":"IXSCAN",
"nReturned":27759,
"executionTimeMillisEstimate":41,
"works":27760,
"advanced":27759,
"needTime":0,
"needYield":0,
"saveState":218,
"restoreState":218,
"isEOF":1,
"invalidates":0,
"keyPattern":{
"tags":1
},
"indexName":"tags_1",
"isMultiKey":true,
"multiKeyPaths":{
"tags":[
"tags"
]
},
"isUnique":false,
"isSparse":true,
"isPartial":false,
"indexVersion":2,
"direction":"forward",
"indexBounds":{
"tags":[
"[\"123456@0f1@e1\", \"123456@0f1@e1\"]"
]
},
"keysExamined":27759,
"seeks":1,
"dupsTested":27759,
"dupsDropped":0,
"seenInvalidated":0
}
}
}
}
结论: 功能符合预期,但是第一个索引整体性能貌似差一点
3.2 tags匹配1个以上的查询
db.v1.find({"itbl":"123456","$and":[{"tags":{$elemMatch:{"k":"0f1","v":"e1"}}}, {"tags":{$elemMatch:{"k":"1e1", "v":"c1"}}}]}).explain("executionStats")
{
"executionStats":{
"executionSuccess":true,
"nReturned":779,
"executionTimeMillis":267,
"totalKeysExamined":27759,
"totalDocsExamined":27759,
"executionStages":{
"stage":"FETCH",
"filter":{
"$and":[
{
"tags":{
"$elemMatch":{
"$and":[
{
"k":{
"$eq":"0f1"
}
},
{
"v":{
"$eq":"e1"
}
}
]
}
}
},
{
"tags":{
"$elemMatch":{
"$and":[
{
"k":{
"$eq":"1e1"
}
},
{
"v":{
"$eq":"c1"
}
}
]
}
}
}
]
},
"nReturned":779,
"executionTimeMillisEstimate":262,
"works":27760,
"advanced":779,
"needTime":26980,
"needYield":0,
"saveState":218,
"restoreState":218,
"isEOF":1,
"invalidates":0,
"docsExamined":27759,
"alreadyHasObj":0,
"inputStage":{
"stage":"IXSCAN",
"nReturned":27759,
"executionTimeMillisEstimate":60,
"works":27760,
"advanced":27759,
"needTime":0,
"needYield":0,
"saveState":218,
"restoreState":218,
"isEOF":1,
"invalidates":0,
"keyPattern":{
"itbl":1,
"tags.k":1,
"tags.v":1
},
"indexName":"itbl_1_tags.k_1_tags.v_1",
"isMultiKey":true,
"multiKeyPaths":{
"itbl":[
],
"tags.k":[
"tags"
],
"tags.v":[
"tags"
]
},
"isUnique":false,
"isSparse":true,
"isPartial":false,
"indexVersion":2,
"direction":"forward",
"indexBounds":{
"itbl":[
"[\"123456\", \"123456\"]"
],
"tags.k":[
"[\"0f1\", \"0f1\"]"
],
"tags.v":[
"[\"e1\", \"e1\"]"
]
},
"keysExamined":27759,
"seeks":1,
"dupsTested":27759,
"dupsDropped":0,
"seenInvalidated":0
}
}
}
}
db.v2.find({itbl:"123456", tags:{$all:["0f1@e1","1e1@c1"]}}).explain("executionStats")
{
"executionStats":{
"executionSuccess":true,
"nReturned":779,
"executionTimeMillis":251,
"totalKeysExamined":27759,
"totalDocsExamined":27759,
"executionStages":{
"stage":"FETCH",
"filter":{
"tags":{
"$eq":"1e1@c1"
}
},
"nReturned":779,
"executionTimeMillisEstimate":252,
"works":27760,
"advanced":779,
"needTime":26980,
"needYield":0,
"saveState":218,
"restoreState":218,
"isEOF":1,
"invalidates":0,
"docsExamined":27759,
"alreadyHasObj":0,
"inputStage":{
"stage":"IXSCAN",
"nReturned":27759,
"executionTimeMillisEstimate":50,
"works":27760,
"advanced":27759,
"needTime":0,
"needYield":0,
"saveState":218,
"restoreState":218,
"isEOF":1,
"invalidates":0,
"keyPattern":{
"itbl":1,
"tags":1
},
"indexName":"itbl_1_tags_1",
"isMultiKey":true,
"multiKeyPaths":{
"itbl":[
],
"tags":[
"tags"
]
},
"isUnique":false,
"isSparse":true,
"isPartial":false,
"indexVersion":2,
"direction":"forward",
"indexBounds":{
"itbl":[
"[\"123456\", \"123456\"]"
],
"tags":[
"[\"0f1@e1\", \"0f1@e1\"]"
]
},
"keysExamined":27759,
"seeks":1,
"dupsTested":27759,
"dupsDropped":0,
"seenInvalidated":0
}
}
}
}
db.v3.find({tags:{$all:["123456@0f1@e1","123456@1e1@c1"]}}).explain("executionStats")
{
"executionStats":{
"executionSuccess":true,
"nReturned":779,
"executionTimeMillis":267,
"totalKeysExamined":27759,
"totalDocsExamined":27759,
"executionStages":{
"stage":"FETCH",
"filter":{
"tags":{
"$eq":"123456@1e1@c1"
}
},
"nReturned":779,
"executionTimeMillisEstimate":232,
"works":27760,
"advanced":779,
"needTime":26980,
"needYield":0,
"saveState":274,
"restoreState":274,
"isEOF":1,
"invalidates":0,
"docsExamined":27759,
"alreadyHasObj":0,
"inputStage":{
"stage":"IXSCAN",
"nReturned":27759,
"executionTimeMillisEstimate":20,
"works":27760,
"advanced":27759,
"needTime":0,
"needYield":0,
"saveState":274,
"restoreState":274,
"isEOF":1,
"invalidates":0,
"keyPattern":{
"tags":1
},
"indexName":"tags_1",
"isMultiKey":true,
"multiKeyPaths":{
"tags":[
"tags"
]
},
"isUnique":false,
"isSparse":true,
"isPartial":false,
"indexVersion":2,
"direction":"forward",
"indexBounds":{
"tags":[
"[\"123456@0f1@e1\", \"123456@0f1@e1\"]"
]
},
"keysExamined":27759,
"seeks":1,
"dupsTested":27759,
"dupsDropped":0,
"seenInvalidated":0
}
}
}
}
结论: 从这三个执行计划看的出来,基本上都差不多,从结果来看是符合需求的;
3.3 查询包含某一个tagk的数据
db.v1.find({itbl:"123456", "tags.k":"0f1"}).explain("executionStats")
{
"executionStats":{
"executionSuccess":true,
"nReturned":166668,
"executionTimeMillis":1038,
"totalKeysExamined":166668,
"totalDocsExamined":166668,
"executionStages":{
"stage":"FETCH",
"nReturned":166668,
"executionTimeMillisEstimate":1010,
"works":166669,
"advanced":166668,
"needTime":0,
"needYield":0,
"saveState":1307,
"restoreState":1307,
"isEOF":1,
"invalidates":0,
"docsExamined":166668,
"alreadyHasObj":0,
"inputStage":{
"stage":"IXSCAN",
"nReturned":166668,
"executionTimeMillisEstimate":192,
"works":166669,
"advanced":166668,
"needTime":0,
"needYield":0,
"saveState":1307,
"restoreState":1307,
"isEOF":1,
"invalidates":0,
"keyPattern":{
"itbl":1,
"tags.k":1,
"tags.v":1
},
"indexName":"itbl_1_tags.k_1_tags.v_1",
"isMultiKey":true,
"multiKeyPaths":{
"itbl":[
],
"tags.k":[
"tags"
],
"tags.v":[
"tags"
]
},
"isUnique":false,
"isSparse":true,
"isPartial":false,
"indexVersion":2,
"direction":"forward",
"indexBounds":{
"itbl":[
"[\"123456\", \"123456\"]"
],
"tags.k":[
"[\"0f1\", \"0f1\"]"
],
"tags.v":[
"[MinKey, MaxKey]"
]
},
"keysExamined":166668,
"seeks":1,
"dupsTested":166668,
"dupsDropped":0,
"seenInvalidated":0
}
}
}
}
db.v2.find({itbl:"123456", tags:{$regex:"^0f1@"}}).explain("executionStats")
{
"executionStats":{
"executionSuccess":true,
"nReturned":166668,
"executionTimeMillis":1156,
"totalKeysExamined":166669,
"totalDocsExamined":166668,
"executionStages":{
"stage":"FETCH",
"nReturned":166668,
"executionTimeMillisEstimate":1140,
"works":166670,
"advanced":166668,
"needTime":1,
"needYield":0,
"saveState":1308,
"restoreState":1308,
"isEOF":1,
"invalidates":0,
"docsExamined":166668,
"alreadyHasObj":0,
"inputStage":{
"stage":"IXSCAN",
"nReturned":166668,
"executionTimeMillisEstimate":213,
"works":166670,
"advanced":166668,
"needTime":1,
"needYield":0,
"saveState":1308,
"restoreState":1308,
"isEOF":1,
"invalidates":0,
"keyPattern":{
"itbl":1,
"tags":1
},
"indexName":"itbl_1_tags_1",
"isMultiKey":true,
"multiKeyPaths":{
"itbl":[
],
"tags":[
"tags"
]
},
"isUnique":false,
"isSparse":true,
"isPartial":false,
"indexVersion":2,
"direction":"forward",
"indexBounds":{
"itbl":[
"[\"123456\", \"123456\"]"
],
"tags":[
"[\"0f1@\", \"0f1A\")",
"[/^0f1@/, /^0f1@/]"
]
},
"keysExamined":166669,
"seeks":2,
"dupsTested":166668,
"dupsDropped":0,
"seenInvalidated":0
}
}
}
}
db.v3.find({tags:{$regex:"^123456@0f1@"}}).explain("executionStats")
{
"executionStats":{
"executionSuccess":true,
"nReturned":0,
"executionTimeMillis":1468,
"totalKeysExamined":166669,
"totalDocsExamined":166668,
"executionStages":{
"stage":"FETCH",
"filter":{
"itbl":{
"$eq":"123456"
}
},
"nReturned":0,
"executionTimeMillisEstimate":1412,
"works":166670,
"advanced":0,
"needTime":166669,
"needYield":0,
"saveState":1308,
"restoreState":1308,
"isEOF":1,
"invalidates":0,
"docsExamined":166668,
"alreadyHasObj":0,
"inputStage":{
"stage":"IXSCAN",
"nReturned":166668,
"executionTimeMillisEstimate":333,
"works":166670,
"advanced":166668,
"needTime":1,
"needYield":0,
"saveState":1308,
"restoreState":1308,
"isEOF":1,
"invalidates":0,
"keyPattern":{
"tags":1
},
"indexName":"tags_1",
"isMultiKey":true,
"multiKeyPaths":{
"tags":[
"tags"
]
},
"isUnique":false,
"isSparse":true,
"isPartial":false,
"indexVersion":2,
"direction":"forward",
"indexBounds":{
"tags":[
"[\"123456@0f1@\", \"123456@0f1A\")",
"[/^123456@0f1@/, /^123456@0f1@/]"
]
},
"keysExamined":166669,
"seeks":2,
"dupsTested":166668,
"dupsDropped":0,
"seenInvalidated":0
}
}
}
}
结论:整体符合预期,并且相差不大;
4. 优缺点
- 存储成本
{
"ns" : "test.v1",
"size" : 774048290,
"count" : 10073870,
"avgObjSize" : 76,
"storageSize" : 774048256,
"capped" : false,
"nindexes" : 2,
"totalIndexSize" : 510683591,
"indexSizes" : {
"_id_" : 433176410,
"itbl_1_tags.k_1_tags.v_1" : 77507181
},
"ok" : 1
}
{
"ns" : "test.v2",
"size" : 729003089,
"count" : 10073867,
"avgObjSize" : 72,
"storageSize" : 729003008,
"capped" : false,
"nindexes" : 2,
"totalIndexSize" : 507680453,
"indexSizes" : {
"_id_" : 433176281,
"itbl_1_tags_1" : 74504172
},
"ok" : 1
}
{
"ns" : "test.v3",
"size" : 733006888,
"count" : 10073864,
"avgObjSize" : 72,
"storageSize" : 733006848,
"capped" : false,
"nindexes" : 2,
"totalIndexSize" : 504677315,
"indexSizes" : {
"_id_" : 433176152,
"tags_1" : 71501163
},
"ok" : 1
}
结论:
- 索引1的
avgObjSize会比较大,当然这边看差别不大的;但是之前测试的时候差别会很大的 索引1的索引成本也会比较大的,索引2和索引3本质上差不多的;
索引性能
从上面的结果来说,其实3个的索引性能都差不多的,但是第一个第二索引有一个致命的缺点,就是当查询匹配多个tags的时候,第一个第二个索引因为本质上是联合索引,所以索引的顺序会按照tags查询的时候
array的顺序有关的;但是第三个索引mongodb会按照实际情况去找最优的索引,和array顺序无关;# 第一索引和第二索引类似,所以以第二索引来做例子 db.v2.find({itbl:"123456", tags:{$all:["0f1@e1","1e1@c1"]}}).explain() db.v2.find({itbl:"123456", tags:{$all:["1e1@c1","0f1@e1"]}}).explain()// 第一过滤的索引是:0f1@e1 { "winningPlan":{ "stage":"FETCH", "filter":{ "tags":{ "$eq":"1e1@c1" } }, "inputStage":{ "stage":"IXSCAN", "keyPattern":{ "itbl":1, "tags":1 }, "indexName":"itbl_1_tags_1", "isMultiKey":true, "multiKeyPaths":{ "itbl":[ ], "tags":[ "tags" ] }, "isUnique":false, "isSparse":true, "isPartial":false, "indexVersion":2, "direction":"forward", "indexBounds":{ "itbl":[ "[\"123456\", \"123456\"]" ], "tags":[ "[\"0f1@e1\", \"0f1@e1\"]" ] } } } } // 第一个过滤索引为:1e1@c1 { "winningPlan":{ "stage":"FETCH", "filter":{ "tags":{ "$eq":"0f1@e1" } }, "inputStage":{ "stage":"IXSCAN", "keyPattern":{ "itbl":1, "tags":1 }, "indexName":"itbl_1_tags_1", "isMultiKey":true, "multiKeyPaths":{ "itbl":[ ], "tags":[ "tags" ] }, "isUnique":false, "isSparse":true, "isPartial":false, "indexVersion":2, "direction":"forward", "indexBounds":{ "itbl":[ "[\"123456\", \"123456\"]" ], "tags":[ "[\"1e1@c1\", \"1e1@c1\"]" ] } } } }# 这两个查询的计划是一样的 db.v3.find({tags:{$all:["123456@0f1@e1","123456@1e1@c1"]}}).explain() db.v3.find({tags:{$all:["123456@1e1@c1","123456@0f1@e1"]}}).explain(){ // 优先索引:123456@0f1@e1 "winningPlan":{ "stage":"FETCH", "filter":{ "tags":{ "$eq":"123456@1e1@c1" } }, "inputStage":{ "stage":"IXSCAN", "keyPattern":{ "tags":1 }, "indexName":"tags_1", "isMultiKey":true, "multiKeyPaths":{ "tags":[ "tags" ] }, "isUnique":false, "isSparse":true, "isPartial":false, "indexVersion":2, "direction":"forward", "indexBounds":{ "tags":[ "[\"123456@0f1@e1\", \"123456@0f1@e1\"]" ] } } }, // 有其他两个query plan通过自己的查询积分逻辑来进行选择,下面两个是被拒绝了 "rejectedPlans":[ { "stage":"FETCH", "filter":{ "tags":{ "$eq":"123456@0f1@e1" } }, "inputStage":{ "stage":"IXSCAN", "keyPattern":{ "tags":1 }, "indexName":"tags_1", "isMultiKey":true, "multiKeyPaths":{ "tags":[ "tags" ] }, "isUnique":false, "isSparse":true, "isPartial":false, "indexVersion":2, "direction":"forward", "indexBounds":{ "tags":[ "[\"123456@1e1@c1\", \"123456@1e1@c1\"]" ] } } }, { "stage":"FETCH", "filter":{ "$and":[ { "tags":{ "$eq":"123456@0f1@e1" } }, { "tags":{ "$eq":"123456@1e1@c1" } } ] }, "inputStage":{ "stage":"AND_SORTED", "inputStages":[ { "stage":"IXSCAN", "keyPattern":{ "tags":1 }, "indexName":"tags_1", "isMultiKey":true, "multiKeyPaths":{ "tags":[ "tags" ] }, "isUnique":false, "isSparse":true, "isPartial":false, "indexVersion":2, "direction":"forward", "indexBounds":{ "tags":[ "[\"123456@0f1@e1\", \"123456@0f1@e1\"]" ] } }, { "stage":"IXSCAN", "keyPattern":{ "tags":1 }, "indexName":"tags_1", "isMultiKey":true, "multiKeyPaths":{ "tags":[ "tags" ] }, "isUnique":false, "isSparse":true, "isPartial":false, "indexVersion":2, "direction":"forward", "indexBounds":{ "tags":[ "[\"123456@1e1@c1\", \"123456@1e1@c1\"]" ] } } ] } } ] }结论:
- 第一第二索引会和查询顺序关系,这样会导致查询过程需要用户关心不必要的细节,
第三个索引,mongodb会产生3个
query plan,并且mongodb内部存在对执行计划有评分策略的,整个评分会随着数据的变化去选择它认为最优的查询计划,当然并不是每次都是最优的,但是当索引之间的数据量相差很大的时候,就会选择最优的5. 总结
最终会选择第三个索引,使用mongodb内存的查询评分策略可以找到最优执行计划,这样会在数据动态变化过程中持续去调整的,能达到一个比较好的性能;特别是用户不需要关心不必要的细节的;