【发布时间】:2016-01-05 17:04:05
【问题描述】:
我在 mongo 集合中有一个包含 3000 万行的数据集。一个示例记录集是:
{"_id" : ObjectId("568bc0f2f7cd2653e163a9e4"),
"EmailAddress" : "1234@ab.com",
"FlightNumber" : 1043,
"FlightTime" : "10:00"},
{"_id" : ObjectId("568bc0f2f7cd2653e163a9e5"),
"EmailAddress" : "1234@ab.com",
"FlightNumber" : 1045,
"FlightTime" : "12:00"},
{"_id" : ObjectId("568bc0f2f7cd2653e163a9e6"),
"EmailAddress" : "5678@ab.com",
"FlightNumber" : 1045,
"FlightTime" : "12:00"},
这是直接从 SQL 服务器导入的,因此具有关系式数据的性质。
我怎样才能最好地将这些数据映射到另一个集合,以便所有数据都按 EmailAddress 分组,并嵌套 FlightNumbers?输出的一个例子是:
{"_id" : ObjectId("can be new id"),
"EmailAddress" : "1234@ab.com",
"Flights" : [{"Number":1043, "Time":"10:00"},{"Number":1045, "Time":"12:00"}]},
{"_id" : ObjectId("can be new id"),
"EmailAddress" : "5678@ab.com",
"Flights" : [{"Number":1045, "Time":"12:00"}]},
我一直在研究一个导入路由,它遍历源集合中的每条记录,然后批量插入到第二个集合中。这工作正常,但不允许我对数据进行分组,除非我通过记录回溯处理,这会给导入例程增加大量时间开销。
代码如下:
var sourceDb = db.getSiblingDB("collectionSource");
var destinationDb = db.getSiblingDB("collectionDestination");
var externalUsers=sourceDb.CRM.find();
var index = 0;
var contactArray = new Array();
var identifierArray = new Array();
externalUsers.forEach(function(doc) {
//library code for NewGuid omitted
var guid = NewGuid();
//buildContact and buildIdentifier simply create 2 js objects based on the parameters
contactArray.push(buildContact(guid, doc.EmailAddress, doc.FlightNumber));
identifierArray.push(buildIdentifier(guid, doc.EmailAddress));
index++;
if (index % 1000 == 0) {
var now = new Date();
var dif = now.getTime() - startDate.getTime();
var Seconds_from_T1_to_T2 = dif / 1000;
var Seconds_Between_Dates = Math.abs(Seconds_from_T1_to_T2);
print("Written " + index + " items (" + Seconds_Between_Dates + "s from start)");
}
//bulk insert in batches
if (index % 5000 == 0) {
destinationDb.Contacts.insert(contactArray);
destinationDb.Identifiers.insert(identifierArray);
contactArray = new Array();
identifierArray = new Array();
}
});
在此先感谢
【问题讨论】:
标签: mongodb