【发布时间】:2014-09-17 15:31:28
【问题描述】:
我有一个 csv 格式的“大”文件 (159 G),我使用 Sqlite3 和 C 将它插入到数据库表中。现在,实际的 csv 文件有 1,232,799,308 行(10 亿以上)。然而,只插入了 (123,254,689)。我有三个问题:
1) 如何将插入的数据与 csv 文件中的实际数据进行比较?我的意思是我想知道感兴趣的数据是来自一个块(即从第 1 行到第 100,000,000 行)还是随机的?
2) 有没有办法知道插入的行是否代表整个总体,就采样率(平均值、标准差和误差范围)而言?
3) 只插入了 10% 的行的代码有问题吗!?
提前感谢大家的帮助:)
代码在 C:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sqlite3.h>
#define INPUTDATA "/nobackup/DSS/clusterdata-2011-1/working_data/task_usage.csv"
#define DATABASE "/nobackup/DSS/clusterdata-2011-1/working_data/google.db"
#define TABLE "CREATE TABLE IF NOT EXISTS task_usageBS(startendtime TEXT, jobID TEXT, taskindex TEXT, machineID TEXT, CPUusage TEXT, MEMusage TEXT, assignedMEM TEXT, unmappedpagecacheMEMusage TEXT, pagecacheMEMusage TEXT, maxMEMusage TEXT, diskIOtimeMEAN TEXT, localdiskspaceusedMEAN TEXT, CPUusagemax TEXT, diskIOtimemax TEXT, cyclesperinstructionCPI TEXT, MEMaccessesperinstructionMAI TEXT, samplingrate TEXT, aggregationtype TEXT, PRIMARY KEY (startendtime, jobID, taskindex), FOREIGN KEY(jobID) REFERENCES job_events(jobID), FOREIGN KEY(taskindex) REFERENCES task_events(taskindex), FOREIGN KEY(machineID) REFERENCES machine_events(machineID))"
#define BUFFER_SIZE 1024
int main(int argc, char **argv) {
sqlite3 * db;
sqlite3_stmt * stmt;
char * sErrMsg = 0;
const char * tail = 0;
int nRetCode;
int n = 0;
clock_t cStartClock;
FILE * pFile;
char sInputBuf [BUFFER_SIZE] = "\0";
char * sTS = 0; /* startendtime */
char * sJI = 0; /* jobID */
char * sTI = 0; /* taskindex */
char * sMI = 0; /* machineid */
char * sCUM = 0; /* CPUusageMEAN */
char * sMU = 0; /* MEMusage */
char * sAM = 0; /* assignedMEM */
char * sUPCMU = 0; /* unmappedpagecacheMEMusage */
char * sPCMU = 0; /* pagecacheMEMusage */
char * sMMU = 0; /* maxMEMusage */
char * sDIOTM = 0; /* diskIOtimeMEAN */
char * sLDSUM = 0; /* localdiskspaceusedMEAN */
char * sCUMAX = 0; /* CPUusagemax */
char * sDIOTMAX = 0; /* diskIOtimemax */
char * sCPI = 0; /* cyclesperinstructionCPI */
char * sMAPI = 0; /* MEMaccessesperinstructionMAI */
char * sSR = 0; /* samplingrate */
char * sAT = 0; /* aggregationtype */
char sSQL [BUFFER_SIZE] = "\0";
/*********************************************/
/* Open the Database and create the Schema */
sqlite3_open(DATABASE, &db);
sqlite3_exec(db, TABLE, NULL, NULL, &sErrMsg);
/*********************************************/
/* Open input file and import into Database*/
cStartClock = clock();
sprintf(sSQL, "INSERT INTO task_usageBS VALUES (@TS, @JI, @TI, @MI, @CUM, @MU, @AM, @UPCMU, @PCMU, @MMU, @DIOTM, @LDSUM, @CUMAX, @DIOTMAX, @CPI, @MAPI, @SR, @AT)");
sqlite3_prepare_v2(db, sSQL, BUFFER_SIZE, &stmt, &tail);
//sqlite3_exec(db, sSQL, NULL, NULL, &sErrMsg);
sqlite3_exec(db, "BEGIN TRANSACTION", NULL, NULL, &sErrMsg);
pFile = fopen (INPUTDATA,"r");
while (!feof(pFile)) {
fgets (sInputBuf, BUFFER_SIZE, pFile);
sTS = strtok (sInputBuf, ",");
sJI = strtok (NULL, ",");
sTI = strtok (NULL, ",");
sMI = strtok (NULL, ",");
sCUM = strtok (NULL, ",");
sMU = strtok (NULL, ",");
sAM = strtok (NULL, ",");
sUPCMU = strtok (NULL, ",");
sPCMU = strtok (NULL, ",");
sMMU = strtok (NULL, ",");
sDIOTM = strtok (NULL, ",");
sLDSUM = strtok (NULL, ",");
sCUMAX = strtok (NULL, ",");
sDIOTMAX = strtok (NULL, ",");
sCPI = strtok (NULL, ",");
sMAPI = strtok (NULL, ",");
sSR = strtok (NULL, ",");
sAT = strtok (NULL, ",");
sqlite3_bind_text(stmt, 1, sTS, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 2, sJI, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 3, sTI, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 4, sMI, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 5, sCUM, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 6, sMU, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 7, sAM, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 8, sUPCMU, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 9, sPCMU, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 10, sMMU, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 11, sDIOTM, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 12, sLDSUM, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 13, sCUMAX, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 14, sDIOTMAX, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 15, sCPI, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 16, sMAPI, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 17, sSR, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 18, sAT, -1, SQLITE_TRANSIENT);
sqlite3_step(stmt);
sqlite3_clear_bindings(stmt);
sqlite3_reset(stmt);
n++;
}
fclose (pFile);
sqlite3_exec(db, "END TRANSACTION", NULL, NULL, &sErrMsg);
printf("Imported %d records in %4.2f seconds\n", n, (clock() - cStartClock) / (double)CLOCKS_PER_SEC);
sqlite3_finalize(stmt);
sqlite3_close(db);
return 0;
}
【问题讨论】:
-
你为什么不在sqlite3中使用.import?
-
我该怎么做? @AmokHuginnsson
-
你必须检查
sqlite3_prepare_v2和sqlite3_step的返回值;见sqlite3_errmsg。