我认为您对data.tables 和keys 的几点感到困惑。
- 除非您明确设置,否则
data.table 将没有密钥。
-
data.table 键不一定必须是唯一的。
您可以编写一个函数来检查某些列是否可以为数据集创建唯一标识符。
我在这里使用了 data.table,并注意在 data.table 的无密钥副本上使用 unique。
这不是高效的。
isid <- function(columns, data, verbose = TRUE){
if(!is.data.table(data)){
copyd <- data.table(data)
} else{
copyd <- copy(data)
}
if(haskey(copyd)){
setkey(copyd, NULL)
}
# NA values don't work in keys for data.tables
any.NA <- Filter(columns, f= function(x) any(is.na(copyd[[x]])))
if(verbose){
for(aa in seq_along(any.NA)){message(sprintf('Column %s contains NA values', any.NA[aa] ))}
}
validCols <- setdiff(columns, any.NA)
# cycle through columns 1 at a time
ncol <- 1L
validKey <- FALSE
while(!isTRUE(validKey) && ncol <= length(validCols)){
anyValid <- combn(x = validCols, m = ncol, FUN = function(xn){
subd <- copyd[, ..xn]
result <- nrow(subd) == nrow(unique(subd))
list(cols = xn, valid = result)
}, simplify = FALSE)
whichValid <- sapply(anyValid, `[[`, 'valid')
validKey <- any(whichValid)
ncol <- ncol + 1L
}
if(!validKey){
warning('No combinations are unique')
return(NULL)} else {
valid.combinations <- lapply(anyValid, `[[`, 'cols')[whichValid]
if(length(valid.combinations) > 1){
warning('More than one combination valid, returning the first only')
}
return(valid.combinations[[1]])
}
}
一些使用中的例子
oneU <- data.table(a = c(2,1,2,2), b = c(1,2,3,4))
twoU <- data.table(a = 1:4, b = letters[1:4])
bothU <- data.table(a = letters[1:2], b = rep(letters[1:2], each = 2))
someNA <- data.table(a = c(1,2,3,NA), b = 1:4)
isid(names(oneU), oneU)
# [1] "b"
isid(names(twoU), twoU)
# [1] "a"
# Warning message:
# In isid(names(twoU), twoU) :
# More than one combination valid, returning the first only
isid(names(bothU), bothU)
# [1] "a" "b"
isid(names(someNA), someNA)
# Column a contains NA values
# [1] "b"
# examples with no valid identifiers
isid('a', someNA)
## Column a contains NA values
## NULL
## Warning message:
## In isid("a", someNA) : No combinations are unique
isid('a', oneU)
## NULL
## Warning message:
## In isid("a", oneU) : No combinations are unique