【问题标题】:Generate dummy variables from all possible combinations of variables从所有可能的变量组合生成虚拟变量
【发布时间】:2019-06-15 20:35:01
【问题描述】:

我有 5 个条件可以是 present (=1) 或 not (=0):

set.seed(101)
df <- data.frame(
alfa = sample(c(0, 1), 30, replace = TRUE),
beta = sample(c(0, 1), 30, replace = TRUE),
gamma = sample(c(0, 1), 30, replace = TRUE),
delta = sample(c(0, 1), 30, replace = TRUE),
epsilon = sample(c(0, 1), 30, replace = TRUE)
)

我想从这些条件存在的所有可能组合 (25) 中生成一组虚拟变量。 具体来说,我想验证以下条件的同时存在:

  • 1)alfa + beta;
  • 2)alfa + gamma;
  • 3)alfa + delta;
  • 4) alfa + epsilon;
  • 5)alfa + beta + gamma;
  • [...]
  • 27) alfa + beta + gamma + delta + epsilon;

生成所有可能组合的矩阵

v <- as.matrix(expand.grid(rep(list(c(FALSE, TRUE)), ncol(df))))

所有组合:

 head(v)
      Var1  Var2  Var3  Var4  Var5
[1,]  TRUE FALSE FALSE FALSE FALSE
[2,] FALSE  TRUE FALSE FALSE FALSE
[3,]  TRUE  TRUE FALSE FALSE FALSE
[4,] FALSE FALSE  TRUE FALSE FALSE
[5,]  TRUE FALSE  TRUE FALSE FALSE
[6,] FALSE  TRUE  TRUE FALSE FALSE

将矩阵转换为列索引列表

indexes <- lapply(seq_len(nrow(v)), function(x) v[x, ])
names(indexes) <- apply(v, 1, function(x) paste(names(df)[x], collapse="."))

不幸的是,我被困在这里。
我需要根据上述组合生成27个虚拟变量(32-5)。

EDT:我用这种肮脏的方式解决了问题:

# my df
set.seed(101)
df <- data.frame(
alfa = sample(c(0, 1), 30, replace = TRUE),
beta = sample(c(0, 1), 30, replace = TRUE),
gamma = sample(c(0, 1), 30, replace = TRUE),
delta = sample(c(0, 1), 30, replace = TRUE),
epsilon = sample(c(0, 1), 30, replace = TRUE)
)

# count the numebr of coesistent conditions:
df$n <- rowSums(df[1:5], na.rm = TRUE)

# Dirty way to compute multiple combinations:
df$alfa.beta <- ifelse (df$alfa == 1 & df$beta == 1 & df$gamma == 0 & df$delta == 0 & df$epsilon == 0 , 1, 0)
df$alfa.gamma <- ifelse (df$alfa == 1 & df$beta == 0 & df$gamma == 1 & df$delta == 0 & df$epsilon == 0 , 1, 0)
df$beta.gamma <- ifelse (df$alfa == 0 & df$beta == 1 & df$gamma == 1 & df$delta == 0 & df$epsilon == 0 , 1, 0)
df$alfa.beta.gamma <- ifelse (df$alfa == 1 & df$beta == 1 & df$gamma == 1 & df$delta == 0 & df$epsilon == 0 , 1, 0)
df$alfa.delta <- ifelse (df$alfa == 1 & df$beta == 0 & df$gamma == 0 & df$delta == 1 & df$epsilon == 0 , 1, 0)
df$beta.delta <- ifelse (df$alfa == 0 & df$beta == 1 & df$gamma == 0 & df$delta == 1 & df$epsilon == 0 , 1, 0)
df$alfa.beta.delta <- ifelse (df$alfa == 1 & df$beta == 1 & df$gamma == 0 & df$delta == 1 & df$epsilon == 0 , 1, 0)
df$gamma.delta <- ifelse (df$alfa == 0 & df$beta == 0 & df$gamma == 1 & df$delta == 1 & df$epsilon == 0 , 1, 0)
df$alfa.gamma.delta <- ifelse (df$alfa == 1 & df$beta == 0 & df$gamma == 1 & df$delta == 1 & df$epsilon == 0 , 1, 0)
df$beta.gamma.delta <- ifelse (df$alfa == 0 & df$beta == 1 & df$gamma == 1 & df$delta == 1 & df$epsilon == 0 , 1, 0)
df$alfa.beta.gamma.delta <- ifelse (df$alfa == 1 & df$beta == 1 & df$gamma == 1 & df$delta == 1 & df$epsilon == 0 , 1, 0)
df$alfa.epsilon <- ifelse (df$alfa == 1 & df$beta == 0 & df$gamma == 0 & df$delta == 0 & df$epsilon == 1 , 1, 0)
df$beta.epsilon <- ifelse (df$alfa == 0 & df$beta == 1 & df$gamma == 0 & df$delta == 0 & df$epsilon == 1 , 1, 0)
df$alfa.beta.epsilon <- ifelse (df$alfa == 1 & df$beta == 1 & df$gamma == 0 & df$delta == 0 & df$epsilon == 1 , 1, 0)
df$gamma.epsilon <- ifelse (df$alfa == 0 & df$beta == 0 & df$gamma == 1 & df$delta == 0 & df$epsilon == 1 , 1, 0)
df$alfa.gamma.epsilon <- ifelse (df$alfa == 1 & df$beta == 0 & df$gamma == 1 & df$delta == 0 & df$epsilon == 1 , 1, 0)
df$beta.gamma.epsilon <- ifelse (df$alfa == 0 & df$beta == 1 & df$gamma == 1 & df$delta == 0 & df$epsilon == 1 , 1, 0)
df$alfa.beta.gamma.epsilon <- ifelse (df$alfa == 1 & df$beta == 1 & df$gamma == 1 & df$delta == 0 & df$epsilon == 1 , 1, 0)
df$delta.epsilon <- ifelse (df$alfa == 0 & df$beta == 0 & df$gamma == 0 & df$delta == 1 & df$epsilon == 1 , 1, 0)
df$alfa.delta.epsilon <- ifelse (df$alfa == 1 & df$beta == 0 & df$gamma == 0 & df$delta == 1 & df$epsilon == 1 , 1, 0)
df$beta.delta.epsilon <- ifelse (df$alfa == 0 & df$beta == 1 & df$gamma == 0 & df$delta == 1 & df$epsilon == 1 , 1, 0)
df$alfa.beta.delta.epsilon <- ifelse (df$alfa == 1 & df$beta == 1 & df$gamma == 0 & df$delta == 1 & df$epsilon == 1 , 1, 0)
df$gamma.delta.epsilon <- ifelse (df$alfa == 0 & df$beta == 0 & df$gamma == 1 & df$delta == 1 & df$epsilon == 1 , 1, 0)
df$alfa.gamma.delta.epsilon <- ifelse (df$alfa == 1 & df$beta == 0 & df$gamma == 1 & df$delta == 1 & df$epsilon == 1 , 1, 0)
df$beta.gamma.delta.epsilon <- ifelse (df$alfa == 0 & df$beta == 1 & df$gamma == 1 & df$delta == 1 & df$epsilon == 1 , 1, 0)
df$alfa.beta.gamma.delta.epsilon <- ifelse (df$alfa == 1 & df$beta == 1 & df$gamma == 1 & df$delta == 1 & df$epsilon == 1 , 1, 0)

【问题讨论】:

  • ?我不确定你的问题是什么。我认为expand.grid() 步骤已经是您问题的答案。所以,如果没有。您的问题/疑问是什么?
  • @petermeissner 我需要根据上述组合生成额外的 27 个虚拟变量。我相应地更新了问题。

标签: r dataframe combinations dummy-variable


【解决方案1】:

也许您可以通过在模型公式中使用stats::model.matrix^ 运算符来返回所有(高阶)变量交互,从而实现您的目标:

library(stats)

df <- data.frame(
    alfa = sample(c(TRUE, FALSE), 30, replace = TRUE),
    beta = sample(c(TRUE, FALSE), 30, replace = TRUE),
    gamma = sample(c(TRUE, FALSE), 30, replace = TRUE),
    delta = sample(c(TRUE, FALSE), 30, replace = TRUE),
    epsilon = sample(c(TRUE, FALSE), 30, replace = TRUE)
)

df_dummy <- model.matrix(~ .^5, data = df)

colnames(df_dummy)
#>  [1] "(Intercept)"                                      
#>  [2] "alfaTRUE"                                         
#>  [3] "betaTRUE"                                         
#>  [4] "gammaTRUE"                                        
#>  [5] "deltaTRUE"                                        
#>  [6] "epsilonTRUE"                                      
#>  [7] "alfaTRUE:betaTRUE"                                
#>  [8] "alfaTRUE:gammaTRUE"                               
#>  [9] "alfaTRUE:deltaTRUE"                               
#> [10] "alfaTRUE:epsilonTRUE"                             
#> [11] "betaTRUE:gammaTRUE"                               
#> [12] "betaTRUE:deltaTRUE"                               
#> [13] "betaTRUE:epsilonTRUE"                             
#> [14] "gammaTRUE:deltaTRUE"                              
#> [15] "gammaTRUE:epsilonTRUE"                            
#> [16] "deltaTRUE:epsilonTRUE"                            
#> [17] "alfaTRUE:betaTRUE:gammaTRUE"                      
#> [18] "alfaTRUE:betaTRUE:deltaTRUE"                      
#> [19] "alfaTRUE:betaTRUE:epsilonTRUE"                    
#> [20] "alfaTRUE:gammaTRUE:deltaTRUE"                     
#> [21] "alfaTRUE:gammaTRUE:epsilonTRUE"                   
#> [22] "alfaTRUE:deltaTRUE:epsilonTRUE"                   
#> [23] "betaTRUE:gammaTRUE:deltaTRUE"                     
#> [24] "betaTRUE:gammaTRUE:epsilonTRUE"                   
#> [25] "betaTRUE:deltaTRUE:epsilonTRUE"                   
#> [26] "gammaTRUE:deltaTRUE:epsilonTRUE"                  
#> [27] "alfaTRUE:betaTRUE:gammaTRUE:deltaTRUE"            
#> [28] "alfaTRUE:betaTRUE:gammaTRUE:epsilonTRUE"          
#> [29] "alfaTRUE:betaTRUE:deltaTRUE:epsilonTRUE"          
#> [30] "alfaTRUE:gammaTRUE:deltaTRUE:epsilonTRUE"         
#> [31] "betaTRUE:gammaTRUE:deltaTRUE:epsilonTRUE"         
#> [32] "alfaTRUE:betaTRUE:gammaTRUE:deltaTRUE:epsilonTRUE"

reprex package (v0.3.0) 于 2019 年 6 月 16 日创建

扩展的 data.frame df_dummy 为所有 31 种可能的交互组合(除了截距)返回 one-hot 编码列。要丢失截距,请将模型公式替换为 ~ .^5 + 0~.^5 - 1。请注意,通过将5 替换为df 中的列数,这很容易扩展到更多变量。


编辑:上述代码不会为您提供专有变量组合的存在(类似于您编辑的问题中的手动编码)。为此,您可以尝试:

df_dummy <- model.matrix(~ .^5 - .^4 - 1, data = df)

colnames(df_dummy)
#>  [1] "alfaFALSE:betaFALSE:gammaFALSE:deltaFALSE:epsilonFALSE"
#>  [2] "alfaTRUE:betaFALSE:gammaFALSE:deltaFALSE:epsilonFALSE" 
#>  [3] "alfaFALSE:betaTRUE:gammaFALSE:deltaFALSE:epsilonFALSE" 
#>  [4] "alfaTRUE:betaTRUE:gammaFALSE:deltaFALSE:epsilonFALSE"  
#>  [5] "alfaFALSE:betaFALSE:gammaTRUE:deltaFALSE:epsilonFALSE" 
#>  [6] "alfaTRUE:betaFALSE:gammaTRUE:deltaFALSE:epsilonFALSE"  
#>  [7] "alfaFALSE:betaTRUE:gammaTRUE:deltaFALSE:epsilonFALSE"  
#>  [8] "alfaTRUE:betaTRUE:gammaTRUE:deltaFALSE:epsilonFALSE"   
#>  [9] "alfaFALSE:betaFALSE:gammaFALSE:deltaTRUE:epsilonFALSE" 
#> [10] "alfaTRUE:betaFALSE:gammaFALSE:deltaTRUE:epsilonFALSE"  
#> [11] "alfaFALSE:betaTRUE:gammaFALSE:deltaTRUE:epsilonFALSE"  
#> [12] "alfaTRUE:betaTRUE:gammaFALSE:deltaTRUE:epsilonFALSE"   
#> [13] "alfaFALSE:betaFALSE:gammaTRUE:deltaTRUE:epsilonFALSE"  
#> [14] "alfaTRUE:betaFALSE:gammaTRUE:deltaTRUE:epsilonFALSE"   
#> [15] "alfaFALSE:betaTRUE:gammaTRUE:deltaTRUE:epsilonFALSE"   
#> [16] "alfaTRUE:betaTRUE:gammaTRUE:deltaTRUE:epsilonFALSE"    
#> [17] "alfaFALSE:betaFALSE:gammaFALSE:deltaFALSE:epsilonTRUE" 
#> [18] "alfaTRUE:betaFALSE:gammaFALSE:deltaFALSE:epsilonTRUE"  
#> [19] "alfaFALSE:betaTRUE:gammaFALSE:deltaFALSE:epsilonTRUE"  
#> [20] "alfaTRUE:betaTRUE:gammaFALSE:deltaFALSE:epsilonTRUE"   
#> [21] "alfaFALSE:betaFALSE:gammaTRUE:deltaFALSE:epsilonTRUE"  
#> [22] "alfaTRUE:betaFALSE:gammaTRUE:deltaFALSE:epsilonTRUE"   
#> [23] "alfaFALSE:betaTRUE:gammaTRUE:deltaFALSE:epsilonTRUE"   
#> [24] "alfaTRUE:betaTRUE:gammaTRUE:deltaFALSE:epsilonTRUE"    
#> [25] "alfaFALSE:betaFALSE:gammaFALSE:deltaTRUE:epsilonTRUE"  
#> [26] "alfaTRUE:betaFALSE:gammaFALSE:deltaTRUE:epsilonTRUE"   
#> [27] "alfaFALSE:betaTRUE:gammaFALSE:deltaTRUE:epsilonTRUE"   
#> [28] "alfaTRUE:betaTRUE:gammaFALSE:deltaTRUE:epsilonTRUE"    
#> [29] "alfaFALSE:betaFALSE:gammaTRUE:deltaTRUE:epsilonTRUE"   
#> [30] "alfaTRUE:betaFALSE:gammaTRUE:deltaTRUE:epsilonTRUE"    
#> [31] "alfaFALSE:betaTRUE:gammaTRUE:deltaTRUE:epsilonTRUE"    
#> [32] "alfaTRUE:betaTRUE:gammaTRUE:deltaTRUE:epsilonTRUE"

reprex package (v0.3.0) 于 2019 年 6 月 16 日创建

【讨论】:

    猜你喜欢
    • 2020-06-22
    • 2018-05-28
    • 1970-01-01
    • 2018-11-19
    • 2021-11-08
    • 1970-01-01
    • 1970-01-01
    • 2012-08-10
    相关资源
    最近更新 更多