每 2 列添加 1 个空白列答案

【问题标题】：Add 1 blank column every 2 columns每 2 列添加 1 个空白列
【发布时间】：2021-08-09 17:11:25
【问题描述】：

我有一个这样的数据框“df”：

col1 col2 col3 col4 col5 col6
 1    2    2    3    5    7
 2    4    6    4    8    2
 5    9    7    3    2    5
 3    4    5    6    8    1

我想创建一个新的数据框“new_df”，其中每 2 列有 1 个空白列（称为“空”），如下所示：

empty col1 col2 empty col3 col4 empty col5 col6
 NA    1    2    NA    2    3    NA    5    7
 NA    2    4    NA    6    4    NA    8    2
 NA    5    9    NA    7    3    NA    2    5
 NA    3    4    NA    5    6    NA    8    1

如何以这种方式添加空白列？我试过使用：

n = length(df)
empty <- NA

for (i in seq(1,n-2,2))
  {
  new_df <- add_column(df, empty, .before=i)
  }

但它只记住最后一步，给我这个结果：

col1 col2 col3 col4 empty col5 col6
 1    2     2    3   NA    5    7
 2    4     6    4   NA    8    2
 5    9     7    3   NA    2    5
 3    4     5    6   NA    8    1

【问题讨论】：

它只记得最后一步的原因是每次制作 new_df 时都在使用原始 df。

标签： r dataframe for-loop add

【解决方案1】：

另一种基本 R 解决方案

tmp1=seq(1,ncol(df),3)
tmp2=!(1:ncol(df) %in% tmp1)

df2=data.frame(matrix(NA,nrow(df),ncol(df)+ncol(df)/2))
df2[tmp2]=df

colnames(df2)[tmp1]=paste0("empty",1:length(tmp1))
colnames(df2)[tmp2]=colnames(df)

  empty1 col1 col2 empty2 col3 col4 empty3 col5 col6
1     NA    1    2     NA    2    3     NA    5    7
2     NA    2    4     NA    6    4     NA    8    2
3     NA    5    9     NA    7    3     NA    2    5
4     NA    3    4     NA    5    6     NA    8    1

【讨论】：

【解决方案2】：

基本的 R 解决方案是：

do.call(cbind, lapply(seq(1, ncol(df), by = 2), function(i) cbind(empty = rep(NA, nrow(df)), df[, seq(i, i+1)])))

#  empty col1 col2 empty col3 col4 empty col5 col6
#1    NA    1    2    NA    2    3    NA    5    7
#2    NA    2    4    NA    6    4    NA    8    2
#3    NA    5    9    NA    7    3    NA    2    5
#4    NA    3    4    NA    5    6    NA    8    1

“整洁”的解决方案可能是：

library(tidyverse)

map_dfc(seq(from = 1, to = ncol(df), by = 2),
        ~df %>%
          mutate(empty = NA) %>%
          select(empty, .x, .x+1))

#New names:
#* empty -> empty...1
#* empty -> empty...4
#* empty -> empty...7
#empty...1 col1 col2 empty...4 col3 col4 empty...7 col5 col6
#1        NA    1    2        NA    2    3        NA    5    7
#2        NA    2    4        NA    6    4        NA    8    2
#3        NA    5    9        NA    7    3        NA    2    5
#4        NA    3    4        NA    5    6        NA    8    1

【讨论】：

【解决方案3】：

使用append()。

for (i in 0:2*ncol(dat)/2) dat <- as.data.frame(append(dat, list(emp=NA), i))
dat
#   emp col1 col2 emp.1 col3 col4 emp.2 col5 col6
# 1  NA    1    2    NA    2    3    NA    5    7
# 2  NA    2    4    NA    6    4    NA    8    2
# 3  NA    5    9    NA    7    3    NA    2    5
# 4  NA    3    4    NA    5    6    NA    8    1

数据：

dat <- structure(list(col1 = c(1L, 2L, 5L, 3L), col2 = c(2L, 4L, 9L, 
4L), col3 = c(2L, 6L, 7L, 5L), col4 = c(3L, 4L, 3L, 6L), col5 = c(5L, 
8L, 2L, 8L), col6 = c(7L, 2L, 5L, 1L)), class = "data.frame", row.names = c(NA, 
-4L))

来了……

微基准测试

# Unit: microseconds
# expr      min        lq      mean   median       uq      max neval    cld
#  ronak()  969.707  990.9945 1001.4807 1012.282 1017.368 1022.453     3    d  
#   user()  349.937  358.0145  364.3877  366.092  371.613  377.134     3 a     
#    jay() 2098.003 2100.8540 2115.7640 2103.705 2124.644 2145.584     3     e 
# groth1() 2164.896 2262.5745 2363.6133 2360.253 2462.972 2565.691     3      f
# groth2()  424.546  438.0185  455.0820  451.491  470.350  489.209     3 ab    
# groth3()  722.551  728.0910  733.1910  733.631  738.511  743.391     3   c   
# r.user()  612.432  619.6570  636.9573  626.882  649.220  671.558     3  bc   


## and with the usual expanded data frame:
set.seed(42)
dat <- dat[sample(nrow(dat), 1e6, replace=T), ]
microbenchmark::microbenchmark(ronak(), user(), jay(), groth1(), groth2(), groth3(), r.user(), times=3L)
# Unit: milliseconds
# expr         min          lq        mean      median         uq        max neval cld
#  ronak() 1375.139030 1456.858743 1564.509886 1538.578457 1659.19531 1779.81217     3   c
#   user()   89.017416  200.845539  251.548652  312.673662  332.81427  352.95488     3 a  
#    jay()    7.655812    8.382333    9.941684    9.108855   11.08462   13.06039     3 a  
# groth1()  501.263785  514.097103  621.755474  526.930421  682.00132  837.07222     3  b 
# groth2()  143.438836  147.783741  189.033391  152.128645  211.83067  271.53269     3 a  
# groth3() 1387.314877 1406.898863 1469.493158 1426.482849 1510.58230 1594.68175     3   c
# r.user() 1469.543881 1472.770464 1483.834022 1475.997046 1490.97909 1505.96114     3   c

代码：

ronak <- \() {
  split_data <- split.default(dat,rep(seq_along(dat), each=2, length.out=ncol(dat)))
  do.call(cbind, Map(function(x, y) cbind(setNames(data.frame(NA), paste0('empty', x)), y), 
                     seq_along(split_data), split_data))
}

user <- \() {
  tmp1=seq(1, 9,3);tmp2=!(1:9 %in% tmp1);dat2=data.frame(matrix(NA,nrow(dat),ncol(dat)+ncol(dat)/2))
  dat2[tmp2]=dat;colnames(dat2)[tmp1]=paste0("empty",1:length(tmp1))
  colnames(dat2)[tmp2]=colnames(dat);dat2
}

jay <- \() {for (i in 0:2*ncol(dat)/2) dat <- as.data.frame(append(dat, list(emp=NA), i));dat}

groth1 <- \() suppressMessages({
  require(dplyr):require(purrr)
  dat %>% split.default(as.numeric(gl(ncol(.), 2, ncol(.)))) %>% map(~ bind_cols(empty=NA, .)) %>%
    bind_cols
})

groth2 <- \() {
  ix <- cumsum(seq_along(dat) %% 2 + 1);dat2 <- replace(data.frame(matrix(NA, nrow(dat), max(ix))), ix, dat)
  names(dat2) <- replace(rep("empty", ncol(dat2)), ix, names(dat));dat2
}

groth3 <- \() {
  ix <- as.numeric(gl(ncol(dat), 2, ncol(dat)))  # 1 1 2 2 3 3
  do.call("cbind", Map(cbind, empty = NA, split.default(dat, ix)))
}

r.user <- \() do.call(cbind, lapply(seq(1, ncol(dat), by=2), function(i)
  cbind(empty=rep(NA, nrow(dat)), dat[, seq(i, i+1)])))

【讨论】：

@G.Grothendieck 我在编辑中考虑到了这一点，谢谢！

【解决方案4】：

!) dplyr/purrr 拆分数据框 DF，在每个组件之前绑定一个 NA 列，然后将生成的组件重新绑定在一起。在多个列中使用与问题的示例输出中相同的列名存在无法按名称识别列的问题，因此使用唯一名称。

library(dplyr)
library(purrr)

DF %>%
  split.default(as.numeric(gl(ncol(.), 2, ncol(.)))) %>%
  map(~ bind_cols(empty = NA, .)) %>%
  bind_cols

给予：

  empty...1 col1 col2 empty...4 col3 col4 empty...7 col5 col6
1        NA    1    2        NA    2    3        NA    5    7
2        NA    2    4        NA    6    4        NA    8    2
3        NA    5    9        NA    7    3        NA    2    5
4        NA    3    4        NA    5    6        NA    8    1

2) Base R 创建一个向量 ix，它在结果数据帧中给出原始数据帧的索引，然后创建一个空结果并将 DF 及其名称复制到其中。

ix <- cumsum(seq_along(DF) %% 2 + 1)    # 2 3 5 6 8 9
DF2 <- replace(data.frame(matrix(NA, nrow(DF), max(ix))), ix, DF)
names(DF2) <- replace(rep("empty", ncol(DF2)), ix, names(DF))

DF2

给予：

  empty col1 col2 empty col3 col4 empty col5 col6
1    NA    1    2    NA    2    3    NA    5    7
2    NA    2    4    NA    6    4    NA    8    2
3    NA    5    9    NA    7    3    NA    2    5
4    NA    3    4    NA    5    6    NA    8    1

3) Base R 这是另一个 Base R 解决方案。它粗略地将 (1) 转换为 Base R。它给出的结果与 (2) 相同。

ix <- as.numeric(gl(ncol(DF), 2, ncol(DF)))  # 1 1 2 2 3 3
do.call("cbind", Map(cbind, empty = NA, split.default(DF, ix)))

4) eList eList 包可用于特别短的解决方案。

library(eList)

DF(for(i in seq(1, ncol(DF), 2)) list(empty = NA, DF[seq(i, len = 2)]))

给予：

  empty col1 col2 empty.1 col3 col4 empty.2 col5 col6
1    NA    1    2      NA    2    3      NA    5    7
2    NA    2    4      NA    6    4      NA    8    2
3    NA    5    9      NA    7    3      NA    2    5
4    NA    3    4      NA    5    6      NA    8    1

注意

可重现形式的输入。

Lines <- "col1 col2 col3 col4 col5 col6
 1    2    2    3    5    7
 2    4    6    4    8    2
 5    9    7    3    2    5
 3    4    5    6    8    1"
DF <- read.table(text = Lines, header = TRUE)

【讨论】：

我使用的是 4.0，但 4.1 似乎确实有所不同。我已将 c(...) 更改为 as.numeric(...) ，现在这两个版本应该相同。

【解决方案5】：

这是一个基本的 R 选项 -

我们可以将每 2 列的数据拆分为数据框列表，并使用Map 在每个数据框中添加一个带有NA 的新列。

split_data <- split.default(df,rep(seq_along(df), each = 2, length.out = ncol(df)))

do.call(cbind, Map(function(x, y) 
  cbind(setNames(data.frame(NA), paste0('empty', x)), y), 
  seq_along(split_data), split_data)) -> result

result

#  empty1 col1 col2 empty2 col3 col4 empty3 col5 col6
#1     NA    1    2     NA    2    3     NA    5    7
#2     NA    2    4     NA    6    4     NA    8    2
#3     NA    5    9     NA    7    3     NA    2    5
#4     NA    3    4     NA    5    6     NA    8    1

在数据框中有重复的列名不是一个好习惯，因此我将它们命名为empty1、empty2 等。

数据

df <- structure(list(col1 = c(1L, 2L, 5L, 3L), col2 = c(2L, 4L, 9L, 
4L), col3 = c(2L, 6L, 7L, 5L), col4 = c(3L, 4L, 3L, 6L), col5 = c(5L, 
8L, 2L, 8L), col6 = c(7L, 2L, 5L, 1L)), 
class = "data.frame", row.names = c(NA, -4L))

【讨论】：