【问题标题】:Retain rows in list of data.frames according to conditions根据条件保留data.frames列表中的行
【发布时间】:2015-02-17 01:41:54
【问题描述】:

我有一个data.frame 数据如下。

ind <- c("22", "58", "57", "43", "23", "90", "45", "21", "27", "67",
        "17", "50", "71", "49", "92", "36", "62", "83", "02", "95")

gp <- c("G1", "G1", "G1", "G23", "G23", "G28", "G28", "G29", "G53",
        "G56", "G56", "G67", "G77", "G77", "G79", "G79", "G79",
        "G82", "G82", "G82")
id <- c("T297170", "T304934", "T437551", "T572358", "T572359", "T118839",
        "T304962", "T594651", "T113085", "T304969", "T444487", "T296315",
        "T305008", "T437400", "T113089", "T305032", "T557004", "T445004",
        "T445004", "T78642")
nm <- c("MaskedMarvel", "DecemberSnowflakes", "MaskedMarvel", 
        "WarmPuppy", "WarmPuppy", "SpringDance", "SpringDance", 
        "RoastedMarshmallows", "TrickorTreat", "FrisbeeSailing",
        "FrisbeeSailing", "GreatPumpkin", "PumpkinHelmet", "PumpkinHelmet",
        "GoodSport", "GoodSport", "GoodSport", "GiftGettingSeason",
        "EasterBeagle", "EasterBeagle")

data <- data.frame(ind, id, gp, nm)
data
   ind      id  gp                  nm
1   22 T297170  G1        MaskedMarvel
2   58 T304934  G1  DecemberSnowflakes
3   57 T437551  G1        MaskedMarvel
4   43 T572358 G23           WarmPuppy
5   23 T572359 G23           WarmPuppy
6   90 T118839 G28         SpringDance
7   45 T304962 G28         SpringDance
8   21 T594651 G29 RoastedMarshmallows
9   27 T113085 G53        TrickorTreat
10  67 T304969 G56      FrisbeeSailing
11  17 T444487 G56      FrisbeeSailing
12  50 T296315 G67        GreatPumpkin
13  71 T305008 G77       PumpkinHelmet
14  49 T437400 G77       PumpkinHelmet
15  92 T113089 G79           GoodSport
16  36 T305032 G79           GoodSport
17  62 T557004 G79           GoodSport
18  83 T445004 G82   GiftGettingSeason
19  02 T445004 G82        EasterBeagle
20  95  T78642 G82        EasterBeagle

我只想保留那些组(在gp 中指定在nm 列中具有不同元素的行。不应保留在nm 中具有单个元素的组。

我通过下面的代码得到了想要的结果。

# Split the data.frame into a list of data.frames by groups
data <- lapply(levels(data$gp),function(i) data[which(data$gp==i),])
# Remove groups with single elements in data$nm
data <- data[!sapply(data, function(i) nrow(i) == 1)]
# Remove groups with multiples of only a single element in data$nm  
data <- data[!sapply(seq_along(data), function(i) length(unique(data[[i]][,4])) == 1)]
# cbind the data.frames    
data <- do.call(rbind, data)
# fix the levels in factor variables
data <- droplevels(data)
data
   ind      id  gp                 nm    rm
1   22 T297170  G1       MaskedMarvel  TRUE
2   58 T304934  G1 DecemberSnowflakes FALSE
3   57 T437551  G1       MaskedMarvel  TRUE
18  83 T445004 G82  GiftGettingSeason FALSE
19  02 T445004 G82       EasterBeagle  TRUE
20  95  T78642 G82       EasterBeagle  TRUE

有没有更优雅的方法来实现这个结果,也许是在基础R 中的一个步骤?

【问题讨论】:

    标签: r list dataframe apply


    【解决方案1】:

    你可以试试

    data1 <- data[with(data, ave(as.character(nm), gp, FUN=
                              function(x) length(unique(x)))>1),]
    transform(data1, rm=ave(as.character(nm),
       gp, FUN=function(x) duplicated(x)|duplicated(x,fromLast=TRUE)))
    #   ind      id  gp                 nm    rm
    #1   22 T297170  G1       MaskedMarvel  TRUE
    #2   58 T304934  G1 DecemberSnowflakes FALSE
    #3   57 T437551  G1       MaskedMarvel  TRUE
    #18  83 T445004 G82  GiftGettingSeason FALSE
    #19  02 T445004 G82       EasterBeagle  TRUE
    #20  95  T78642 G82       EasterBeagle  TRUE
    

    或使用data.table

     setDT(data)[,.SD[length(unique(nm))>1], by=gp][,
            rm:= duplicated(nm)|duplicated(nm,fromLast=TRUE) ,by=gp][]
     #    gp ind      id                 nm    rm
     #1:  G1  22 T297170       MaskedMarvel  TRUE
     #2:  G1  58 T304934 DecemberSnowflakes FALSE
     #3:  G1  57 T437551       MaskedMarvel  TRUE
     #4: G82  83 T445004  GiftGettingSeason FALSE
     #5: G82  02 T445004       EasterBeagle  TRUE
     #6: G82  95  T78642       EasterBeagle  TRUE
    

    或使用dplyr

     library(dplyr)
     data %>%
          group_by(gp) %>% 
          filter(n_distinct(nm)>1) %>% 
          mutate(rm=duplicated(nm)|duplicated(nm,fromLast=TRUE))
    

    【讨论】:

      猜你喜欢
      • 2021-01-24
      • 2022-11-20
      • 1970-01-01
      • 2020-02-25
      • 1970-01-01
      • 2021-08-08
      • 1970-01-01
      • 2021-12-21
      • 2018-06-10
      相关资源
      最近更新 更多