【问题标题】:Correspondance between values in two df R两个df R中的值之间的对应关系
【发布时间】:2023-04-05 13:31:01
【问题描述】:

我有两个df要对抗。我的第一个df是“sum”

> head(sum)
   File_pdb  Res1      Chain1     Res2      Chain2
1:  7LD1_CM  GLN 81      M       ASN 501      C
2:  7LD1_CM  TYR 128     M       PHE 377      C
3:  7LD1_CM  ILE 78      M       SER 375      C
4:  7LD1_CM  ASN 76      M       ALA 372      C
5:  7LD1_CM  THR 20      M       TYR 369      C
6:  7LD1_CM  ARG 408     C       LEU 131      M

第二个是“mut”

> head(mut)
   RefAA  MutAA LineagesCount
1  VAL 3  GLY 3             1
2  LEU 5  PHE 5             2
3  LEU 8  VAL 8             1
4 SER 13 ILE 13             2
5 LEU 18 PHE 18             5
6 THR 20 ILE 20             1

我必须检查 sum$res1 和 sum$res2 中是否有等于 mut$refAA 的值。如果是这样,我需要在 sum$res1 或 sum$res2 附近添加整行 mut$refAA。

这里是一个例子:

    File_pdb  Res1      Chain1     Res2      Chain2 RefAA  MutAA  LineagesCount
1:  7LD1_CM  GLN 81      M       ASN 501      C
2:  7LD1_CM  TYR 128     M       PHE 377      C
3:  7LD1_CM  ILE 78      M       SER 375      C
4:  7LD1_CM  ASN 76      M       ALA 372      C
5:  7LD1_CM  THR 20      M       TYR 369      C     THR 20   ILE 20     1
6:  7LD1_CM  ARG 408     C       LEU 131      M

我该怎么做?我正在尝试使用合并和连接功能,但我没有那么有经验,所以我需要更多练习。有人能帮我吗?谢谢!

【问题讨论】:

    标签: r dataframe join merge correspondence


    【解决方案1】:

    我不得不稍微修正一下数据,以便轻松导入数据。那你可以试试tidyverse

    library(tidyverse)
    SUM %>% 
      mutate(index = 1:n()) %>% 
      pivot_longer(c(Res1, Res2)) %>%   
      left_join(mutate(MUT, value=RefAA), by = "value") %>%  
      group_by(index) %>% 
      fill(MutAA, RefAA, LineagesCount, .direction = "downup") %>% 
      ungroup() %>% 
      pivot_wider(names_from = name, values_from = value, values_fn = toString) %>% 
      mutate(which_Res = ifelse(RefAA == Res1, "Res1", "Res2"))
    # A tibble: 6 x 10
      File_pdb Chain1 Chain2 index RefAA MutAA LineagesCount Res1   Res2   which_Res
      <chr>    <chr>  <chr>  <int> <chr> <chr>         <int> <chr>  <chr>  <chr>    
    1 7LD1_CM  M      C          1 NA    NA               NA GLN81  ASN501 NA       
    2 7LD1_CM  M      C          2 NA    NA               NA TYR128 PHE377 NA       
    3 7LD1_CM  M      C          3 NA    NA               NA ILE78  SER375 NA       
    4 7LD1_CM  M      C          4 NA    NA               NA ASN76  ALA372 NA       
    5 7LD1_CM  M      C          5 THR20 ILE20             1 THR20  TYR369 Res1     
    6 7LD1_CM  C      M          6 NA    NA               NA ARG408 LEU131 NA   
    

    数据

    SUM <- read.table(text = "   File_pdb  Res1      Chain1     Res2      Chain2
    1:  7LD1_CM  GLN81      M       ASN501      C
    2:  7LD1_CM  TYR128     M       PHE377      C
    3:  7LD1_CM  ILE78      M       SER375      C
    4:  7LD1_CM  ASN76      M       ALA372      C
    5:  7LD1_CM  THR20      M       TYR369      C
    6:  7LD1_CM  ARG408     C       LEU131      M") 
    SUM
    
    MUT <- read.table(text = " RefAA  MutAA LineagesCount
    1  VAL3  GLY3             1
    2  LEU5  PHE5             2
    3  LEU8  VAL8             1
    4 SER13 ILE13             2
    5 LEU18 PHE18             5
    6 THR20 ILE20             1")
    

    【讨论】:

    • 没关系,但我需要知道我的 MutAA 是指 res1 还是 res2。在最终的df中报告RefAA也会有所帮助
    【解决方案2】:

    希望这会有所帮助

    do.call(
      dplyr::coalesce,
      lapply(
        c("Res1", "Res2"),
        function(x) merge(SUM, MUT, by.x = x, by.y = "RefAA", all.x = TRUE)
      )
    )
    

    给了

        Res1 File_pdb Chain1   Res2 Chain2 MutAA LineagesCount
    1 ARG408  7LD1_CM      C LEU131      M  <NA>            NA
    2  ASN76  7LD1_CM      M ALA372      C  <NA>            NA
    3  GLN81  7LD1_CM      M ASN501      C  <NA>            NA
    4  ILE78  7LD1_CM      M SER375      C  <NA>            NA
    5  THR20  7LD1_CM      M TYR369      C ILE20             1
    6 TYR128  7LD1_CM      M PHE377      C  <NA>            NA
    

    数据

    > dput(SUM)
    structure(list(File_pdb = c("7LD1_CM", "7LD1_CM", "7LD1_CM",
    "7LD1_CM", "7LD1_CM", "7LD1_CM"), Res1 = c("GLN81", "TYR128",
    "ILE78", "ASN76", "THR20", "ARG408"), Chain1 = c("M", "M", "M",
    "M", "M", "C"), Res2 = c("ASN501", "PHE377", "SER375", "ALA372",
    "TYR369", "LEU131"), Chain2 = c("C", "C", "C", "C", "C", "M")), class = "data.frame", row.names = c("1:",
    "2:", "3:", "4:", "5:", "6:"))
    
    > dput(MUT)
    structure(list(RefAA = c("VAL3", "LEU5", "LEU8", "SER13", "LEU18",
    "THR20"), MutAA = c("GLY3", "PHE5", "VAL8", "ILE13", "PHE18", 
    "ILE20"), LineagesCount = c(1L, 2L, 1L, 2L, 5L, 1L)), class = "data.frame", row.names = c("1",
    "2", "3", "4", "5", "6"))
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 2020-09-25
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2022-01-07
      相关资源
      最近更新 更多