【问题标题】:How to keep only the highest duplicated value in data frame?如何仅保留数据框中的最高重复值?
【发布时间】:2020-07-13 19:54:12
【问题描述】:

我有以下代码:

library(tidyverse)
astronauts %>% 
  group_by(name, nationality, total_hrs_sum) %>% 
  summarise() 

输出:

name                                 nationality         total_hrs_sum
<chr>                                <chr>               <dbl>
Acaba, Joseph M.                     U.S.                7272.23        
Acton, Loren Wilbur                  U.S.                190.94     
Adamson, James C.                    U.S.                334.00     
Afanasyev, Viktor Mikhaylovich       U.S.S.R/Russia      13338.55       
Aidyn (Aydyn) Akanovich Aimbetov     Kazakhstan          236.23     
Akers, Thomas D.                     U.S.                814.00     
Akiyama, Toyohiro                    Japan               189.90     
Aksyonov, Vladimir                   U.S.S.R/Russia      284.18     
Al Mansoori, Hazzaa                  UAE                 189.00     
Al-saud, Sultan bin Salman           Saudi Arabia        170.00

我的问题:

我想进一步过滤此数据框,以便每个国籍仅提供 1 个姓名。每个国籍的一个名字应该在 total_hrs_sum 列中具有最高值。我对 dplyr 解决方案最满意,但也对其他可能的解决方案持开放态度。

样本数据:

structure(list(name = c("Acaba, Joseph M.", "Acton, Loren Wilbur", 
"Adamson, James C.", "Afanasyev, Viktor Mikhaylovich", "Aidyn (Aydyn) Akanovich Aimbetov", 
"Akers, Thomas D.", "Akiyama, Toyohiro", "Aksyonov, Vladimir", 
"Al Mansoori, Hazzaa", "Al-saud, Sultan bin Salman", "Aldrin, Edwin Eugene, Jr.", 
"Aleksandrov, Aleksandr", "Aleksandrov, Aleksandr", "Allen, Andrew M.", 
"Allen, Joseph P.", "Altman, Scott D.", "Anders, William Alison", 
"Anderson, Clayton C.", "Anderson, Michael P.", "André-Deshays, Claudie (Haigneré)", 
"Ansari, Anousheh", "Antonelli, Dominic A.", "Apt, Jerome", "Archambault, Lee J.", 
"Armstrong, Neil A.", "Arnaldo Tamayo Mendez", "Arnold, Richard R., II", 
"Artemyev, Oleg", "Artsebarsky, Anatoly", "Artyukhin, Yuri", 
"Ashby, Jeffrey S.", "Atkov, Oleg", "Aubakirov, Toktar", "Auñón-Chancellor, Serena", 
"Avdeyev, Sergei", "Bagian, James P.", "Baker, Ellen S.", "Baker, Michael A.", 
"Balandin, Aleksandr", "Barratt, Michael R.", "Barry, Daniel T.", 
"Bartoe, John-David Francis", "Baturin, Yuri", "Baudry, Patrick", 
"Bean, Alan Lavern", "Behnken, Robert L.", "Bella, Ivan", "Belyayev, Pavel", 
"Beregovoi, Georgi", "Berezovoy, Anatoly"), nationality = c("U.S.", 
"U.S.", "U.S.", "U.S.S.R/Russia", "Kazakhstan", "U.S.", "Japan", 
"U.S.S.R/Russia", "UAE", "Saudi Arabia", "U.S.", "Bulgaria", 
"U.S.S.R/Russia", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", 
"France", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", "Cuba", "U.S.", 
"U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.", 
"U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.", "U.S.S.R/Russia", 
"U.S.", "U.S.", "U.S.", "U.S.S.R/Russia", "U.S.", "U.S.", "U.S.", 
"U.S.S.R/Russia", "France", "U.S.", "U.S.", "Slovakia", "U.S.S.R/Russia", 
"U.S.S.R/Russia", "U.S.S.R/Russia"), total_hrs_sum = c(7272.23, 
190.94, 334, 13338.55, 236.23, 814, 189.9, 284.18, 189, 170, 
289, 47, 7434.03, 904, 314, 1224, 147, 4046, 593, 614.37, 261.525, 
579, 847, 639.5, 206, 188.71, 307, 8784, 3471.35, 377.5, 664, 
5686.82, 190.2, 4722, 17942.23, 338, 686, 965, 4297.28, 5085, 
734, 190.94, 473.75, 169.63, 1671.75, 708, 190, 26.03, 94.83, 
5073.07)), row.names = c(NA, -50L), groups = structure(list(name = c("Acaba, Joseph M.", 
"Acton, Loren Wilbur", "Adamson, James C.", "Afanasyev, Viktor Mikhaylovich", 
"Aidyn (Aydyn) Akanovich Aimbetov", "Akers, Thomas D.", "Akiyama, Toyohiro", 
"Aksyonov, Vladimir", "Al Mansoori, Hazzaa", "Al-saud, Sultan bin Salman", 
"Aldrin, Edwin Eugene, Jr.", "Aleksandrov, Aleksandr", "Aleksandrov, Aleksandr", 
"Allen, Andrew M.", "Allen, Joseph P.", "Altman, Scott D.", "Anders, William Alison", 
"Anderson, Clayton C.", "Anderson, Michael P.", "André-Deshays, Claudie (Haigneré)", 
"Ansari, Anousheh", "Antonelli, Dominic A.", "Apt, Jerome", "Archambault, Lee J.", 
"Armstrong, Neil A.", "Arnaldo Tamayo Mendez", "Arnold, Richard R., II", 
"Artemyev, Oleg", "Artsebarsky, Anatoly", "Artyukhin, Yuri", 
"Ashby, Jeffrey S.", "Atkov, Oleg", "Aubakirov, Toktar", "Auñón-Chancellor, Serena", 
"Avdeyev, Sergei", "Bagian, James P.", "Baker, Ellen S.", "Baker, Michael A.", 
"Balandin, Aleksandr", "Barratt, Michael R.", "Barry, Daniel T.", 
"Bartoe, John-David Francis", "Baturin, Yuri", "Baudry, Patrick", 
"Bean, Alan Lavern", "Behnken, Robert L.", "Bella, Ivan", "Belyayev, Pavel", 
"Beregovoi, Georgi", "Berezovoy, Anatoly"), nationality = c("U.S.", 
"U.S.", "U.S.", "U.S.S.R/Russia", "Kazakhstan", "U.S.", "Japan", 
"U.S.S.R/Russia", "UAE", "Saudi Arabia", "U.S.", "Bulgaria", 
"U.S.S.R/Russia", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", 
"France", "U.S.", "U.S.", "U.S.", "U.S.", "U.S.", "Cuba", "U.S.", 
"U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.", 
"U.S.S.R/Russia", "U.S.S.R/Russia", "U.S.", "U.S.S.R/Russia", 
"U.S.", "U.S.", "U.S.", "U.S.S.R/Russia", "U.S.", "U.S.", "U.S.", 
"U.S.S.R/Russia", "France", "U.S.", "U.S.", "Slovakia", "U.S.S.R/Russia", 
"U.S.S.R/Russia", "U.S.S.R/Russia"), .rows = structure(list(1L, 
    2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 
    15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 
    27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 
    39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), row.names = c(NA, 50L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))
    

【问题讨论】:

    标签: r dataframe dplyr tidyverse


    【解决方案1】:

    我们可以使用slice按“国籍”分组后的第一行,arrange按降序排列“total_hrs_sum”

    library(dplyr)
    df %>%        
        arrange(nationality, desc(total_hrs_sum)) %>%
        group_by(nationality) %>%
        slice(1)
    

    或使用top_n

    df %>%
        group_by(nationality) %>%
        top_n(n=1, total_hrs_sum)
    # A tibble: 10 x 3
    # Groups:   nationality [10]
    #   name                              nationality    total_hrs_sum
    #   <chr>                             <chr>                  <dbl>
    # 1 Acaba, Joseph M.                  U.S.                   7272.
    # 2 Aidyn (Aydyn) Akanovich Aimbetov  Kazakhstan              236.
    # 3 Akiyama, Toyohiro                 Japan                   190.
    # 4 Al Mansoori, Hazzaa               UAE                     189 
    # 5 Al-saud, Sultan bin Salman        Saudi Arabia            170 
    # 6 Aleksandrov, Aleksandr            Bulgaria                 47 
    # 7 André-Deshays, Claudie (Haigneré) France                  614.
    # 8 Arnaldo Tamayo Mendez             Cuba                    189.
    # 9 Avdeyev, Sergei                   U.S.S.R/Russia        17942.
    #10 Bella, Ivan                       Slovakia                190 
    

    【讨论】:

      【解决方案2】:

      试试这个:

      library(dplyr)
      
      df %>% group_by(nationality) %>% arrange(desc(total_hrs_sum)) %>% filter(!duplicated(nationality))
      
      # A tibble: 10 x 3
      # Groups:   nationality [10]
         name                              nationality    total_hrs_sum
         <chr>                             <chr>                  <dbl>
       1 Avdeyev, Sergei                   U.S.S.R/Russia        17942.
       2 Acaba, Joseph M.                  U.S.                   7272.
       3 André-Deshays, Claudie (Haigneré) France                  614.
       4 Aidyn (Aydyn) Akanovich Aimbetov  Kazakhstan              236.
       5 Bella, Ivan                       Slovakia                190 
       6 Akiyama, Toyohiro                 Japan                   190.
       7 Al Mansoori, Hazzaa               UAE                     189 
       8 Arnaldo Tamayo Mendez             Cuba                    189.
       9 Al-saud, Sultan bin Salman        Saudi Arabia            170 
      10 Aleksandrov, Aleksandr            Bulgaria                 47 
      

      【讨论】:

        【解决方案3】:

        你可以这样过滤:

        astronauts %>%
          group_by(nationality) %>%
          filter(total_hrs_sum == max(total_hrs_sum))
        

        看到group_by 数据框中的max 将是每个组的最大值。

        这是输出:

        # A tibble: 10 x 3
        # Groups:   nationality [10]
           name                              nationality    total_hrs_sum
           <chr>                             <chr>                  <dbl>
         1 Acaba, Joseph M.                  U.S.                   7272.
         2 Aidyn (Aydyn) Akanovich Aimbetov  Kazakhstan              236.
         3 Akiyama, Toyohiro                 Japan                   190.
         4 Al Mansoori, Hazzaa               UAE                     189 
         5 Al-saud, Sultan bin Salman        Saudi Arabia            170 
         6 Aleksandrov, Aleksandr            Bulgaria                 47 
         7 André-Deshays, Claudie (Haigneré) France                  614.
         8 Arnaldo Tamayo Mendez             Cuba                    189.
         9 Avdeyev, Sergei                   U.S.S.R/Russia        17942.
        10 Bella, Ivan                       Slovakia                190 
        

        【讨论】:

          猜你喜欢
          • 2021-12-23
          • 1970-01-01
          • 2016-10-15
          • 1970-01-01
          • 2019-12-21
          • 2013-07-10
          • 1970-01-01
          • 2021-12-27
          • 2020-02-17
          相关资源
          最近更新 更多