【问题标题】:Mixed sorting in RR中的混合排序
【发布时间】:2015-03-04 05:12:29
【问题描述】:

我在重新调整字母数字列时遇到了一些问题,我正在寻找一些技巧(首选基数 R)。

考虑以下几点:

structure(list(Company = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L), .Label = c("ABC Inc", 
"ACME", "Handy Co"), class = "factor"), Quarter = structure(c(2L, 
3L, 4L, 5L, 6L, 1L, 2L, 3L, 2L, 3L, 4L, 5L, 6L, 1L, 3L, 4L, 5L, 
6L), .Label = c("1Q 2013", "1Q 2014", "1Q 2015", "2Q 2014", "3Q 2014", 
"4Q 2014"), class = "factor"), Revenue = c(5000L, 6000L, 3200L, 
1200L, 7200L, 10000L, 2500L, 4100L, 1250L, 8100L, 2300L, 3700L, 
1100L, 1600L, 8000L, 9000L, 10000L, 12000L)), .Names = c("Company", 
"Quarter", "Revenue"), class = "data.frame", row.names = c(NA, 
-18L))

我希望重新调整 Quarter 列以按年份和季度排序(例如 c("1Q 2013", "1Q 2014", "2Q 2014", ..., "1Q 2015")) .假设这只是一个更大的数据集的样本,该数据集涉及可追溯至数十年的季度。

【问题讨论】:

    标签: r sorting mixed


    【解决方案1】:

    我认为您应该将Quarter 分成两个字段:quarteryear。这是使用dplyr的解决方案:

    library(dplyr)
    
    df %>%
      mutate(
        quarter = as.numeric(substr(Quarter, 1, 1)),
        year = as.numeric(substr(Quarter, 4, 7))
      ) %>%
      select(-Quarter) %>% # original field no longer needed
      arrange(quarter, year)
    

    使用tidyr::separate()可以得到更简洁的解决方案:

    library(tidyr)
    
    df %>%
      separate(Quarter, c("quarter", "year")) %>%
      arrange(quarter, year)
    

    【讨论】:

    • 我通常同意将它们分开。不幸的是,在我的具体情况下,我最终需要稍后在我的代码中重新组合它们。
    【解决方案2】:

    这里有两种使用base r的方法

    dat <- structure(list(Company = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 
                                         1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L), .Label = c("ABC Inc", 
                                                                                                     "ACME", "Handy Co"), class = "factor"), Quarter = structure(c(2L, 
                                                                                                                                                                   3L, 4L, 5L, 6L, 1L, 2L, 3L, 2L, 3L, 4L, 5L, 6L, 1L, 3L, 4L, 5L, 
                                                                                                                                                                   6L), .Label = c("1Q 2013", "1Q 2014", "1Q 2015", "2Q 2014", "3Q 2014", 
                                                                                                                                                                                   "4Q 2014"), class = "factor"), Revenue = c(5000L, 6000L, 3200L, 
                                                                                                                                                                                                                              1200L, 7200L, 10000L, 2500L, 4100L, 1250L, 8100L, 2300L, 3700L, 
                                                                                                                                                                                                                              1100L, 1600L, 8000L, 9000L, 10000L, 12000L)), .Names = c("Company", 
                                                                                                                                                                                                                                                                                       "Quarter", "Revenue"), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                                                                                                                                                                  -18L))
    

    第一个是直截了当的,但使用正则表达式来分隔季度和年份

    (x <- gsub('(..)(?:\\s+)(\\d{4})', '\\1', dat$Quarter))
    # [1] "1Q" "1Q" "2Q" "3Q" "4Q" "1Q" "1Q" "1Q" "1Q" "1Q" "2Q" "3Q" "4Q" "1Q" "1Q"
    # [16] "2Q" "3Q" "4Q"
    
    (y <- gsub('(..)(?:\\s+)(\\d{4})', '\\2', dat$Quarter))
    # [1] "2014" "2015" "2014" "2014" "2014" "2013" "2014" "2015" "2014" "2015"
    # [11] "2014" "2014" "2014" "2013" "2015" "2014" "2014" "2014"
    

    然后按组合排序:

    dat[order(y, x), ]
    #     Company Quarter Revenue
    # 6      ACME 1Q 2013   10000
    # 14  ABC Inc 1Q 2013    1600
    # 1      ACME 1Q 2014    5000
    # 7   ABC Inc 1Q 2014    2500
    # 9   ABC Inc 1Q 2014    1250
    # 3      ACME 2Q 2014    3200
    # 11  ABC Inc 2Q 2014    2300
    # 16 Handy Co 2Q 2014    9000
    # 4      ACME 3Q 2014    1200
    # 12  ABC Inc 3Q 2014    3700
    # 17 Handy Co 3Q 2014   10000
    # 5      ACME 4Q 2014    7200
    # 13  ABC Inc 4Q 2014    1100
    # 18 Handy Co 4Q 2014   12000
    # 2      ACME 1Q 2015    6000
    # 8   ABC Inc 1Q 2015    4100
    # 10  ABC Inc 1Q 2015    8100
    # 15 Handy Co 1Q 2015    8000
    

    第二个在数据框中再创建两列,而不是像通常那样创建向量和顺序:

    (tmp <- data.frame(do.call('rbind', strsplit(as.character(dat$Quarter), ' ')),
                      stringsAsFactors = FALSE))
    #    X1   X2
    # 1  1Q 2014
    # 2  1Q 2015
    # 3  2Q 2014
    # 4  3Q 2014
    # 5  4Q 2014
    # 6  1Q 2013
    # 7  1Q 2014
    # 8  1Q 2015
    # 9  1Q 2014
    # 10 1Q 2015
    # 11 2Q 2014
    # 12 3Q 2014
    # 13 4Q 2014
    # 14 1Q 2013
    # 15 1Q 2015
    # 16 2Q 2014
    # 17 3Q 2014
    # 18 4Q 2014
    
    dat[order(tmp[, 2], tmp[, 1]), ]
    
    #     Company Quarter Revenue
    # 6      ACME 1Q 2013   10000
    # 14  ABC Inc 1Q 2013    1600
    # 1      ACME 1Q 2014    5000
    # 7   ABC Inc 1Q 2014    2500
    # 9   ABC Inc 1Q 2014    1250
    # 3      ACME 2Q 2014    3200
    # 11  ABC Inc 2Q 2014    2300
    # 16 Handy Co 2Q 2014    9000
    # 4      ACME 3Q 2014    1200
    # 12  ABC Inc 3Q 2014    3700
    # 17 Handy Co 3Q 2014   10000
    # 5      ACME 4Q 2014    7200
    # 13  ABC Inc 4Q 2014    1100
    # 18 Handy Co 4Q 2014   12000
    # 2      ACME 1Q 2015    6000
    # 8   ABC Inc 1Q 2015    4100
    # 10  ABC Inc 1Q 2015    8100
    # 15 Handy Co 1Q 2015    8000
    

    【讨论】:

    • 欣赏细节!
    最近更新 更多