【问题标题】:Reshaping a file that isn't time varying重塑不随时间变化的文件
【发布时间】:2021-11-13 14:39:06
【问题描述】:

我想重塑一个文件,但重塑的典型用法不适用于我的问题(我不认为)。我可以手动完成,但很容易出错,而且我不会学习如何使用 R 来完成这项工作。所以在阅读了 Reshape 和 Tidyr 之后,我决定来这里。

我有一份电力公司及其运营所在县的档案。我希望能够重塑它,使其成为一个县列表以及其中的电力设施。我还有一个变量,用于判断我是否在另一个名为“InData”的数据集中为每个实用程序提供数据。然后我会在此基础上确定每个县的缺失程度。

列名是: UtilityName, InData, County1,County2,County3,County4... County12

有没有一种简单的方法来重塑它,还是我需要创建某种循环遍历每一列?

以下是数据示例:>

dput(utility_county)
structure(list(UtilityName = c("Alder Mutual Light Co Inc", "Avista Corporation", 
"Asotin County PUD", "Benton County PUD", "Blaine City Light", 
"Centralia City Light", "Chelan County PUD", "City of Cashmere", 
"City of Chewelah, Electric Department", "City of Cheney", "City of Coulee Dam Light Dept", 
"City of Ellensburg", "City of McCleary", "City of Milton", "City of Richland", 
"City of Sumas", "Clallam County PUD", "Clark County PUD", "Clearwater Power", 
"Columbia Rural Electric", "Cowlitz County PUD", "Douglas County PUD", 
"Elmhurst Power & Light Co", "Ferry County PUD", "Franklin County PUD", 
"Grant County PUD", "Grays Harbor County PUD", "Inland Power & Light", 
"Jefferson County PUD", "Kittitas County PUD", "Klickitat County PUD", 
"Kootenai Electric Cooperative Inc", "Lakeview Light & Power", 
"Lewis County PUD", "Mason County PUD1", "Mason County PUD3", 
"Modern Electric Water Company", "Nespelem Valley Electric Cooperative", 
"Ohop Mutual Light Co", "Okanogan PUD", "Orcas Power and Light Coop", 
"Pacific County PUD", "Pacific Power", "Parkland Light & Power", 
"Pend Oreille PUD", "Peninsula Light Company", "Port Angeles City Light", 
"Puget Sound Energy", "Seattle City Light", "Skamania PUD", "Snohomish County PUD", 
"Tacoma Public Utilities", "Tanner Electric Cooperative", "Town of Eatonville", 
"Town of Ruston", "Town of Steilacoom", "Vera Water & Power", 
"Wahkiakum County PUD", "Whatcom County PUD", "Big Bend Electric", 
"Northern Lights Inc"), InData = c(0L, 1L, 0L, 1L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 
0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 
0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 1L), UtilityType = c("Coop", "Private", "Public", 
"Public", "Public", "Public", "Public", "Public", "Public", "Public", 
"Public", "Public", "Public", "Public", "Public", "Public", "Public", 
"Public", "Coop", "Coop", "Public", "Public", "Coop", "Public", 
"Public", "Public", "Public", "Coop", "Public", "Public", "Public", 
"Coop", "Coop", "Public", "Public", "Public", "Public", "Coop", 
"Coop", "Public", "Coop", "Public", "Private", "Coop", "Public", 
"Coop", "Public", "Private", "Public", "Public", "Public", "Public", 
"Coop", "Public", "Public", "Public", "Coop", "Public", "Public", 
"Coop", "Coop"), County1 = c("Pierce", "Whitman", "Asotin", "Benton", 
"Whatcom", "Lewis", "Chelan", "Chelan", "Stevens", "Spokane", 
"Grant", "Kittitas", "Grays Harbor", "Fulton County", "Benton County", 
"Whatcom", "Clallam", "Clark", "Asotin  ", "Walla Walla", "Cowlitz", 
"Douglas", "Pierce", "Ferry", "Franklin", "Grant", "Grays Harbor", 
"Spokane", "Jefferson", "Kittitas", "Klickitat", "Spokane", "Grant", 
"Lewis", "Mason", "Mason", "Spokane", "Okanogan", "Pierce", "Okanogan", 
"San Juan", "Pacific", "Yakima", "Pierce", "Pend Oreille", "Pierce", 
"Clallam", "Island", "King", "Skamania", "Snohomish", "Pierce", 
"King", "Pierce", "Pierce", "Pierce", "Spokane", "Wahkiakum", 
"Whatcom", "Adams", "Pend Oreille"), County2 = c("", "Spokane", 
"", "", "", "", "", "", "", "", "Douglas", "", "", "", "", "", 
"", "", "Whitman", "Columbia", "", "", "", "Okanogan", "", "", 
"", "Whitman", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "Garfield", "", "", "", "", "King", "", "", "Island", 
"Mason", "Pierce", "", "", "", "", "", "", "", ""), County3 = c("", 
"Stevens", "", "", "", "", "", "", "", "", "Okanogan", "", "", 
"", "", "", "", "", "Garfield", "", "", "", "", "", "", "", "", 
"Lincoln", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "Columbia", "", "", "", "", "Kitsap", "", "", "", "Grays Harbor", 
"", "", "", "", "", "", "", "", ""), County4 = c("", "Lincoln", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "Stevens", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "Walla Walla", "", "", 
"", "", "Kittitas", "", "", "", "Lewis", "", "", "", "", "", 
"", "", "", ""), County5 = c("", "Asotin", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "Garfield", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "Benton", "", "", "", "", "Pierce", "", "", 
"", "", "", "", "", "", "", "", "", "", ""), County6 = c("", 
"Adams", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "Adams", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "Kittitas", 
"", "", "", "", "Skagit", "", "", "", "", "", "", "", "", "", 
"", "", "", ""), County7 = c("", "Ferry", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "Columbia", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "Thurston", "", "", "", 
"", "", "", "", "", "", "", "", "", ""), County8 = c("", "Franklin", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "Pend Orielle", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"Whatcom", "", "", "", "", "", "", "", "", "", "", "", "", ""
), County9 = c("", "Grant", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"Grant", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", ""), County10 = c("", "Pend Oreille", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "Asotin", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "")), row.names = c(NA, 61L
), class = "data.frame")

【问题讨论】:

  • 请使用dput(df) 的输出分享您的数据集的样例。另外,分享一下转换后的预期输出。
  • 我补充了,谢谢!

标签: r dplyr reshape tidyr


【解决方案1】:

使用reshape

dat[] <- lapply(dat, \(x) {x[x == ''] <- NA;x})  ## encode missings first

res <- reshape(dat, varying=4:13, sep='', direction='long', timevar='County', 
               v.names='FOO')
head(res)
#                    tilityName InData UtilityType County     FOO id
# 1.1 Alder Mutual Light Co Inc      0        Coop      1  Pierce  1
# 2.1        Avista Corporation      1     Private      1 Whitman  2
# 3.1         Asotin County PUD      0      Public      1  Asotin  3
# 4.1         Benton County PUD      1      Public      1  Benton  4
# 5.1         Blaine City Light      0      Public      1 Whatcom  5
# 6.1      Centralia City Light      0      Public      1   Lewis  6

dim(res)
# [1] 610   6

【讨论】:

    【解决方案2】:

    这是tidyr 的一种方法:

    library(dplyr);library(tidyr)
    utility_county %>% 
       pivot_longer(cols = -c(UtilityName:UtilityType),values_to = "County") %>%
       filter(County != "") %>%
       group_by(County) %>%
       summarise(Utilities = paste(UtilityName, collapse = ", "),
                 Missing = n() - sum(InData))
    ## A tibble: 43 x 3
    #   County          Utilities                                                    Missing
    #   <chr>           <chr>                                                          <int>
    # 1 "Adams"         Avista Corporation, Inland Power & Light, Big Bend Electric        1
    # 2 "Asotin"        Avista Corporation, Asotin County PUD, Inland Power & Light        1
    # 3 "Asotin  "      Clearwater Power                                                   0
    # 4 "Benton"        Benton County PUD, Pacific Power                                   0
    # 5 "Benton County" City of Richland                                                   1
    # 6 "Chelan"        Chelan County PUD, City of Cashmere                                2
    # 7 "Clallam"       Clallam County PUD, Port Angeles City Light                        1
    # 8 "Clark"         Clark County PUD                                                   0
    # 9 "Columbia"      Columbia Rural Electric, Inland Power & Light, Pacific Power       1
    #10 "Cowlitz"       Cowlitz County PUD                                                 0
    ## … with 33 more rows
    

    请注意,您的数据有 "Asotin ",但如果您愿意,可以提前修复它。

    【讨论】:

    • 这对我来说是一个很好的开始,谢谢!
    • 让我惊讶的是,代码不需要任何方向就可以知道拉县名。例如,它不需要包含以下命令:cols = starts_with("County")
    • 您会注意到pivot_longer 的第一个参数是数据,它是从上一行通过管道传输的。第二个参数是cols =。在这种情况下,我们指定了“除了UtilityNameUtilityType 之间的所有内容”。这就是-c(...) 语法的含义。
    • 总是很高兴亲爱的伊恩。
    • 谢谢你,伊恩! :) 非常酷。
    【解决方案3】:
    utility_county_long = 
      reshape(utility_county, 
             direction = 'long', 
             varying = grep('County\\d+', names(utility_county), value = T), 
             timevar = 'county_id', 
             v.names = 'county_name', 
             idvar = grep('County\\d+', names(utility_county), value = T, invert=T))
    
    rownames(utility_county_long) = NULL # reshape concatenates the idvars into rownames 
                                         # but we don't need them
    
    # make empties explicit NAs
    utility_county_long[utility_county_long$county_name == "", 'county_name'] = NA
    
    # remove white spaces
    utility_county_long = transform(utility_county_long,
              UtilityName = trimws(UtilityName, 'both'), 
              county_name = trimws(county_name))
    
    head(utility_county_long, 10)
    counts = aggregate(list(N=utility_county_long$UtilityName), 
                       list(county=utility_county_long$county_name),
                       length)
    
    # extra: add pasted-together utility names as in @Ian Campbell's answer
    counts$utilities = aggregate(list(utilities = utility_county_long$UtilityName), 
                                 list(county = utility_county_long$county_name), 
                       \(x) paste(x, collapse = ", "))$utilities
    
    
    counts[, c("county", "N")]
              county  N
    1          Adams  3
    2         Asotin  4
    3         Benton  2
    4  Benton County  1
    5         Chelan  2
    6        Clallam  2
    7          Clark  1
    8       Columbia  3
    9        Cowlitz  1
    10       Douglas  2
    11         Ferry  2
    12      Franklin  2
    13 Fulton County  1
    14      Garfield  3
    15         Grant  5
    16  Grays Harbor  3
    17        Island  2
    18     Jefferson  1
    19          King  3
    20        Kitsap  1
    21      Kittitas  4
    22     Klickitat  1
    23         Lewis  3
    24       Lincoln  2
    25         Mason  3
    26      Okanogan  4
    27       Pacific  1
    28  Pend Oreille  3
    29  Pend Orielle  1
    30        Pierce 11
    31      San Juan  1
    32        Skagit  1
    33      Skamania  1
    34     Snohomish  1
    35       Spokane  6
    36       Stevens  3
    37      Thurston  1
    38     Wahkiakum  1
    39   Walla Walla  2
    40       Whatcom  4
    41       Whitman  3
    42        Yakima  1
    

    【讨论】:

    • 嗨,谢谢。这不是我想要的,而是想要理解代码,因为我怀疑它可以被调整来做到这一点。 grep 代码中的 \\d+ 有什么作用?
    • 编辑了我的答案。\d 是匹配任何数字的字符类,+ 是匹配一个或多个匹配项的量词。如果您需要了解更多信息,可以阅读正则表达式。这是一个很好的起点:r4ds.had.co.nz/strings.html#character-classes-and-alternatives 或阅读帮助页面?`regular-expressions`
    猜你喜欢
    • 2013-07-05
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2023-03-23
    • 1970-01-01
    • 2017-11-20
    • 1970-01-01
    • 2019-03-24
    相关资源
    最近更新 更多