【问题标题】:Tidyversing working R code with a for loop使用 for 循环整理工作 R 代码
【发布时间】:2019-01-25 13:17:58
【问题描述】:

我有一对城市 V1 和 V2 的数据集。每个城市都有人口 v1_pop2015 和 v2_pop2015。

我想创建一个新数据集,其中仅包含最大城市的城市代码及其人口与最小城市的人口之和。

我能够使用 for 循环创建我想要的输出。出于教育目的,我尝试使用 tidyverse 工具来做这件事,但没有成功。

这是一个工作示例

library(tidyverse)

## Sample dataset
pairs_pop <- structure(list(cityCodeV1 = c(20073, 20888, 20222, 22974, 23792, 
20779), cityCodeV2 = c(20063, 204024, 20183, 20406, 23586, 23595
), v1_pop2015 = c(414, 682, 497, 3639, 384, 596), v2_pop2015 = c(384, 
757, 5716, 315, 367, 1303)), row.names = c(NA, 6L), class = c("tbl_df", 
"tbl", "data.frame"))

pairs_pop
#> # A tibble: 6 x 4
#>   cityCodeV1 cityCodeV2 v1_pop2015 v2_pop2015
#> *      <dbl>      <dbl>      <dbl>      <dbl>
#> 1      20073      20063        414        384
#> 2      20888     204024        682        757
#> 3      20222      20183        497       5716
#> 4      22974      20406       3639        315
#> 5      23792      23586        384        367
#> 6      20779      23595        596       1303


#### This is working !!!
clean_df <- setNames(data.frame(matrix(ncol = 2, nrow = dim(pairs_pop)[1])),c("to_keep", "to_keep_pop"))
# For each row, determine which city is the biggest and adds the two cities population
for (i  in 1:dim(pairs_pop)[1]) {


  if(pairs_pop$v1_pop2015[i] > pairs_pop$v2_pop2015[i])
    {
      clean_df$to_keep[i] = pairs_pop$cityCodeV1[i]
      clean_df$to_keep_pop[i] = pairs_pop$v1_pop2015[i] + pairs_pop$v2_pop2015[i]
    } 
  else 
    {
      clean_df$to_keep[i] = pairs_pop$cityCodeV2[i]
      clean_df$to_keep_pop[i] = pairs_pop$v1_pop2015[i] + pairs_pop$v2_pop2015[i]
    }

}
clean_df 
#>   to_keep to_keep_pop
#> 1   20073         798
#> 2  204024        1439
#> 3   20183        6213
#> 4   22974        3954
#> 5   23792         751
#> 6   23595        1899

这就是我卡住的地方

### trying to tidy it  with rowwise, mutate and a function

v1_sup_tov2 <- function(x){
  print(x)
  if(x$v1_pop2015 > x$v2_pop2015){

    return (TRUE)
  }
  return(FALSE)
}

to_clean_df2 <- pairs_pop %>%
  rowwise() %>%
  mutate_if(v1_sup_tov2,
            to_keep = cityCodeV1,
            to_delete= cityCodeV2,
            to_keep_pop = v1_pop2015 + v2_pop2015)

预期的输出是一个包含 2 列的数据框,如下所示: to_keep:我要保留的城市的cityCode to_keep_pop: 那个城市的人口

clean_df 
#>   to_keep to_keep_pop
#> 1   20073         798
#> 2  204024        1439
#> 3   20183        6213
#> 4   22974        3954
#> 5   23792         751
#> 6   23595        1899

【问题讨论】:

    标签: r dplyr tidyverse


    【解决方案1】:

    这个怎么样?

    library(dplyr)
    
    ## Sample dataset
    pairs_pop <- structure(
      list(cityCodeV1 = c(20073, 20888, 20222, 22974, 23792, 20779),
           cityCodeV2 = c(20063, 204024, 20183, 20406, 23586, 23595),
           v1_pop2015 = c(414, 682, 497, 3639, 384, 596),
           v2_pop2015 = c(384, 757, 5716, 315, 367, 1303)),
      row.names = c(NA, 6L), class = c("tbl_df", "tbl", "data.frame"))
    
    clean_df <- transmute(pairs_pop,
                  to_keep = if_else(v1_pop2015 > v2_pop2015, cityCodeV1, cityCodeV2),
                  to_keep_pop = v1_pop2015 + v2_pop2015)
    
    

    【讨论】:

      【解决方案2】:

      以防万一有一天您获得多个城市的 v1、v2、v3... 不要忘记将所有信息保存在数据框中,以便您知道什么值与什么相关。一个整洁的数据框。

      library(dplyr)
      
      ## Sample dataset
      pairs_pop <- structure(
        list(cityCodeV1 = c(20073, 20888, 20222, 22974, 23792, 20779),
             cityCodeV2 = c(20063, 204024, 20183, 20406, 23586, 23595),
             v1_pop2015 = c(414, 682, 497, 3639, 384, 596),
             v2_pop2015 = c(384, 757, 5716, 315, 367, 1303)),
        row.names = c(NA, 6L), class = c("tbl_df", "tbl", "data.frame"))
      
      # Tidy dataset with all information that was in columns
      
      library(dplyr)
      library(tidyr)
      library(stringr)
      
      tidy_pairs <- pairs_pop %>% 
        mutate(city = 1:n()) %>% 
        gather("key", "value", -city) %>% 
        mutate(ville = str_extract(key, "([[:digit:]])"),
               key = case_when(
                 grepl("cityCode", key) ~ "cityCode",
                 grepl("pop", key) ~ "pop",
                 TRUE ~ "other"
               )) %>% 
        spread(key, value)
      

      然后你可以应用你想要的测试

      tidy_pairs %>% 
        group_by(city) %>% 
        summarise(to_keep = cityCode[pop == max(pop)],
                  to_keep_pop = sum(pop))
      

      【讨论】:

        猜你喜欢
        • 1970-01-01
        • 1970-01-01
        • 2017-01-19
        • 2015-07-04
        • 2015-02-03
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        相关资源
        最近更新 更多