【问题标题】:Split vector of string elements into list将字符串元素的向量拆分为列表
【发布时间】:2014-09-27 23:58:53
【问题描述】:

我有一个向量:

my_vec <- 
c("Iceland", "06/2010,60% ,38% ,1% ,1% ,0% ", "11/2010,63% ,36% ,1% ,0% ,0% ", 
"05/2011,59% ,38% ,2% ,1% ,0% ", "11/2011,56% ,40% ,3% ,0% ,1% ", 
"05/2012,60% ,36% ,2% ,2% ,0% ", "11/2012,59% ,40% ,1% ,0% ,0% ", 
"05/2013,60% ,38% ,1% ,0% ,1% ", "11/2013,55% ,43% ,2% ,0% ,0% ", 
"06/2014,59% ,39% ,2% ,0% ,0% ", "Montenegro", "05/2011,11% ,41% ,36% ,11% ,1% ", 
"11/2011,12% ,43% ,32% ,12% ,1% ", "05/2012,8% ,35% ,38% ,14% ,5% ", 
"11/2012,9% ,35% ,34% ,18% ,4% ", "05/2013,10% ,39% ,32% ,16% ,3% ", 
"11/2013,10% ,34% ,36% ,19% ,1% ", "06/2014,15% ,47% ,27% ,11% ,0% ", 
"Republic of Serbia ", "05/2012,3% ,31% ,43% ,20% ,3% ", "11/2012,5% ,28% ,43% ,21% ,3% ", 
"05/2013,6% ,29% ,44% ,18% ,3% ", "11/2013,7% ,34% ,39% ,18% ,2% ", 
"06/2014,11% ,40% ,33% ,16% ")

该向量包含国家名称和一些以逗号分隔的值。我想将向量拆分为按国家/地区名称列出。

我试过了:

library(stringr)

split(my_vec, which(str_detect(my_vec, "[aeiou]")))

但输出不正确:

$`1`
[1] "Iceland"                         "05/2011,59% ,38% ,2% ,1% ,0% "  
[3] "11/2012,59% ,40% ,1% ,0% ,0% "   "06/2014,59% ,39% ,2% ,0% ,0% "  
[5] "11/2011,12% ,43% ,32% ,12% ,1% " "05/2013,10% ,39% ,32% ,16% ,3% "
[7] "Republic of Serbia "             "05/2013,6% ,29% ,44% ,18% ,3% " 

$`11`
[1] "06/2010,60% ,38% ,1% ,1% ,0% "   "11/2011,56% ,40% ,3% ,0% ,1% "  
[3] "05/2013,60% ,38% ,1% ,0% ,1% "   "Montenegro"                     
[5] "05/2012,8% ,35% ,38% ,14% ,5% "  "11/2013,10% ,34% ,36% ,19% ,1% "
[7] "05/2012,3% ,31% ,43% ,20% ,3% "  "11/2013,7% ,34% ,39% ,18% ,2% " 

$`19`
[1] "11/2010,63% ,36% ,1% ,0% ,0% "   "05/2012,60% ,36% ,2% ,2% ,0% "  
[3] "11/2013,55% ,43% ,2% ,0% ,0% "   "05/2011,11% ,41% ,36% ,11% ,1% "
[5] "11/2012,9% ,35% ,34% ,18% ,4% "  "06/2014,15% ,47% ,27% ,11% ,0% "
[7] "11/2012,5% ,28% ,43% ,21% ,3% "  "06/2014,11% ,40% ,33% ,16% "  

每个列表元素都应该是国家名称。

【问题讨论】:

    标签: r list vector split


    【解决方案1】:

    更新:

    u <- unlist(gregexpr("^[[:alpha:]]", my_vec))
    w <- which(u==1)
    s <- setNames(split(my_vec[-w], cumsum(u + 1)[-w]), my_vec[w])
    

    结果:

    > s
    $Iceland
    [1] "06/2010,60% ,38% ,1% ,1% ,0% " "11/2010,63% ,36% ,1% ,0% ,0% "
    [3] "05/2011,59% ,38% ,2% ,1% ,0% " "11/2011,56% ,40% ,3% ,0% ,1% "
    [5] "05/2012,60% ,36% ,2% ,2% ,0% " "11/2012,59% ,40% ,1% ,0% ,0% "
    [7] "05/2013,60% ,38% ,1% ,0% ,1% " "11/2013,55% ,43% ,2% ,0% ,0% "
    [9] "06/2014,59% ,39% ,2% ,0% ,0% "
    
    $Montenegro
    [1] "05/2011,11% ,41% ,36% ,11% ,1% " "11/2011,12% ,43% ,32% ,12% ,1% "
    [3] "05/2012,8% ,35% ,38% ,14% ,5% "  "11/2012,9% ,35% ,34% ,18% ,4% " 
    [5] "05/2013,10% ,39% ,32% ,16% ,3% " "11/2013,10% ,34% ,36% ,19% ,1% "
    [7] "06/2014,15% ,47% ,27% ,11% ,0% "
    
    $`Republic of Serbia `
    [1] "05/2012,3% ,31% ,43% ,20% ,3% " "11/2012,5% ,28% ,43% ,21% ,3% "
    [3] "05/2013,6% ,29% ,44% ,18% ,3% " "11/2013,7% ,34% ,39% ,18% ,2% "
    [5] "06/2014,11% ,40% ,33% ,16% "
    

    您还可以轻松地将其转换为 data.frames 列表(根据@AnandaMahto 的回答):

    > lapply(s, function(x) read.csv(text=x))
    $Iceland
      X06.2010 X60. X38. X1. X1..1 X0.
    1  11/2010 63%  36%  1%    0%  0% 
    2  05/2011 59%  38%  2%    1%  0% 
    3  11/2011 56%  40%  3%    0%  1% 
    4  05/2012 60%  36%  2%    2%  0% 
    5  11/2012 59%  40%  1%    0%  0% 
    6  05/2013 60%  38%  1%    0%  1% 
    7  11/2013 55%  43%  2%    0%  0% 
    8  06/2014 59%  39%  2%    0%  0% 
    
    $Montenegro
      X05.2011 X11. X41. X36. X11..1 X1.
    1  11/2011 12%  43%  32%    12%  1% 
    2  05/2012  8%  35%  38%    14%  5% 
    3  11/2012  9%  35%  34%    18%  4% 
    4  05/2013 10%  39%  32%    16%  3% 
    5  11/2013 10%  34%  36%    19%  1% 
    6  06/2014 15%  47%  27%    11%  0% 
    
    $`Republic of Serbia `
      X05.2012  X3. X31. X43. X20. X3..1
    1  11/2012  5%  28%  43%  21%    3% 
    2  05/2013  6%  29%  44%  18%    3% 
    3  11/2013  7%  34%  39%  18%    2% 
    4  06/2014 11%  40%  33%  16%       
    

    【讨论】:

    • 它接近我所追求的......但是......最后我需要列表名称是国家名称......并且在分割向量中没有国家名称。
    【解决方案2】:

    这不是您要求的,但可能更符合您的前进方向。评论太长了,所以我想我会发布作为答案。

    我已经编写了a function called read.mtable,它是for 循环的包装器,可让您将数据读取到data.frames 的list 中(看起来就像您在这里拥有的那样)。它是 GitHub 上 my "SOfun" package 的一部分,因此您可以使用以下方式安装它:

    library(devtools)
    install_github("SOfun", "mrdwab") ## for `read.mtable`
    

    对于你的样本向量,我会这样使用它:

    read.mtable(textConnection(my_vec), 
                chunkId = "^[[:alpha:]]", 
                header = FALSE, fill = TRUE, 
                sep = ",", strip.white = TRUE)
    # $Iceland
    #        V1  V2  V3 V4 V5 V6
    # 1 06/2010 60% 38% 1% 1% 0%
    # 2 11/2010 63% 36% 1% 0% 0%
    # 3 05/2011 59% 38% 2% 1% 0%
    # 4 11/2011 56% 40% 3% 0% 1%
    # 5 05/2012 60% 36% 2% 2% 0%
    # 6 11/2012 59% 40% 1% 0% 0%
    # 7 05/2013 60% 38% 1% 0% 1%
    # 8 11/2013 55% 43% 2% 0% 0%
    # 9 06/2014 59% 39% 2% 0% 0%
    # 
    # $Montenegro
    #        V1  V2  V3  V4  V5 V6
    # 1 05/2011 11% 41% 36% 11% 1%
    # 2 11/2011 12% 43% 32% 12% 1%
    # 3 05/2012  8% 35% 38% 14% 5%
    # 4 11/2012  9% 35% 34% 18% 4%
    # 5 05/2013 10% 39% 32% 16% 3%
    # 6 11/2013 10% 34% 36% 19% 1%
    # 7 06/2014 15% 47% 27% 11% 0%
    # 
    # $`Republic of Serbia `
    #        V1  V2  V3  V4  V5 V6
    # 1 05/2012  3% 31% 43% 20% 3%
    # 2 11/2012  5% 28% 43% 21% 3%
    # 3 05/2013  6% 29% 44% 18% 3%
    # 4 11/2013  7% 34% 39% 18% 2%
    # 5 06/2014 11% 40% 33% 16%   
    

    【讨论】:

    • 这超出了我的要求。谢谢。
    【解决方案3】:

    你可以试试:

     indx <- grep("^[A-Za-z ]", my_vec) #create the index of country names from the list
    
    indx2<- diff(c(indx, length(my_vec)+1))-1 #create another index to replicate the country names
    

    用复制的国名分割没有国名的向量

    split(my_vec[-indx], rep(my_vec[indx], indx2))
    #$Iceland
    #[1] "06/2010,60% ,38% ,1% ,1% ,0% " "11/2010,63% ,36% ,1% ,0% ,0% "
    #[3] "05/2011,59% ,38% ,2% ,1% ,0% " "11/2011,56% ,40% ,3% ,0% ,1% "
    #[5] "05/2012,60% ,36% ,2% ,2% ,0% " "11/2012,59% ,40% ,1% ,0% ,0% "
    #[7] "05/2013,60% ,38% ,1% ,0% ,1% " "11/2013,55% ,43% ,2% ,0% ,0% "
    #[9] "06/2014,59% ,39% ,2% ,0% ,0% "
    
    #$Montenegro
    #[1] "05/2011,11% ,41% ,36% ,11% ,1% " "11/2011,12% ,43% ,32% ,12% ,1% "
    #[3] "05/2012,8% ,35% ,38% ,14% ,5% "  "11/2012,9% ,35% ,34% ,18% ,4% " 
    #[5] "05/2013,10% ,39% ,32% ,16% ,3% " "11/2013,10% ,34% ,36% ,19% ,1% "
    #[7] "06/2014,15% ,47% ,27% ,11% ,0% "
    
    #$`Republic of Serbia `
    #[1] "05/2012,3% ,31% ,43% ,20% ,3% " "11/2012,5% ,28% ,43% ,21% ,3% "
    #[3] "05/2013,6% ,29% ,44% ,18% ,3% " "11/2013,7% ,34% ,39% ,18% ,2% "
    #[5] "06/2014,11% ,40% ,33% ,16% "   
    

    将其转换为 data.frames 列表

    lapply(split(my_vec[-indx], rep(my_vec[indx], indx2)), 
         function(x) read.table(text=x, sep=",", header=F, stringsAsFactors=F, fill=T))
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 2020-11-08
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2014-01-16
      • 2018-05-16
      • 1970-01-01
      • 2019-12-24
      相关资源
      最近更新 更多