【问题标题】:Parse multiple element XML values into a R dataframe将多个元素 XML 值解析为 R 数据框
【发布时间】:2022-01-23 19:38:22
【问题描述】:

我有一个类似的 XML:

    <Trait ID="4711" Type="Disease">
    <!--  each phenotype -->
    <Name>
    <ElementValue Type="Preferred">Breast-ovarian cancer, familial 1</ElementValue>
    <XRef ID="Breast-ovarian+cancer%2C+familial+1/7865" DB="Genetic Alliance"/>
    </Name>
    <Name>
    <ElementValue Type="Alternate">BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1</ElementValue>
    <XRef Type="MIM" ID="604370" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0001" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0002" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0003" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0004" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0005" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0006" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0007" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0008" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0009" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0010" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0011" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0012" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0013" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0014" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0015" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0016" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0017" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0018" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0019" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0020" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0021" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0022" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0023" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0024" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0025" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0026" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0027" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0028" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0029" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0030" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0031" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0032" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0033" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0034" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0035" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0036" DB="OMIM"/>
    <XRef Type="Allelic variant" ID="113705.0037" DB="OMIM"/>
    </Name>
    <Name>
    <ElementValue Type="Alternate">OVARIAN CANCER, SUSCEPTIBILITY TO</ElementValue>
    <XRef Type="Allelic variant" ID="602667.0001" DB="OMIM"/>
    </Name>
    <Name>
    <ElementValue Type="Alternate">BREAST CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1</ElementValue>
    <XRef ID="604370" DB="OMIM"/>
    </Name>
    <Name>
    <ElementValue Type="Alternate">Breast cancer, familial 1</ElementValue>
    </Name>
    <Name>
    <ElementValue Type="Alternate">Breast-ovarian cancer, familial 1 and 2</ElementValue>
    <XRef ID="GTR000310494" DB="Laboratory of Genetics,HUSLAB"/>
    </Name>
    <Name>
    <ElementValue Type="Alternate">BRCA1 Gene Mutation</ElementValue>
    <XRef ID="GTR000501743" DB="Myriad Genetic Laboratories,Myriad Genetic Laboratories, Inc."/>
    </Name>
</Trait>

我想将 XML 解析为如下数据框:

enter image description here

我想使用 r 包 XML,但问题是我的 XREF 属性值比特征值的名称多。 我可以用 for 循环解决这个问题,但这通常不是“R 方式”。我想知道是否有更简单的解决方案? (例如,使用 xpath 查询)。

我正在尝试这样的事情:

x <- do.call(rbind, xpathApply(xml_1, "//TraitSet/Trait[@ID='4711']/Name", function(node) {
  
  trait <- xmlValue(node[["ElementValue"]])
  
  xp <- "//TraitSet/Trait[@ID='4711']/Name/XRef"
  DB <- sapply(c("ID","DB"), function(x) xpathSApply(xmltop, '//TraitSet/Trait/Name/XRef', xmlGetAttr, x))
  if (is.null(DB)) DB <- NA
  
  data.frame(trait, DB, stringsAsFactors = FALSE)
  
}))

但是记录被错误地相乘。

我将不胜感激!谢谢

【问题讨论】:

    标签: r xml xpath


    【解决方案1】:

    我认为这是一个使用 xml2 的解决方案,以及一种快速而冗长的整洁诗歌方法。

    在导入嵌套的 xml 数据时,编写紧凑的代码并不总是那么容易。

    
    library(xml2)
    library(dplyr)
    library(tidyr)
    library(purrr)
    
    # read the xml file
    te <- xml2::read_xml("~/Desktop/test_so.xml")
    
    # to move from node to node
    cursor <- xml2::xml_find_all(te, ".//Name")
    
    # i <- 1L
    seq_along(cursor) %>% 
      # map to move along cursor
      purrr::map_df(function(i){
        print(i)
        x <- cursor[i]
        # first part is Name and Type (Alternate/Disease/Preferred)
        dplyr::tibble(Type = xml2::xml_attr(xml_find_all(x, './/ElementValue'), 'Type'),
             Trait = xml2::xml_text(x)) -> temp
        
        # second part is not always here so test if it exists before, then
        # if it exists, extract and compact it (nest)
        if (!is.na(xml2::xml_text(xml2::xml_find_first(x, './/XRef')))){
          Details <- dplyr::tibble(
             DB = xml2::xml_find_all(x, './/XRef') %>% 
               xml2::xml_attr('DB'),
             ID = xml2::xml_find_all(x, './/XRef') %>% 
               xml2::xml_attr('ID'),
             Type_ = xml2::xml_find_all(x, './/XRef') %>% 
               xml2::xml_attr('Type')) %>% tidyr::nest(data = c('DB', 'ID', 'Type_'))
          } else {
          # if it doesn't exist, fill with an empty df and compact it too (nest)
          Details <- dplyr::tibble(DB = NA, ID = NA, Type_ = NA) %>% 
            tidyr::nest(data = c('DB', 'ID', 'Type_'))
          }
        # add this new "df" column to temporary object
        temp <- temp %>% dplyr::mutate(Details = Details$data)
        # return
        temp
      }) -> te2
    
    result <- te2 %>% 
      unnest(Details)
    
    # A tibble: 44 × 5
       Type      Trait                                                 DB               ID                                       Type_          
       <chr>     <chr>                                                 <chr>            <chr>                                    <chr>          
     1 Preferred Breast-ovarian cancer, familial 1                     Genetic Alliance Breast-ovarian+cancer%2C+familial+1/7865 NA             
     2 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             604370                                   MIM            
     3 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0001                              Allelic variant
     4 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0002                              Allelic variant
     5 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0003                              Allelic variant
     6 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0004                              Allelic variant
     7 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0005                              Allelic variant
     8 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0006                              Allelic variant
     9 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0007                              Allelic variant
    10 Alternate BREAST-OVARIAN CANCER, FAMILIAL, SUSCEPTIBILITY TO, 1 OMIM             113705.0008                              Allelic variant
    # … with 34 more rows
    >
    

    【讨论】:

    • 这是一个出色的分辨率@Guillaume!这是非常可读的。谢谢!
    • 感谢您的反馈。当我必须在工作中做同样的事情时,这段代码和你的案例也会对我有所帮助。
    猜你喜欢
    • 1970-01-01
    • 1970-01-01
    • 2014-05-05
    • 2013-06-16
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多