【问题标题】:r xml to dataframe from attributesr xml从属性到数据框
【发布时间】:2016-02-19 05:11:00
【问题描述】:

我有一个以下格式的 xml 数据集。

<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE population SYSTEM "http://www.matsim.org/files/dtd/population_v5.dtd">

<population>

<!-- ====================================================================== -->

	<person id="10000061">
		<plan score="219.62581874242716" selected="yes">
			<act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="03:00:00" end_time="15:07:00" />
			<leg mode="ride" dep_time="15:07:00" trav_time="00:03:27" arr_time="15:10:27">
				<route type="links">21258 14045 13977 13939 13925 13919 13905 13904</route>
			</leg>
			<act type="shop" link="13904" x="332634.86999" y="3127078.96383" start_time="15:12:00" end_time="16:21:00" />
			<leg mode="car" dep_time="16:21:00" trav_time="00:09:44" arr_time="16:30:44">
				<route type="links">13904 21207 21208 13980 21187 21188 14148 14144 14130 14129</route>
			</leg>
			<act type="shop" link="14129" x="331666.364904" y="3129306.48785" start_time="16:25:00" end_time="17:37:00" />
			<leg mode="ride" dep_time="17:37:00" trav_time="00:09:46" arr_time="17:46:46">
				<route type="links">14129 14143 14147 14161 14171 14189 14195 14120 14106 14051 13941 13938 13976 14044 21259 21258</route>
			</leg>
			<act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="17:45:00" end_time="26:59:00" />
		</plan>

		<plan score="218.9756035020247" selected="no">
			<act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="03:00:00" end_time="15:07:00" />
			<leg mode="ride" dep_time="15:07:00" trav_time="00:03:26" arr_time="15:10:26">
				<route type="links">21258 14045 13977 13939 13925 13919 13905 13904</route>
			</leg>
			<act type="shop" link="13904" x="332634.86999" y="3127078.96383" start_time="15:12:00" end_time="16:21:00" />
			<leg mode="car" dep_time="16:21:00" trav_time="00:08:46" arr_time="16:29:46">
				<route type="links">13904 13905 13891 13855 21239 21240 13887 13885 13869 13870 13920 13974 14070 14075 14103 14109 14123 14129</route>
			</leg>
			<act type="shop" link="14129" x="331666.364904" y="3129306.48785" start_time="16:25:00" end_time="17:37:00" />
			<leg mode="ride" dep_time="17:37:00" trav_time="00:11:06" arr_time="17:48:06">
				<route type="links">14129 14143 14147 14161 14150 14098 14094 14095 14113 14106 14051 13941 13938 13976 14044 21259 21258</route>
			</leg>
			<act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="17:45:00" end_time="26:59:00" />
		</plan>

		<plan score="218.5148700010285" selected="no">
			<act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="03:00:00" end_time="15:07:00" />
			<leg mode="ride" dep_time="15:07:00" trav_time="00:03:26" arr_time="15:10:26">
				<route type="links">21258 14045 13977 13939 13925 13919 13905 13904</route>
			</leg>
			<act type="shop" link="13904" x="332634.86999" y="3127078.96383" start_time="15:12:00" end_time="16:21:00" />
			<leg mode="car" dep_time="16:21:00" trav_time="00:08:15" arr_time="16:29:15">
				<route type="links">13904 13905 13906 13980 21187 21188 14148 14144 14130 14129</route>
			</leg>
			<act type="shop" link="14129" x="331666.364904" y="3129306.48785" start_time="16:25:00" end_time="17:37:00" />
			<leg mode="ride" dep_time="17:37:00" trav_time="00:11:18" arr_time="17:48:18">
				<route type="links">14129 14130 14124 14110 14104 14077 14071 13975 13921 13871 13868 13884 13886 13888 13894 13904 13918 13924 13938 13976 14044 21259 21258</route>
			</leg>
			<act type="home" link="21258" x="334867.243653" y="3126570.70778" start_time="17:45:00" end_time="26:59:00" />
		</plan>

	</person>

<!-- ====================================================================== -->

	<person id="10000302">
		<plan score="209.66504470021556" selected="yes">
			<act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="03:00:00" end_time="07:56:00" />
			<leg mode="car" dep_time="07:56:00" trav_time="00:03:00" arr_time="07:59:00">
				<route type="links">21256 13966 14056 14057</route>
			</leg>
			<act type="work" link="14057" x="335957.065395" y="3128105.16619" start_time="08:04:00" end_time="10:28:00" />
			<leg mode="car" dep_time="10:28:00" trav_time="00:08:20" arr_time="10:36:20">
				<route type="links">14057 14049 14045 13977 13939 13925 13919 21207 21208 13980 14046 14095 21191</route>
			</leg>
			<act type="social" link="21191" x="333032.807855" y="3128759.66141" start_time="10:33:00" end_time="11:52:00" />
			<leg mode="car" dep_time="11:52:00" trav_time="00:08:33" arr_time="12:00:33">
				<route type="links">21191 21194 14189 14195 14197 14210 14212 14234 14246 14215 14192 14178 14057 13967 21257 21256</route>
			</leg>
			<act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="11:59:00" end_time="12:11:00" />
			<leg mode="car" dep_time="12:11:00" trav_time="00:06:35" arr_time="12:17:35">
				<route type="links">21256 21257 21258 14045 13977 13939 13925 13919 13905 13906</route>
			</leg>
			<act type="social" link="13906" x="332302.159169" y="3127536.46778" start_time="12:17:00" end_time="13:30:00" />
			<leg mode="car" dep_time="13:30:00" trav_time="00:05:30" arr_time="13:35:30">
				<route type="links">13906 13907 13904 13918 13924 13938 13976 14044 21259 21256</route>
			</leg>
			<act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="13:36:00" end_time="26:59:00" />
		</plan>

		<plan score="205.5456839457717" selected="no">
			<act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="03:00:00" end_time="07:56:00" />
			<leg mode="car" dep_time="07:56:00" trav_time="00:02:15" arr_time="07:58:15">
				<route type="links">21256 13966 14056 14057</route>
			</leg>
			<act type="work" link="14057" x="335957.065395" y="3128105.16619" start_time="08:04:00" end_time="10:28:00" />
			<leg mode="car" dep_time="10:28:00" trav_time="00:06:51" arr_time="10:34:51">
				<route type="links">14057 14056 14177 14191 14214 14247 14235 14213 14211 14198 14120 14114 21191</route>
			</leg>
			<act type="social" link="21191" x="333032.807855" y="3128759.66141" start_time="10:33:00" end_time="11:52:00" />
			<leg mode="car" dep_time="11:52:00" trav_time="00:07:45" arr_time="11:59:45">
				<route type="links">21191 21194 14189 14195 14197 14210 14212 14234 14246 14215 14192 14178 14057 13967 21257 21256</route>
			</leg>
			<act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="11:59:00" end_time="12:11:00" />
			<leg mode="car" dep_time="12:11:00" trav_time="00:07:51" arr_time="12:18:51">
				<route type="links">21256 13915 13823 13767 13743 13731 13732 13837 13831 13819 13820 13854 13890 13906</route>
			</leg>
			<act type="social" link="13906" x="332302.159169" y="3127536.46778" start_time="12:17:00" end_time="13:30:00" />
			<leg mode="car" dep_time="13:30:00" trav_time="00:08:54" arr_time="13:38:54">
				<route type="links">13906 13907 13904 13918 13924 13938 13976 14044 21259 21256</route>
			</leg>
			<act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="13:36:00" end_time="26:59:00" />
		</plan>

		<plan score="203.4205865037132" selected="no">
			<act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="03:00:00" end_time="07:56:00" />
			<leg mode="car" dep_time="07:56:00" trav_time="00:03:15" arr_time="07:59:15">
				<route type="links">21256 13966 14056 14057</route>
			</leg>
			<act type="work" link="14057" x="335957.065395" y="3128105.16619" start_time="08:04:00" end_time="10:28:00" />
			<leg mode="car" dep_time="10:28:00" trav_time="00:06:41" arr_time="10:34:41">
				<route type="links">14057 14049 14045 13977 13939 13940 14050 14105 14114 21191</route>
			</leg>
			<act type="social" link="21191" x="333032.807855" y="3128759.66141" start_time="10:33:00" end_time="11:52:00" />
			<leg mode="car" dep_time="11:52:00" trav_time="00:09:12" arr_time="12:01:12">
				<route type="links">21191 21194 14189 14195 14197 14210 14212 14234 14246 14215 14192 14178 14057 13967 21257 21256</route>
			</leg>
			<act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="11:59:00" end_time="12:11:00" />
			<leg mode="car" dep_time="12:11:00" trav_time="00:05:10" arr_time="12:16:10">
				<route type="links">21256 13966 14049 14045 13977 13939 13925 13919 13905 13906</route>
			</leg>
			<act type="social" link="13906" x="332302.159169" y="3127536.46778" start_time="12:17:00" end_time="13:30:00" />
			<leg mode="car" dep_time="13:30:00" trav_time="00:05:30" arr_time="13:35:30">
				<route type="links">13906 13907 13904 13918 13924 13938 13976 14044 21259 21256</route>
			</leg>
			<act type="home" link="21256" x="334598.361546" y="3126269.05167" start_time="13:36:00" end_time="26:59:00" />
		</plan>

	</person>

<!-- ====================================================================== -->

</population>

从这个 XML 数据中,我尝试生成具有以下结构的数据框。

    person  score       selected    act.typ act.x       act.y   act_start   act_end     leg.mod  leg_dep    leg_trav    leg_arr
10000061    219.6258    yes         home    334867.2    3126571 3:00:00     15:07:00    ride     15:07:00   0:03:27     15:10:27
10000061    219.6258    yes         shop    332634.9    3127079 15:12:00    16:21:00    car      16:21:00   0:09:44     16:30:44
10000061    219.6258    yes         shop    331666.4    3129306 16:25:00    17:37:00    ride     17:37:00   0:09:46     17:46:46
10000061    219.6258    yes         home    334867.2    3126571 17:45:00    26:59:00    NA       NA         NA          NA
10000302    209.665     yes         home    334598.4    3126269 3:00:00     7:56:00     car      7:56:00    0:03:00     7:59:00
10000302    209.665     yes         work    335957.1    3128105 8:04:00     10:28:00    car      10:28:00   0:08:20     10:36:20
10000302    209.665     yes         social  333032.8    3128760 10:33:00    11:52:00    car      11:52:00   0:08:33     12:00:33
10000302    209.665     yes         home    334598.4    3126269 11:59:00    12:11:00    car      12:11:00   0:06:35     12:17:35
10000302    209.665     yes         social  332302.2    3127536 12:17:00    13:30:00    car      13:30:00   0:05:30     13:35:30
10000302    209.665     yes         home    334598.4    3126269 13:36:00    26:59:00    NA       NA         NA          NA

我查看了 XML 库和各种函数,例如 XpathApplyxmlGetAttr,并且能够在每个节点级别检索单独的列表和数据帧。但是,如您所见,我需要通过维护父子关系将这些不同的数据子集连接到一个数据帧中。此外,我只想选择属性 selected 值为“yes”的父属性和子属性。我也不想要最终节点的值,即路由。我在这里只粘贴了两个人的信息。实际数据包含 300 万人的信息。

【问题讨论】:

    标签: xml r


    【解决方案1】:

    每当处理复杂的 XML 文件时,请考虑使用 XSLT 解决方案。 XSLT 是一种声明性的专用语言(与 SQL 相同的类型),旨在转换 XML 文档。因为数据集需要行和列这两个维度,XSLT 可以转换嵌套节点和属性以满足这种结构。

    虽然 R 没有通用 XSLT 库,但 R 可以使用 system() 调用外部程序来处理 XSLT 转换。下面是开源 Python 示例,但实际上任何通用(Java、C#、PHP、Perl,甚至 Excel VBA)语言、专用可执行处理器 (Xalan and Saxon) 或命令行程序(PowerShell、Bash)都可以处理 XSLT。

    XSLT 脚本(另存为 .xsl 或 .xslt)

    <xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
    <xsl:output version="1.0" encoding="UTF-8" indent="yes" />
    <xsl:strip-space elements="*"/>
    
      <xsl:template match="/">
        <data>
          <xsl:apply-templates select="*"/>
        </data>
      </xsl:template>
    
      <xsl:template match="person|plan">
        <xsl:apply-templates />  
      </xsl:template>
    
      <xsl:template match="act">
        <person>      
          <id><xsl:value-of select="ancestor::person/@id"/></id>
          <score><xsl:value-of select="ancestor::plan/@score"/></score>
          <selected><xsl:value-of select="ancestor::plan/@selected"/></selected>
          <type><xsl:value-of select="@type"/></type>
          <link><xsl:value-of select="@link"/></link>
          <x><xsl:value-of select="@x"/></x>
          <y><xsl:value-of select="@y"/></y>    
          <start_time><xsl:value-of select="@start_time"/></start_time>
          <end_time><xsl:value-of select="@end_time"/></end_time>
          <mode><xsl:value-of select="following-sibling::leg/@mode"/></mode>
          <dep_time><xsl:value-of select="following-sibling::leg/@dep_time"/></dep_time>
          <trav_time><xsl:value-of select="following-sibling::leg/@trav_time"/></trav_time>
          <arr_time><xsl:value-of select="following-sibling::leg/@arr_time"/></arr_time>
        </person>
      </xsl:template>
    
      <xsl:template match="route"/>
    </xsl:transform>
    

    Python脚本(使用lxml模块,解析上述脚本)

    import lxml.etree as ET
    
    dom = ET.parse('Input.xml'))
    xslt = ET.parse('XSLTScript.xsl'))
    
    transform = ET.XSLT(xslt)
    newdom = transform(dom)
    
    tree_out = ET.tostring(newdom, encoding='UTF-8', pretty_print=True,  xml_declaration=True)
    
    xmlfile = open('Output.xml','wb')
    xmlfile.write(tree_out)
    xmlfile.close()
    

    XML 转换后的输出

    <?xml version='1.0' encoding='UTF-8'?>
    <data>
      <person>
        <id>10000061</id>
        <score>219.62581874242716</score>
        <selected>yes</selected>
        <type>home</type>
        <link>21258</link>
        <x>334867.243653</x>
        <y>3126570.70778</y>
        <start_time>03:00:00</start_time>
        <end_time>15:07:00</end_time>
        <mode>ride</mode>
        <dep_time>15:07:00</dep_time>
        <trav_time>00:03:27</trav_time>
        <arr_time>15:10:27</arr_time>
      </person>
      <person>
        <id>10000061</id>
        <score>219.62581874242716</score>
        <selected>yes</selected>
        <type>shop</type>
        <link>13904</link>
        <x>332634.86999</x>
        <y>3127078.96383</y>
        <start_time>15:12:00</start_time>
        <end_time>16:21:00</end_time>
        <mode>car</mode>
        <dep_time>16:21:00</dep_time>
        <trav_time>00:09:44</trav_time>
        <arr_time>16:30:44</arr_time>
      </person>
      <person>
        <id>10000061</id>
        <score>219.62581874242716</score>
        <selected>yes</selected>
        <type>shop</type>
        <link>14129</link>
        <x>331666.364904</x>
        <y>3129306.48785</y>
        <start_time>16:25:00</start_time>
        <end_time>17:37:00</end_time>
        <mode>ride</mode>
        <dep_time>17:37:00</dep_time>
        <trav_time>00:09:46</trav_time>
        <arr_time>17:46:46</arr_time>
      </person>
      ...
    

    R脚本(在命令行调用上面的Python脚本)

    library(XML)
    
    system('python "C:/Path/To/Python/Script.py"')              # TRANSFORMS INPUT
    plans <- xmlParse('C:/Path/To/Transformed/Output.xml')      # PARSES OUTPUT
    df <- xmlToDataFrame(nodes = getNodeSet(plans, "//person")) # MIGRATES OUTPUT TO DF
    

    【讨论】:

      【解决方案2】:

      自从发布这个问题后,我做了一些研究,发现了这个post 并用它来想出答案。下面是我的代码。我敢肯定,这可以以一种非常有效的方式完成。因此,在有人查看并批准它或提供更好的选择之前,我不会接受这个作为最终答案。

      library(XML)
      file1 <- "C:/Users/s/Desktop/plans.xml"
      plans <- xmlParse(file1)
      
      idNodes <- getNodeSet(plans, "//person[@id]")
      ids <- lapply(idNodes, function(x) xmlAttrs(x)['id'])
      
      attribact <- vector("list", length(ids)*14)
      k <- 1
      for (i in 1:length(ids))
      {
        act <- xpathApply(idNodes[[i]], path = paste("//person[@id=", as.numeric(ids[[i]]),"]//plan[@selected='yes']//act|leg", sep=""), xmlAttrs)
        for (j in 1:length(act))
        {
          attribact[[k]] <- c(act[[j]],ids[[i]])
          k <- k+1    
        }
      }
      attribact <- attribact[attribact!='NULL']
      k1 <- t(as.data.frame(attribact))
      
      attribleg <- vector("list", length(ids)*10)
      k <- 1
      for (i in 1:length(ids))
      {
        leg <- xpathApply(idNodes[[i]], path = paste("//person[@id=", as.numeric(ids[[i]]),"]//plan[@selected='yes']//leg", sep=""), xmlAttrs)
        leg[[length(leg)+1]] <- c(NA,NA,NA,NA)
        for (j in 1:length(leg))
        {
          attribleg[[k]] <- c(leg[[j]],ids[[i]])  
          k <- k+1    
        }
      }
      attribleg <- attribleg[attribleg!='NULL']
      k2 <- t(as.data.frame(attribleg))
      
      data1 <- cbind(k1,k2)

      【讨论】:

        猜你喜欢
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 2017-06-22
        • 2016-07-14
        • 1970-01-01
        • 1970-01-01
        • 2015-10-07
        • 1970-01-01
        相关资源
        最近更新 更多