这是一个相当复杂的操作,所以我会尝试解释,但不容易消化:
from lxml import etree
import elementpath //the core element here is the xpath method intersect(); it's an xpath 2.0 function, so lxml (which only supports xpath 1.0, doesn't work here, so we need this library which supports xpath 2.0
import pandas as pd
movie = """[your xml above]"""
root = etree.XML(movie)
columns = ['Name','Sex','Age','Awards'] #prepare the dataframe columns
rows = [] #initialize the collection of information about actors
anchor = '@name="actor"' #this isn't strictly necessary, but because the xpath expressions get progressively convoluted, I believe this will make it more readable
actor_count = elementpath.select(root,f'count(//meta[{anchor}])') # how many actors are there? Note that this is the first, but not last, use of f-strings; you should read up on those as well
meta_count = elementpath.select(root,'count(//meta)') #how many items are there?
for c in range(actor_count): #for each actor
row = [] #initialize a list containing data about this actor
#the intersect() method start at top and looks down, than goes to the bottom and looks up, then selects what's in the middle; this is where it gets really complex, so you'll just have to read up on it
top_down = f'//meta[{anchor}][{c+1}]/(self::meta,following-sibling::meta)' #note the use of {c+1} instead of just {c}; that's because the range() function is python, which counts from zero, while xpath counts from 1, so you need to account for that
bottom_up = f'(//meta[{anchor}][preceding-sibling::meta[1][not({anchor})]]\
,//meta[count(./preceding-sibling::*) = {meta_count}])[{c+1}]/(self::meta[not({anchor})],\
preceding-sibling::meta)'
src_exp = f'{top_down} intersect {bottom_up}'
entries = elementpath.select(root,src_exp)
#if everything works, this should have separated the actors' data into separate groups
for entry in entries:
row.append(entry.attrib['content']) #add this actor's data to the actor's row
if len(entries)<4:
row += ['NA'] * (4 - len(entries)) #since some actors don't have all data items, the row for such an actor needs to be padded with 'NA's.
rows.append(row) #add this actor's data to the general data pool
pd.DataFrame(rows,columns=columns) #load the whole thing into a dataframe
输出:
Name Sex Age Awards
0 Joseph Male 32 Yrs NA
1 Alex Male NA NA
2 John Male 32 Yrs 3 awards