问题:将每一行细分为正确的品牌、颜色和车型。
A = """Item Name
Black Toyota Camry L
Honda Accord Navy
Grey Toyota Corolla
Black Nissan Murano
Silver Toyota Camry LE
"""
B = """Toyota,Nissan,Honda
Camry L,Murano,Accord
Corolla,Rogue,Civic
Avalon,Pathfinder,CR-V
Highlander,Maxima,HR-V
Prius,Altima,
Camry LE,,
"""
- 从文件
A中拆分行的最简单方法。
从文件B 中,只有带有品牌的第一行用于检测故障行:
Honda Accord Navy。
import io
def simple_split_title():
# with open(<name of your file B>) as fh:
with io.StringIO(B) as fh:
brands = fh.readline().rstrip().split(',')
# with open(<name of your file A>) as fh:
with io.StringIO(A) as fh:
_ = next(fh)
for line in fh:
title = line.rstrip()
item = title.split(' ')
if item[0] in brands:
_color, _brand, _type, _last = len(item) - 1, 0, 1, len(item) - 1
else:
_color, _brand, _type, _last = 0, 1, 2, len(item)
result = {'title': title,
'brand': item[_brand],
'color': item[_color],
'type': ' '.join(item[_type:_last])}
print(result)
- 最昂贵的方法。这需要循环文件
B中的dict 两次 文件A中的每行。
检测乱序行:Honda Accord Navy,需要两个字符串比较。
import io, csv
def looping_dict():
# with open(<name of your file B>) as fh:
with io.StringIO(B) as fh:
car_type = [_dict for _dict in csv.DictReader(fh)]
# with open(<name of your file A>) as fh:
with io.StringIO(A) as fh:
_ = next(fh)
for line in fh:
title = line.rstrip()
result = {'title': title, 'brand': '', 'color': '', 'type': ''}
# Get brand
for brand in car_type[0].keys():
if brand in title:
result['brand'] = brand
title = title.replace(brand + ' ', '')
break
# Get type
for _type in car_type:
if title.endswith(_type[brand]) or title.startswith(_type[brand]):
result['type'] = _type[brand]
title = title.replace(_type[brand], '')
break
# Get color
result['color'] = title.strip()
print(result)
- 数学方法,使用集合论。
car_type 的列表每行文件A 仅循环一次。
不需要检测乱序行的额外条件:Honda Accord Navy。
如果title items 中的set 是car_type[x].set 中的superset,您将得到匹配。
import io, csv
from collections import namedtuple
def theory_of_sets():
CarType = namedtuple('CarType', 'set brand type')
car_type = []
# with open(<name of your file B>) as fh:
with io.StringIO(B) as fh:
for _dict in csv.DictReader(fh):
for brand, _type in _dict.items():
_set = {brand} | set(_type.split(' '))
car_type.append(CarType._make((_set, brand, _type)))
# with open(<name of your file A>) as fh:
with io.StringIO(A) as fh:
_ = next(fh)
for line in fh:
title = line.rstrip()
_title = title.split(' ')
_items = set(_title)
result = None
for ct in car_type:
if _items.issuperset(ct.set):
result = {'title': title,
'brand': ct.brand,
'color': (_items - ct.set).pop(),
'type': ct.type}
break
print(result)
输出:所有三个示例都打印相同的输出。
{'title': 'Black Toyota Camry L', 'brand': 'Toyota', 'color': 'Black', 'type': 'Camry L'}
{'title': 'Honda Accord Navy', 'brand': 'Honda', 'color': 'Navy', 'type': 'Accord'}
{'title': 'Grey Toyota Corolla', 'brand': 'Toyota', 'color': 'Grey', 'type': 'Corolla'}
{'title': 'Black Nissan Murano', 'brand': 'Nissan', 'color': 'Black', 'type': 'Murano'}
{'title': 'Silver Toyota Camry LE', 'brand': 'Toyota', 'color': 'Silver', 'type': 'Camry LE'}