听起来页面上的文本位置会对您有很大帮助。我建议使用PyMuPDF 提取带有位置数据的文本,以便您找到一行。
这是一个代码示例,用于获取带有位置的文本 *.csv 文件。希望这可以帮助您开始使用 Python 挖掘信息。
#!python3.3
""" Use PyMuPDF to extract text to *.csv file. """
import csv
import json
import os
import sys
import fitz
assert len(sys.argv) == 2, 'Pass file name as parameter'
srcfilename = sys.argv[1]
assert os.path.isfile(srcfilename), 'File {} does not exist'.format(srcfilename)
dstfilename = '{}.csv'.format(srcfilename)
with open(dstfilename, 'w', encoding='utf-8', errors='ignore', newline='') as dstfile:
writer = csv.writer(dstfile)
writer.writerow([
'PAGE',
'X1',
'Y1',
'X2',
'Y2',
'TEXT',
])
document = fitz.open(srcfilename)
for page_number in range(document.pageCount):
text_dict = json.loads(document.getPageText(page_number, output='json'))
for block in text_dict['blocks']:
if block['type'] != 'text':
continue
for line in block['lines']:
for span in line['spans']:
writer.writerow([
page_number,
span['bbox'][0],
span['bbox'][1],
span['bbox'][2],
span['bbox'][3],
span['text'],
])
document.close()
这是我编写的一些代码,用于挖掘您的 PDF 并将内容放入格式更好的 *.csv 文件中:
#!python3.3
import collections
import csv
import json
import os
import fitz # PyMuPDF package
class MemberEligibility(object):
""" Row in Member Eligibility Data Contents Guide table. """
def __init__(self):
"""
Initialize object. I've made all fields strings but you may want some to
be dates or integers.
"""
self.col = ''
self.element = ''
self.data_element_name = ''
self.date_modified = ''
self.fmt = ''
self.length = ''
self.description = ''
self.comments = ''
self.condition = ''
self.recommended_threshold = ''
def get_sorted_list(document, page_number):
"""
Get text on specified page of document in sorted list. Each list item is a
(top-left y-coordinate, top-left x-coordinate, text) tuple. List sorted
top-to-bottom and then left-to-right. Coordinates converted to integers so
text with slightly different y-coordinates line up.
"""
text_dict = json.loads(document.getPageText(page_number, output='json'))
text_list = []
for block in text_dict['blocks']:
if block['type'] == 'text':
for line in block['lines']:
for span in line['spans']:
text_list.append((
int(span['bbox'][1]), # Top-left y-coordinate
int(span['bbox'][0]), # Top-left x-coordinate
span['text'], # Text itself
))
text_list.sort()
return text_list
def main():
# Downloaded PDF to same folder as this script
script_dir = os.path.dirname(os.path.abspath(__file__))
pdf_filename = os.path.join(
script_dir,
'CT_DSG_-12132014_version_1.2_(with_clarifications).pdf'
)
# Mine PDF for data
document = fitz.open(pdf_filename)
# Using OrderedDict so iteration will occur in same order as rows appear in
# PDF
member_eligibility_dict = collections.OrderedDict()
for page_number in range(document.pageCount):
# Page numbers are zero-based. I'm only looking at p. 11 of PDF here.
if 10 <= page_number <= 10:
text_list = get_sorted_list(document, page_number)
for y, x, text in text_list:
if 115 < y < 575:
# Only look at text whose y-coordinates are within the data
# portion of the table
if 25 < x < 72:
# Assuming one row of text per cell in this column but
# this doesn't appear to hold on p. 10 of PDF so may
# need to be modified if you're going to do whole table
row = MemberEligibility()
row.col = text
member_eligibility_dict[row.col] = row
elif 72 < x < 118:
row.element += text
elif 118 < x < 175:
row.data_element_name += text
elif 175 < x < 221:
row.date_modified += text
elif 221 < x < 268:
row.fmt += text
elif 268 < x < 315:
row.length += text
elif 315 < x < 390:
row.description += text
elif 390 < x < 633:
row.comments += text
elif 633 < x < 709:
row.condition += text
elif 709 < x < 765:
row.recommended_threshold += text
document.close()
# Write data to *.csv
csv_filename = os.path.join(script_dir, 'EligibilityDataContentsGuide.csv')
with open(csv_filename, 'w', encoding='utf-8', errors='ignore', newline='') as f:
writer = csv.writer(f)
writer.writerow([
'Col',
'Element',
'Data Element Name',
'Date Modified',
'Format',
'Length',
'Description',
'Element Submission Guideline Comments',
'Condition (Denominator)',
'Recommended Threshold'
])
for row in member_eligibility_dict.values():
writer.writerow([
row.col,
row.element,
row.data_element_name,
row.date_modified,
row.fmt,
row.length,
row.description,
row.comments,
row.condition,
row.recommended_threshold
])
if __name__ == '__main__':
main()
你可能需要做更多的工作才能得到你想要的。