这里尝试使用OpenCV,思路是:
-
获取二值图像。我们加载图像,使用放大
imutils.resize帮助获得更好的OCR结果(见Tesseract improve quality),转换为灰度,然后Otsu's threshold获得二值图像(1通道)。
-
删除表格网格线。我们创建一个horizontal and vertical kernels,然后执行morphological operations 将相邻的文本轮廓组合成一个轮廓。这个想法是将 ROI 行作为一个整体提取到 OCR。
-
提取行 ROI。 我们 find contours 然后使用 imutils.contours.sort_contours 从上到下排序。这确保我们以正确的顺序遍历每一行。从这里我们遍历轮廓,使用 Numpy 切片提取行 ROI,使用 Pytesseract 进行 OCR,然后解析数据。
这是每个步骤的可视化:
输入图像
二值图像
变形关闭
遍历每一行的可视化
提取的行 ROI
输出发票数据结果:
{'line': '0', 'tariff': '85444290', 'quantity': '3', 'amount': '258.93'}
{'line': '1', 'tariff': '85444290', 'quantity': '4', 'amount': '548.32'}
{'line': '2', 'tariff': '76109090', 'quantity': '5', 'amount': '412.30'}
不幸的是,在尝试第二张和第三张图片时,我得到了不同的结果。这种方法在其他图像上不会产生很好的效果,因为发票的布局都不同。但是,这种方法表明,假设您有固定的发票布局,可以使用传统的图像处理技术来提取发票信息。
代码
import cv2
import numpy as np
import pytesseract
from imutils import contours
import imutils
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
# Load image, enlarge, convert to grayscale, Otsu's threshold
image = cv2.imread('1.png')
image = imutils.resize(image, width=1000)
height, width = image.shape[:2]
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
# Remove horizontal lines
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50,1))
detect_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
cnts = cv2.findContours(detect_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
cv2.drawContours(thresh, [c], -1, 0, -1)
# Remove vertical lines
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,50))
detect_vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
cnts = cv2.findContours(detect_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
cv2.drawContours(thresh, [c], -1, 0, -1)
# Morph close to combine adjacent contours into a single contour
invoice_data = []
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (85,5))
close = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel, iterations=3)
# Find contours, sort from top-to-bottom
# Iterate through contours, extract row ROI, OCR, and parse data
cnts = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
(cnts, _) = contours.sort_contours(cnts, method="top-to-bottom")
row = 0
for c in cnts:
x,y,w,h = cv2.boundingRect(c)
ROI = image[y:y+h, 0:width]
ROI = cv2.GaussianBlur(ROI, (3,3), 0)
data = pytesseract.image_to_string(ROI, lang='eng', config='--psm 6')
parsed = [word.lower() for word in data.split()]
if 'tariff' in parsed or 'number' in parsed:
row_data = {}
row_data['line'] = str(row)
row_data['tariff'] = parsed[-1]
row_data['quantity'] = parsed[2]
row_data['amount'] = str(max(parsed[10], parsed[11]))
row += 1
print(row_data)
invoice_data.append(row_data)
# Visualize row extraction
'''
mask = np.zeros(image.shape, dtype=np.uint8)
cv2.rectangle(mask, (0, y), (width, y + h), (255,255,255), -1)
display_row = cv2.bitwise_and(image, mask)
cv2.imshow('ROI', ROI)
cv2.imshow('display_row', display_row)
cv2.waitKey(1000)
'''
print(invoice_data)
cv2.imshow('thresh', thresh)
cv2.imshow('close', close)
cv2.waitKey()