You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ambar2/Pipeline/parsers/ocrproxy.py

65 lines
1.7 KiB
Python

from PIL import Image
import pyocr
import pyocr.builders
import sys
import re
import io
class OCRProxyResponse:
def __init__(self):
self.text = ''
self.success = False
self.message = ''
class OCRProxy:
def __init__(self):
self.ocr = None
self.lang = None
for tool in pyocr.get_available_tools():
if tool.get_name() == 'Tesseract (sh)':
self.ocr = tool
break
if self.ocr:
self.lang = '+'.join(self.ocr.get_available_languages())
def PerformOCR(self, ImageData):
resp = OCRProxyResponse()
if not self.ocr:
resp.success = False
resp.message = 'No OCR foud!'
return resp
try:
image = Image.open(io.BytesIO(ImageData))
if 'compression' in image.info and image.info['compression']=='tiff_jpeg':
resp.success = False
resp.message = 'Unsupported compression type'
return resp
text = self.ocr.image_to_string(
image,
lang = self.lang,
builder = pyocr.builders.TextBuilder()
)
resp.text = text
resp.success = True
except Exception as ex:
resp.success = False
resp.message = str(ex)
return resp
@classmethod
def GetLanguages(cls):
lang = ''
for tool in pyocr.get_available_tools():
if tool.get_name() == 'Tesseract (sh)':
ocr = tool
break
if ocr:
lang = '+'.join(ocr.get_available_languages())
return lang