Use paragraph bbox for gvision if only 1 line
This commit is contained in:
286
owocr/ocr.py
286
owocr/ocr.py
@@ -417,60 +417,30 @@ class GoogleVision:
|
|||||||
except:
|
except:
|
||||||
logger.warning('Error parsing Google credentials, Google Vision will not work!')
|
logger.warning('Error parsing Google credentials, Google Vision will not work!')
|
||||||
|
|
||||||
def __call__(self, img):
|
def _break_type_to_char(self, break_type):
|
||||||
img, is_path = input_to_pil_image(img)
|
if break_type == vision.TextAnnotation.DetectedBreak.BreakType.SPACE:
|
||||||
if not img:
|
return ' '
|
||||||
return (False, 'Invalid image provided')
|
elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.SURE_SPACE:
|
||||||
|
return ' '
|
||||||
|
elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.EOL_SURE_SPACE:
|
||||||
|
return '\n'
|
||||||
|
elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.HYPHEN:
|
||||||
|
return '-'
|
||||||
|
elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.LINE_BREAK:
|
||||||
|
return '\n'
|
||||||
|
return ''
|
||||||
|
|
||||||
image_bytes = self._preprocess(img)
|
def _convert_bbox(self, quad, img_width, img_height):
|
||||||
image = vision.Image(content=image_bytes)
|
vertices = quad.vertices
|
||||||
|
|
||||||
try:
|
return quad_to_bounding_box(
|
||||||
response = self.client.document_text_detection(image=image)
|
vertices[0].x, vertices[0].y,
|
||||||
except ServiceUnavailable:
|
vertices[1].x, vertices[1].y,
|
||||||
return (False, 'Connection error!')
|
vertices[2].x, vertices[2].y,
|
||||||
except Exception as e:
|
vertices[3].x, vertices[3].y,
|
||||||
return (False, 'Unknown error!')
|
img_width, img_height
|
||||||
|
|
||||||
ocr_result = self._to_generic_result(response.full_text_annotation, img.width, img.height)
|
|
||||||
x = (True, ocr_result)
|
|
||||||
|
|
||||||
if is_path:
|
|
||||||
img.close()
|
|
||||||
return x
|
|
||||||
|
|
||||||
def _to_generic_result(self, full_text_annotation, img_width, img_height):
|
|
||||||
paragraphs = []
|
|
||||||
|
|
||||||
if full_text_annotation:
|
|
||||||
for page in full_text_annotation.pages:
|
|
||||||
if page.width == img_width and page.height == img_height:
|
|
||||||
for block in page.blocks:
|
|
||||||
for google_paragraph in block.paragraphs:
|
|
||||||
p_bbox = self._convert_bbox(google_paragraph.bounding_box, img_width, img_height)
|
|
||||||
lines = self._create_lines_from_google_paragraph(google_paragraph, img_width, img_height)
|
|
||||||
paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
|
|
||||||
paragraphs.append(paragraph)
|
|
||||||
|
|
||||||
return OcrResult(
|
|
||||||
image_properties=ImageProperties(width=img_width, height=img_height),
|
|
||||||
paragraphs=paragraphs
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def _create_lines_from_google_paragraph(self, google_paragraph, img_width, img_height):
|
|
||||||
lines = []
|
|
||||||
words = []
|
|
||||||
for google_word in google_paragraph.words:
|
|
||||||
word = self._create_word_from_google_word(google_word, img_width, img_height)
|
|
||||||
words.append(word)
|
|
||||||
if word.separator == '\n':
|
|
||||||
l_bbox = merge_bounding_boxes(words, True)
|
|
||||||
line = Line(bounding_box=l_bbox, words=words)
|
|
||||||
lines.append(line)
|
|
||||||
words = []
|
|
||||||
|
|
||||||
return lines
|
|
||||||
|
|
||||||
def _create_word_from_google_word(self, google_word, img_width, img_height):
|
def _create_word_from_google_word(self, google_word, img_width, img_height):
|
||||||
w_bbox = self._convert_bbox(google_word.bounding_box, img_width, img_height)
|
w_bbox = self._convert_bbox(google_word.bounding_box, img_width, img_height)
|
||||||
|
|
||||||
@@ -497,30 +467,66 @@ class GoogleVision:
|
|||||||
separator=w_separator
|
separator=w_separator
|
||||||
)
|
)
|
||||||
|
|
||||||
def _break_type_to_char(self, break_type):
|
def _create_lines_from_google_paragraph(self, google_paragraph, p_bbox, img_width, img_height):
|
||||||
if break_type == vision.TextAnnotation.DetectedBreak.BreakType.SPACE:
|
lines = []
|
||||||
return ' '
|
words = []
|
||||||
elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.SURE_SPACE:
|
for google_word in google_paragraph.words:
|
||||||
return ' '
|
word = self._create_word_from_google_word(google_word, img_width, img_height)
|
||||||
elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.EOL_SURE_SPACE:
|
words.append(word)
|
||||||
return '\n'
|
if word.separator == '\n':
|
||||||
elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.HYPHEN:
|
line = Line(bounding_box=BoundingBox(0,0,0,0), words=words)
|
||||||
return '-'
|
lines.append(line)
|
||||||
elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.LINE_BREAK:
|
words = []
|
||||||
return '\n'
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def _convert_bbox(self, quad, img_width, img_height):
|
if len(lines) == 1:
|
||||||
vertices = quad.vertices
|
lines[0].bounding_box = p_bbox
|
||||||
|
else:
|
||||||
|
for line in lines:
|
||||||
|
l_bbox = merge_bounding_boxes(line.words, True)
|
||||||
|
line.bounding_box = l_bbox
|
||||||
|
|
||||||
return quad_to_bounding_box(
|
return lines
|
||||||
vertices[0].x, vertices[0].y,
|
|
||||||
vertices[1].x, vertices[1].y,
|
def _to_generic_result(self, full_text_annotation, img_width, img_height):
|
||||||
vertices[2].x, vertices[2].y,
|
paragraphs = []
|
||||||
vertices[3].x, vertices[3].y,
|
|
||||||
img_width, img_height
|
if full_text_annotation:
|
||||||
|
for page in full_text_annotation.pages:
|
||||||
|
if page.width == img_width and page.height == img_height:
|
||||||
|
for block in page.blocks:
|
||||||
|
for google_paragraph in block.paragraphs:
|
||||||
|
p_bbox = self._convert_bbox(google_paragraph.bounding_box, img_width, img_height)
|
||||||
|
lines = self._create_lines_from_google_paragraph(google_paragraph, p_bbox, img_width, img_height)
|
||||||
|
paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
|
||||||
|
paragraphs.append(paragraph)
|
||||||
|
|
||||||
|
return OcrResult(
|
||||||
|
image_properties=ImageProperties(width=img_width, height=img_height),
|
||||||
|
paragraphs=paragraphs
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def __call__(self, img):
|
||||||
|
img, is_path = input_to_pil_image(img)
|
||||||
|
if not img:
|
||||||
|
return (False, 'Invalid image provided')
|
||||||
|
|
||||||
|
image_bytes = self._preprocess(img)
|
||||||
|
image = vision.Image(content=image_bytes)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self.client.document_text_detection(image=image)
|
||||||
|
except ServiceUnavailable:
|
||||||
|
return (False, 'Connection error!')
|
||||||
|
except Exception as e:
|
||||||
|
return (False, 'Unknown error!')
|
||||||
|
|
||||||
|
ocr_result = self._to_generic_result(response.full_text_annotation, img.width, img.height)
|
||||||
|
x = (True, ocr_result)
|
||||||
|
|
||||||
|
if is_path:
|
||||||
|
img.close()
|
||||||
|
return x
|
||||||
|
|
||||||
def _preprocess(self, img):
|
def _preprocess(self, img):
|
||||||
return pil_image_to_bytes(img)
|
return pil_image_to_bytes(img)
|
||||||
|
|
||||||
@@ -1322,24 +1328,14 @@ class AzureImageAnalysis:
|
|||||||
except:
|
except:
|
||||||
logger.warning('Error parsing Azure credentials, Azure Image Analysis will not work!')
|
logger.warning('Error parsing Azure credentials, Azure Image Analysis will not work!')
|
||||||
|
|
||||||
def __call__(self, img):
|
def _convert_bbox(self, rect, img_width, img_height):
|
||||||
img, is_path = input_to_pil_image(img)
|
return quad_to_bounding_box(
|
||||||
if not img:
|
rect[0]['x'], rect[0]['y'],
|
||||||
return (False, 'Invalid image provided')
|
rect[1]['x'], rect[1]['y'],
|
||||||
|
rect[2]['x'], rect[2]['y'],
|
||||||
try:
|
rect[3]['x'], rect[3]['y'],
|
||||||
read_result = self.client.analyze(image_data=self._preprocess(img), visual_features=[VisualFeatures.READ])
|
img_width, img_height
|
||||||
except ServiceRequestError:
|
)
|
||||||
return (False, 'Connection error!')
|
|
||||||
except:
|
|
||||||
return (False, 'Unknown error!')
|
|
||||||
|
|
||||||
ocr_result = self._to_generic_result(read_result, img.width, img.height)
|
|
||||||
x = (True, ocr_result)
|
|
||||||
|
|
||||||
if is_path:
|
|
||||||
img.close()
|
|
||||||
return x
|
|
||||||
|
|
||||||
def _to_generic_result(self, read_result, img_width, img_height):
|
def _to_generic_result(self, read_result, img_width, img_height):
|
||||||
paragraphs = []
|
paragraphs = []
|
||||||
@@ -1374,14 +1370,24 @@ class AzureImageAnalysis:
|
|||||||
paragraphs=paragraphs
|
paragraphs=paragraphs
|
||||||
)
|
)
|
||||||
|
|
||||||
def _convert_bbox(self, rect, img_width, img_height):
|
def __call__(self, img):
|
||||||
return quad_to_bounding_box(
|
img, is_path = input_to_pil_image(img)
|
||||||
rect[0]['x'], rect[0]['y'],
|
if not img:
|
||||||
rect[1]['x'], rect[1]['y'],
|
return (False, 'Invalid image provided')
|
||||||
rect[2]['x'], rect[2]['y'],
|
|
||||||
rect[3]['x'], rect[3]['y'],
|
try:
|
||||||
img_width, img_height
|
read_result = self.client.analyze(image_data=self._preprocess(img), visual_features=[VisualFeatures.READ])
|
||||||
)
|
except ServiceRequestError:
|
||||||
|
return (False, 'Connection error!')
|
||||||
|
except:
|
||||||
|
return (False, 'Unknown error!')
|
||||||
|
|
||||||
|
ocr_result = self._to_generic_result(read_result, img.width, img.height)
|
||||||
|
x = (True, ocr_result)
|
||||||
|
|
||||||
|
if is_path:
|
||||||
|
img.close()
|
||||||
|
return x
|
||||||
|
|
||||||
def _preprocess(self, img):
|
def _preprocess(self, img):
|
||||||
min_pixel_size = 50
|
min_pixel_size = 50
|
||||||
@@ -1601,6 +1607,50 @@ class OCRSpace:
|
|||||||
else:
|
else:
|
||||||
return 'auto'
|
return 'auto'
|
||||||
|
|
||||||
|
def _convert_bbox(self, word_data, img_width, img_height):
|
||||||
|
left = word_data['Left'] / img_width
|
||||||
|
top = word_data['Top'] / img_height
|
||||||
|
width = word_data['Width'] / img_width
|
||||||
|
height = word_data['Height'] / img_height
|
||||||
|
|
||||||
|
center_x = left + width / 2
|
||||||
|
center_y = top + height / 2
|
||||||
|
|
||||||
|
return BoundingBox(
|
||||||
|
center_x=center_x,
|
||||||
|
center_y=center_y,
|
||||||
|
width=width,
|
||||||
|
height=height
|
||||||
|
)
|
||||||
|
|
||||||
|
def _to_generic_result(self, api_result, img_width, img_height, og_img_width, og_img_height):
|
||||||
|
parsed_result = api_result['ParsedResults'][0]
|
||||||
|
text_overlay = parsed_result.get('TextOverlay', {})
|
||||||
|
|
||||||
|
image_props = ImageProperties(width=og_img_width, height=og_img_height)
|
||||||
|
ocr_result = OcrResult(image_properties=image_props)
|
||||||
|
|
||||||
|
lines_data = text_overlay.get('Lines', [])
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
for line_data in lines_data:
|
||||||
|
words = []
|
||||||
|
for word_data in line_data.get('Words', []):
|
||||||
|
w_bbox = self._convert_bbox(word_data, img_width, img_height)
|
||||||
|
words.append(Word(text=word_data['WordText'], bounding_box=w_bbox))
|
||||||
|
|
||||||
|
l_bbox = merge_bounding_boxes(words)
|
||||||
|
lines.append(Line(bounding_box=l_bbox, words=words))
|
||||||
|
|
||||||
|
if lines:
|
||||||
|
p_bbox = merge_bounding_boxes(lines)
|
||||||
|
paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
|
||||||
|
ocr_result.paragraphs = [paragraph]
|
||||||
|
else:
|
||||||
|
ocr_result.paragraphs = []
|
||||||
|
|
||||||
|
return ocr_result
|
||||||
|
|
||||||
def __call__(self, img):
|
def __call__(self, img):
|
||||||
img, is_path = input_to_pil_image(img)
|
img, is_path = input_to_pil_image(img)
|
||||||
if not img:
|
if not img:
|
||||||
@@ -1644,49 +1694,5 @@ class OCRSpace:
|
|||||||
img.close()
|
img.close()
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def _to_generic_result(self, api_result, img_width, img_height, og_img_width, og_img_height):
|
|
||||||
parsed_result = api_result['ParsedResults'][0]
|
|
||||||
text_overlay = parsed_result.get('TextOverlay', {})
|
|
||||||
|
|
||||||
image_props = ImageProperties(width=og_img_width, height=og_img_height)
|
|
||||||
ocr_result = OcrResult(image_properties=image_props)
|
|
||||||
|
|
||||||
lines_data = text_overlay.get('Lines', [])
|
|
||||||
|
|
||||||
lines = []
|
|
||||||
for line_data in lines_data:
|
|
||||||
words = []
|
|
||||||
for word_data in line_data.get('Words', []):
|
|
||||||
w_bbox = self._convert_bbox(word_data, img_width, img_height)
|
|
||||||
words.append(Word(text=word_data['WordText'], bounding_box=w_bbox))
|
|
||||||
|
|
||||||
l_bbox = merge_bounding_boxes(words)
|
|
||||||
lines.append(Line(bounding_box=l_bbox, words=words))
|
|
||||||
|
|
||||||
if lines:
|
|
||||||
p_bbox = merge_bounding_boxes(lines)
|
|
||||||
paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
|
|
||||||
ocr_result.paragraphs = [paragraph]
|
|
||||||
else:
|
|
||||||
ocr_result.paragraphs = []
|
|
||||||
|
|
||||||
return ocr_result
|
|
||||||
|
|
||||||
def _convert_bbox(self, word_data, img_width, img_height):
|
|
||||||
left = word_data['Left'] / img_width
|
|
||||||
top = word_data['Top'] / img_height
|
|
||||||
width = word_data['Width'] / img_width
|
|
||||||
height = word_data['Height'] / img_height
|
|
||||||
|
|
||||||
center_x = left + width / 2
|
|
||||||
center_y = top + height / 2
|
|
||||||
|
|
||||||
return BoundingBox(
|
|
||||||
center_x=center_x,
|
|
||||||
center_y=center_y,
|
|
||||||
width=width,
|
|
||||||
height=height
|
|
||||||
)
|
|
||||||
|
|
||||||
def _preprocess(self, img):
|
def _preprocess(self, img):
|
||||||
return limit_image_size(img, self.max_byte_size)
|
return limit_image_size(img, self.max_byte_size)
|
||||||
|
|||||||
Reference in New Issue
Block a user