From f724d5c41fd03797efc1c8fd65adcbc8bcf3cc77 Mon Sep 17 00:00:00 2001 From: AuroraWright Date: Thu, 16 Oct 2025 08:18:37 +0200 Subject: [PATCH] Use paragraph bbox for gvision if only 1 line --- owocr/ocr.py | 286 ++++++++++++++++++++++++++------------------------- 1 file changed, 146 insertions(+), 140 deletions(-) diff --git a/owocr/ocr.py b/owocr/ocr.py index 5c52079..b79b2bb 100644 --- a/owocr/ocr.py +++ b/owocr/ocr.py @@ -417,60 +417,30 @@ class GoogleVision: except: logger.warning('Error parsing Google credentials, Google Vision will not work!') - def __call__(self, img): - img, is_path = input_to_pil_image(img) - if not img: - return (False, 'Invalid image provided') + def _break_type_to_char(self, break_type): + if break_type == vision.TextAnnotation.DetectedBreak.BreakType.SPACE: + return ' ' + elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.SURE_SPACE: + return ' ' + elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.EOL_SURE_SPACE: + return '\n' + elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.HYPHEN: + return '-' + elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.LINE_BREAK: + return '\n' + return '' - image_bytes = self._preprocess(img) - image = vision.Image(content=image_bytes) + def _convert_bbox(self, quad, img_width, img_height): + vertices = quad.vertices - try: - response = self.client.document_text_detection(image=image) - except ServiceUnavailable: - return (False, 'Connection error!') - except Exception as e: - return (False, 'Unknown error!') - - ocr_result = self._to_generic_result(response.full_text_annotation, img.width, img.height) - x = (True, ocr_result) - - if is_path: - img.close() - return x - - def _to_generic_result(self, full_text_annotation, img_width, img_height): - paragraphs = [] - - if full_text_annotation: - for page in full_text_annotation.pages: - if page.width == img_width and page.height == img_height: - for block in page.blocks: - for google_paragraph in block.paragraphs: - p_bbox = self._convert_bbox(google_paragraph.bounding_box, img_width, img_height) - lines = self._create_lines_from_google_paragraph(google_paragraph, img_width, img_height) - paragraph = Paragraph(bounding_box=p_bbox, lines=lines) - paragraphs.append(paragraph) - - return OcrResult( - image_properties=ImageProperties(width=img_width, height=img_height), - paragraphs=paragraphs + return quad_to_bounding_box( + vertices[0].x, vertices[0].y, + vertices[1].x, vertices[1].y, + vertices[2].x, vertices[2].y, + vertices[3].x, vertices[3].y, + img_width, img_height ) - def _create_lines_from_google_paragraph(self, google_paragraph, img_width, img_height): - lines = [] - words = [] - for google_word in google_paragraph.words: - word = self._create_word_from_google_word(google_word, img_width, img_height) - words.append(word) - if word.separator == '\n': - l_bbox = merge_bounding_boxes(words, True) - line = Line(bounding_box=l_bbox, words=words) - lines.append(line) - words = [] - - return lines - def _create_word_from_google_word(self, google_word, img_width, img_height): w_bbox = self._convert_bbox(google_word.bounding_box, img_width, img_height) @@ -497,30 +467,66 @@ class GoogleVision: separator=w_separator ) - def _break_type_to_char(self, break_type): - if break_type == vision.TextAnnotation.DetectedBreak.BreakType.SPACE: - return ' ' - elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.SURE_SPACE: - return ' ' - elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.EOL_SURE_SPACE: - return '\n' - elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.HYPHEN: - return '-' - elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.LINE_BREAK: - return '\n' - return '' + def _create_lines_from_google_paragraph(self, google_paragraph, p_bbox, img_width, img_height): + lines = [] + words = [] + for google_word in google_paragraph.words: + word = self._create_word_from_google_word(google_word, img_width, img_height) + words.append(word) + if word.separator == '\n': + line = Line(bounding_box=BoundingBox(0,0,0,0), words=words) + lines.append(line) + words = [] - def _convert_bbox(self, quad, img_width, img_height): - vertices = quad.vertices + if len(lines) == 1: + lines[0].bounding_box = p_bbox + else: + for line in lines: + l_bbox = merge_bounding_boxes(line.words, True) + line.bounding_box = l_bbox - return quad_to_bounding_box( - vertices[0].x, vertices[0].y, - vertices[1].x, vertices[1].y, - vertices[2].x, vertices[2].y, - vertices[3].x, vertices[3].y, - img_width, img_height + return lines + + def _to_generic_result(self, full_text_annotation, img_width, img_height): + paragraphs = [] + + if full_text_annotation: + for page in full_text_annotation.pages: + if page.width == img_width and page.height == img_height: + for block in page.blocks: + for google_paragraph in block.paragraphs: + p_bbox = self._convert_bbox(google_paragraph.bounding_box, img_width, img_height) + lines = self._create_lines_from_google_paragraph(google_paragraph, p_bbox, img_width, img_height) + paragraph = Paragraph(bounding_box=p_bbox, lines=lines) + paragraphs.append(paragraph) + + return OcrResult( + image_properties=ImageProperties(width=img_width, height=img_height), + paragraphs=paragraphs ) + def __call__(self, img): + img, is_path = input_to_pil_image(img) + if not img: + return (False, 'Invalid image provided') + + image_bytes = self._preprocess(img) + image = vision.Image(content=image_bytes) + + try: + response = self.client.document_text_detection(image=image) + except ServiceUnavailable: + return (False, 'Connection error!') + except Exception as e: + return (False, 'Unknown error!') + + ocr_result = self._to_generic_result(response.full_text_annotation, img.width, img.height) + x = (True, ocr_result) + + if is_path: + img.close() + return x + def _preprocess(self, img): return pil_image_to_bytes(img) @@ -1322,24 +1328,14 @@ class AzureImageAnalysis: except: logger.warning('Error parsing Azure credentials, Azure Image Analysis will not work!') - def __call__(self, img): - img, is_path = input_to_pil_image(img) - if not img: - return (False, 'Invalid image provided') - - try: - read_result = self.client.analyze(image_data=self._preprocess(img), visual_features=[VisualFeatures.READ]) - except ServiceRequestError: - return (False, 'Connection error!') - except: - return (False, 'Unknown error!') - - ocr_result = self._to_generic_result(read_result, img.width, img.height) - x = (True, ocr_result) - - if is_path: - img.close() - return x + def _convert_bbox(self, rect, img_width, img_height): + return quad_to_bounding_box( + rect[0]['x'], rect[0]['y'], + rect[1]['x'], rect[1]['y'], + rect[2]['x'], rect[2]['y'], + rect[3]['x'], rect[3]['y'], + img_width, img_height + ) def _to_generic_result(self, read_result, img_width, img_height): paragraphs = [] @@ -1374,14 +1370,24 @@ class AzureImageAnalysis: paragraphs=paragraphs ) - def _convert_bbox(self, rect, img_width, img_height): - return quad_to_bounding_box( - rect[0]['x'], rect[0]['y'], - rect[1]['x'], rect[1]['y'], - rect[2]['x'], rect[2]['y'], - rect[3]['x'], rect[3]['y'], - img_width, img_height - ) + def __call__(self, img): + img, is_path = input_to_pil_image(img) + if not img: + return (False, 'Invalid image provided') + + try: + read_result = self.client.analyze(image_data=self._preprocess(img), visual_features=[VisualFeatures.READ]) + except ServiceRequestError: + return (False, 'Connection error!') + except: + return (False, 'Unknown error!') + + ocr_result = self._to_generic_result(read_result, img.width, img.height) + x = (True, ocr_result) + + if is_path: + img.close() + return x def _preprocess(self, img): min_pixel_size = 50 @@ -1601,6 +1607,50 @@ class OCRSpace: else: return 'auto' + def _convert_bbox(self, word_data, img_width, img_height): + left = word_data['Left'] / img_width + top = word_data['Top'] / img_height + width = word_data['Width'] / img_width + height = word_data['Height'] / img_height + + center_x = left + width / 2 + center_y = top + height / 2 + + return BoundingBox( + center_x=center_x, + center_y=center_y, + width=width, + height=height + ) + + def _to_generic_result(self, api_result, img_width, img_height, og_img_width, og_img_height): + parsed_result = api_result['ParsedResults'][0] + text_overlay = parsed_result.get('TextOverlay', {}) + + image_props = ImageProperties(width=og_img_width, height=og_img_height) + ocr_result = OcrResult(image_properties=image_props) + + lines_data = text_overlay.get('Lines', []) + + lines = [] + for line_data in lines_data: + words = [] + for word_data in line_data.get('Words', []): + w_bbox = self._convert_bbox(word_data, img_width, img_height) + words.append(Word(text=word_data['WordText'], bounding_box=w_bbox)) + + l_bbox = merge_bounding_boxes(words) + lines.append(Line(bounding_box=l_bbox, words=words)) + + if lines: + p_bbox = merge_bounding_boxes(lines) + paragraph = Paragraph(bounding_box=p_bbox, lines=lines) + ocr_result.paragraphs = [paragraph] + else: + ocr_result.paragraphs = [] + + return ocr_result + def __call__(self, img): img, is_path = input_to_pil_image(img) if not img: @@ -1644,49 +1694,5 @@ class OCRSpace: img.close() return x - def _to_generic_result(self, api_result, img_width, img_height, og_img_width, og_img_height): - parsed_result = api_result['ParsedResults'][0] - text_overlay = parsed_result.get('TextOverlay', {}) - - image_props = ImageProperties(width=og_img_width, height=og_img_height) - ocr_result = OcrResult(image_properties=image_props) - - lines_data = text_overlay.get('Lines', []) - - lines = [] - for line_data in lines_data: - words = [] - for word_data in line_data.get('Words', []): - w_bbox = self._convert_bbox(word_data, img_width, img_height) - words.append(Word(text=word_data['WordText'], bounding_box=w_bbox)) - - l_bbox = merge_bounding_boxes(words) - lines.append(Line(bounding_box=l_bbox, words=words)) - - if lines: - p_bbox = merge_bounding_boxes(lines) - paragraph = Paragraph(bounding_box=p_bbox, lines=lines) - ocr_result.paragraphs = [paragraph] - else: - ocr_result.paragraphs = [] - - return ocr_result - - def _convert_bbox(self, word_data, img_width, img_height): - left = word_data['Left'] / img_width - top = word_data['Top'] / img_height - width = word_data['Width'] / img_width - height = word_data['Height'] / img_height - - center_x = left + width / 2 - center_y = top + height / 2 - - return BoundingBox( - center_x=center_x, - center_y=center_y, - width=width, - height=height - ) - def _preprocess(self, img): return limit_image_size(img, self.max_byte_size)