add new json (text+coordinates) output format for bing, glens and oneocr as a proof-of-concept

2025-08-16 08:19:18 +02:00
parent db5d4bc023
commit 54042163ea
3 changed files with 309 additions and 56 deletions
--- a/owocr/ocr.py
+++ b/owocr/ocr.py
@@ -5,10 +5,12 @@ from pathlib import Path
 import sys
 import platform
 import logging
-from math import sqrt
+from math import sqrt, sin, cos, atan2
 import json
 import base64
 from urllib.parse import urlparse, parse_qs
+from dataclasses import dataclass, field, asdict
+from typing import List, Optional

 import jaconv
 import numpy as np
@@ -83,6 +85,50 @@ try:
 except:
    optimized_png_encode = False

+@dataclass
+class BoundingBox:
+    """
+    Represents the normalized coordinates of a detected element.
+    All values are floats between 0.0 and 1.0.
+    """
+    center_x: float
+    center_y: float
+    width: float
+    height: float
+    rotation_z: Optional[float] = None  # Optional rotation in radians
+
+@dataclass
+class Word:
+    """Represents a single recognized word and its properties."""
+    text: str
+    bounding_box: BoundingBox
+    separator: Optional[str] = None  # The character(s) that follow the word, e.g., a space
+
+@dataclass
+class Line:
+    """Represents a single line of text, composed of words."""
+    bounding_box: BoundingBox
+    words: List[Word] = field(default_factory=list)
+
+@dataclass
+class Paragraph:
+    """Represents a block of text, composed of lines."""
+    bounding_box: BoundingBox
+    lines: List[Line] = field(default_factory=list)
+    writing_direction: Optional[str] = None # Optional: e.g., "LEFT_TO_RIGHT"
+
+@dataclass
+class ImageProperties:
+    """Stores the original dimensions of the processed image."""
+    width: int
+    height: int
+
+@dataclass
+class OcrResult:
+    """The root object for a complete OCR analysis of an image."""
+    image_properties: ImageProperties
+    paragraphs: List[Paragraph] = field(default_factory=list)
+

 def empty_post_process(text):
    return text
@@ -243,6 +289,62 @@ class GoogleLens:
            self.available = True
            logger.info('Google Lens ready')

+    def _to_generic_result(self, response, img_width, img_height):
+        paragraphs = []
+        if 'objects_response' in response and 'text' in response['objects_response']:
+            text_data = response['objects_response']['text']
+            if 'text_layout' in text_data:
+                for p in text_data['text_layout'].get('paragraphs', []):
+                    lines = []
+                    for l in p.get('lines', []):
+                        words = []
+                        for w in l.get('words', []):
+                            w_bbox = w.get('geometry', {}).get('bounding_box', {})
+                            word = Word(
+                                text=w.get('plain_text', ''),
+                                separator=w.get('text_separator'),
+                                bounding_box=BoundingBox(
+                                    center_x=w_bbox.get('center_x'),
+                                    center_y=w_bbox.get('center_y'),
+                                    width=w_bbox.get('width'),
+                                    height=w_bbox.get('height'),
+                                    rotation_z=w_bbox.get('rotation_z')
+                                )
+                            )
+                            words.append(word)
+                        
+                        l_bbox = l.get('geometry', {}).get('bounding_box', {})
+                        line = Line(
+                            bounding_box=BoundingBox(
+                                center_x=l_bbox.get('center_x'),
+                                center_y=l_bbox.get('center_y'),
+                                width=l_bbox.get('width'),
+                                height=l_bbox.get('height'),
+                                rotation_z=l_bbox.get('rotation_z')
+                            ),
+                            words=words
+                        )
+                        lines.append(line)
+
+                    p_bbox = p.get('geometry', {}).get('bounding_box', {})
+                    paragraph = Paragraph(
+                        bounding_box=BoundingBox(
+                            center_x=p_bbox.get('center_x'),
+                            center_y=p_bbox.get('center_y'),
+                            width=p_bbox.get('width'),
+                            height=p_bbox.get('height'),
+                            rotation_z=p_bbox.get('rotation_z')
+                        ),
+                        lines=lines,
+                        writing_direction=p.get('writing_direction')
+                    )
+                    paragraphs.append(paragraph)
+
+        return OcrResult(
+            image_properties=ImageProperties(width=img_width, height=img_height),
+            paragraphs=paragraphs
+        )
+
    def __call__(self, img):
        img, is_path = input_to_pil_image(img)
        if not img:
@@ -272,7 +374,7 @@ class GoogleLens:
        image_data = self._preprocess(img)
        request.objects_request.image_data.payload.image_bytes = image_data[0]
        request.objects_request.image_data.image_metadata.width = image_data[1]
-        request.objects_request.image_data.image_metadata.height = image_data[2] 
+        request.objects_request.image_data.image_metadata.height = image_data[2]

        payload = request.SerializeToString()

@@ -302,17 +404,8 @@ class GoogleLens:
        response_proto = LensOverlayServerResponse().FromString(res.content)
        response_dict = response_proto.to_dict(betterproto.Casing.SNAKE)

-        res = ''
-        text = response_dict['objects_response']['text']
-        if 'text_layout' in text:
-            paragraphs = text['text_layout']['paragraphs']
-            for paragraph in paragraphs:
-                for line in paragraph['lines']:
-                    for word in line['words']:
-                        res += word['plain_text'] + word['text_separator']
-                res += '\n'
-
-        x = (True, res)
+        ocr_result = self._to_generic_result(response_dict, img.width, img.height)
+        x = (True, ocr_result)

        if is_path:
            img.close()
@@ -433,6 +526,69 @@ class Bing:
        self.available = True
        logger.info('Bing ready')

+    def _quad_to_center_bbox(self, quad):
+        center_x = (quad['topLeft']['x'] + quad['topRight']['x'] + quad['bottomRight']['x'] + quad['bottomLeft']['x']) / 4
+        center_y = (quad['topLeft']['y'] + quad['topRight']['y'] + quad['bottomRight']['y'] + quad['bottomLeft']['y']) / 4
+        
+        width1 = sqrt((quad['topRight']['x'] - quad['topLeft']['x'])**2 + (quad['topRight']['y'] - quad['topLeft']['y'])**2)
+        width2 = sqrt((quad['bottomRight']['x'] - quad['bottomLeft']['x'])**2 + (quad['bottomRight']['y'] - quad['bottomLeft']['y'])**2)
+        avg_width = (width1 + width2) / 2
+
+        height1 = sqrt((quad['bottomLeft']['x'] - quad['topLeft']['x'])**2 + (quad['bottomLeft']['y'] - quad['topLeft']['y'])**2)
+        height2 = sqrt((quad['bottomRight']['x'] - quad['topRight']['x'])**2 + (quad['bottomRight']['y'] - quad['topRight']['y'])**2)
+        avg_height = (height1 + height2) / 2
+        
+        return BoundingBox(center_x=center_x, center_y=center_y, width=avg_width, height=avg_height)
+
+    def _to_generic_result(self, response, img_width, img_height):
+        paragraphs = []
+        text_tag = None
+        for tag in response.get('tags', []):
+            if tag.get('displayName') == '##TextRecognition':
+                text_tag = tag
+                break
+        
+        if text_tag:
+            text_action = None
+            for action in text_tag.get('actions', []):
+                if action.get('_type') == 'ImageKnowledge/TextRecognitionAction':
+                    text_action = action
+                    break
+            
+            if text_action:
+                for p in text_action.get('data', {}).get('regions', []):
+                    lines = []
+                    for l in p.get('lines', []):
+                        words = []
+                        for w in l.get('words', []):
+                            word = Word(
+                                text=w.get('text', ''),
+                                bounding_box=self._quad_to_center_bbox(w['boundingBox']),
+                                separator=" "
+                            )
+                            words.append(word)
+
+                        line = Line(
+                            bounding_box=self._quad_to_center_bbox(l['boundingBox']),
+                            words=words
+                        )
+                        lines.append(line)
+                    
+                    # Bing doesn't provide paragraph-level separators, so we add a newline
+                    if lines and lines[-1].words:
+                        lines[-1].words[-1].separator = '\n'
+
+                    paragraph = Paragraph(
+                        bounding_box=self._quad_to_center_bbox(p['boundingBox']),
+                        lines=lines
+                    )
+                    paragraphs.append(paragraph)
+
+        return OcrResult(
+            image_properties=ImageProperties(width=img_width, height=img_height),
+            paragraphs=paragraphs
+        )
+
    def __call__(self, img):
        img, is_path = input_to_pil_image(img)
        if not img:
@@ -510,26 +666,9 @@ class Bing:
            return (False, 'Unknown error!')

        data = res.json()
-
-        res = ''
-        text_tag = None
-        for tag in data['tags']:
-            if tag.get('displayName') == '##TextRecognition':
-                text_tag = tag
-                break
-        if text_tag:
-            text_action = None
-            for action in text_tag['actions']:
-                if action.get('_type') == 'ImageKnowledge/TextRecognitionAction':
-                    text_action = action
-                    break
-            if text_action:
-                regions = text_action['data'].get('regions', [])
-                for region in regions:
-                    for line in region.get('lines', []):
-                        res += line['text'] + '\n'
        
-        x = (True, res)
+        ocr_result = self._to_generic_result(data, img.width, img.height)
+        x = (True, ocr_result)

        if is_path:
            img.close()
@@ -763,6 +902,67 @@ class OneOCR:
            except:
                logger.warning('Error reading URL from config, OneOCR will not work!')

+    def _pixel_quad_to_center_bbox(self, rect, img_width, img_height):
+        x_coords = [rect['x1'], rect['x2'], rect['x3'], rect['x4']]
+        y_coords = [rect['y1'], rect['y2'], rect['y3'], rect['y4']]
+
+        center_x_px = sum(x_coords) / 4
+        center_y_px = sum(y_coords) / 4
+        
+        width_px = (abs(rect['x2'] - rect['x1']) + abs(rect['x3'] - rect['x4'])) / 2
+        height_px = (abs(rect['y4'] - rect['y1']) + abs(rect['y3'] - rect['y2'])) / 2
+
+        return BoundingBox(
+            center_x=center_x_px / img_width,
+            center_y=center_y_px / img_height,
+            width=width_px / img_width,
+            height=height_px / img_height
+        )
+
+    def _to_generic_result(self, response, img_width, img_height):
+        lines = []
+        for l in response.get('lines', []):
+            words = []
+            for i, w in enumerate(l.get('words', [])):
+                separator = " " if i < len(l.get('words', [])) - 1 else None
+                word = Word(
+                    text=w.get('text', ''),
+                    separator=separator,
+                    bounding_box=self._pixel_quad_to_center_bbox(w['bounding_rect'], img_width, img_height)
+                )
+                words.append(word)
+            
+            line = Line(
+                bounding_box=self._pixel_quad_to_center_bbox(l['bounding_rect'], img_width, img_height),
+                words=words
+            )
+            lines.append(line)
+
+        # Create a single paragraph to hold all lines
+        if lines:
+            # Approximate paragraph bbox by combining all line bboxes
+            all_line_bboxes = [l.bounding_box for l in lines]
+            min_x = min(b.center_x - b.width / 2 for b in all_line_bboxes)
+            max_x = max(b.center_x + b.width / 2 for b in all_line_bboxes)
+            min_y = min(b.center_y - b.height / 2 for b in all_line_bboxes)
+            max_y = max(b.center_y + b.height / 2 for b in all_line_bboxes)
+            
+            p_bbox = BoundingBox(
+                center_x=(min_x + max_x) / 2,
+                center_y=(min_y + max_y) / 2,
+                width=max_x - min_x,
+                height=max_y - min_y
+            )
+            paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
+            paragraphs = [paragraph]
+        else:
+            paragraphs = []
+
+        return OcrResult(
+            image_properties=ImageProperties(width=img_width, height=img_height),
+            paragraphs=paragraphs
+        )
+
    def __call__(self, img):
        img, is_path = input_to_pil_image(img)
        if not img:
@@ -770,7 +970,7 @@ class OneOCR:

        if sys.platform == 'win32':
            try:
-                res = self.model.recognize_pil(img)['text']
+                raw_res = self.model.recognize_pil(img)
            except RuntimeError as e:
                return (False, e)
        else:
@@ -784,9 +984,10 @@ class OneOCR:
            if res.status_code != 200:
                return (False, 'Unknown error!')

-            res = res.json()['text']
+            raw_res = res.json()

-        x = (True, res)
+        ocr_response = self._to_generic_result(raw_res, img.width, img.height)
+        x = (True, ocr_response)

        if is_path:
            img.close()