Overhaul paragraph/line reordering (it's been 3,000 years...)

2025-11-22 16:47:00 +01:00
parent e34ac60a57
commit 1870c04574
2 changed files with 663 additions and 285 deletions
--- a/owocr/ocr.py
+++ b/owocr/ocr.py
@@ -100,6 +100,22 @@ class BoundingBox:
    height: float
    rotation_z: Optional[float] = None  # Optional rotation in radians

+    @property
+    def left(self) -> float:
+        return self.center_x - self.width / 2
+
+    @property
+    def right(self) -> float:
+        return self.center_x + self.width / 2
+
+    @property
+    def top(self) -> float:
+        return self.center_y - self.height / 2
+
+    @property
+    def bottom(self) -> float:
+        return self.center_y + self.height / 2
+
@dataclass
 class Word:
    """Represents a single recognized word and its properties."""
@@ -127,15 +143,29 @@ class ImageProperties:
    width: int
    height: int

+@dataclass
+class EngineCapabilities:
+    """
+    Represents the features natively supported by the OCR engine.
+    """
+    words: bool
+    word_bounding_boxes: bool
+    lines: bool
+    line_bounding_boxes: bool
+    paragraphs: bool
+    paragraph_bounding_boxes: bool
+
@dataclass
 class OcrResult:
    """The root object for a complete OCR analysis of an image."""
    image_properties: ImageProperties
+    engine_capabilities: EngineCapabilities
    paragraphs: List[Paragraph] = field(default_factory=list)


 def initialize_manga_ocr(pretrained_model_name_or_path, force_cpu):
    def empty_post_process(text):
+        text = re.sub(r'\s+', '', text)
        return text

    global manga_ocr_model
@@ -386,6 +416,14 @@ class MangaOcrSegmented:
    manual_language = False
    coordinate_support = True
    threading_support = True
+    capabilities = EngineCapabilities(
+        words=False,
+        word_bounding_boxes=False,
+        lines=True,
+        line_bounding_boxes=True,
+        paragraphs=True,
+        paragraph_bounding_boxes=True
+    )

    def __init__(self, config={}):
        if 'manga_ocr' not in sys.modules:
@@ -537,7 +575,8 @@ class MangaOcrSegmented:

        return OcrResult(
            image_properties=ImageProperties(width=img_width, height=img_height),
-            paragraphs=paragraphs
+            paragraphs=paragraphs,
+            engine_capabilities=self.capabilities
        )

    def __call__(self, img):
@@ -566,6 +605,14 @@ class MangaOcr:
    manual_language = False
    coordinate_support = False
    threading_support = True
+    capabilities = EngineCapabilities(
+        words=False,
+        word_bounding_boxes=False,
+        lines=True,
+        line_bounding_boxes=False,
+        paragraphs=False,
+        paragraph_bounding_boxes=False
+    )

    def __init__(self, config={}):
        if 'manga_ocr' not in sys.modules:
@@ -598,6 +645,14 @@ class GoogleVision:
    manual_language = False
    coordinate_support = True
    threading_support = True
+    capabilities = {
+        'words': True,
+        'word_bounding_boxes': True,
+        'lines': True,
+        'line_bounding_boxes': False,
+        'paragraphs': True,
+        'paragraph_bounding_boxes': True
+    }

    def __init__(self):
        if 'google.cloud' not in sys.modules:
@@ -698,7 +753,8 @@ class GoogleVision:

        return OcrResult(
            image_properties=ImageProperties(width=img_width, height=img_height),
-            paragraphs=paragraphs
+            paragraphs=paragraphs,
+            engine_capabilities=self.capabilities
        )

    def __call__(self, img):
@@ -736,6 +792,14 @@ class GoogleLens:
    manual_language = False
    coordinate_support = True
    threading_support = True
+    capabilities = EngineCapabilities(
+        words=True,
+        word_bounding_boxes=True,
+        lines=True,
+        line_bounding_boxes=True,
+        paragraphs=True,
+        paragraph_bounding_boxes=True
+    )

    def __init__(self):
        if 'betterproto' not in sys.modules:
@@ -797,7 +861,8 @@ class GoogleLens:

        return OcrResult(
            image_properties=ImageProperties(width=img_width, height=img_height),
-            paragraphs=paragraphs
+            paragraphs=paragraphs,
+            engine_capabilities=self.capabilities
        )

    def __call__(self, img):
@@ -885,6 +950,14 @@ class Bing:
    manual_language = False
    coordinate_support = True
    threading_support = True
+    capabilities = EngineCapabilities(
+        words=True,
+        word_bounding_boxes=True,
+        lines=True,
+        line_bounding_boxes=True,
+        paragraphs=True,
+        paragraph_bounding_boxes=True
+    )

    def __init__(self):
        self.requests_session = requests.Session()
@@ -941,7 +1014,8 @@ class Bing:

        return OcrResult(
            image_properties=ImageProperties(width=og_img_width, height=og_img_height),
-            paragraphs=paragraphs
+            paragraphs=paragraphs,
+            engine_capabilities=self.capabilities
        )

    def __call__(self, img):
@@ -1058,6 +1132,14 @@ class AppleVision:
    manual_language = True
    coordinate_support = True
    threading_support = True
+    capabilities = EngineCapabilities(
+        words=False,
+        word_bounding_boxes=False,
+        lines=True,
+        line_bounding_boxes=True,
+        paragraphs=False,
+        paragraph_bounding_boxes=False
+    )

    def __init__(self, language='ja', config={}):
        if sys.platform != 'darwin':
@@ -1105,7 +1187,8 @@ class AppleVision:

        return OcrResult(
            image_properties=ImageProperties(width=img_width, height=img_height),
-            paragraphs=paragraphs
+            paragraphs=paragraphs,
+            engine_capabilities=self.capabilities
        )

    def __call__(self, img):
@@ -1150,6 +1233,14 @@ class AppleLiveText:
    manual_language = True
    coordinate_support = True
    threading_support = False
+    capabilities = EngineCapabilities(
+        words=True,
+        word_bounding_boxes=True,
+        lines=True,
+        line_bounding_boxes=True,
+        paragraphs=False,
+        paragraph_bounding_boxes=False
+    )

    def __init__(self, language='ja'):
        if sys.platform != 'darwin':
@@ -1212,7 +1303,8 @@ class AppleLiveText:

        ocr_result = OcrResult(
            image_properties=ImageProperties(width=img.width, height=img.height),
-            paragraphs=self.result
+            paragraphs=self.result,
+            engine_capabilities=self.capabilities
        )
        x = (True, ocr_result)

@@ -1278,6 +1370,14 @@ class WinRTOCR:
    manual_language = True
    coordinate_support = True
    threading_support = True
+    capabilities = EngineCapabilities(
+        words=True,
+        word_bounding_boxes=True,
+        lines=True,
+        line_bounding_boxes=False,
+        paragraphs=False,
+        paragraph_bounding_boxes=False
+    )

    def __init__(self, config={}, language='ja'):
        if sys.platform == 'win32':
@@ -1343,7 +1443,8 @@ class WinRTOCR:

        return OcrResult(
            image_properties=ImageProperties(width=img_width, height=img_height),
-            paragraphs=paragraphs
+            paragraphs=paragraphs,
+            engine_capabilities=self.capabilities
        )

    def __call__(self, img):
@@ -1387,6 +1488,14 @@ class OneOCR:
    manual_language = False
    coordinate_support = True
    threading_support = True
+    capabilities = EngineCapabilities(
+        words=True,
+        word_bounding_boxes=True,
+        lines=True,
+        line_bounding_boxes=True,
+        paragraphs=False,
+        paragraph_bounding_boxes=False
+    )

    def __init__(self, config={}):
        if sys.platform == 'win32':
@@ -1446,7 +1555,8 @@ class OneOCR:

        return OcrResult(
            image_properties=ImageProperties(width=og_img_width, height=og_img_height),
-            paragraphs=paragraphs
+            paragraphs=paragraphs,
+            engine_capabilities=self.capabilities
        )

    def __call__(self, img):
@@ -1517,6 +1627,14 @@ class AzureImageAnalysis:
    manual_language = False
    coordinate_support = True
    threading_support = True
+    capabilities = EngineCapabilities(
+        words=True,
+        word_bounding_boxes=True,
+        lines=True,
+        line_bounding_boxes=True,
+        paragraphs=False,
+        paragraph_bounding_boxes=False
+    )

    def __init__(self, config={}):
        if 'azure.ai.vision.imageanalysis' not in sys.modules:
@@ -1569,7 +1687,8 @@ class AzureImageAnalysis:

        return OcrResult(
            image_properties=ImageProperties(width=img_width, height=img_height),
-            paragraphs=paragraphs
+            paragraphs=paragraphs,
+            engine_capabilities=self.capabilities
        )

    def __call__(self, img):
@@ -1619,6 +1738,14 @@ class EasyOCR:
    manual_language = True
    coordinate_support = True
    threading_support = True
+    capabilities = EngineCapabilities(
+        words=False,
+        word_bounding_boxes=False,
+        lines=True,
+        line_bounding_boxes=True,
+        paragraphs=False,
+        paragraph_bounding_boxes=False
+    )

    def __init__(self, config={}, language='ja'):
        if 'easyocr' not in sys.modules:
@@ -1660,7 +1787,8 @@ class EasyOCR:

        return OcrResult(
            image_properties=ImageProperties(width=img_width, height=img_height),
-            paragraphs=paragraphs
+            paragraphs=paragraphs,
+            engine_capabilities=self.capabilities
        )

    def __call__(self, img):
@@ -1689,6 +1817,14 @@ class RapidOCR:
    manual_language = True
    coordinate_support = True
    threading_support = True
+    capabilities = EngineCapabilities(
+        words=False,
+        word_bounding_boxes=False,
+        lines=True,
+        line_bounding_boxes=True,
+        paragraphs=False,
+        paragraph_bounding_boxes=False
+    )

    def __init__(self, config={}, language='ja'):
        if 'rapidocr' not in sys.modules:
@@ -1756,7 +1892,8 @@ class RapidOCR:

        return OcrResult(
            image_properties=ImageProperties(width=img_width, height=img_height),
-            paragraphs=paragraphs
+            paragraphs=paragraphs,
+            engine_capabilities=self.capabilities
        )

    def __call__(self, img):
@@ -1785,6 +1922,14 @@ class OCRSpace:
    manual_language = True
    coordinate_support = True
    threading_support = True
+    capabilities = EngineCapabilities(
+        words=True,
+        word_bounding_boxes=True,
+        lines=True,
+        line_bounding_boxes=False,
+        paragraphs=False,
+        paragraph_bounding_boxes=False
+    )

    def __init__(self, config={}, language='ja'):
        try:
@@ -1855,7 +2000,8 @@ class OCRSpace:

        return OcrResult(
            image_properties=ImageProperties(width=og_img_width, height=og_img_height),
-            paragraphs=paragraphs
+            paragraphs=paragraphs,
+            engine_capabilities=self.capabilities
        )

    def __call__(self, img):
--- a/owocr/run.py
+++ b/owocr/run.py