Remove more duplication/fix rotation for microsoft stuff

2025-10-15 06:38:20 +02:00
parent e88ee3006c
commit c21bdef848
1 changed files with 108 additions and 75 deletions
--- a/owocr/ocr.py
+++ b/owocr/ocr.py
@@ -191,12 +191,78 @@ def limit_image_size(img, max_size):

    return False, '', (None, None)

+def quad_to_bounding_box(x1, y1, x2, y2, x3, y3, x4, y4, img_width=None, img_height=None):
+    center_x = (x1 + x2 + x3 + x4) / 4
+    center_y = (y1 + y2 + y3 + y4) / 4
+
+    # Calculate widths using Euclidean distance
+    width1 = sqrt((x2 - x1)**2 + (y2 - y1)**2)
+    width2 = sqrt((x3 - x4)**2 + (y3 - y4)**2)
+    avg_width = (width1 + width2) / 2
+
+    # Calculate heights using Euclidean distance
+    height1 = sqrt((x4 - x1)**2 + (y4 - y1)**2)
+    height2 = sqrt((x3 - x2)**2 + (y3 - y2)**2)
+    avg_height = (height1 + height2) / 2
+
+    # Calculate rotation angle from the first edge
+    dx = x2 - x1
+    dy = y2 - y1
+    angle = atan2(dy, dx)
+
+    if img_width and img_height:
+        center_x = center_x / img_width
+        center_y = center_y / img_height
+        avg_width = avg_width / img_width
+        avg_height = avg_height / img_height
+
+    return BoundingBox(
+        center_x=center_x,
+        center_y=center_y,
+        width=avg_width,
+        height=avg_height,
+        rotation_z=angle
+    )
+
 def merge_bounding_boxes(ocr_element_list):
-    all_bboxes = [e.bounding_box for e in ocr_element_list]
-    min_x = min(b.center_x - b.width / 2 for b in all_bboxes)
-    max_x = max(b.center_x + b.width / 2 for b in all_bboxes)
-    min_y = min(b.center_y - b.height / 2 for b in all_bboxes)
-    max_y = max(b.center_y + b.height / 2 for b in all_bboxes)
+    all_corners = []
+
+    for element in ocr_element_list:
+        bbox = element.bounding_box
+        angle = bbox.rotation_z
+        hw = bbox.width / 2
+        hh = bbox.height / 2
+
+        if not angle:
+            corners = [
+                (bbox.center_x - hw, bbox.center_y - hh),  # Top-left
+                (bbox.center_x + hw, bbox.center_y - hh),  # Top-right  
+                (bbox.center_x + hw, bbox.center_y + hh),  # Bottom-right
+                (bbox.center_x - hw, bbox.center_y + hh)   # Bottom-left
+            ]
+            all_corners.extend(corners)
+        else:
+            local_corners = [
+                (-hw, -hh),  # Top-left
+                ( hw, -hh),  # Top-right
+                ( hw,  hh),  # Bottom-right
+                (-hw,  hh)   # Bottom-left
+            ]
+
+            # Rotate and translate corners
+            cos_angle = cos(angle)
+            sin_angle = sin(angle)
+
+            for x_local, y_local in local_corners:
+                x_rotated = x_local * cos_angle - y_local * sin_angle
+                y_rotated = x_local * sin_angle + y_local * cos_angle
+                x_global = bbox.center_x + x_rotated
+                y_global = bbox.center_y + y_rotated
+                all_corners.append((x_global, y_global))
+
+    xs, ys = zip(*all_corners)
+    min_x, max_x = min(xs), max(xs)
+    min_y, max_y = min(ys), max(ys)

    return BoundingBox(
        center_x=(min_x + max_x) / 2,
@@ -205,7 +271,6 @@ def merge_bounding_boxes(ocr_element_list):
        height=max_y - min_y
    )

-
 class MangaOcr:
    name = 'mangaocr'
    readable_name = 'Manga OCR'
@@ -549,19 +614,13 @@ class Bing:
        self.available = True
        logger.info('Bing ready')

-    def _quad_to_center_bbox(self, quad):
-        center_x = (quad['topLeft']['x'] + quad['topRight']['x'] + quad['bottomRight']['x'] + quad['bottomLeft']['x']) / 4
-        center_y = (quad['topLeft']['y'] + quad['topRight']['y'] + quad['bottomRight']['y'] + quad['bottomLeft']['y']) / 4
-
-        width1 = sqrt((quad['topRight']['x'] - quad['topLeft']['x'])**2 + (quad['topRight']['y'] - quad['topLeft']['y'])**2)
-        width2 = sqrt((quad['bottomRight']['x'] - quad['bottomLeft']['x'])**2 + (quad['bottomRight']['y'] - quad['bottomLeft']['y'])**2)
-        avg_width = (width1 + width2) / 2
-
-        height1 = sqrt((quad['bottomLeft']['x'] - quad['topLeft']['x'])**2 + (quad['bottomLeft']['y'] - quad['topLeft']['y'])**2)
-        height2 = sqrt((quad['bottomRight']['x'] - quad['topRight']['x'])**2 + (quad['bottomRight']['y'] - quad['topRight']['y'])**2)
-        avg_height = (height1 + height2) / 2
-
-        return BoundingBox(center_x=center_x, center_y=center_y, width=avg_width, height=avg_height)
+    def _convert_bbox(self, quad):
+        return quad_to_bounding_box(
+            quad['topLeft']['x'], quad['topLeft']['y'],
+            quad['topRight']['x'], quad['topRight']['y'],
+            quad['bottomRight']['x'], quad['bottomRight']['y'],
+            quad['bottomLeft']['x'], quad['bottomLeft']['y']
+        )

    def _to_generic_result(self, response, img_width, img_height, og_img_width, og_img_height):
        paragraphs = []
@@ -586,19 +645,19 @@ class Bing:
                        for w in l.get('words', []):
                            word = Word(
                                text=w.get('text', ''),
-                                bounding_box=self._quad_to_center_bbox(w['boundingBox'])
+                                bounding_box=self._convert_bbox(w['boundingBox'])
                            )
                            words.append(word)

                        line = Line(
                            text=l.get('text', ''),
-                            bounding_box=self._quad_to_center_bbox(l['boundingBox']),
+                            bounding_box=self._convert_bbox(l['boundingBox']),
                            words=words
                        )
                        lines.append(line)

                    paragraph = Paragraph(
-                        bounding_box=self._quad_to_center_bbox(p['boundingBox']),
+                        bounding_box=self._convert_bbox(p['boundingBox']),
                        lines=lines
                    )
                    paragraphs.append(paragraph)
@@ -870,11 +929,11 @@ class AppleLiveText:
        if self.result == None:
            return (False, 'Unknown error!')

-        ocr_response = OcrResult(
+        ocr_result = OcrResult(
            image_properties=ImageProperties(width=img.width, height=img.height),
            paragraphs=self.result
        )
-        x = (True, ocr_response)
+        x = (True, ocr_result)

        if is_path:
            img.close()
@@ -1070,21 +1129,13 @@ class OneOCR:
            except:
                logger.warning('Error reading URL from config, OneOCR will not work!')

-    def _pixel_quad_to_center_bbox(self, rect, img_width, img_height):
-        x_coords = [rect['x1'], rect['x2'], rect['x3'], rect['x4']]
-        y_coords = [rect['y1'], rect['y2'], rect['y3'], rect['y4']]
-
-        center_x_px = sum(x_coords) / 4
-        center_y_px = sum(y_coords) / 4
-
-        width_px = (abs(rect['x2'] - rect['x1']) + abs(rect['x3'] - rect['x4'])) / 2
-        height_px = (abs(rect['y4'] - rect['y1']) + abs(rect['y3'] - rect['y2'])) / 2
-
-        return BoundingBox(
-            center_x=center_x_px / img_width,
-            center_y=center_y_px / img_height,
-            width=width_px / img_width,
-            height=height_px / img_height
+    def _convert_bbox(self, rect, img_width, img_height):
+        return quad_to_bounding_box(
+            rect['x1'], rect['y1'], 
+            rect['x2'], rect['y2'], 
+            rect['x3'], rect['y3'], 
+            rect['x4'], rect['y4'],
+            img_width, img_height
        )

    def _to_generic_result(self, response, img_width, img_height, og_img_width, og_img_height):
@@ -1094,13 +1145,13 @@ class OneOCR:
            for i, w in enumerate(l.get('words', [])):
                word = Word(
                    text=w.get('text', ''),
-                    bounding_box=self._pixel_quad_to_center_bbox(w['bounding_rect'], img_width, img_height)
+                    bounding_box=self._convert_bbox(w['bounding_rect'], img_width, img_height)
                )
                words.append(word)

            line = Line(
                text=l.get('text', ''),
-                bounding_box=self._pixel_quad_to_center_bbox(l['bounding_rect'], img_width, img_height),
+                bounding_box=self._convert_bbox(l['bounding_rect'], img_width, img_height),
                words=words
            )
            lines.append(line)
@@ -1147,8 +1198,8 @@ class OneOCR:
        if 'error' in raw_res:
            return (False, raw_res['error'])

-        ocr_response = self._to_generic_result(raw_res, img_width, img_height, img.width, img.height)
-        x = (True, ocr_response)
+        ocr_result = self._to_generic_result(raw_res, img_width, img_height, img.width, img.height)
+        x = (True, ocr_result)

        if is_path:
            img.close()
@@ -1262,22 +1313,13 @@ class EasyOCR:
            self.available = True
            logger.info('EasyOCR ready')

-    def _pixel_quad_to_center_bbox(self, rect, img_width, img_height):
-        x_coords = [float(point[0]) for point in rect]
-        y_coords = [float(point[1]) for point in rect]
+    def _convert_bbox(self, rect, img_width, img_height):
+        x1, y1 = float(rect[0][0]), float(rect[0][1])
+        x2, y2 = float(rect[1][0]), float(rect[1][1])
+        x3, y3 = float(rect[2][0]), float(rect[2][1])
+        x4, y4 = float(rect[3][0]), float(rect[3][1])

-        center_x_px = sum(x_coords) / 4
-        center_y_px = sum(y_coords) / 4
-
-        width_px = (abs(float(rect[1][0]) - float(rect[0][0])) + abs(float(rect[2][0]) - float(rect[3][0]))) / 2
-        height_px = (abs(float(rect[3][1]) - float(rect[0][1])) + abs(float(rect[2][1]) - float(rect[1][1]))) / 2
-
-        return BoundingBox(
-            center_x=center_x_px / img_width,
-            center_y=center_y_px / img_height,
-            width=width_px / img_width,
-            height=height_px / img_height
-        )
+        return quad_to_bounding_box(x1, y1, x2, y2, x3, y3, x4, y4, img_width, img_height)

    def _to_generic_result(self, response, img_width, img_height):
        lines = []
@@ -1286,7 +1328,7 @@ class EasyOCR:
            quad_coords = detection[0]
            text = detection[1]

-            bbox = self._pixel_quad_to_center_bbox(quad_coords, img_width, img_height)
+            bbox = self._convert_bbox(quad_coords, img_width, img_height)
            word = Word(text=text, bounding_box=bbox)
            line = Line(bounding_box=bbox, words=[word], text=text)
            lines.append(line)
@@ -1365,22 +1407,13 @@ class RapidOCR:
        else:
            return LangRec.LATIN

-    def _pixel_quad_to_center_bbox(self, rect, img_width, img_height):
-        x_coords = [float(point[0]) for point in rect]
-        y_coords = [float(point[1]) for point in rect]
+    def _convert_bbox(self, rect, img_width, img_height):
+        x1, y1 = float(rect[0][0]), float(rect[0][1])
+        x2, y2 = float(rect[1][0]), float(rect[1][1])
+        x3, y3 = float(rect[2][0]), float(rect[2][1])
+        x4, y4 = float(rect[3][0]), float(rect[3][1])

-        center_x_px = sum(x_coords) / 4
-        center_y_px = sum(y_coords) / 4
-
-        width_px = (abs(float(rect[1][0]) - float(rect[0][0])) + abs(float(rect[2][0]) - float(rect[3][0]))) / 2
-        height_px = (abs(float(rect[3][1]) - float(rect[0][1])) + abs(float(rect[2][1]) - float(rect[1][1]))) / 2
-
-        return BoundingBox(
-            center_x=center_x_px / img_width,
-            center_y=center_y_px / img_height,
-            width=width_px / img_width,
-            height=height_px / img_height
-        )
+        return quad_to_bounding_box(x1, y1, x2, y2, x3, y3, x4, y4, img_width, img_height)

    def _to_generic_result(self, response, img_width, img_height):
        lines = []
@@ -1388,7 +1421,7 @@ class RapidOCR:
        for i in range(len(response.boxes)):
            box = response.boxes[i]
            text = response.txts[i]
-            bbox = self._pixel_quad_to_center_bbox(box, img_width, img_height)
+            bbox = self._convert_bbox(box, img_width, img_height)
            word = Word(text=text, bounding_box=bbox)
            line = Line(bounding_box=bbox, words=[word], text=text)
            lines.append(line)