Remove more duplication/fix rotation for microsoft stuff

2025-10-15 06:38:20 +02:00
parent e88ee3006c
commit c21bdef848
1 changed files with 108 additions and 75 deletions
--- a/owocr/ocr.py
+++ b/owocr/ocr.py
@@ -191,12 +191,78 @@ def limit_image_size(img, max_size):
    return False, '', (None, None)
 def quad_to_bounding_box(x1, y1, x2, y2, x3, y3, x4, y4, img_width=None, img_height=None):
    center_x = (x1 + x2 + x3 + x4) / 4
    center_y = (y1 + y2 + y3 + y4) / 4
    # Calculate widths using Euclidean distance
    width1 = sqrt((x2 - x1)**2 + (y2 - y1)**2)
    width2 = sqrt((x3 - x4)**2 + (y3 - y4)**2)
    avg_width = (width1 + width2) / 2
    # Calculate heights using Euclidean distance
    height1 = sqrt((x4 - x1)**2 + (y4 - y1)**2)
    height2 = sqrt((x3 - x2)**2 + (y3 - y2)**2)
    avg_height = (height1 + height2) / 2
    # Calculate rotation angle from the first edge
    dx = x2 - x1
    dy = y2 - y1
    angle = atan2(dy, dx)
    if img_width and img_height:
        center_x = center_x / img_width
        center_y = center_y / img_height
        avg_width = avg_width / img_width
        avg_height = avg_height / img_height
    return BoundingBox(
        center_x=center_x,
        center_y=center_y,
        width=avg_width,
        height=avg_height,
        rotation_z=angle
    )
 def merge_bounding_boxes(ocr_element_list):
-    all_bboxes = [e.bounding_box for e in ocr_element_list]
+    all_corners = []
-    min_x = min(b.center_x - b.width / 2 for b in all_bboxes)
+
-    max_x = max(b.center_x + b.width / 2 for b in all_bboxes)
+    for element in ocr_element_list:
-    min_y = min(b.center_y - b.height / 2 for b in all_bboxes)
+        bbox = element.bounding_box
-    max_y = max(b.center_y + b.height / 2 for b in all_bboxes)
+        angle = bbox.rotation_z
        hw = bbox.width / 2
        hh = bbox.height / 2
        if not angle:
            corners = [
                (bbox.center_x - hw, bbox.center_y - hh),  # Top-left
                (bbox.center_x + hw, bbox.center_y - hh),  # Top-right  
                (bbox.center_x + hw, bbox.center_y + hh),  # Bottom-right
                (bbox.center_x - hw, bbox.center_y + hh)   # Bottom-left
            ]
            all_corners.extend(corners)
        else:
            local_corners = [
                (-hw, -hh),  # Top-left
                ( hw, -hh),  # Top-right
                ( hw,  hh),  # Bottom-right
                (-hw,  hh)   # Bottom-left
            ]
            # Rotate and translate corners
            cos_angle = cos(angle)
            sin_angle = sin(angle)
            for x_local, y_local in local_corners:
                x_rotated = x_local * cos_angle - y_local * sin_angle
                y_rotated = x_local * sin_angle + y_local * cos_angle
                x_global = bbox.center_x + x_rotated
                y_global = bbox.center_y + y_rotated
                all_corners.append((x_global, y_global))
    xs, ys = zip(*all_corners)
    min_x, max_x = min(xs), max(xs)
    min_y, max_y = min(ys), max(ys)
    return BoundingBox(
        center_x=(min_x + max_x) / 2,
@@ -205,7 +271,6 @@ def merge_bounding_boxes(ocr_element_list):
        height=max_y - min_y
    )
 class MangaOcr:
    name = 'mangaocr'
    readable_name = 'Manga OCR'
@@ -549,19 +614,13 @@ class Bing:
        self.available = True
        logger.info('Bing ready')
-    def _quad_to_center_bbox(self, quad):
+    def _convert_bbox(self, quad):
-        center_x = (quad['topLeft']['x'] + quad['topRight']['x'] + quad['bottomRight']['x'] + quad['bottomLeft']['x']) / 4
+        return quad_to_bounding_box(
-        center_y = (quad['topLeft']['y'] + quad['topRight']['y'] + quad['bottomRight']['y'] + quad['bottomLeft']['y']) / 4
+            quad['topLeft']['x'], quad['topLeft']['y'],
-
+            quad['topRight']['x'], quad['topRight']['y'],
-        width1 = sqrt((quad['topRight']['x'] - quad['topLeft']['x'])**2 + (quad['topRight']['y'] - quad['topLeft']['y'])**2)
+            quad['bottomRight']['x'], quad['bottomRight']['y'],
-        width2 = sqrt((quad['bottomRight']['x'] - quad['bottomLeft']['x'])**2 + (quad['bottomRight']['y'] - quad['bottomLeft']['y'])**2)
+            quad['bottomLeft']['x'], quad['bottomLeft']['y']
-        avg_width = (width1 + width2) / 2
+        )
        height1 = sqrt((quad['bottomLeft']['x'] - quad['topLeft']['x'])**2 + (quad['bottomLeft']['y'] - quad['topLeft']['y'])**2)
        height2 = sqrt((quad['bottomRight']['x'] - quad['topRight']['x'])**2 + (quad['bottomRight']['y'] - quad['topRight']['y'])**2)
        avg_height = (height1 + height2) / 2
        return BoundingBox(center_x=center_x, center_y=center_y, width=avg_width, height=avg_height)
    def _to_generic_result(self, response, img_width, img_height, og_img_width, og_img_height):
        paragraphs = []
@@ -586,19 +645,19 @@ class Bing:
                        for w in l.get('words', []):
                            word = Word(
                                text=w.get('text', ''),
-                                bounding_box=self._quad_to_center_bbox(w['boundingBox'])
+                                bounding_box=self._convert_bbox(w['boundingBox'])
                            )
                            words.append(word)
                        line = Line(
                            text=l.get('text', ''),
-                            bounding_box=self._quad_to_center_bbox(l['boundingBox']),
+                            bounding_box=self._convert_bbox(l['boundingBox']),
                            words=words
                        )
                        lines.append(line)
                    paragraph = Paragraph(
-                        bounding_box=self._quad_to_center_bbox(p['boundingBox']),
+                        bounding_box=self._convert_bbox(p['boundingBox']),
                        lines=lines
                    )
                    paragraphs.append(paragraph)
@@ -870,11 +929,11 @@ class AppleLiveText:
        if self.result == None:
            return (False, 'Unknown error!')
-        ocr_response = OcrResult(
+        ocr_result = OcrResult(
            image_properties=ImageProperties(width=img.width, height=img.height),
            paragraphs=self.result
        )
-        x = (True, ocr_response)
+        x = (True, ocr_result)
        if is_path:
            img.close()
@@ -1070,21 +1129,13 @@ class OneOCR:
            except:
                logger.warning('Error reading URL from config, OneOCR will not work!')
-    def _pixel_quad_to_center_bbox(self, rect, img_width, img_height):
+    def _convert_bbox(self, rect, img_width, img_height):
-        x_coords = [rect['x1'], rect['x2'], rect['x3'], rect['x4']]
+        return quad_to_bounding_box(
-        y_coords = [rect['y1'], rect['y2'], rect['y3'], rect['y4']]
+            rect['x1'], rect['y1'], 
-
+            rect['x2'], rect['y2'], 
-        center_x_px = sum(x_coords) / 4
+            rect['x3'], rect['y3'], 
-        center_y_px = sum(y_coords) / 4
+            rect['x4'], rect['y4'],
-
+            img_width, img_height
        width_px = (abs(rect['x2'] - rect['x1']) + abs(rect['x3'] - rect['x4'])) / 2
        height_px = (abs(rect['y4'] - rect['y1']) + abs(rect['y3'] - rect['y2'])) / 2
        return BoundingBox(
            center_x=center_x_px / img_width,
            center_y=center_y_px / img_height,
            width=width_px / img_width,
            height=height_px / img_height
        )
    def _to_generic_result(self, response, img_width, img_height, og_img_width, og_img_height):
@@ -1094,13 +1145,13 @@ class OneOCR:
            for i, w in enumerate(l.get('words', [])):
                word = Word(
                    text=w.get('text', ''),
-                    bounding_box=self._pixel_quad_to_center_bbox(w['bounding_rect'], img_width, img_height)
+                    bounding_box=self._convert_bbox(w['bounding_rect'], img_width, img_height)
                )
                words.append(word)
            line = Line(
                text=l.get('text', ''),
-                bounding_box=self._pixel_quad_to_center_bbox(l['bounding_rect'], img_width, img_height),
+                bounding_box=self._convert_bbox(l['bounding_rect'], img_width, img_height),
                words=words
            )
            lines.append(line)
@@ -1147,8 +1198,8 @@ class OneOCR:
        if 'error' in raw_res:
            return (False, raw_res['error'])
-        ocr_response = self._to_generic_result(raw_res, img_width, img_height, img.width, img.height)
+        ocr_result = self._to_generic_result(raw_res, img_width, img_height, img.width, img.height)
-        x = (True, ocr_response)
+        x = (True, ocr_result)
        if is_path:
            img.close()
@@ -1262,22 +1313,13 @@ class EasyOCR:
            self.available = True
            logger.info('EasyOCR ready')
-    def _pixel_quad_to_center_bbox(self, rect, img_width, img_height):
+    def _convert_bbox(self, rect, img_width, img_height):
-        x_coords = [float(point[0]) for point in rect]
+        x1, y1 = float(rect[0][0]), float(rect[0][1])
-        y_coords = [float(point[1]) for point in rect]
+        x2, y2 = float(rect[1][0]), float(rect[1][1])
        x3, y3 = float(rect[2][0]), float(rect[2][1])
        x4, y4 = float(rect[3][0]), float(rect[3][1])
-        center_x_px = sum(x_coords) / 4
+        return quad_to_bounding_box(x1, y1, x2, y2, x3, y3, x4, y4, img_width, img_height)
        center_y_px = sum(y_coords) / 4
        width_px = (abs(float(rect[1][0]) - float(rect[0][0])) + abs(float(rect[2][0]) - float(rect[3][0]))) / 2
        height_px = (abs(float(rect[3][1]) - float(rect[0][1])) + abs(float(rect[2][1]) - float(rect[1][1]))) / 2
        return BoundingBox(
            center_x=center_x_px / img_width,
            center_y=center_y_px / img_height,
            width=width_px / img_width,
            height=height_px / img_height
        )
    def _to_generic_result(self, response, img_width, img_height):
        lines = []
@@ -1286,7 +1328,7 @@ class EasyOCR:
            quad_coords = detection[0]
            text = detection[1]
-            bbox = self._pixel_quad_to_center_bbox(quad_coords, img_width, img_height)
+            bbox = self._convert_bbox(quad_coords, img_width, img_height)
            word = Word(text=text, bounding_box=bbox)
            line = Line(bounding_box=bbox, words=[word], text=text)
            lines.append(line)
@@ -1365,22 +1407,13 @@ class RapidOCR:
        else:
            return LangRec.LATIN
-    def _pixel_quad_to_center_bbox(self, rect, img_width, img_height):
+    def _convert_bbox(self, rect, img_width, img_height):
-        x_coords = [float(point[0]) for point in rect]
+        x1, y1 = float(rect[0][0]), float(rect[0][1])
-        y_coords = [float(point[1]) for point in rect]
+        x2, y2 = float(rect[1][0]), float(rect[1][1])
        x3, y3 = float(rect[2][0]), float(rect[2][1])
        x4, y4 = float(rect[3][0]), float(rect[3][1])
-        center_x_px = sum(x_coords) / 4
+        return quad_to_bounding_box(x1, y1, x2, y2, x3, y3, x4, y4, img_width, img_height)
        center_y_px = sum(y_coords) / 4
        width_px = (abs(float(rect[1][0]) - float(rect[0][0])) + abs(float(rect[2][0]) - float(rect[3][0]))) / 2
        height_px = (abs(float(rect[3][1]) - float(rect[0][1])) + abs(float(rect[2][1]) - float(rect[1][1]))) / 2
        return BoundingBox(
            center_x=center_x_px / img_width,
            center_y=center_y_px / img_height,
            width=width_px / img_width,
            height=height_px / img_height
        )
    def _to_generic_result(self, response, img_width, img_height):
        lines = []
@@ -1388,7 +1421,7 @@ class RapidOCR:
        for i in range(len(response.boxes)):
            box = response.boxes[i]
            text = response.txts[i]
-            bbox = self._pixel_quad_to_center_bbox(box, img_width, img_height)
+            bbox = self._convert_bbox(box, img_width, img_height)
            word = Word(text=text, bounding_box=bbox)
            line = Line(bounding_box=bbox, words=[word], text=text)
            lines.append(line)