From 169ca15e9f38d8e60c8d1e84ae143e377daa3b27 Mon Sep 17 00:00:00 2001
From: AuroraWright <AuroraWright@users.noreply.github.com>
Date: Fri, 19 Sep 2025 11:31:28 +0200
Subject: [PATCH] Add coordinate support to Live Text, remove space separators

---
 owocr/ocr.py | 84 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 69 insertions(+), 15 deletions(-)

diff --git a/owocr/ocr.py b/owocr/ocr.py
index 71f5c3c..4651c47 100644
--- a/owocr/ocr.py
+++ b/owocr/ocr.py
@@ -568,8 +568,7 @@ class Bing:
                         for w in l.get('words', []):
                             word = Word(
                                 text=w.get('text', ''),
-                                bounding_box=self._quad_to_center_bbox(w['boundingBox']),
-                                separator=" "
+                                bounding_box=self._quad_to_center_bbox(w['boundingBox'])
                             )
                             words.append(word)
 
@@ -752,7 +751,7 @@ class AppleLiveText:
     readable_name = 'Apple Live Text'
     key = 'd'
     available = False
-    coordinate_support = False
+    coordinate_support = True
 
     def __init__(self):
         if sys.platform != 'darwin':
@@ -799,27 +798,84 @@ class AppleLiveText:
         if not img:
             return (False, 'Invalid image provided')
 
+        self.result = None
+
         with objc.autorelease_pool():
             analyzer = self.VKCImageAnalyzer.alloc().init()
             req = self.VKCImageAnalyzerRequest.alloc().initWithImage_requestType_(self._preprocess(img), 1) #VKAnalysisTypeText
             req.setLocales_(['ja','en'])
-            self.result = None
             analyzer.processRequest_progressHandler_completionHandler_(req, lambda progress: None, self._process)
 
             CFRunLoopRunInMode(kCFRunLoopDefaultMode, 10.0, False)
 
-            if self.result == None:
-                return (False, 'Unknown error!')
-            return (True, self.result)
+        if self.result == None:
+            return (False, 'Unknown error!')
+
+        ocr_response = OcrResult(
+            image_properties=ImageProperties(width=img.width, height=img.height),
+            paragraphs=self.result
+        )
+        x = (True, ocr_response)
+
+        if is_path:
+            img.close()
+        return x
 
     def _process(self, analysis, error):
-        res = ''
-        lines = analysis.allLines()
+        lines = []
+        response_lines = analysis.allLines()
+        if response_lines:
+            for l in response_lines:
+                words = []
+                for i, w in enumerate(l.children()):
+                    w_bbox = w.quad().boundingBox()
+                    word = Word(
+                        text=w.string(),
+                        bounding_box=BoundingBox(
+                            width=w_bbox.size.width,
+                            height=w_bbox.size.height,
+                            center_x=w_bbox.origin.x + (w_bbox.size.width / 2),
+                            center_y=w_bbox.origin.y + (w_bbox.size.height / 2),
+                            rotation_z=0.0
+                        )
+                    )
+                    words.append(word)
+                
+                l_bbox = l.quad().boundingBox()
+                line = Line(
+                    bounding_box=BoundingBox(
+                        width=l_bbox.size.width,
+                        height=l_bbox.size.height,
+                        center_x=l_bbox.origin.x + (l_bbox.size.width / 2),
+                        center_y=l_bbox.origin.y + (l_bbox.size.height / 2),
+                        rotation_z=0.0
+                    ),
+                    words=words
+                )
+                lines.append(line)
+
+        # Create a single paragraph to hold all lines
         if lines:
-            for line in lines:
-                res += line.string() + '\n'
-        self.result = res
-        CFRunLoopStop(CFRunLoopGetCurrent())
+            # Approximate paragraph bbox by combining all line bboxes
+            all_line_bboxes = [l.bounding_box for l in lines]
+            min_x = min(b.center_x - b.width / 2 for b in all_line_bboxes)
+            max_x = max(b.center_x + b.width / 2 for b in all_line_bboxes)
+            min_y = min(b.center_y - b.height / 2 for b in all_line_bboxes)
+            max_y = max(b.center_y + b.height / 2 for b in all_line_bboxes)
+            
+            p_bbox = BoundingBox(
+                center_x=(min_x + max_x) / 2,
+                center_y=(min_y + max_y) / 2,
+                width=max_x - min_x,
+                height=max_y - min_y
+            )
+            paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
+            paragraphs = [paragraph]
+        else:
+            paragraphs = []
+
+        self.result = paragraphs
+        CFRunLoopStop(CFRunLoopGetCurrent())        
 
     def _preprocess(self, img):
         image_bytes = pil_image_to_bytes(img, 'tiff')
@@ -933,10 +989,8 @@ class OneOCR:
         for l in response.get('lines', []):
             words = []
             for i, w in enumerate(l.get('words', [])):
-                separator = " " if i < len(l.get('words', [])) - 1 else None
                 word = Word(
                     text=w.get('text', ''),
-                    separator=separator,
                     bounding_box=self._pixel_quad_to_center_bbox(w['bounding_rect'], img_width, img_height)
                 )
                 words.append(word)