From 54b41d00a5d73ef79a15290a3cfaf1b8f599465e Mon Sep 17 00:00:00 2001 From: AuroraWright Date: Wed, 15 Oct 2025 02:58:05 +0200 Subject: [PATCH] Adapt remaining local engines to json format --- owocr/ocr.py | 313 +++++++++++++++++++++++++++++++++++++++++++-------- owocr/run.py | 2 +- 2 files changed, 266 insertions(+), 49 deletions(-) diff --git a/owocr/ocr.py b/owocr/ocr.py index 776b3b4..0fb87dd 100644 --- a/owocr/ocr.py +++ b/owocr/ocr.py @@ -314,7 +314,7 @@ class GoogleLens: ) ) words.append(word) - + l_bbox = l.get('geometry', {}).get('bounding_box', {}) line = Line( bounding_box=BoundingBox( @@ -538,7 +538,7 @@ class Bing: def _quad_to_center_bbox(self, quad): center_x = (quad['topLeft']['x'] + quad['topRight']['x'] + quad['bottomRight']['x'] + quad['bottomLeft']['x']) / 4 center_y = (quad['topLeft']['y'] + quad['topRight']['y'] + quad['bottomRight']['y'] + quad['bottomLeft']['y']) / 4 - + width1 = sqrt((quad['topRight']['x'] - quad['topLeft']['x'])**2 + (quad['topRight']['y'] - quad['topLeft']['y'])**2) width2 = sqrt((quad['bottomRight']['x'] - quad['bottomLeft']['x'])**2 + (quad['bottomRight']['y'] - quad['bottomLeft']['y'])**2) avg_width = (width1 + width2) / 2 @@ -546,24 +546,24 @@ class Bing: height1 = sqrt((quad['bottomLeft']['x'] - quad['topLeft']['x'])**2 + (quad['bottomLeft']['y'] - quad['topLeft']['y'])**2) height2 = sqrt((quad['bottomRight']['x'] - quad['topRight']['x'])**2 + (quad['bottomRight']['y'] - quad['topRight']['y'])**2) avg_height = (height1 + height2) / 2 - + return BoundingBox(center_x=center_x, center_y=center_y, width=avg_width, height=avg_height) - def _to_generic_result(self, response, img_width, img_height): + def _to_generic_result(self, response, img_width, img_height, og_img_width, og_img_height): paragraphs = [] text_tag = None for tag in response.get('tags', []): if tag.get('displayName') == '##TextRecognition': text_tag = tag break - + if text_tag: text_action = None for action in text_tag.get('actions', []): if action.get('_type') == 'ImageKnowledge/TextRecognitionAction': text_action = action break - + if text_action: for p in text_action.get('data', {}).get('regions', []): lines = [] @@ -582,10 +582,6 @@ class Bing: words=words ) lines.append(line) - - # Bing doesn't provide paragraph-level separators, so we add a newline - if lines and lines[-1].words: - lines[-1].words[-1].separator = '\n' paragraph = Paragraph( bounding_box=self._quad_to_center_bbox(p['boundingBox']), @@ -594,7 +590,7 @@ class Bing: paragraphs.append(paragraph) return OcrResult( - image_properties=ImageProperties(width=img_width, height=img_height), + image_properties=ImageProperties(width=og_img_width, height=og_img_height), paragraphs=paragraphs ) @@ -677,7 +673,7 @@ class Bing: data = res.json() img_width, img_height = img_size - ocr_result = self._to_generic_result(data, img_width, img_height) + ocr_result = self._to_generic_result(data, img_width, img_height, img.width, img.height) x = (True, ocr_result) if is_path: @@ -709,7 +705,7 @@ class AppleVision: available = False local = True manual_language = True - coordinate_support = False + coordinate_support = True threading_support = True def __init__(self, language='ja'): @@ -722,6 +718,56 @@ class AppleVision: self.language = [language, 'en'] logger.info('Apple Vision ready') + def _to_generic_result(self, response, img_width, img_height): + lines = [] + for l in response: + bbox_raw = l.boundingBox() + bbox = BoundingBox( + width=bbox_raw.size.width, + height=bbox_raw.size.height, + center_x=bbox_raw.origin.x + (bbox_raw.size.width / 2), + center_y=(1 - bbox_raw.origin.y - bbox_raw.size.height / 2) + ) + + word = Word( + text=l.text(), + bounding_box=bbox + ) + words = [word] + + line = Line( + text=l.text(), + bounding_box=bbox, + words=words + ) + + lines.append(line) + + if lines: + # Approximate paragraph bbox by combining all line bboxes + all_line_bboxes = [l.bounding_box for l in lines] + min_x = min(b.center_x - b.width / 2 for b in all_line_bboxes) + max_x = max(b.center_x + b.width / 2 for b in all_line_bboxes) + min_y = min(b.center_y - b.height / 2 for b in all_line_bboxes) + max_y = max(b.center_y + b.height / 2 for b in all_line_bboxes) + + p_bbox = BoundingBox( + center_x=(min_x + max_x) / 2, + center_y=(min_y + max_y) / 2, + width=max_x - min_x, + height=max_y - min_y + ) + + paragraph = Paragraph(bounding_box=p_bbox, lines=lines) + paragraphs = [paragraph] + else: + paragraphs = [] + + return OcrResult( + image_properties=ImageProperties(width=img_width, height=img_height), + paragraphs=paragraphs + ) + def __call__(self, img): img, is_path = input_to_pil_image(img) if not img: @@ -742,9 +788,8 @@ class AppleVision: success = handler.performRequests_error_([req], None) res = [] if success[0]: - for result in req.results(): - res.append(result.text()) - x = (True, res) + ocr_result = self._to_generic_result(req.results(), img.width, img.height) + x = (True, ocr_result) else: x = (False, 'Unknown error!') @@ -848,12 +893,11 @@ class AppleLiveText: width=w_bbox.size.width, height=w_bbox.size.height, center_x=w_bbox.origin.x + (w_bbox.size.width / 2), - center_y=w_bbox.origin.y + (w_bbox.size.height / 2), - rotation_z=0.0 + center_y=w_bbox.origin.y + (w_bbox.size.height / 2) ) ) words.append(word) - + l_bbox = l.quad().boundingBox() line = Line( text=l.string(), @@ -861,8 +905,7 @@ class AppleLiveText: width=l_bbox.size.width, height=l_bbox.size.height, center_x=l_bbox.origin.x + (l_bbox.size.width / 2), - center_y=l_bbox.origin.y + (l_bbox.size.height / 2), - rotation_z=0.0 + center_y=l_bbox.origin.y + (l_bbox.size.height / 2) ), words=words ) @@ -876,7 +919,7 @@ class AppleLiveText: max_x = max(b.center_x + b.width / 2 for b in all_line_bboxes) min_y = min(b.center_y - b.height / 2 for b in all_line_bboxes) max_y = max(b.center_y + b.height / 2 for b in all_line_bboxes) - + p_bbox = BoundingBox( center_x=(min_x + max_x) / 2, center_y=(min_y + max_y) / 2, @@ -889,7 +932,7 @@ class AppleLiveText: paragraphs = [] self.result = paragraphs - CFRunLoopStop(CFRunLoopGetCurrent()) + CFRunLoopStop(CFRunLoopGetCurrent()) def _preprocess(self, img): image_bytes = pil_image_to_bytes(img, 'tiff') @@ -904,7 +947,7 @@ class WinRTOCR: available = False local = True manual_language = True - coordinate_support = False + coordinate_support = True threading_support = True def __init__(self, config={}, language='ja'): @@ -926,13 +969,86 @@ class WinRTOCR: except: logger.warning('Error reading URL from config, WinRT OCR will not work!') + def _normalize_bbox(self, rect, img_width, img_height): + x_norm = rect['x'] / img_width + y_norm = rect['y'] / img_height + width_norm = rect['width'] / img_width + height_norm = rect['height'] / img_height + + # Calculate center coordinates + center_x = x_norm + (width_norm / 2) + center_y = y_norm + (height_norm / 2) + + return BoundingBox( + center_x=center_x, + center_y=center_y, + width=width_norm, + height=height_norm + ) + + def _to_generic_result(self, response, img_width, img_height): + lines = [] + for l in response.get('lines', []): + words = [] + for i, w in enumerate(l.get('words', [])): + word = Word( + text=w.get('text', ''), + bounding_box=self._normalize_bbox(w['bounding_rect'], img_width, img_height) + ) + words.append(word) + + # Approximate line bbox by combining all word bboxes + all_word_bboxes = [w.bounding_box for w in words] + min_x = min(b.center_x - b.width / 2 for b in all_word_bboxes) + max_x = max(b.center_x + b.width / 2 for b in all_word_bboxes) + min_y = min(b.center_y - b.height / 2 for b in all_word_bboxes) + max_y = max(b.center_y + b.height / 2 for b in all_word_bboxes) + + l_bbox = BoundingBox( + center_x=(min_x + max_x) / 2, + center_y=(min_y + max_y) / 2, + width=max_x - min_x, + height=max_y - min_y + ) + line = Line( + text=l.get('text', ''), + bounding_box=l_bbox, + words=words + ) + lines.append(line) + + # Create a single paragraph to hold all lines + if lines: + # Approximate paragraph bbox by combining all line bboxes + all_line_bboxes = [l.bounding_box for l in lines] + min_x = min(b.center_x - b.width / 2 for b in all_line_bboxes) + max_x = max(b.center_x + b.width / 2 for b in all_line_bboxes) + min_y = min(b.center_y - b.height / 2 for b in all_line_bboxes) + max_y = max(b.center_y + b.height / 2 for b in all_line_bboxes) + + p_bbox = BoundingBox( + center_x=(min_x + max_x) / 2, + center_y=(min_y + max_y) / 2, + width=max_x - min_x, + height=max_y - min_y + ) + paragraph = Paragraph(bounding_box=p_bbox, lines=lines) + paragraphs = [paragraph] + else: + paragraphs = [] + + return OcrResult( + image_properties=ImageProperties(width=img_width, height=img_height), + paragraphs=paragraphs + ) + def __call__(self, img): img, is_path = input_to_pil_image(img) if not img: return (False, 'Invalid image provided') if sys.platform == 'win32': - res = winocr.recognize_pil_sync(img, lang=self.language)['text'] + res = winocr.recognize_pil_sync(img, lang=self.language) else: params = {'lang': self.language} try: @@ -945,9 +1061,10 @@ class WinRTOCR: if res.status_code != 200: return (False, 'Unknown error!') - res = res.json()['text'] + res = res.json() - x = (True, res) + ocr_result = self._to_generic_result(res, img.width, img.height) + x = (True, ocr_result) if is_path: img.close() @@ -994,7 +1111,7 @@ class OneOCR: center_x_px = sum(x_coords) / 4 center_y_px = sum(y_coords) / 4 - + width_px = (abs(rect['x2'] - rect['x1']) + abs(rect['x3'] - rect['x4'])) / 2 height_px = (abs(rect['y4'] - rect['y1']) + abs(rect['y3'] - rect['y2'])) / 2 @@ -1005,7 +1122,7 @@ class OneOCR: height=height_px / img_height ) - def _to_generic_result(self, response, img_width, img_height): + def _to_generic_result(self, response, img_width, img_height, og_img_width, og_img_height): lines = [] for l in response.get('lines', []): words = [] @@ -1015,7 +1132,7 @@ class OneOCR: bounding_box=self._pixel_quad_to_center_bbox(w['bounding_rect'], img_width, img_height) ) words.append(word) - + line = Line( text=l.get('text', ''), bounding_box=self._pixel_quad_to_center_bbox(l['bounding_rect'], img_width, img_height), @@ -1031,7 +1148,7 @@ class OneOCR: max_x = max(b.center_x + b.width / 2 for b in all_line_bboxes) min_y = min(b.center_y - b.height / 2 for b in all_line_bboxes) max_y = max(b.center_y + b.height / 2 for b in all_line_bboxes) - + p_bbox = BoundingBox( center_x=(min_x + max_x) / 2, center_y=(min_y + max_y) / 2, @@ -1044,7 +1161,7 @@ class OneOCR: paragraphs = [] return OcrResult( - image_properties=ImageProperties(width=img_width, height=img_height), + image_properties=ImageProperties(width=og_img_width, height=og_img_height), paragraphs=paragraphs ) @@ -1077,7 +1194,7 @@ class OneOCR: if 'error' in raw_res: return (False, raw_res['error']) - ocr_response = self._to_generic_result(raw_res, img_width, img_height) + ocr_response = self._to_generic_result(raw_res, img_width, img_height, img.width, img.height) x = (True, ocr_response) if is_path: @@ -1179,7 +1296,7 @@ class EasyOCR: available = False local = True manual_language = True - coordinate_support = False + coordinate_support = True threading_support = True def __init__(self, config={'gpu': True}, language='ja'): @@ -1192,17 +1309,68 @@ class EasyOCR: self.available = True logger.info('EasyOCR ready') + def _pixel_quad_to_center_bbox(self, rect, img_width, img_height): + x_coords = [float(point[0]) for point in rect] + y_coords = [float(point[1]) for point in rect] + + center_x_px = sum(x_coords) / 4 + center_y_px = sum(y_coords) / 4 + + width_px = (abs(float(rect[1][0]) - float(rect[0][0])) + abs(float(rect[2][0]) - float(rect[3][0]))) / 2 + height_px = (abs(float(rect[3][1]) - float(rect[0][1])) + abs(float(rect[2][1]) - float(rect[1][1]))) / 2 + + return BoundingBox( + center_x=center_x_px / img_width, + center_y=center_y_px / img_height, + width=width_px / img_width, + height=height_px / img_height + ) + + def _to_generic_result(self, response, img_width, img_height): + lines = [] + + for detection in response: + quad_coords = detection[0] + text = detection[1] + + bbox = self._pixel_quad_to_center_bbox(quad_coords, img_width, img_height) + word = Word(text=text, bounding_box=bbox) + line = Line(bounding_box=bbox, words=[word], text=text) + lines.append(line) + + if lines: + # Approximate paragraph bbox by combining all line bboxes + all_line_bboxes = [l.bounding_box for l in lines] + min_x = min(b.center_x - b.width / 2 for b in all_line_bboxes) + max_x = max(b.center_x + b.width / 2 for b in all_line_bboxes) + min_y = min(b.center_y - b.height / 2 for b in all_line_bboxes) + max_y = max(b.center_y + b.height / 2 for b in all_line_bboxes) + + p_bbox = BoundingBox( + center_x=(min_x + max_x) / 2, + center_y=(min_y + max_y) / 2, + width=max_x - min_x, + height=max_y - min_y + ) + + paragraph = Paragraph(bounding_box=p_bbox, lines=lines) + paragraphs = [paragraph] + else: + paragraphs = [] + + return OcrResult( + image_properties=ImageProperties(width=img_width, height=img_height), + paragraphs=paragraphs + ) + def __call__(self, img): img, is_path = input_to_pil_image(img) if not img: return (False, 'Invalid image provided') - res = [] - read_result = self.model.readtext(self._preprocess(img), detail=0) - for text in read_result: - res.append(text) - - x = (True, res) + read_results = self.model.readtext(self._preprocess(img)) + ocr_result = self._to_generic_result(read_results, img.width, img.height) + x = (True, ocr_result) if is_path: img.close() @@ -1218,7 +1386,7 @@ class RapidOCR: available = False local = True manual_language = True - coordinate_support = False + coordinate_support = True threading_support = True def __init__(self, config={'high_accuracy_detection': False, 'high_accuracy_recognition': True}, language='ja'): @@ -1257,18 +1425,67 @@ class RapidOCR: else: return LangRec.LATIN + def _pixel_quad_to_center_bbox(self, rect, img_width, img_height): + x_coords = [float(point[0]) for point in rect] + y_coords = [float(point[1]) for point in rect] + + center_x_px = sum(x_coords) / 4 + center_y_px = sum(y_coords) / 4 + + width_px = (abs(float(rect[1][0]) - float(rect[0][0])) + abs(float(rect[2][0]) - float(rect[3][0]))) / 2 + height_px = (abs(float(rect[3][1]) - float(rect[0][1])) + abs(float(rect[2][1]) - float(rect[1][1]))) / 2 + + return BoundingBox( + center_x=center_x_px / img_width, + center_y=center_y_px / img_height, + width=width_px / img_width, + height=height_px / img_height + ) + + def _to_generic_result(self, response, img_width, img_height): + lines = [] + + for i in range(len(response.boxes)): + box = response.boxes[i] + text = response.txts[i] + bbox = self._pixel_quad_to_center_bbox(box, img_width, img_height) + word = Word(text=text, bounding_box=bbox) + line = Line(bounding_box=bbox, words=[word], text=text) + lines.append(line) + + if lines: + # Approximate paragraph bbox by combining all line bboxes + all_line_bboxes = [l.bounding_box for l in lines] + min_x = min(b.center_x - b.width / 2 for b in all_line_bboxes) + max_x = max(b.center_x + b.width / 2 for b in all_line_bboxes) + min_y = min(b.center_y - b.height / 2 for b in all_line_bboxes) + max_y = max(b.center_y + b.height / 2 for b in all_line_bboxes) + + p_bbox = BoundingBox( + center_x=(min_x + max_x) / 2, + center_y=(min_y + max_y) / 2, + width=max_x - min_x, + height=max_y - min_y + ) + + paragraph = Paragraph(bounding_box=p_bbox, lines=lines) + paragraphs = [paragraph] + else: + paragraphs = [] + + return OcrResult( + image_properties=ImageProperties(width=img_width, height=img_height), + paragraphs=paragraphs + ) + def __call__(self, img): img, is_path = input_to_pil_image(img) if not img: return (False, 'Invalid image provided') - res = [] read_results = self.model(self._preprocess(img)) - if read_results: - for read_result in read_results.txts: - res.append(read_result) - - x = (True, res) + ocr_result = self._to_generic_result(read_results, img.width, img.height) + x = (True, ocr_result) if is_path: img.close() @@ -1356,5 +1573,5 @@ class OCRSpace: img.close() return x - def _preprocess(self, img): + def _preprocess(self, img): return limit_image_size(img, self.max_byte_size) diff --git a/owocr/run.py b/owocr/run.py index 00e8c5e..d0f14c7 100644 --- a/owocr/run.py +++ b/owocr/run.py @@ -1142,7 +1142,7 @@ class ScreenshotThread(threading.Thread): class AutopauseTimer: def __init__(self): self.timeout = config.get_general('auto_pause') - self.timer_thread = threading.Thread(target=self._countdown) + self.timer_thread = threading.Thread(target=self._countdown, daemon=True) self.running = True self.countdown_active = threading.Event() self.allow_auto_pause = threading.Event()