Vertical furigana filter, rename option

2025-10-17 01:43:24 +02:00
parent 87f7ea6069
commit e59ceb7ae4
4 changed files with 101 additions and 46 deletions
--- a/owocr/config.py
+++ b/owocr/config.py
@@ -52,14 +52,14 @@ parser.add_argument('-sf', '--screen_capture_frame_stabilization', type=float, d
                    help="When reading with screen capture, delay to wait until text is stable before processing it. -1 waits for two OCR results to be the same. 0 to disable.")
 parser.add_argument('-sl', '--screen_capture_line_recovery', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS,
                    help="When reading with screen capture and frame stabilization is on, try to recover missed lines from unstable frames. Can lead to increased glitches.")
-parser.add_argument('-sff', '--screen_capture_furigana_filter', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS,
-                    help="When reading with screen capture, try to filter furigana lines.")
 parser.add_argument('-sc', '--screen_capture_combo', type=str, default=argparse.SUPPRESS,
                    help='When reading with screen capture, combo to wait on for taking a screenshot. If periodic screenshots are also enabled, any screenshot taken this way bypasses the filtering. Example value: "<ctrl>+<shift>+s". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
 parser.add_argument('-scc', '--coordinate_selector_combo', type=str, default=argparse.SUPPRESS,
                    help='When reading with screen capture, combo to wait on for invoking the coordinate picker to change the screen/window area. Example value: "<ctrl>+<shift>+c". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
 parser.add_argument('-l', '--language', type=str, default=argparse.SUPPRESS,
                    help='Two letter language code for filtering screencapture OCR results. Ex. "ja" for Japanese, "zh" for Chinese, "ko" for Korean, "ar" for Arabic, "ru" for Russian, "el" for Greek, "he" for Hebrew, "th" for Thai. Any other value will use Latin Extended (for most European languages and English).')
+parser.add_argument('-f', '--furigana_filter', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS,
+                    help="Try to filter furigana lines for Japanese.")
 parser.add_argument('-of', '--output_format', type=str, default=argparse.SUPPRESS,
                    help='The output format for OCR results. Can be "text" (default) or "json" (to include coordinates).')
 parser.add_argument('-wp', '--websocket_port', type=int, default=argparse.SUPPRESS,
@@ -99,7 +99,7 @@ class Config:
        'screen_capture_only_active_windows': True,
        'screen_capture_frame_stabilization': -1,
        'screen_capture_line_recovery': True,
-        'screen_capture_furigana_filter': True,
+        'furigana_filter': True,
        'screen_capture_combo': '',
        'coordinate_selector_combo': '',
        'screen_capture_old_macos_api': False,
--- a/owocr/ocr.py
+++ b/owocr/ocr.py
@@ -369,16 +369,18 @@ class MangaOcr:
    coordinate_support = False
    threading_support = True

-    def __init__(self, config={'pretrained_model_name_or_path':'kha-white/manga-ocr-base','force_cpu': False}):
+    def __init__(self, config={}):
        if 'manga_ocr' not in sys.modules:
            logger.warning('manga-ocr not available, Manga OCR will not work!')
        else:
+            pretrained_model_name_or_path = config.get('pretrained_model_name_or_path', 'kha-white/manga-ocr-base')
+            force_cpu = config.get('force_cpu', False)
            logger.disable('manga_ocr')
            logging.getLogger('transformers').setLevel(logging.ERROR) # silence transformers >=4.46 warnings
            from manga_ocr import ocr
            ocr.post_process = empty_post_process
            logger.info(f'Loading Manga OCR model')
-            self.model = MOCR(config['pretrained_model_name_or_path'], config['force_cpu'])
+            self.model = MOCR(pretrained_model_name_or_path, force_cpu)
            self.available = True
            logger.info('Manga OCR ready')

@@ -860,12 +862,14 @@ class AppleVision:
    coordinate_support = True
    threading_support = True

-    def __init__(self, language='ja'):
+    def __init__(self, language='ja', config={}):
        if sys.platform != 'darwin':
            logger.warning('Apple Vision is not supported on non-macOS platforms!')
        elif int(platform.mac_ver()[0].split('.')[0]) < 13:
            logger.warning('Apple Vision is not supported on macOS older than Ventura/13.0!')
        else:
+            self.recognition_level = Vision.VNRequestTextRecognitionLevelFast if config.get('fast_mode', False) else Vision.VNRequestTextRecognitionLevelAccurate
+            self.language_correction = config.get('language_correction', True)
            self.available = True
            self.language = [language, 'en']
            logger.info('Apple Vision ready')
@@ -916,8 +920,8 @@ class AppleVision:
            req = Vision.VNRecognizeTextRequest.alloc().init()

            req.setRevision_(Vision.VNRecognizeTextRequestRevision3)
-            req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
-            req.setUsesLanguageCorrection_(True)
+            req.setRecognitionLevel_(self.recognition_level)
+            req.setUsesLanguageCorrection_(self.language_correction)
            req.setRecognitionLanguages_(self.language)

            handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(
@@ -1050,7 +1054,6 @@ class AppleLiveText:
                )
                lines.append(line)

-        # Create a single paragraph to hold all lines
        if lines:
            p_bbox = merge_bounding_boxes(lines)
            paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
@@ -1132,7 +1135,6 @@ class WinRTOCR:
            )
            lines.append(line)

-        # Create a single paragraph to hold all lines
        if lines:
            p_bbox = merge_bounding_boxes(lines)
            paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
@@ -1235,7 +1237,6 @@ class OneOCR:
            )
            lines.append(line)

-        # Create a single paragraph to hold all lines
        if lines:
            p_bbox = merge_bounding_boxes(lines)
            paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
@@ -1417,13 +1418,14 @@ class EasyOCR:
    coordinate_support = True
    threading_support = True

-    def __init__(self, config={'gpu': True}, language='ja'):
+    def __init__(self, config={}, language='ja'):
        if 'easyocr' not in sys.modules:
            logger.warning('easyocr not available, EasyOCR will not work!')
        else:
            logger.info('Loading EasyOCR model')
+            gpu = config.get('gpu', True)
            logging.getLogger('easyocr.easyocr').setLevel(logging.ERROR)
-            self.model = easyocr.Reader([language,'en'], gpu=config['gpu'])
+            self.model = easyocr.Reader([language,'en'], gpu=gpu)
            self.available = True
            logger.info('EasyOCR ready')

@@ -1485,20 +1487,22 @@ class RapidOCR:
    coordinate_support = True
    threading_support = True

-    def __init__(self, config={'high_accuracy_detection': False, 'high_accuracy_recognition': True}, language='ja'):
+    def __init__(self, config={}, language='ja'):
        if 'rapidocr' not in sys.modules:
            logger.warning('rapidocr not available, RapidOCR will not work!')
        else:
            logger.info('Loading RapidOCR model')
+            high_accuracy_detection = config.get('high_accuracy_detection', False)
+            high_accuracy_recognition = config.get('high_accuracy_recognition', True)
            lang_rec = self.language_to_model_language(language)
            self.model = ROCR(params={
                'Det.engine_type': EngineType.ONNXRUNTIME,
                'Det.lang_type': LangDet.CH,
-                'Det.model_type': ModelType.SERVER if config['high_accuracy_detection'] else ModelType.MOBILE,
+                'Det.model_type': ModelType.SERVER if high_accuracy_detection else ModelType.MOBILE,
                'Det.ocr_version': OCRVersion.PPOCRV5,
                'Rec.engine_type': EngineType.ONNXRUNTIME,
                'Rec.lang_type': lang_rec,
-                'Rec.model_type': ModelType.SERVER if config['high_accuracy_recognition'] else ModelType.MOBILE,
+                'Rec.model_type': ModelType.SERVER if high_accuracy_recognition else ModelType.MOBILE,
                'Rec.ocr_version': OCRVersion.PPOCRV5,
                'Global.log_level': 'error'
            })
@@ -1626,10 +1630,6 @@ class OCRSpace:
    def _to_generic_result(self, api_result, img_width, img_height, og_img_width, og_img_height):
        parsed_result = api_result['ParsedResults'][0]
        text_overlay = parsed_result.get('TextOverlay', {})
-
-        image_props = ImageProperties(width=og_img_width, height=og_img_height)
-        ocr_result = OcrResult(image_properties=image_props)
-
        lines_data = text_overlay.get('Lines', [])

        lines = []
@@ -1645,11 +1645,14 @@ class OCRSpace:
        if lines:
            p_bbox = merge_bounding_boxes(lines)
            paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
-            ocr_result.paragraphs = [paragraph]
+            paragraphs = [paragraph]
        else:
-            ocr_result.paragraphs = []
+            paragraphs = []

-        return ocr_result
+        return OcrResult(
+            image_properties=ImageProperties(width=og_img_width, height=og_img_height),
+            paragraphs=paragraphs
+        )

    def __call__(self, img):
        img, is_path = input_to_pil_image(img)
--- a/owocr/run.py
+++ b/owocr/run.py
@@ -294,7 +294,7 @@ class TextFiltering:
        self.language = config.get_general('language')
        self.frame_stabilization = 0 if config.get_general('screen_capture_delay_secs') == -1 else config.get_general('screen_capture_frame_stabilization')
        self.line_recovery = config.get_general('screen_capture_line_recovery')
-        self.furigana_filter = config.get_general('screen_capture_furigana_filter')
+        self.furigana_filter = self.language == 'ja' and config.get_general('furigana_filter')
        self.last_frame_data = (None, None)
        self.last_last_frame_data = (None, None)
        self.stable_frame_data = None
@@ -549,7 +549,7 @@ class TextFiltering:
        if all(not current_text_line for current_text_line in current_lines):
            return None

-        if self.furigana_filter and self.language == 'ja' and isinstance(current_result_ocr, OcrResult):
+        if self.furigana_filter and isinstance(current_result_ocr, OcrResult):
            for p in current_result_ocr.paragraphs:
                current_lines_ocr.extend(p.lines)

@@ -607,34 +607,57 @@ class TextFiltering:
                            if not current_lines[j]:
                                continue

-                            below_line_bbox = current_lines_ocr[j].bounding_box
-                            below_line_text = current_lines[j]
+                            other_line_bbox = current_lines_ocr[j].bounding_box
+                            other_line_text = current_lines[j]

-                            logger.opt(colors=True).debug(f"<magenta>Furigana check against line: '{below_line_text}'</magenta>")
+                            if len(current_text) <= len(other_line_text):
+                                is_vertical = other_line_bbox.height > other_line_bbox.width
+                            else:
+                                is_vertical = current_line_bbox.height > current_line_bbox.width
+
+                            logger.opt(colors=True).debug(f"<magenta>Furigana check against line: '{other_line_text}'</magenta>")
+
+                            if is_vertical:
+                                width_threshold = other_line_bbox.width * 0.7
+                                is_smaller = current_line_bbox.width < width_threshold
+                                logger.opt(colors=True).debug(f"<magenta>Vertical furigana check width: '{other_line_bbox.width}' '{current_line_bbox.width}'</magenta>")
+                            else:
+                                height_threshold = other_line_bbox.height * 0.7
+                                is_smaller = current_line_bbox.height < height_threshold
+                                logger.opt(colors=True).debug(f"<magenta>Horizontal furigana check height: '{other_line_bbox.height}' '{current_line_bbox.height}'</magenta>")

-                            # Check if the line is taller
-                            height_threshold = below_line_bbox.height * 0.7
-                            is_smaller = current_line_bbox.height < height_threshold
-                            logger.opt(colors=True).debug(f"<magenta>Furigana check height: '{below_line_bbox.height}' '{current_line_bbox.height}'</magenta>")
                            if not is_smaller:
                                continue

                            # Check if the line has kanji
-                            below_has_kanji = self.kanji_regex.search(below_line_text)
-                            if not below_has_kanji:
+                            other_has_kanji = self.kanji_regex.search(other_line_text)
+                            if not other_has_kanji:
                                continue

-                            vertical_threshold = below_line_bbox.height + current_line_bbox.height
-                            vertical_distance = below_line_bbox.center_y - current_line_bbox.center_y
-                            horizontal_overlap = self._check_horizontal_overlap(current_line_bbox, below_line_bbox)
+                            if is_vertical:
+                                horizontal_threshold = current_line_bbox.width + other_line_bbox.width
+                                horizontal_distance = current_line_bbox.center_x - other_line_bbox.center_x
+                                vertical_overlap = self._check_vertical_overlap(current_line_bbox, other_line_bbox)

-                            logger.opt(colors=True).debug(f"<magenta>Furigana check position: '{vertical_threshold}' '{vertical_distance}' '{horizontal_overlap}'</magenta>")
+                                logger.opt(colors=True).debug(f"<magenta>Vertical furigana check position: '{horizontal_threshold}' '{horizontal_distance}' '{vertical_overlap}'</magenta>")

-                            # If vertically close and horizontally aligned, it's likely furigana
-                            if (0 < vertical_distance < vertical_threshold and horizontal_overlap > 0.5):
-                                is_furigana = True
-                                logger.opt(colors=True).debug(f"<magenta>Skipping furigana line: '{current_text}' above line: '{below_line_text}'</magenta>")
-                                break
+                                # If horizontally close and vertically aligned, it's likely furigana
+                                if (0 < horizontal_distance < horizontal_threshold and vertical_overlap > 0.5):
+                                    is_furigana = True
+                                    logger.opt(colors=True).debug(f"<magenta>Skipping vertical furigana line: '{current_text}' next to line: '{other_line_text}'</magenta>")
+                                    break
+                            else:
+                                vertical_threshold = other_line_bbox.height + current_line_bbox.height
+                                vertical_distance = other_line_bbox.center_y - current_line_bbox.center_y
+                                horizontal_overlap = self._check_horizontal_overlap(current_line_bbox, other_line_bbox)
+
+                                logger.opt(colors=True).debug(f"<magenta>Horizontal furigana check position: '{vertical_threshold}' '{vertical_distance}' '{horizontal_overlap}'</magenta>")
+
+                                # If vertically close and horizontally aligned, it's likely furigana
+                                if (0 < vertical_distance < vertical_threshold and horizontal_overlap > 0.5):
+                                    is_furigana = True
+                                    logger.opt(colors=True).debug(f"<magenta>Skipping horizontal furigana line: '{current_text}' above line: '{other_line_text}'</magenta>")
+                                    break

                        if is_furigana:
                            continue
@@ -652,6 +675,9 @@ class TextFiltering:

        return changed_lines

+    def _standalone_furigana_filter(self, result, result_ocr):
+        return self._find_changed_lines_text_impl(result, result_ocr, 0, [], None, False, 0)
+
    def _find_overlap(self, previous_text, current_text):
        min_overlap_length = 3
        max_overlap_length = min(len(previous_text), len(current_text))
@@ -705,6 +731,25 @@ class TextFiltering:

        return overlap_width / smaller_width if smaller_width > 0 else 0.0

+    def _check_vertical_overlap(self, bbox1, bbox2):
+        # Calculate top and bottom boundaries for both boxes
+        top1 = bbox1.center_y - bbox1.height / 2
+        bottom1 = bbox1.center_y + bbox1.height / 2
+        top2 = bbox2.center_y - bbox2.height / 2
+        bottom2 = bbox2.center_y + bbox2.height / 2
+
+        # Calculate overlap
+        overlap_top = max(top1, top2)
+        overlap_bottom = min(bottom1, bottom2)
+
+        if overlap_bottom <= overlap_top:
+            return 0.0
+
+        overlap_height = overlap_bottom - overlap_top
+        smaller_height = min(bbox1.height, bbox2.height)
+
+        return overlap_height / smaller_height if smaller_height > 0 else 0.0
+
    def _create_changed_regions_image(self, pil_image, changed_lines, pil_image_2, changed_lines_2, margin=5):
        def crop_image(image, lines):
            img_width, img_height = image.size
@@ -1339,6 +1384,8 @@ class OutputResult:
                    return
                output_string = self._post_process(text_to_process, True)
            else:
+                if self.filtering.furigana_filter and isinstance(result_data, OcrResult):
+                    result_data_text = self.filtering._standalone_furigana_filter(result_data_text, result_data)
                output_string = self._post_process(result_data_text, False)
            log_message = output_string

--- a/owocr_config.ini
+++ b/owocr_config.ini
@@ -87,8 +87,8 @@
 ;recover missed lines from unstable frames. Can lead to increased glitches.
 ;screen_capture_line_recovery = True

-;When reading with screen capture, try to filter furigana lines.
-;screen_capture_furigana_filter = True
+;Try to filter furigana lines for Japanese.
+;furigana_filter = True

 ;When reading with screen capture, combo to wait on for taking a screenshot.
 ;If periodic screenshots are also enabled, any screenshot taken this way
@@ -147,4 +147,9 @@
 ;[rapidocr]
 ;high_accuracy_detection = False

-;high_accuracy_recognition = True
+;high_accuracy_recognition = True
+
+;[avision]
+;fast_mode = False
+
+;language_correction = True