Improve filtering quality

2025-10-07 18:47:54 +02:00
parent ed9b05d2e0
commit 878f164533
1 changed files with 158 additions and 55 deletions
--- a/owocr/run.py
+++ b/owocr/run.py
@@ -303,13 +303,30 @@ class RequestHandler(socketserver.BaseRequestHandler):

 class TextFiltering:
    def __init__(self):
-        from pysbd import Segmenter
-        import langid
        self.language = config.get_general('language')
-        self.segmenter = Segmenter(language=self.language, clean=True)
-        self.classify = langid.classify
        self.regex = self.get_regex()
-        self.last_result = ([], engine_index)
+        self.kana_variants = {
+            'ぁ': ['ぁ', 'あ'], 'あ': ['ぁ', 'あ'],
+            'ぃ': ['ぃ', 'い'], 'い': ['ぃ', 'い'],
+            'ぅ': ['ぅ', 'う'], 'う': ['ぅ', 'う'],
+            'ぇ': ['ぇ', 'え'], 'え': ['ぇ', 'え'],
+            'ぉ': ['ぉ', 'お'], 'お': ['ぉ', 'お'],
+            'ァ': ['ァ', 'ア'], 'ア': ['ァ', 'ア'],
+            'ィ': ['ィ', 'イ'], 'イ': ['ィ', 'イ'],
+            'ゥ': ['ゥ', 'ウ'], 'ウ': ['ゥ', 'ウ'],
+            'ェ': ['ェ', 'エ'], 'エ': ['ェ', 'エ'],
+            'ォ': ['ォ', 'オ'], 'オ': ['ォ', 'オ'],
+            'ゃ': ['ゃ', 'や'], 'や': ['ゃ', 'や'],
+            'ゅ': ['ゅ', 'ゆ'], 'ゆ': ['ゅ', 'ゆ'],
+            'ょ': ['ょ', 'よ'], 'よ': ['ょ', 'よ'],
+            'ャ': ['ャ', 'ヤ'], 'ヤ': ['ャ', 'ヤ'],
+            'ュ': ['ュ', 'ユ'], 'ユ': ['ュ', 'ユ'],
+            'ョ': ['ョ', 'ヨ'], 'ヨ': ['ョ', 'ヨ'],
+            'っ': ['っ', 'つ'], 'つ': ['っ', 'つ'],
+            'ッ': ['ッ', 'ツ'], 'ツ': ['ッ', 'ツ'],
+            'ゎ': ['ゎ', 'わ'], 'わ': ['ゎ', 'わ'],
+            'ヮ': ['ヮ', 'ワ'], 'ワ': ['ヮ', 'ワ']
+        }

    def get_regex(self):
        if self.language == 'ja':
@@ -334,52 +351,9 @@ class TextFiltering:
            r'[a-zA-Z\u00C0-\u00FF\u0100-\u017F\u0180-\u024F\u0250-\u02AF\u1D00-\u1D7F\u1D80-\u1DBF\u1E00-\u1EFF\u2C60-\u2C7F\uA720-\uA7FF\uAB30-\uAB6F]')

    def convert_small_kana_to_big(self, text):
-        small_to_big = {
-            # Hiragana
-            'ぁ': 'あ', 'ぃ': 'い', 'ぅ': 'う', 'ぇ': 'え', 'ぉ': 'お',
-            'っ': 'つ', 'ゃ': 'や', 'ゅ': 'ゆ', 'ょ': 'よ', 'ゎ': 'わ',
-            # Katakana
-            'ァ': 'ア', 'ィ': 'イ', 'ゥ': 'ウ', 'ェ': 'エ', 'ォ': 'オ',
-            'ッ': 'ツ', 'ャ': 'ヤ', 'ュ': 'ユ', 'ョ': 'ヨ', 'ヮ': 'ワ'
-        }
-
-        converted_text = ''.join(small_to_big.get(char, char) for char in text)
+        converted_text = ''.join(self.kana_variants.get(char, [char])[-1] for char in text)
        return converted_text

-    def __call__(self, text):
-        orig_text = self.segmenter.segment(text)
-        orig_text_filtered = []
-        for block in orig_text:
-            block_filtered = self.regex.findall(block)
-            if self.language == 'ja':
-                block_filtered = self.convert_small_kana_to_big(block_filtered)
-
-            if block_filtered:
-                orig_text_filtered.append(''.join(block_filtered))
-            else:
-                orig_text_filtered.append(None)
-
-        if self.last_result[1] == engine_index:
-            last_text = self.last_result[0]
-        else:
-            last_text = []
-
-        new_blocks = []
-        for idx, block in enumerate(orig_text):
-            if orig_text_filtered[idx] and (orig_text_filtered[idx] not in last_text):
-                new_blocks.append(block)
-
-        final_blocks = []
-        for block in new_blocks:
-            # This only looks at language IF language is ja or zh, otherwise it keeps all text
-            if self.language not in ['ja', 'zh'] or self.classify(block)[0] in ['ja', 'zh'] or block == "\n":
-                final_blocks.append(block)
-
-        text = '\n'.join(final_blocks)
-
-        self.last_result = (orig_text_filtered, engine_index)
-        return text
-

 class ScreenshotThread(threading.Thread):
    def __init__(self, screen_capture_on_combo):
@@ -711,6 +685,7 @@ class OutputResult:
        self.filtering = TextFiltering() if init_filtering else None
        self.cj_regex = re.compile(r'[\u3041-\u3096\u30A1-\u30FA\u4E00-\u9FFF]')
        self.previous_result = None
+        self.previous_result_text = None

    def _coordinate_format_to_string(self, result_data):
        full_text_parts = []
@@ -745,7 +720,7 @@ class OutputResult:
                text_parts.append(' ')
        return ''.join(text_parts)

-    def _compare_text(self, current_text, prev_text, threshold=80):
+    def _compare_text(self, current_text, prev_text, threshold=82):
        if current_text in prev_text:
            return True
        if len(prev_text) > len(current_text):
@@ -776,27 +751,155 @@ class OutputResult:
        for p in previous_result.paragraphs:
            previous_lines.extend(p.lines)

-        all_previous_text = ''
+        all_previous_text_spliced = []
        for prev_line in previous_lines:
            prev_text = self._get_line_text(prev_line)
            prev_text = ''.join(self.filtering.regex.findall(prev_text))
            if self.filtering.language == 'ja':
                prev_text = self.filtering.convert_small_kana_to_big(prev_text)
-            all_previous_text += prev_text
+            all_previous_text_spliced.append(prev_text)

+        all_previous_text = ''.join(all_previous_text_spliced)
+
+        logger.debug(f"Previous text: '{all_previous_text_spliced}'")
+
+        first = True
        for current_line in current_lines:
            current_text = self._get_line_text(current_line)
            current_text = ''.join(self.filtering.regex.findall(current_text))
+            if not current_text:
+                continue
            if self.filtering.language == 'ja':
                current_text = self.filtering.convert_small_kana_to_big(current_text)

+            # For the first line, check if it contains the end of previous text
+            if first and all_previous_text:
+                overlap = self._find_overlap(all_previous_text, current_text)
+                if overlap and len(current_text) > len(overlap):
+                    logger.debug(f"Found overlap: '{overlap}'")
+                    changed_lines.append(current_line)
+                    first = False
+                    continue
+
+            if len(current_text) < 3:
+                text_similar = current_text in all_previous_text_spliced
+            else:
                text_similar = self._compare_text(current_text, all_previous_text)

+            logger.debug(f"Current line: '{current_text}' Similar: '{text_similar}'")
+
            if not text_similar:
                changed_lines.append(current_line)
+                if len(current_text) >= 3:
+                    first = False

        return changed_lines

+    def _find_overlap(self, previous_text, current_text):
+        """Find the overlapping portion between the end of previous_text and start of current_text."""
+        # Try different overlap lengths, starting from the maximum possible
+        min_overlap_length = 3  # Minimum overlap to consider meaningful
+        max_overlap_length = min(len(previous_text), len(current_text))
+
+        for overlap_length in range(max_overlap_length, min_overlap_length - 1, -1):
+            previous_end = previous_text[-overlap_length:]
+            current_start = current_text[:overlap_length]
+
+            if previous_end == current_start:
+                return previous_end
+
+        return None
+
+    def _cut_at_overlap(self, current_line, overlap):
+        pattern_parts = []
+        for char in overlap:
+            # Check if character is kana and has small/big variants
+            if char in self.filtering.kana_variants:
+                # Use character class that matches both small and big variants
+                variants = self.filtering.kana_variants[char]
+                pattern_parts.append(f'[{"".join(variants)}]')
+            else:
+                # Escape regex special characters for regular characters
+                pattern_parts.append(re.escape(char))
+
+        # Create pattern: overlap characters with any characters (0 or more) between them
+        overlap_pattern = r'.*?'.join(pattern_parts)
+
+        # Also allow any characters at the beginning
+        full_pattern = r'^.*?' + overlap_pattern
+
+        logger.debug(f"Cut regex: '{full_pattern}'")
+
+        # Find the match
+        match = re.search(full_pattern, current_line)
+        if match:
+            # Cut after the matched overlapping portion
+            cut_position = match.end()
+            return current_line[cut_position:]
+
+        return current_line
+
+    def _find_changed_lines_text(self, current_result):
+        # Split both results into lines
+        current_lines = current_result.split('\n')
+
+        # If no previous result, all lines are considered changed
+        if self.previous_result_text is None:
+            self.previous_result_text = current_lines[-10:]  # Keep only last 10 lines
+            return current_result
+
+        changed_lines = []
+        all_previous_text_spliced = []
+
+        for prev_line in self.previous_result_text:
+            prev_text = ''.join(self.filtering.regex.findall(prev_line))
+            if self.filtering.language == 'ja':
+                prev_text = self.filtering.convert_small_kana_to_big(prev_text)
+            all_previous_text_spliced.append(prev_text)
+
+        all_previous_text = ''.join(all_previous_text_spliced)
+
+        logger.debug(f"Previous text: '{all_previous_text_spliced}'")
+
+        first = True
+        # Check each current line against the combined previous text
+        for current_line in current_lines:
+            current_text = ''.join(self.filtering.regex.findall(current_line))
+            if not current_text:
+                continue
+            if self.filtering.language == 'ja':
+                current_text = self.filtering.convert_small_kana_to_big(current_text)
+
+            # For the first line, check if it contains the end of previous text
+            if first and all_previous_text:
+                overlap = self._find_overlap(all_previous_text, current_text)
+                if overlap and len(current_text) > len(overlap):
+                    logger.debug(f"Found overlap: '{overlap}'")
+                    # Cut the current_line to remove the overlapping part
+                    current_line = self._cut_at_overlap(current_line, overlap)
+                    logger.debug(f"After cutting: '{current_line}'")
+                    changed_lines.append(current_line)
+                    first = False
+                    continue
+
+            if len(current_text) < 3:
+                text_similar = current_text in all_previous_text_spliced
+            else:
+                text_similar = self._compare_text(current_text, all_previous_text)
+
+            logger.debug(f"Current line: '{current_text}' Similar: '{text_similar}'")
+
+            if not text_similar:
+                changed_lines.append(current_line)
+                if len(current_text) >= 3:
+                    first = False
+
+        # Update cache with current lines, keeping only the last 10
+        self.previous_result_text.extend(current_lines)
+        self.previous_result_text = self.previous_result_text[-10:]
+
+        return '\n'.join(changed_lines)
+
    def _create_changed_regions_image(self, pil_image, changed_lines, margin=5):
        img_width, img_height = pil_image.size

@@ -896,7 +999,7 @@ class OutputResult:
            if output_format == 'json':
                logger.warning(f"Engine '{engine_instance.name}' does not support JSON output. Falling back to text.")
            if filter_text:
-                text_to_process = self.filtering(result_data_text)
+                text_to_process = self._find_changed_lines_text(result_data_text)
                output_string = self._post_process(text_to_process, True)
            else:
                output_string = self._post_process(result_data_text, False)
@@ -1021,7 +1124,7 @@ def on_screenshot_combo():


 def run():
-    logger.configure(handlers=[{'sink': sys.stderr, 'format': config.get_general('logger_format')}])
+    logger.configure(handlers=[{'sink': sys.stderr, 'format': config.get_general('logger_format'), 'level': 'INFO'}])

    if config.has_config:
        logger.info('Parsed config file')