Add manual regex filter for screen capture

2025-10-23 14:53:19 +02:00
parent 07ce077dec
commit e5aab4e2a5
3 changed files with 25 additions and 5 deletions
--- a/owocr/config.py
+++ b/owocr/config.py
@@ -49,9 +49,11 @@ parser.add_argument('-sd', '--screen_capture_delay_secs', type=float, default=ar
 parser.add_argument('-sw', '--screen_capture_only_active_windows', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS,
                    help="When reading with screen capture and screen_capture_area is a window name, only target the window while it's active.")
 parser.add_argument('-sf', '--screen_capture_frame_stabilization', type=float, default=argparse.SUPPRESS,
-                    help="When reading with screen capture, delay to wait until text is stable before processing it. -1 waits for two OCR results to be the same. 0 to disable.")
+                    help='When reading with screen capture, delay to wait until text is stable before processing it. -1 waits for two OCR results to be the same. 0 to disable.')
 parser.add_argument('-sl', '--screen_capture_line_recovery', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS,
-                    help="When reading with screen capture and frame stabilization is on, try to recover missed lines from unstable frames. Can lead to increased glitches.")
+                    help='When reading with screen capture and frame stabilization is on, try to recover missed lines from unstable frames. Can lead to increased glitches.')
+parser.add_argument('-sr', '--screen_capture_regex_filter', type=str, default=argparse.SUPPRESS,
+                    help='When reading with screen capture, regex to filter unwanted text from the output. Example value: "▶|♥|・" to remove either of those characters.')
 parser.add_argument('-sc', '--screen_capture_combo', type=str, default=argparse.SUPPRESS,
                    help='When reading with screen capture, combo to wait on for taking a screenshot. If periodic screenshots are also enabled, any screenshot taken this way bypasses the filtering. Example value: "<ctrl>+<shift>+s". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
 parser.add_argument('-scc', '--coordinate_selector_combo', type=str, default=argparse.SUPPRESS,
@@ -59,11 +61,11 @@ parser.add_argument('-scc', '--coordinate_selector_combo', type=str, default=arg
 parser.add_argument('-l', '--language', type=str, default=argparse.SUPPRESS,
                    help='Two letter language code to use for some engines and for filtering screen capture OCR results. Ex. "ja" for Japanese, "zh" for Chinese, "ko" for Korean, "ar" for Arabic, "ru" for Russian, "el" for Greek, "he" for Hebrew, "th" for Thai. Any other value will use Latin Extended (for most European languages and English).')
 parser.add_argument('-j', '--join_lines', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS,
-                    help="Display lines in the text output without a space between them.")
+                    help='Display lines in the text output without a space between them.')
 parser.add_argument('-jp', '--join_paragraphs', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS,
-                    help="Display paragraphs in the text output without a space between them.")
+                    help='Display paragraphs in the text output without a space between them.')
 parser.add_argument('-f', '--furigana_filter', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS,
-                    help="Try to filter furigana lines for Japanese.")
+                    help='Try to filter furigana lines for Japanese.')
 parser.add_argument('-of', '--output_format', type=str, default=argparse.SUPPRESS,
                    help='The output format for OCR results. Can be "text" (default) or "json" (to include coordinates).')
 parser.add_argument('-wp', '--websocket_port', type=int, default=argparse.SUPPRESS,
@@ -103,6 +105,7 @@ class Config:
        'screen_capture_only_active_windows': True,
        'screen_capture_frame_stabilization': -1,
        'screen_capture_line_recovery': True,
+        'screen_capture_regex_filter': '',
        'join_lines': False,
        'furigana_filter': True,
        'screen_capture_combo': '',
--- a/owocr/run.py
+++ b/owocr/run.py
@@ -319,6 +319,7 @@ class TextFiltering:
        self.cj_regex = re.compile(r'[\u3041-\u3096\u30A1-\u30FA\u4E01-\u9FFF]')
        self.kanji_regex = re.compile(r'[\u4E00-\u9FFF]')
        self.regex = self._get_regex()
+        self.manual_regex_filter = self._get_manual_regex_filter()
        self.kana_variants = {
            'ぁ': ['ぁ', 'あ'], 'あ': ['ぁ', 'あ'],
            'ぃ': ['ぃ', 'い'], 'い': ['ぃ', 'い'],
@@ -364,6 +365,15 @@ class TextFiltering:
            return re.compile(
            r'[a-zA-Z\u00C0-\u00FF\u0100-\u017F\u0180-\u024F\u0250-\u02AF\u1D00-\u1D7F\u1D80-\u1DBF\u1E00-\u1EFF\u2C60-\u2C7F\uA720-\uA7FF\uAB30-\uAB6F]')

+    def _get_manual_regex_filter(self):
+        manual_regex_filter = config.get_general('screen_capture_regex_filter').strip()
+        if manual_regex_filter:
+            try:
+                return re.compile(manual_regex_filter)
+            except re.error as e:
+                logger.warning(f'Invalid screen capture regex filter: {e}')
+        return None
+
    def _convert_small_kana_to_big(self, text):
        converted_text = ''.join(self.kana_variants.get(char, [char])[-1] for char in text)
        return converted_text
@@ -625,6 +635,9 @@ class TextFiltering:
                        logger.opt(colors=True).debug(f"<magenta>Found overlap: '{overlap}'</magenta>")
                        changed_line = self._cut_at_overlap(changed_line, overlap)
                        logger.opt(colors=True).debug(f"<magenta>After cutting: '{changed_line}'</magenta>")
+
+            if self.manual_regex_filter:
+                changed_line = self.manual_regex_filter.sub('', changed_line)
            changed_lines.append(changed_line)
            changed_lines_count += 1

--- a/owocr_config.ini
+++ b/owocr_config.ini
@@ -91,6 +91,10 @@
 ;recover missed lines from unstable frames. Can lead to increased glitches.
 ;screen_capture_line_recovery = True

+;When reading with screen capture, regex to filter unwanted text from the output.
+;Example value: "▶|♥|・" to remove either of those characters.
+;screen_capture_regex_filter =
+
 ;Display lines in the text output without a space between them.
 ;join_lines = False