Add TextFiltering Support for more languages

2025-07-28 12:07:15 -04:00
parent 01f63cd402
commit 1c2e844d7a
3 changed files with 39 additions and 8 deletions
@@ -48,6 +48,8 @@ parser.add_argument('-sw', '--screen_capture_only_active_windows', type=str2bool
                    help="When reading with screen capture and screen_capture_area is a window name, only target the window while it's active.")
 parser.add_argument('-sc', '--screen_capture_combo', type=str, default=argparse.SUPPRESS,
                    help='When reading with screen capture, combo to wait on for taking a screenshot instead of using the delay. As an example: "<ctrl>+<shift>+s". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
 parser.add.argument('-l', '--language', type=str, default=argparse.SUPPRESS,
                    help='Two letter language code for filtering screencapture OCR results. Ex. "ja" for Japanese, "zh" for Chinese, "ko" for Korean, "ar" for Arabic, "ru" for Russian, "el" for Greek, "he" for Hebrew, "th" for Thai. Any other value will use Latin Extended (for most European languages and English).')
 class Config:
    has_config = False
@@ -76,7 +78,8 @@ class Config:
        'screen_capture_delay_secs': 3,
        'screen_capture_only_active_windows': True,
        'screen_capture_combo': '',
-        'screen_capture_old_macos_api': False
+        'screen_capture_old_macos_api': False,
        'language': 'ja'
    }
    def __parse(self, value):
@@ -299,10 +299,11 @@ class RequestHandler(socketserver.BaseRequestHandler):
 class TextFiltering:
    accurate_filtering = False
-    def __init__(self):
+    def __init__(self, lang='ja'):
        from pysbd import Segmenter
-        self.segmenter = Segmenter(language='ja', clean=True)
+        self.segmenter = Segmenter(language=lang, clean=True)
-        self.kana_kanji_regex = re.compile(r'[\u3041-\u3096\u30A1-\u30FA\u4E00-\u9FFF]')
+        self.regex = self.get_regex(lang)
        try:
            from transformers import pipeline, AutoTokenizer
            import torch
@@ -326,12 +327,34 @@ class TextFiltering:
            import langid
            self.classify = langid.classify
    def get_regex(self, lang):
        if lang == 'ja':
            return re.compile(r'[\u3041-\u3096\u30A1-\u30FA\u4E00-\u9FFF]')
        elif lang == 'zh':
            return re.compile(r'[\u4E00-\u9FFF]')
        elif lang == 'ko':
            return re.compile(r'[\uAC00-\uD7AF]')
        elif lang == 'ar':
            return re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]')
        elif lang == 'ru':
            return re.compile(r'[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F\u1C80-\u1C8F]')
        elif lang == 'el':
            return re.compile(r'[\u0370-\u03FF\u1F00-\u1FFF]')
        elif lang == 'he':
            return re.compile(r'[\u0590-\u05FF\uFB1D-\uFB4F]')
        elif lang == 'th':
            return re.compile(r'[\u0E00-\u0E7F]')
        else:
            # Latin Extended regex for many European languages/English
            return re.compile(
            r'[a-zA-Z\u00C0-\u00FF\u0100-\u017F\u0180-\u024F\u0250-\u02AF\u1D00-\u1D7F\u1D80-\u1DBF\u1E00-\u1EFF\u2C60-\u2C7F\uA720-\uA7FF\uAB30-\uAB6F]')
    def __call__(self, text, last_result):
        orig_text = self.segmenter.segment(text)
        orig_text_filtered = []
        for block in orig_text:
-            block_filtered = self.kana_kanji_regex.findall(block)
+            block_filtered = self.regex.findall(block)
            if block_filtered:
                orig_text_filtered.append(''.join(block_filtered))
            else:
@@ -352,12 +375,13 @@ class TextFiltering:
            detection_results = self.pipe(new_blocks, top_k=3, truncation=True)
            for idx, block in enumerate(new_blocks):
                for result in detection_results[idx]:
-                    if result['label'] == 'ja':
+                    if result['label'] == self.language:
                        final_blocks.append(block)
                        break
        else:
            for block in new_blocks:
-                if self.classify(block)[0] in ('ja', 'zh'):
+                # This only looks at language IF language is ja or zh, otherwise it keeps all text
                if self.language not in ["ja", "zh"] or self.classify(block)[0] in ['ja', 'zh'] or block == "\n":
                    final_blocks.append(block)
        text = '\n'.join(final_blocks)
@@ -29,6 +29,10 @@
 ;note: this specifies a combo to wait on for taking a screenshot instead of using the delay. As an example: <ctrl>+<shift>+s. The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key
 ;screen_capture_combo = <ctrl>+<shift>+s
 ;screen_capture_old_macos_api = False
 ;language = ja
 ;language = zh
 ;note: This specifies the language to use for text filtering while using "screencapture". Valid values: ja: (Japanese) zh: (Chinese) ko: (Korean) ar: (Arabic) ru: (Russian) el: (Greek) he: (Hebrew) th: (Thai)
 ;Any other value will use Latin Extended (for most European languages and English).
 ;[winrtocr]
 ;url = http://aaa.xxx.yyy.zzz:8000
 ;[oneocr]