From 1c2e844d7a7a84e6d4a1da2703dfe3f3a00df22a Mon Sep 17 00:00:00 2001 From: Beangate Date: Mon, 28 Jul 2025 12:07:15 -0400 Subject: [PATCH 1/3] Add TextFiltering Support for more languages --- owocr/config.py | 5 ++++- owocr/run.py | 38 +++++++++++++++++++++++++++++++------- owocr_config.ini | 4 ++++ 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/owocr/config.py b/owocr/config.py index 018e05f..c99a4d6 100644 --- a/owocr/config.py +++ b/owocr/config.py @@ -48,6 +48,8 @@ parser.add_argument('-sw', '--screen_capture_only_active_windows', type=str2bool help="When reading with screen capture and screen_capture_area is a window name, only target the window while it's active.") parser.add_argument('-sc', '--screen_capture_combo', type=str, default=argparse.SUPPRESS, help='When reading with screen capture, combo to wait on for taking a screenshot instead of using the delay. As an example: "++s". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key') +parser.add.argument('-l', '--language', type=str, default=argparse.SUPPRESS, + help='Two letter language code for filtering screencapture OCR results. Ex. "ja" for Japanese, "zh" for Chinese, "ko" for Korean, "ar" for Arabic, "ru" for Russian, "el" for Greek, "he" for Hebrew, "th" for Thai. Any other value will use Latin Extended (for most European languages and English).') class Config: has_config = False @@ -76,7 +78,8 @@ class Config: 'screen_capture_delay_secs': 3, 'screen_capture_only_active_windows': True, 'screen_capture_combo': '', - 'screen_capture_old_macos_api': False + 'screen_capture_old_macos_api': False, + 'language': 'ja' } def __parse(self, value): diff --git a/owocr/run.py b/owocr/run.py index 6e8c219..e8823bf 100644 --- a/owocr/run.py +++ b/owocr/run.py @@ -299,10 +299,11 @@ class RequestHandler(socketserver.BaseRequestHandler): class TextFiltering: accurate_filtering = False - def __init__(self): + def __init__(self, lang='ja'): from pysbd import Segmenter - self.segmenter = Segmenter(language='ja', clean=True) - self.kana_kanji_regex = re.compile(r'[\u3041-\u3096\u30A1-\u30FA\u4E00-\u9FFF]') + self.segmenter = Segmenter(language=lang, clean=True) + self.regex = self.get_regex(lang) + try: from transformers import pipeline, AutoTokenizer import torch @@ -325,13 +326,35 @@ class TextFiltering: except: import langid self.classify = langid.classify + + def get_regex(self, lang): + if lang == 'ja': + return re.compile(r'[\u3041-\u3096\u30A1-\u30FA\u4E00-\u9FFF]') + elif lang == 'zh': + return re.compile(r'[\u4E00-\u9FFF]') + elif lang == 'ko': + return re.compile(r'[\uAC00-\uD7AF]') + elif lang == 'ar': + return re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]') + elif lang == 'ru': + return re.compile(r'[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F\u1C80-\u1C8F]') + elif lang == 'el': + return re.compile(r'[\u0370-\u03FF\u1F00-\u1FFF]') + elif lang == 'he': + return re.compile(r'[\u0590-\u05FF\uFB1D-\uFB4F]') + elif lang == 'th': + return re.compile(r'[\u0E00-\u0E7F]') + else: + # Latin Extended regex for many European languages/English + return re.compile( + r'[a-zA-Z\u00C0-\u00FF\u0100-\u017F\u0180-\u024F\u0250-\u02AF\u1D00-\u1D7F\u1D80-\u1DBF\u1E00-\u1EFF\u2C60-\u2C7F\uA720-\uA7FF\uAB30-\uAB6F]') def __call__(self, text, last_result): orig_text = self.segmenter.segment(text) - orig_text_filtered = [] for block in orig_text: - block_filtered = self.kana_kanji_regex.findall(block) + block_filtered = self.regex.findall(block) + if block_filtered: orig_text_filtered.append(''.join(block_filtered)) else: @@ -352,12 +375,13 @@ class TextFiltering: detection_results = self.pipe(new_blocks, top_k=3, truncation=True) for idx, block in enumerate(new_blocks): for result in detection_results[idx]: - if result['label'] == 'ja': + if result['label'] == self.language: final_blocks.append(block) break else: for block in new_blocks: - if self.classify(block)[0] in ('ja', 'zh'): + # This only looks at language IF language is ja or zh, otherwise it keeps all text + if self.language not in ["ja", "zh"] or self.classify(block)[0] in ['ja', 'zh'] or block == "\n": final_blocks.append(block) text = '\n'.join(final_blocks) diff --git a/owocr_config.ini b/owocr_config.ini index 386ca1a..f12869b 100644 --- a/owocr_config.ini +++ b/owocr_config.ini @@ -29,6 +29,10 @@ ;note: this specifies a combo to wait on for taking a screenshot instead of using the delay. As an example: ++s. The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key ;screen_capture_combo = ++s ;screen_capture_old_macos_api = False +;language = ja +;language = zh +;note: This specifies the language to use for text filtering while using "screencapture". Valid values: ja: (Japanese) zh: (Chinese) ko: (Korean) ar: (Arabic) ru: (Russian) el: (Greek) he: (Hebrew) th: (Thai) +;Any other value will use Latin Extended (for most European languages and English). ;[winrtocr] ;url = http://aaa.xxx.yyy.zzz:8000 ;[oneocr] From c458b960321d9448bd7220fd2d8559ff5cb6cc11 Mon Sep 17 00:00:00 2001 From: Beangate Date: Mon, 28 Jul 2025 12:10:01 -0400 Subject: [PATCH 2/3] Actually pass the language to TextFiltering.. --- owocr/run.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/owocr/run.py b/owocr/run.py index e8823bf..ce997a3 100644 --- a/owocr/run.py +++ b/owocr/run.py @@ -896,6 +896,7 @@ def run(): terminated = False paused = config.get_general('pause_at_startup') auto_pause = config.get_general('auto_pause') + language = config.get_general('language') clipboard_thread = None websocket_server_thread = None screenshot_thread = None @@ -939,7 +940,7 @@ def run(): screenshot_event = threading.Event() screenshot_thread = ScreenshotThread(screen_capture_on_combo) screenshot_thread.start() - filtering = TextFiltering() + filtering = TextFiltering(lang=language) read_from_readable.append('screen capture') if 'websocket' in (read_from, read_from_secondary): read_from_readable.append('websocket') From fdca1df137c3da8863f0b976de5ede5b89661de4 Mon Sep 17 00:00:00 2001 From: Beangate Date: Mon, 28 Jul 2025 12:13:00 -0400 Subject: [PATCH 3/3] use "language" instead of "lang" --- owocr/run.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/owocr/run.py b/owocr/run.py index ce997a3..cf003c1 100644 --- a/owocr/run.py +++ b/owocr/run.py @@ -299,10 +299,10 @@ class RequestHandler(socketserver.BaseRequestHandler): class TextFiltering: accurate_filtering = False - def __init__(self, lang='ja'): + def __init__(self, language='ja'): from pysbd import Segmenter - self.segmenter = Segmenter(language=lang, clean=True) - self.regex = self.get_regex(lang) + self.segmenter = Segmenter(language=language, clean=True) + self.regex = self.get_regex(language) try: from transformers import pipeline, AutoTokenizer @@ -327,22 +327,22 @@ class TextFiltering: import langid self.classify = langid.classify - def get_regex(self, lang): - if lang == 'ja': + def get_regex(self, language): + if language == 'ja': return re.compile(r'[\u3041-\u3096\u30A1-\u30FA\u4E00-\u9FFF]') - elif lang == 'zh': + elif language == 'zh': return re.compile(r'[\u4E00-\u9FFF]') - elif lang == 'ko': + elif language == 'ko': return re.compile(r'[\uAC00-\uD7AF]') - elif lang == 'ar': + elif language == 'ar': return re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]') - elif lang == 'ru': + elif language == 'ru': return re.compile(r'[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F\u1C80-\u1C8F]') - elif lang == 'el': + elif language == 'el': return re.compile(r'[\u0370-\u03FF\u1F00-\u1FFF]') - elif lang == 'he': + elif language == 'he': return re.compile(r'[\u0590-\u05FF\uFB1D-\uFB4F]') - elif lang == 'th': + elif language == 'th': return re.compile(r'[\u0E00-\u0E7F]') else: # Latin Extended regex for many European languages/English @@ -940,7 +940,7 @@ def run(): screenshot_event = threading.Event() screenshot_thread = ScreenshotThread(screen_capture_on_combo) screenshot_thread.start() - filtering = TextFiltering(lang=language) + filtering = TextFiltering(language=language) read_from_readable.append('screen capture') if 'websocket' in (read_from, read_from_secondary): read_from_readable.append('websocket')