Push nitpick from a while back

This commit is contained in:
AuroraWright
2025-09-19 09:42:09 +02:00
parent 6552c884b7
commit 094559c68a

View File

@@ -302,10 +302,11 @@ class RequestHandler(socketserver.BaseRequestHandler):
class TextFiltering: class TextFiltering:
accurate_filtering = False accurate_filtering = False
def __init__(self, language='ja'): def __init__(self):
from pysbd import Segmenter from pysbd import Segmenter
self.segmenter = Segmenter(language=language, clean=True) self.language = config.get_general('language')
self.regex = self.get_regex(language) self.segmenter = Segmenter(language=self.language, clean=True)
self.regex = self.get_regex()
try: try:
from transformers import pipeline, AutoTokenizer from transformers import pipeline, AutoTokenizer
@@ -330,22 +331,22 @@ class TextFiltering:
import langid import langid
self.classify = langid.classify self.classify = langid.classify
def get_regex(self, language): def get_regex(self):
if language == 'ja': if self.language == 'ja':
return re.compile(r'[\u3041-\u3096\u30A1-\u30FA\u4E00-\u9FFF]') return re.compile(r'[\u3041-\u3096\u30A1-\u30FA\u4E00-\u9FFF]')
elif language == 'zh': elif self.language == 'zh':
return re.compile(r'[\u4E00-\u9FFF]') return re.compile(r'[\u4E00-\u9FFF]')
elif language == 'ko': elif self.language == 'ko':
return re.compile(r'[\uAC00-\uD7AF]') return re.compile(r'[\uAC00-\uD7AF]')
elif language == 'ar': elif self.language == 'ar':
return re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]') return re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]')
elif language == 'ru': elif self.language == 'ru':
return re.compile(r'[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F\u1C80-\u1C8F]') return re.compile(r'[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F\u1C80-\u1C8F]')
elif language == 'el': elif self.language == 'el':
return re.compile(r'[\u0370-\u03FF\u1F00-\u1FFF]') return re.compile(r'[\u0370-\u03FF\u1F00-\u1FFF]')
elif language == 'he': elif self.language == 'he':
return re.compile(r'[\u0590-\u05FF\uFB1D-\uFB4F]') return re.compile(r'[\u0590-\u05FF\uFB1D-\uFB4F]')
elif language == 'th': elif self.language == 'th':
return re.compile(r'[\u0E00-\u0E7F]') return re.compile(r'[\u0E00-\u0E7F]')
else: else:
# Latin Extended regex for many European languages/English # Latin Extended regex for many European languages/English
@@ -937,7 +938,6 @@ def run():
terminated = False terminated = False
paused = config.get_general('pause_at_startup') paused = config.get_general('pause_at_startup')
auto_pause = config.get_general('auto_pause') auto_pause = config.get_general('auto_pause')
language = config.get_general('language')
output_format = config.get_general('output_format') output_format = config.get_general('output_format')
clipboard_thread = None clipboard_thread = None
websocket_server_thread = None websocket_server_thread = None
@@ -982,7 +982,7 @@ def run():
screenshot_event = threading.Event() screenshot_event = threading.Event()
screenshot_thread = ScreenshotThread(screen_capture_on_combo) screenshot_thread = ScreenshotThread(screen_capture_on_combo)
screenshot_thread.start() screenshot_thread.start()
filtering = TextFiltering(language=language) filtering = TextFiltering()
read_from_readable.append('screen capture') read_from_readable.append('screen capture')
if 'websocket' in (read_from, read_from_secondary): if 'websocket' in (read_from, read_from_secondary):
read_from_readable.append('websocket') read_from_readable.append('websocket')