Improve filtering quality

This commit is contained in:
AuroraWright
2025-10-07 18:47:54 +02:00
parent ed9b05d2e0
commit 878f164533

View File

@@ -303,13 +303,30 @@ class RequestHandler(socketserver.BaseRequestHandler):
class TextFiltering:
def __init__(self):
from pysbd import Segmenter
import langid
self.language = config.get_general('language')
self.segmenter = Segmenter(language=self.language, clean=True)
self.classify = langid.classify
self.regex = self.get_regex()
self.last_result = ([], engine_index)
self.kana_variants = {
'': ['', ''], '': ['', ''],
'': ['', ''], '': ['', ''],
'': ['', ''], '': ['', ''],
'': ['', ''], '': ['', ''],
'': ['', ''], '': ['', ''],
'': ['', ''], '': ['', ''],
'': ['', ''], '': ['', ''],
'': ['', ''], '': ['', ''],
'': ['', ''], '': ['', ''],
'': ['', ''], '': ['', ''],
'': ['', ''], '': ['', ''],
'': ['', ''], '': ['', ''],
'': ['', ''], '': ['', ''],
'': ['', ''], '': ['', ''],
'': ['', ''], '': ['', ''],
'': ['', ''], '': ['', ''],
'': ['', ''], '': ['', ''],
'': ['', ''], '': ['', ''],
'': ['', ''], '': ['', ''],
'': ['', ''], '': ['', '']
}
def get_regex(self):
if self.language == 'ja':
@@ -334,52 +351,9 @@ class TextFiltering:
r'[a-zA-Z\u00C0-\u00FF\u0100-\u017F\u0180-\u024F\u0250-\u02AF\u1D00-\u1D7F\u1D80-\u1DBF\u1E00-\u1EFF\u2C60-\u2C7F\uA720-\uA7FF\uAB30-\uAB6F]')
def convert_small_kana_to_big(self, text):
small_to_big = {
# Hiragana
'': '', '': '', '': '', '': '', '': '',
'': '', '': '', '': '', '': '', '': '',
# Katakana
'': '', '': '', '': '', '': '', '': '',
'': '', '': '', '': '', '': '', '': ''
}
converted_text = ''.join(small_to_big.get(char, char) for char in text)
converted_text = ''.join(self.kana_variants.get(char, [char])[-1] for char in text)
return converted_text
def __call__(self, text):
orig_text = self.segmenter.segment(text)
orig_text_filtered = []
for block in orig_text:
block_filtered = self.regex.findall(block)
if self.language == 'ja':
block_filtered = self.convert_small_kana_to_big(block_filtered)
if block_filtered:
orig_text_filtered.append(''.join(block_filtered))
else:
orig_text_filtered.append(None)
if self.last_result[1] == engine_index:
last_text = self.last_result[0]
else:
last_text = []
new_blocks = []
for idx, block in enumerate(orig_text):
if orig_text_filtered[idx] and (orig_text_filtered[idx] not in last_text):
new_blocks.append(block)
final_blocks = []
for block in new_blocks:
# This only looks at language IF language is ja or zh, otherwise it keeps all text
if self.language not in ['ja', 'zh'] or self.classify(block)[0] in ['ja', 'zh'] or block == "\n":
final_blocks.append(block)
text = '\n'.join(final_blocks)
self.last_result = (orig_text_filtered, engine_index)
return text
class ScreenshotThread(threading.Thread):
def __init__(self, screen_capture_on_combo):
@@ -711,6 +685,7 @@ class OutputResult:
self.filtering = TextFiltering() if init_filtering else None
self.cj_regex = re.compile(r'[\u3041-\u3096\u30A1-\u30FA\u4E00-\u9FFF]')
self.previous_result = None
self.previous_result_text = None
def _coordinate_format_to_string(self, result_data):
full_text_parts = []
@@ -745,7 +720,7 @@ class OutputResult:
text_parts.append(' ')
return ''.join(text_parts)
def _compare_text(self, current_text, prev_text, threshold=80):
def _compare_text(self, current_text, prev_text, threshold=82):
if current_text in prev_text:
return True
if len(prev_text) > len(current_text):
@@ -776,27 +751,155 @@ class OutputResult:
for p in previous_result.paragraphs:
previous_lines.extend(p.lines)
all_previous_text = ''
all_previous_text_spliced = []
for prev_line in previous_lines:
prev_text = self._get_line_text(prev_line)
prev_text = ''.join(self.filtering.regex.findall(prev_text))
if self.filtering.language == 'ja':
prev_text = self.filtering.convert_small_kana_to_big(prev_text)
all_previous_text += prev_text
all_previous_text_spliced.append(prev_text)
all_previous_text = ''.join(all_previous_text_spliced)
logger.debug(f"Previous text: '{all_previous_text_spliced}'")
first = True
for current_line in current_lines:
current_text = self._get_line_text(current_line)
current_text = ''.join(self.filtering.regex.findall(current_text))
if not current_text:
continue
if self.filtering.language == 'ja':
current_text = self.filtering.convert_small_kana_to_big(current_text)
# For the first line, check if it contains the end of previous text
if first and all_previous_text:
overlap = self._find_overlap(all_previous_text, current_text)
if overlap and len(current_text) > len(overlap):
logger.debug(f"Found overlap: '{overlap}'")
changed_lines.append(current_line)
first = False
continue
if len(current_text) < 3:
text_similar = current_text in all_previous_text_spliced
else:
text_similar = self._compare_text(current_text, all_previous_text)
logger.debug(f"Current line: '{current_text}' Similar: '{text_similar}'")
if not text_similar:
changed_lines.append(current_line)
if len(current_text) >= 3:
first = False
return changed_lines
def _find_overlap(self, previous_text, current_text):
"""Find the overlapping portion between the end of previous_text and start of current_text."""
# Try different overlap lengths, starting from the maximum possible
min_overlap_length = 3 # Minimum overlap to consider meaningful
max_overlap_length = min(len(previous_text), len(current_text))
for overlap_length in range(max_overlap_length, min_overlap_length - 1, -1):
previous_end = previous_text[-overlap_length:]
current_start = current_text[:overlap_length]
if previous_end == current_start:
return previous_end
return None
def _cut_at_overlap(self, current_line, overlap):
pattern_parts = []
for char in overlap:
# Check if character is kana and has small/big variants
if char in self.filtering.kana_variants:
# Use character class that matches both small and big variants
variants = self.filtering.kana_variants[char]
pattern_parts.append(f'[{"".join(variants)}]')
else:
# Escape regex special characters for regular characters
pattern_parts.append(re.escape(char))
# Create pattern: overlap characters with any characters (0 or more) between them
overlap_pattern = r'.*?'.join(pattern_parts)
# Also allow any characters at the beginning
full_pattern = r'^.*?' + overlap_pattern
logger.debug(f"Cut regex: '{full_pattern}'")
# Find the match
match = re.search(full_pattern, current_line)
if match:
# Cut after the matched overlapping portion
cut_position = match.end()
return current_line[cut_position:]
return current_line
def _find_changed_lines_text(self, current_result):
# Split both results into lines
current_lines = current_result.split('\n')
# If no previous result, all lines are considered changed
if self.previous_result_text is None:
self.previous_result_text = current_lines[-10:] # Keep only last 10 lines
return current_result
changed_lines = []
all_previous_text_spliced = []
for prev_line in self.previous_result_text:
prev_text = ''.join(self.filtering.regex.findall(prev_line))
if self.filtering.language == 'ja':
prev_text = self.filtering.convert_small_kana_to_big(prev_text)
all_previous_text_spliced.append(prev_text)
all_previous_text = ''.join(all_previous_text_spliced)
logger.debug(f"Previous text: '{all_previous_text_spliced}'")
first = True
# Check each current line against the combined previous text
for current_line in current_lines:
current_text = ''.join(self.filtering.regex.findall(current_line))
if not current_text:
continue
if self.filtering.language == 'ja':
current_text = self.filtering.convert_small_kana_to_big(current_text)
# For the first line, check if it contains the end of previous text
if first and all_previous_text:
overlap = self._find_overlap(all_previous_text, current_text)
if overlap and len(current_text) > len(overlap):
logger.debug(f"Found overlap: '{overlap}'")
# Cut the current_line to remove the overlapping part
current_line = self._cut_at_overlap(current_line, overlap)
logger.debug(f"After cutting: '{current_line}'")
changed_lines.append(current_line)
first = False
continue
if len(current_text) < 3:
text_similar = current_text in all_previous_text_spliced
else:
text_similar = self._compare_text(current_text, all_previous_text)
logger.debug(f"Current line: '{current_text}' Similar: '{text_similar}'")
if not text_similar:
changed_lines.append(current_line)
if len(current_text) >= 3:
first = False
# Update cache with current lines, keeping only the last 10
self.previous_result_text.extend(current_lines)
self.previous_result_text = self.previous_result_text[-10:]
return '\n'.join(changed_lines)
def _create_changed_regions_image(self, pil_image, changed_lines, margin=5):
img_width, img_height = pil_image.size
@@ -896,7 +999,7 @@ class OutputResult:
if output_format == 'json':
logger.warning(f"Engine '{engine_instance.name}' does not support JSON output. Falling back to text.")
if filter_text:
text_to_process = self.filtering(result_data_text)
text_to_process = self._find_changed_lines_text(result_data_text)
output_string = self._post_process(text_to_process, True)
else:
output_string = self._post_process(result_data_text, False)
@@ -1021,7 +1124,7 @@ def on_screenshot_combo():
def run():
logger.configure(handlers=[{'sink': sys.stderr, 'format': config.get_general('logger_format')}])
logger.configure(handlers=[{'sink': sys.stderr, 'format': config.get_general('logger_format'), 'level': 'INFO'}])
if config.has_config:
logger.info('Parsed config file')