diff --git a/owocr/run.py b/owocr/run.py index a39ebde..49eb88e 100644 --- a/owocr/run.py +++ b/owocr/run.py @@ -303,13 +303,30 @@ class RequestHandler(socketserver.BaseRequestHandler): class TextFiltering: def __init__(self): - from pysbd import Segmenter - import langid self.language = config.get_general('language') - self.segmenter = Segmenter(language=self.language, clean=True) - self.classify = langid.classify self.regex = self.get_regex() - self.last_result = ([], engine_index) + self.kana_variants = { + 'ぁ': ['ぁ', 'あ'], 'あ': ['ぁ', 'あ'], + 'ぃ': ['ぃ', 'い'], 'い': ['ぃ', 'い'], + 'ぅ': ['ぅ', 'う'], 'う': ['ぅ', 'う'], + 'ぇ': ['ぇ', 'え'], 'え': ['ぇ', 'え'], + 'ぉ': ['ぉ', 'お'], 'お': ['ぉ', 'お'], + 'ァ': ['ァ', 'ア'], 'ア': ['ァ', 'ア'], + 'ィ': ['ィ', 'イ'], 'イ': ['ィ', 'イ'], + 'ゥ': ['ゥ', 'ウ'], 'ウ': ['ゥ', 'ウ'], + 'ェ': ['ェ', 'エ'], 'エ': ['ェ', 'エ'], + 'ォ': ['ォ', 'オ'], 'オ': ['ォ', 'オ'], + 'ゃ': ['ゃ', 'や'], 'や': ['ゃ', 'や'], + 'ゅ': ['ゅ', 'ゆ'], 'ゆ': ['ゅ', 'ゆ'], + 'ょ': ['ょ', 'よ'], 'よ': ['ょ', 'よ'], + 'ャ': ['ャ', 'ヤ'], 'ヤ': ['ャ', 'ヤ'], + 'ュ': ['ュ', 'ユ'], 'ユ': ['ュ', 'ユ'], + 'ョ': ['ョ', 'ヨ'], 'ヨ': ['ョ', 'ヨ'], + 'っ': ['っ', 'つ'], 'つ': ['っ', 'つ'], + 'ッ': ['ッ', 'ツ'], 'ツ': ['ッ', 'ツ'], + 'ゎ': ['ゎ', 'わ'], 'わ': ['ゎ', 'わ'], + 'ヮ': ['ヮ', 'ワ'], 'ワ': ['ヮ', 'ワ'] + } def get_regex(self): if self.language == 'ja': @@ -334,52 +351,9 @@ class TextFiltering: r'[a-zA-Z\u00C0-\u00FF\u0100-\u017F\u0180-\u024F\u0250-\u02AF\u1D00-\u1D7F\u1D80-\u1DBF\u1E00-\u1EFF\u2C60-\u2C7F\uA720-\uA7FF\uAB30-\uAB6F]') def convert_small_kana_to_big(self, text): - small_to_big = { - # Hiragana - 'ぁ': 'あ', 'ぃ': 'い', 'ぅ': 'う', 'ぇ': 'え', 'ぉ': 'お', - 'っ': 'つ', 'ゃ': 'や', 'ゅ': 'ゆ', 'ょ': 'よ', 'ゎ': 'わ', - # Katakana - 'ァ': 'ア', 'ィ': 'イ', 'ゥ': 'ウ', 'ェ': 'エ', 'ォ': 'オ', - 'ッ': 'ツ', 'ャ': 'ヤ', 'ュ': 'ユ', 'ョ': 'ヨ', 'ヮ': 'ワ' - } - - converted_text = ''.join(small_to_big.get(char, char) for char in text) + converted_text = ''.join(self.kana_variants.get(char, [char])[-1] for char in text) return converted_text - def __call__(self, text): - orig_text = self.segmenter.segment(text) - orig_text_filtered = [] - for block in orig_text: - block_filtered = self.regex.findall(block) - if self.language == 'ja': - block_filtered = self.convert_small_kana_to_big(block_filtered) - - if block_filtered: - orig_text_filtered.append(''.join(block_filtered)) - else: - orig_text_filtered.append(None) - - if self.last_result[1] == engine_index: - last_text = self.last_result[0] - else: - last_text = [] - - new_blocks = [] - for idx, block in enumerate(orig_text): - if orig_text_filtered[idx] and (orig_text_filtered[idx] not in last_text): - new_blocks.append(block) - - final_blocks = [] - for block in new_blocks: - # This only looks at language IF language is ja or zh, otherwise it keeps all text - if self.language not in ['ja', 'zh'] or self.classify(block)[0] in ['ja', 'zh'] or block == "\n": - final_blocks.append(block) - - text = '\n'.join(final_blocks) - - self.last_result = (orig_text_filtered, engine_index) - return text - class ScreenshotThread(threading.Thread): def __init__(self, screen_capture_on_combo): @@ -711,6 +685,7 @@ class OutputResult: self.filtering = TextFiltering() if init_filtering else None self.cj_regex = re.compile(r'[\u3041-\u3096\u30A1-\u30FA\u4E00-\u9FFF]') self.previous_result = None + self.previous_result_text = None def _coordinate_format_to_string(self, result_data): full_text_parts = [] @@ -745,7 +720,7 @@ class OutputResult: text_parts.append(' ') return ''.join(text_parts) - def _compare_text(self, current_text, prev_text, threshold=80): + def _compare_text(self, current_text, prev_text, threshold=82): if current_text in prev_text: return True if len(prev_text) > len(current_text): @@ -776,27 +751,155 @@ class OutputResult: for p in previous_result.paragraphs: previous_lines.extend(p.lines) - all_previous_text = '' + all_previous_text_spliced = [] for prev_line in previous_lines: prev_text = self._get_line_text(prev_line) prev_text = ''.join(self.filtering.regex.findall(prev_text)) if self.filtering.language == 'ja': prev_text = self.filtering.convert_small_kana_to_big(prev_text) - all_previous_text += prev_text + all_previous_text_spliced.append(prev_text) + all_previous_text = ''.join(all_previous_text_spliced) + + logger.debug(f"Previous text: '{all_previous_text_spliced}'") + + first = True for current_line in current_lines: current_text = self._get_line_text(current_line) current_text = ''.join(self.filtering.regex.findall(current_text)) + if not current_text: + continue if self.filtering.language == 'ja': current_text = self.filtering.convert_small_kana_to_big(current_text) - text_similar = self._compare_text(current_text, all_previous_text) + # For the first line, check if it contains the end of previous text + if first and all_previous_text: + overlap = self._find_overlap(all_previous_text, current_text) + if overlap and len(current_text) > len(overlap): + logger.debug(f"Found overlap: '{overlap}'") + changed_lines.append(current_line) + first = False + continue + + if len(current_text) < 3: + text_similar = current_text in all_previous_text_spliced + else: + text_similar = self._compare_text(current_text, all_previous_text) + + logger.debug(f"Current line: '{current_text}' Similar: '{text_similar}'") if not text_similar: changed_lines.append(current_line) + if len(current_text) >= 3: + first = False return changed_lines + def _find_overlap(self, previous_text, current_text): + """Find the overlapping portion between the end of previous_text and start of current_text.""" + # Try different overlap lengths, starting from the maximum possible + min_overlap_length = 3 # Minimum overlap to consider meaningful + max_overlap_length = min(len(previous_text), len(current_text)) + + for overlap_length in range(max_overlap_length, min_overlap_length - 1, -1): + previous_end = previous_text[-overlap_length:] + current_start = current_text[:overlap_length] + + if previous_end == current_start: + return previous_end + + return None + + def _cut_at_overlap(self, current_line, overlap): + pattern_parts = [] + for char in overlap: + # Check if character is kana and has small/big variants + if char in self.filtering.kana_variants: + # Use character class that matches both small and big variants + variants = self.filtering.kana_variants[char] + pattern_parts.append(f'[{"".join(variants)}]') + else: + # Escape regex special characters for regular characters + pattern_parts.append(re.escape(char)) + + # Create pattern: overlap characters with any characters (0 or more) between them + overlap_pattern = r'.*?'.join(pattern_parts) + + # Also allow any characters at the beginning + full_pattern = r'^.*?' + overlap_pattern + + logger.debug(f"Cut regex: '{full_pattern}'") + + # Find the match + match = re.search(full_pattern, current_line) + if match: + # Cut after the matched overlapping portion + cut_position = match.end() + return current_line[cut_position:] + + return current_line + + def _find_changed_lines_text(self, current_result): + # Split both results into lines + current_lines = current_result.split('\n') + + # If no previous result, all lines are considered changed + if self.previous_result_text is None: + self.previous_result_text = current_lines[-10:] # Keep only last 10 lines + return current_result + + changed_lines = [] + all_previous_text_spliced = [] + + for prev_line in self.previous_result_text: + prev_text = ''.join(self.filtering.regex.findall(prev_line)) + if self.filtering.language == 'ja': + prev_text = self.filtering.convert_small_kana_to_big(prev_text) + all_previous_text_spliced.append(prev_text) + + all_previous_text = ''.join(all_previous_text_spliced) + + logger.debug(f"Previous text: '{all_previous_text_spliced}'") + + first = True + # Check each current line against the combined previous text + for current_line in current_lines: + current_text = ''.join(self.filtering.regex.findall(current_line)) + if not current_text: + continue + if self.filtering.language == 'ja': + current_text = self.filtering.convert_small_kana_to_big(current_text) + + # For the first line, check if it contains the end of previous text + if first and all_previous_text: + overlap = self._find_overlap(all_previous_text, current_text) + if overlap and len(current_text) > len(overlap): + logger.debug(f"Found overlap: '{overlap}'") + # Cut the current_line to remove the overlapping part + current_line = self._cut_at_overlap(current_line, overlap) + logger.debug(f"After cutting: '{current_line}'") + changed_lines.append(current_line) + first = False + continue + + if len(current_text) < 3: + text_similar = current_text in all_previous_text_spliced + else: + text_similar = self._compare_text(current_text, all_previous_text) + + logger.debug(f"Current line: '{current_text}' Similar: '{text_similar}'") + + if not text_similar: + changed_lines.append(current_line) + if len(current_text) >= 3: + first = False + + # Update cache with current lines, keeping only the last 10 + self.previous_result_text.extend(current_lines) + self.previous_result_text = self.previous_result_text[-10:] + + return '\n'.join(changed_lines) + def _create_changed_regions_image(self, pil_image, changed_lines, margin=5): img_width, img_height = pil_image.size @@ -896,7 +999,7 @@ class OutputResult: if output_format == 'json': logger.warning(f"Engine '{engine_instance.name}' does not support JSON output. Falling back to text.") if filter_text: - text_to_process = self.filtering(result_data_text) + text_to_process = self._find_changed_lines_text(result_data_text) output_string = self._post_process(text_to_process, True) else: output_string = self._post_process(result_data_text, False) @@ -1021,7 +1124,7 @@ def on_screenshot_combo(): def run(): - logger.configure(handlers=[{'sink': sys.stderr, 'format': config.get_general('logger_format')}]) + logger.configure(handlers=[{'sink': sys.stderr, 'format': config.get_general('logger_format'), 'level': 'INFO'}]) if config.has_config: logger.info('Parsed config file')