Vertical furigana filter, rename option

This commit is contained in:
AuroraWright
2025-10-17 01:43:24 +02:00
parent 87f7ea6069
commit e59ceb7ae4
4 changed files with 101 additions and 46 deletions

View File

@@ -52,14 +52,14 @@ parser.add_argument('-sf', '--screen_capture_frame_stabilization', type=float, d
help="When reading with screen capture, delay to wait until text is stable before processing it. -1 waits for two OCR results to be the same. 0 to disable.") help="When reading with screen capture, delay to wait until text is stable before processing it. -1 waits for two OCR results to be the same. 0 to disable.")
parser.add_argument('-sl', '--screen_capture_line_recovery', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS, parser.add_argument('-sl', '--screen_capture_line_recovery', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS,
help="When reading with screen capture and frame stabilization is on, try to recover missed lines from unstable frames. Can lead to increased glitches.") help="When reading with screen capture and frame stabilization is on, try to recover missed lines from unstable frames. Can lead to increased glitches.")
parser.add_argument('-sff', '--screen_capture_furigana_filter', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS,
help="When reading with screen capture, try to filter furigana lines.")
parser.add_argument('-sc', '--screen_capture_combo', type=str, default=argparse.SUPPRESS, parser.add_argument('-sc', '--screen_capture_combo', type=str, default=argparse.SUPPRESS,
help='When reading with screen capture, combo to wait on for taking a screenshot. If periodic screenshots are also enabled, any screenshot taken this way bypasses the filtering. Example value: "<ctrl>+<shift>+s". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key') help='When reading with screen capture, combo to wait on for taking a screenshot. If periodic screenshots are also enabled, any screenshot taken this way bypasses the filtering. Example value: "<ctrl>+<shift>+s". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
parser.add_argument('-scc', '--coordinate_selector_combo', type=str, default=argparse.SUPPRESS, parser.add_argument('-scc', '--coordinate_selector_combo', type=str, default=argparse.SUPPRESS,
help='When reading with screen capture, combo to wait on for invoking the coordinate picker to change the screen/window area. Example value: "<ctrl>+<shift>+c". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key') help='When reading with screen capture, combo to wait on for invoking the coordinate picker to change the screen/window area. Example value: "<ctrl>+<shift>+c". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
parser.add_argument('-l', '--language', type=str, default=argparse.SUPPRESS, parser.add_argument('-l', '--language', type=str, default=argparse.SUPPRESS,
help='Two letter language code for filtering screencapture OCR results. Ex. "ja" for Japanese, "zh" for Chinese, "ko" for Korean, "ar" for Arabic, "ru" for Russian, "el" for Greek, "he" for Hebrew, "th" for Thai. Any other value will use Latin Extended (for most European languages and English).') help='Two letter language code for filtering screencapture OCR results. Ex. "ja" for Japanese, "zh" for Chinese, "ko" for Korean, "ar" for Arabic, "ru" for Russian, "el" for Greek, "he" for Hebrew, "th" for Thai. Any other value will use Latin Extended (for most European languages and English).')
parser.add_argument('-f', '--furigana_filter', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS,
help="Try to filter furigana lines for Japanese.")
parser.add_argument('-of', '--output_format', type=str, default=argparse.SUPPRESS, parser.add_argument('-of', '--output_format', type=str, default=argparse.SUPPRESS,
help='The output format for OCR results. Can be "text" (default) or "json" (to include coordinates).') help='The output format for OCR results. Can be "text" (default) or "json" (to include coordinates).')
parser.add_argument('-wp', '--websocket_port', type=int, default=argparse.SUPPRESS, parser.add_argument('-wp', '--websocket_port', type=int, default=argparse.SUPPRESS,
@@ -99,7 +99,7 @@ class Config:
'screen_capture_only_active_windows': True, 'screen_capture_only_active_windows': True,
'screen_capture_frame_stabilization': -1, 'screen_capture_frame_stabilization': -1,
'screen_capture_line_recovery': True, 'screen_capture_line_recovery': True,
'screen_capture_furigana_filter': True, 'furigana_filter': True,
'screen_capture_combo': '', 'screen_capture_combo': '',
'coordinate_selector_combo': '', 'coordinate_selector_combo': '',
'screen_capture_old_macos_api': False, 'screen_capture_old_macos_api': False,

View File

@@ -369,16 +369,18 @@ class MangaOcr:
coordinate_support = False coordinate_support = False
threading_support = True threading_support = True
def __init__(self, config={'pretrained_model_name_or_path':'kha-white/manga-ocr-base','force_cpu': False}): def __init__(self, config={}):
if 'manga_ocr' not in sys.modules: if 'manga_ocr' not in sys.modules:
logger.warning('manga-ocr not available, Manga OCR will not work!') logger.warning('manga-ocr not available, Manga OCR will not work!')
else: else:
pretrained_model_name_or_path = config.get('pretrained_model_name_or_path', 'kha-white/manga-ocr-base')
force_cpu = config.get('force_cpu', False)
logger.disable('manga_ocr') logger.disable('manga_ocr')
logging.getLogger('transformers').setLevel(logging.ERROR) # silence transformers >=4.46 warnings logging.getLogger('transformers').setLevel(logging.ERROR) # silence transformers >=4.46 warnings
from manga_ocr import ocr from manga_ocr import ocr
ocr.post_process = empty_post_process ocr.post_process = empty_post_process
logger.info(f'Loading Manga OCR model') logger.info(f'Loading Manga OCR model')
self.model = MOCR(config['pretrained_model_name_or_path'], config['force_cpu']) self.model = MOCR(pretrained_model_name_or_path, force_cpu)
self.available = True self.available = True
logger.info('Manga OCR ready') logger.info('Manga OCR ready')
@@ -860,12 +862,14 @@ class AppleVision:
coordinate_support = True coordinate_support = True
threading_support = True threading_support = True
def __init__(self, language='ja'): def __init__(self, language='ja', config={}):
if sys.platform != 'darwin': if sys.platform != 'darwin':
logger.warning('Apple Vision is not supported on non-macOS platforms!') logger.warning('Apple Vision is not supported on non-macOS platforms!')
elif int(platform.mac_ver()[0].split('.')[0]) < 13: elif int(platform.mac_ver()[0].split('.')[0]) < 13:
logger.warning('Apple Vision is not supported on macOS older than Ventura/13.0!') logger.warning('Apple Vision is not supported on macOS older than Ventura/13.0!')
else: else:
self.recognition_level = Vision.VNRequestTextRecognitionLevelFast if config.get('fast_mode', False) else Vision.VNRequestTextRecognitionLevelAccurate
self.language_correction = config.get('language_correction', True)
self.available = True self.available = True
self.language = [language, 'en'] self.language = [language, 'en']
logger.info('Apple Vision ready') logger.info('Apple Vision ready')
@@ -916,8 +920,8 @@ class AppleVision:
req = Vision.VNRecognizeTextRequest.alloc().init() req = Vision.VNRecognizeTextRequest.alloc().init()
req.setRevision_(Vision.VNRecognizeTextRequestRevision3) req.setRevision_(Vision.VNRecognizeTextRequestRevision3)
req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate) req.setRecognitionLevel_(self.recognition_level)
req.setUsesLanguageCorrection_(True) req.setUsesLanguageCorrection_(self.language_correction)
req.setRecognitionLanguages_(self.language) req.setRecognitionLanguages_(self.language)
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_( handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(
@@ -1050,7 +1054,6 @@ class AppleLiveText:
) )
lines.append(line) lines.append(line)
# Create a single paragraph to hold all lines
if lines: if lines:
p_bbox = merge_bounding_boxes(lines) p_bbox = merge_bounding_boxes(lines)
paragraph = Paragraph(bounding_box=p_bbox, lines=lines) paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
@@ -1132,7 +1135,6 @@ class WinRTOCR:
) )
lines.append(line) lines.append(line)
# Create a single paragraph to hold all lines
if lines: if lines:
p_bbox = merge_bounding_boxes(lines) p_bbox = merge_bounding_boxes(lines)
paragraph = Paragraph(bounding_box=p_bbox, lines=lines) paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
@@ -1235,7 +1237,6 @@ class OneOCR:
) )
lines.append(line) lines.append(line)
# Create a single paragraph to hold all lines
if lines: if lines:
p_bbox = merge_bounding_boxes(lines) p_bbox = merge_bounding_boxes(lines)
paragraph = Paragraph(bounding_box=p_bbox, lines=lines) paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
@@ -1417,13 +1418,14 @@ class EasyOCR:
coordinate_support = True coordinate_support = True
threading_support = True threading_support = True
def __init__(self, config={'gpu': True}, language='ja'): def __init__(self, config={}, language='ja'):
if 'easyocr' not in sys.modules: if 'easyocr' not in sys.modules:
logger.warning('easyocr not available, EasyOCR will not work!') logger.warning('easyocr not available, EasyOCR will not work!')
else: else:
logger.info('Loading EasyOCR model') logger.info('Loading EasyOCR model')
gpu = config.get('gpu', True)
logging.getLogger('easyocr.easyocr').setLevel(logging.ERROR) logging.getLogger('easyocr.easyocr').setLevel(logging.ERROR)
self.model = easyocr.Reader([language,'en'], gpu=config['gpu']) self.model = easyocr.Reader([language,'en'], gpu=gpu)
self.available = True self.available = True
logger.info('EasyOCR ready') logger.info('EasyOCR ready')
@@ -1485,20 +1487,22 @@ class RapidOCR:
coordinate_support = True coordinate_support = True
threading_support = True threading_support = True
def __init__(self, config={'high_accuracy_detection': False, 'high_accuracy_recognition': True}, language='ja'): def __init__(self, config={}, language='ja'):
if 'rapidocr' not in sys.modules: if 'rapidocr' not in sys.modules:
logger.warning('rapidocr not available, RapidOCR will not work!') logger.warning('rapidocr not available, RapidOCR will not work!')
else: else:
logger.info('Loading RapidOCR model') logger.info('Loading RapidOCR model')
high_accuracy_detection = config.get('high_accuracy_detection', False)
high_accuracy_recognition = config.get('high_accuracy_recognition', True)
lang_rec = self.language_to_model_language(language) lang_rec = self.language_to_model_language(language)
self.model = ROCR(params={ self.model = ROCR(params={
'Det.engine_type': EngineType.ONNXRUNTIME, 'Det.engine_type': EngineType.ONNXRUNTIME,
'Det.lang_type': LangDet.CH, 'Det.lang_type': LangDet.CH,
'Det.model_type': ModelType.SERVER if config['high_accuracy_detection'] else ModelType.MOBILE, 'Det.model_type': ModelType.SERVER if high_accuracy_detection else ModelType.MOBILE,
'Det.ocr_version': OCRVersion.PPOCRV5, 'Det.ocr_version': OCRVersion.PPOCRV5,
'Rec.engine_type': EngineType.ONNXRUNTIME, 'Rec.engine_type': EngineType.ONNXRUNTIME,
'Rec.lang_type': lang_rec, 'Rec.lang_type': lang_rec,
'Rec.model_type': ModelType.SERVER if config['high_accuracy_recognition'] else ModelType.MOBILE, 'Rec.model_type': ModelType.SERVER if high_accuracy_recognition else ModelType.MOBILE,
'Rec.ocr_version': OCRVersion.PPOCRV5, 'Rec.ocr_version': OCRVersion.PPOCRV5,
'Global.log_level': 'error' 'Global.log_level': 'error'
}) })
@@ -1626,10 +1630,6 @@ class OCRSpace:
def _to_generic_result(self, api_result, img_width, img_height, og_img_width, og_img_height): def _to_generic_result(self, api_result, img_width, img_height, og_img_width, og_img_height):
parsed_result = api_result['ParsedResults'][0] parsed_result = api_result['ParsedResults'][0]
text_overlay = parsed_result.get('TextOverlay', {}) text_overlay = parsed_result.get('TextOverlay', {})
image_props = ImageProperties(width=og_img_width, height=og_img_height)
ocr_result = OcrResult(image_properties=image_props)
lines_data = text_overlay.get('Lines', []) lines_data = text_overlay.get('Lines', [])
lines = [] lines = []
@@ -1645,11 +1645,14 @@ class OCRSpace:
if lines: if lines:
p_bbox = merge_bounding_boxes(lines) p_bbox = merge_bounding_boxes(lines)
paragraph = Paragraph(bounding_box=p_bbox, lines=lines) paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
ocr_result.paragraphs = [paragraph] paragraphs = [paragraph]
else: else:
ocr_result.paragraphs = [] paragraphs = []
return ocr_result return OcrResult(
image_properties=ImageProperties(width=og_img_width, height=og_img_height),
paragraphs=paragraphs
)
def __call__(self, img): def __call__(self, img):
img, is_path = input_to_pil_image(img) img, is_path = input_to_pil_image(img)

View File

@@ -294,7 +294,7 @@ class TextFiltering:
self.language = config.get_general('language') self.language = config.get_general('language')
self.frame_stabilization = 0 if config.get_general('screen_capture_delay_secs') == -1 else config.get_general('screen_capture_frame_stabilization') self.frame_stabilization = 0 if config.get_general('screen_capture_delay_secs') == -1 else config.get_general('screen_capture_frame_stabilization')
self.line_recovery = config.get_general('screen_capture_line_recovery') self.line_recovery = config.get_general('screen_capture_line_recovery')
self.furigana_filter = config.get_general('screen_capture_furigana_filter') self.furigana_filter = self.language == 'ja' and config.get_general('furigana_filter')
self.last_frame_data = (None, None) self.last_frame_data = (None, None)
self.last_last_frame_data = (None, None) self.last_last_frame_data = (None, None)
self.stable_frame_data = None self.stable_frame_data = None
@@ -549,7 +549,7 @@ class TextFiltering:
if all(not current_text_line for current_text_line in current_lines): if all(not current_text_line for current_text_line in current_lines):
return None return None
if self.furigana_filter and self.language == 'ja' and isinstance(current_result_ocr, OcrResult): if self.furigana_filter and isinstance(current_result_ocr, OcrResult):
for p in current_result_ocr.paragraphs: for p in current_result_ocr.paragraphs:
current_lines_ocr.extend(p.lines) current_lines_ocr.extend(p.lines)
@@ -607,34 +607,57 @@ class TextFiltering:
if not current_lines[j]: if not current_lines[j]:
continue continue
below_line_bbox = current_lines_ocr[j].bounding_box other_line_bbox = current_lines_ocr[j].bounding_box
below_line_text = current_lines[j] other_line_text = current_lines[j]
logger.opt(colors=True).debug(f"<magenta>Furigana check against line: '{below_line_text}'</magenta>") if len(current_text) <= len(other_line_text):
is_vertical = other_line_bbox.height > other_line_bbox.width
else:
is_vertical = current_line_bbox.height > current_line_bbox.width
logger.opt(colors=True).debug(f"<magenta>Furigana check against line: '{other_line_text}'</magenta>")
if is_vertical:
width_threshold = other_line_bbox.width * 0.7
is_smaller = current_line_bbox.width < width_threshold
logger.opt(colors=True).debug(f"<magenta>Vertical furigana check width: '{other_line_bbox.width}' '{current_line_bbox.width}'</magenta>")
else:
height_threshold = other_line_bbox.height * 0.7
is_smaller = current_line_bbox.height < height_threshold
logger.opt(colors=True).debug(f"<magenta>Horizontal furigana check height: '{other_line_bbox.height}' '{current_line_bbox.height}'</magenta>")
# Check if the line is taller
height_threshold = below_line_bbox.height * 0.7
is_smaller = current_line_bbox.height < height_threshold
logger.opt(colors=True).debug(f"<magenta>Furigana check height: '{below_line_bbox.height}' '{current_line_bbox.height}'</magenta>")
if not is_smaller: if not is_smaller:
continue continue
# Check if the line has kanji # Check if the line has kanji
below_has_kanji = self.kanji_regex.search(below_line_text) other_has_kanji = self.kanji_regex.search(other_line_text)
if not below_has_kanji: if not other_has_kanji:
continue continue
vertical_threshold = below_line_bbox.height + current_line_bbox.height if is_vertical:
vertical_distance = below_line_bbox.center_y - current_line_bbox.center_y horizontal_threshold = current_line_bbox.width + other_line_bbox.width
horizontal_overlap = self._check_horizontal_overlap(current_line_bbox, below_line_bbox) horizontal_distance = current_line_bbox.center_x - other_line_bbox.center_x
vertical_overlap = self._check_vertical_overlap(current_line_bbox, other_line_bbox)
logger.opt(colors=True).debug(f"<magenta>Furigana check position: '{vertical_threshold}' '{vertical_distance}' '{horizontal_overlap}'</magenta>") logger.opt(colors=True).debug(f"<magenta>Vertical furigana check position: '{horizontal_threshold}' '{horizontal_distance}' '{vertical_overlap}'</magenta>")
# If vertically close and horizontally aligned, it's likely furigana # If horizontally close and vertically aligned, it's likely furigana
if (0 < vertical_distance < vertical_threshold and horizontal_overlap > 0.5): if (0 < horizontal_distance < horizontal_threshold and vertical_overlap > 0.5):
is_furigana = True is_furigana = True
logger.opt(colors=True).debug(f"<magenta>Skipping furigana line: '{current_text}' above line: '{below_line_text}'</magenta>") logger.opt(colors=True).debug(f"<magenta>Skipping vertical furigana line: '{current_text}' next to line: '{other_line_text}'</magenta>")
break break
else:
vertical_threshold = other_line_bbox.height + current_line_bbox.height
vertical_distance = other_line_bbox.center_y - current_line_bbox.center_y
horizontal_overlap = self._check_horizontal_overlap(current_line_bbox, other_line_bbox)
logger.opt(colors=True).debug(f"<magenta>Horizontal furigana check position: '{vertical_threshold}' '{vertical_distance}' '{horizontal_overlap}'</magenta>")
# If vertically close and horizontally aligned, it's likely furigana
if (0 < vertical_distance < vertical_threshold and horizontal_overlap > 0.5):
is_furigana = True
logger.opt(colors=True).debug(f"<magenta>Skipping horizontal furigana line: '{current_text}' above line: '{other_line_text}'</magenta>")
break
if is_furigana: if is_furigana:
continue continue
@@ -652,6 +675,9 @@ class TextFiltering:
return changed_lines return changed_lines
def _standalone_furigana_filter(self, result, result_ocr):
return self._find_changed_lines_text_impl(result, result_ocr, 0, [], None, False, 0)
def _find_overlap(self, previous_text, current_text): def _find_overlap(self, previous_text, current_text):
min_overlap_length = 3 min_overlap_length = 3
max_overlap_length = min(len(previous_text), len(current_text)) max_overlap_length = min(len(previous_text), len(current_text))
@@ -705,6 +731,25 @@ class TextFiltering:
return overlap_width / smaller_width if smaller_width > 0 else 0.0 return overlap_width / smaller_width if smaller_width > 0 else 0.0
def _check_vertical_overlap(self, bbox1, bbox2):
# Calculate top and bottom boundaries for both boxes
top1 = bbox1.center_y - bbox1.height / 2
bottom1 = bbox1.center_y + bbox1.height / 2
top2 = bbox2.center_y - bbox2.height / 2
bottom2 = bbox2.center_y + bbox2.height / 2
# Calculate overlap
overlap_top = max(top1, top2)
overlap_bottom = min(bottom1, bottom2)
if overlap_bottom <= overlap_top:
return 0.0
overlap_height = overlap_bottom - overlap_top
smaller_height = min(bbox1.height, bbox2.height)
return overlap_height / smaller_height if smaller_height > 0 else 0.0
def _create_changed_regions_image(self, pil_image, changed_lines, pil_image_2, changed_lines_2, margin=5): def _create_changed_regions_image(self, pil_image, changed_lines, pil_image_2, changed_lines_2, margin=5):
def crop_image(image, lines): def crop_image(image, lines):
img_width, img_height = image.size img_width, img_height = image.size
@@ -1339,6 +1384,8 @@ class OutputResult:
return return
output_string = self._post_process(text_to_process, True) output_string = self._post_process(text_to_process, True)
else: else:
if self.filtering.furigana_filter and isinstance(result_data, OcrResult):
result_data_text = self.filtering._standalone_furigana_filter(result_data_text, result_data)
output_string = self._post_process(result_data_text, False) output_string = self._post_process(result_data_text, False)
log_message = output_string log_message = output_string

View File

@@ -87,8 +87,8 @@
;recover missed lines from unstable frames. Can lead to increased glitches. ;recover missed lines from unstable frames. Can lead to increased glitches.
;screen_capture_line_recovery = True ;screen_capture_line_recovery = True
;When reading with screen capture, try to filter furigana lines. ;Try to filter furigana lines for Japanese.
;screen_capture_furigana_filter = True ;furigana_filter = True
;When reading with screen capture, combo to wait on for taking a screenshot. ;When reading with screen capture, combo to wait on for taking a screenshot.
;If periodic screenshots are also enabled, any screenshot taken this way ;If periodic screenshots are also enabled, any screenshot taken this way
@@ -148,3 +148,8 @@
;high_accuracy_detection = False ;high_accuracy_detection = False
;high_accuracy_recognition = True ;high_accuracy_recognition = True
;[avision]
;fast_mode = False
;language_correction = True