Vertical furigana filter, rename option

This commit is contained in:
AuroraWright
2025-10-17 01:43:24 +02:00
parent 87f7ea6069
commit e59ceb7ae4
4 changed files with 101 additions and 46 deletions

View File

@@ -52,14 +52,14 @@ parser.add_argument('-sf', '--screen_capture_frame_stabilization', type=float, d
help="When reading with screen capture, delay to wait until text is stable before processing it. -1 waits for two OCR results to be the same. 0 to disable.")
parser.add_argument('-sl', '--screen_capture_line_recovery', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS,
help="When reading with screen capture and frame stabilization is on, try to recover missed lines from unstable frames. Can lead to increased glitches.")
parser.add_argument('-sff', '--screen_capture_furigana_filter', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS,
help="When reading with screen capture, try to filter furigana lines.")
parser.add_argument('-sc', '--screen_capture_combo', type=str, default=argparse.SUPPRESS,
help='When reading with screen capture, combo to wait on for taking a screenshot. If periodic screenshots are also enabled, any screenshot taken this way bypasses the filtering. Example value: "<ctrl>+<shift>+s". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
parser.add_argument('-scc', '--coordinate_selector_combo', type=str, default=argparse.SUPPRESS,
help='When reading with screen capture, combo to wait on for invoking the coordinate picker to change the screen/window area. Example value: "<ctrl>+<shift>+c". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
parser.add_argument('-l', '--language', type=str, default=argparse.SUPPRESS,
help='Two letter language code for filtering screencapture OCR results. Ex. "ja" for Japanese, "zh" for Chinese, "ko" for Korean, "ar" for Arabic, "ru" for Russian, "el" for Greek, "he" for Hebrew, "th" for Thai. Any other value will use Latin Extended (for most European languages and English).')
parser.add_argument('-f', '--furigana_filter', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS,
help="Try to filter furigana lines for Japanese.")
parser.add_argument('-of', '--output_format', type=str, default=argparse.SUPPRESS,
help='The output format for OCR results. Can be "text" (default) or "json" (to include coordinates).')
parser.add_argument('-wp', '--websocket_port', type=int, default=argparse.SUPPRESS,
@@ -99,7 +99,7 @@ class Config:
'screen_capture_only_active_windows': True,
'screen_capture_frame_stabilization': -1,
'screen_capture_line_recovery': True,
'screen_capture_furigana_filter': True,
'furigana_filter': True,
'screen_capture_combo': '',
'coordinate_selector_combo': '',
'screen_capture_old_macos_api': False,

View File

@@ -369,16 +369,18 @@ class MangaOcr:
coordinate_support = False
threading_support = True
def __init__(self, config={'pretrained_model_name_or_path':'kha-white/manga-ocr-base','force_cpu': False}):
def __init__(self, config={}):
if 'manga_ocr' not in sys.modules:
logger.warning('manga-ocr not available, Manga OCR will not work!')
else:
pretrained_model_name_or_path = config.get('pretrained_model_name_or_path', 'kha-white/manga-ocr-base')
force_cpu = config.get('force_cpu', False)
logger.disable('manga_ocr')
logging.getLogger('transformers').setLevel(logging.ERROR) # silence transformers >=4.46 warnings
from manga_ocr import ocr
ocr.post_process = empty_post_process
logger.info(f'Loading Manga OCR model')
self.model = MOCR(config['pretrained_model_name_or_path'], config['force_cpu'])
self.model = MOCR(pretrained_model_name_or_path, force_cpu)
self.available = True
logger.info('Manga OCR ready')
@@ -860,12 +862,14 @@ class AppleVision:
coordinate_support = True
threading_support = True
def __init__(self, language='ja'):
def __init__(self, language='ja', config={}):
if sys.platform != 'darwin':
logger.warning('Apple Vision is not supported on non-macOS platforms!')
elif int(platform.mac_ver()[0].split('.')[0]) < 13:
logger.warning('Apple Vision is not supported on macOS older than Ventura/13.0!')
else:
self.recognition_level = Vision.VNRequestTextRecognitionLevelFast if config.get('fast_mode', False) else Vision.VNRequestTextRecognitionLevelAccurate
self.language_correction = config.get('language_correction', True)
self.available = True
self.language = [language, 'en']
logger.info('Apple Vision ready')
@@ -916,8 +920,8 @@ class AppleVision:
req = Vision.VNRecognizeTextRequest.alloc().init()
req.setRevision_(Vision.VNRecognizeTextRequestRevision3)
req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
req.setUsesLanguageCorrection_(True)
req.setRecognitionLevel_(self.recognition_level)
req.setUsesLanguageCorrection_(self.language_correction)
req.setRecognitionLanguages_(self.language)
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(
@@ -1050,7 +1054,6 @@ class AppleLiveText:
)
lines.append(line)
# Create a single paragraph to hold all lines
if lines:
p_bbox = merge_bounding_boxes(lines)
paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
@@ -1132,7 +1135,6 @@ class WinRTOCR:
)
lines.append(line)
# Create a single paragraph to hold all lines
if lines:
p_bbox = merge_bounding_boxes(lines)
paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
@@ -1235,7 +1237,6 @@ class OneOCR:
)
lines.append(line)
# Create a single paragraph to hold all lines
if lines:
p_bbox = merge_bounding_boxes(lines)
paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
@@ -1417,13 +1418,14 @@ class EasyOCR:
coordinate_support = True
threading_support = True
def __init__(self, config={'gpu': True}, language='ja'):
def __init__(self, config={}, language='ja'):
if 'easyocr' not in sys.modules:
logger.warning('easyocr not available, EasyOCR will not work!')
else:
logger.info('Loading EasyOCR model')
gpu = config.get('gpu', True)
logging.getLogger('easyocr.easyocr').setLevel(logging.ERROR)
self.model = easyocr.Reader([language,'en'], gpu=config['gpu'])
self.model = easyocr.Reader([language,'en'], gpu=gpu)
self.available = True
logger.info('EasyOCR ready')
@@ -1485,20 +1487,22 @@ class RapidOCR:
coordinate_support = True
threading_support = True
def __init__(self, config={'high_accuracy_detection': False, 'high_accuracy_recognition': True}, language='ja'):
def __init__(self, config={}, language='ja'):
if 'rapidocr' not in sys.modules:
logger.warning('rapidocr not available, RapidOCR will not work!')
else:
logger.info('Loading RapidOCR model')
high_accuracy_detection = config.get('high_accuracy_detection', False)
high_accuracy_recognition = config.get('high_accuracy_recognition', True)
lang_rec = self.language_to_model_language(language)
self.model = ROCR(params={
'Det.engine_type': EngineType.ONNXRUNTIME,
'Det.lang_type': LangDet.CH,
'Det.model_type': ModelType.SERVER if config['high_accuracy_detection'] else ModelType.MOBILE,
'Det.model_type': ModelType.SERVER if high_accuracy_detection else ModelType.MOBILE,
'Det.ocr_version': OCRVersion.PPOCRV5,
'Rec.engine_type': EngineType.ONNXRUNTIME,
'Rec.lang_type': lang_rec,
'Rec.model_type': ModelType.SERVER if config['high_accuracy_recognition'] else ModelType.MOBILE,
'Rec.model_type': ModelType.SERVER if high_accuracy_recognition else ModelType.MOBILE,
'Rec.ocr_version': OCRVersion.PPOCRV5,
'Global.log_level': 'error'
})
@@ -1626,10 +1630,6 @@ class OCRSpace:
def _to_generic_result(self, api_result, img_width, img_height, og_img_width, og_img_height):
parsed_result = api_result['ParsedResults'][0]
text_overlay = parsed_result.get('TextOverlay', {})
image_props = ImageProperties(width=og_img_width, height=og_img_height)
ocr_result = OcrResult(image_properties=image_props)
lines_data = text_overlay.get('Lines', [])
lines = []
@@ -1645,11 +1645,14 @@ class OCRSpace:
if lines:
p_bbox = merge_bounding_boxes(lines)
paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
ocr_result.paragraphs = [paragraph]
paragraphs = [paragraph]
else:
ocr_result.paragraphs = []
paragraphs = []
return ocr_result
return OcrResult(
image_properties=ImageProperties(width=og_img_width, height=og_img_height),
paragraphs=paragraphs
)
def __call__(self, img):
img, is_path = input_to_pil_image(img)

View File

@@ -294,7 +294,7 @@ class TextFiltering:
self.language = config.get_general('language')
self.frame_stabilization = 0 if config.get_general('screen_capture_delay_secs') == -1 else config.get_general('screen_capture_frame_stabilization')
self.line_recovery = config.get_general('screen_capture_line_recovery')
self.furigana_filter = config.get_general('screen_capture_furigana_filter')
self.furigana_filter = self.language == 'ja' and config.get_general('furigana_filter')
self.last_frame_data = (None, None)
self.last_last_frame_data = (None, None)
self.stable_frame_data = None
@@ -549,7 +549,7 @@ class TextFiltering:
if all(not current_text_line for current_text_line in current_lines):
return None
if self.furigana_filter and self.language == 'ja' and isinstance(current_result_ocr, OcrResult):
if self.furigana_filter and isinstance(current_result_ocr, OcrResult):
for p in current_result_ocr.paragraphs:
current_lines_ocr.extend(p.lines)
@@ -607,34 +607,57 @@ class TextFiltering:
if not current_lines[j]:
continue
below_line_bbox = current_lines_ocr[j].bounding_box
below_line_text = current_lines[j]
other_line_bbox = current_lines_ocr[j].bounding_box
other_line_text = current_lines[j]
logger.opt(colors=True).debug(f"<magenta>Furigana check against line: '{below_line_text}'</magenta>")
if len(current_text) <= len(other_line_text):
is_vertical = other_line_bbox.height > other_line_bbox.width
else:
is_vertical = current_line_bbox.height > current_line_bbox.width
logger.opt(colors=True).debug(f"<magenta>Furigana check against line: '{other_line_text}'</magenta>")
if is_vertical:
width_threshold = other_line_bbox.width * 0.7
is_smaller = current_line_bbox.width < width_threshold
logger.opt(colors=True).debug(f"<magenta>Vertical furigana check width: '{other_line_bbox.width}' '{current_line_bbox.width}'</magenta>")
else:
height_threshold = other_line_bbox.height * 0.7
is_smaller = current_line_bbox.height < height_threshold
logger.opt(colors=True).debug(f"<magenta>Horizontal furigana check height: '{other_line_bbox.height}' '{current_line_bbox.height}'</magenta>")
# Check if the line is taller
height_threshold = below_line_bbox.height * 0.7
is_smaller = current_line_bbox.height < height_threshold
logger.opt(colors=True).debug(f"<magenta>Furigana check height: '{below_line_bbox.height}' '{current_line_bbox.height}'</magenta>")
if not is_smaller:
continue
# Check if the line has kanji
below_has_kanji = self.kanji_regex.search(below_line_text)
if not below_has_kanji:
other_has_kanji = self.kanji_regex.search(other_line_text)
if not other_has_kanji:
continue
vertical_threshold = below_line_bbox.height + current_line_bbox.height
vertical_distance = below_line_bbox.center_y - current_line_bbox.center_y
horizontal_overlap = self._check_horizontal_overlap(current_line_bbox, below_line_bbox)
if is_vertical:
horizontal_threshold = current_line_bbox.width + other_line_bbox.width
horizontal_distance = current_line_bbox.center_x - other_line_bbox.center_x
vertical_overlap = self._check_vertical_overlap(current_line_bbox, other_line_bbox)
logger.opt(colors=True).debug(f"<magenta>Furigana check position: '{vertical_threshold}' '{vertical_distance}' '{horizontal_overlap}'</magenta>")
logger.opt(colors=True).debug(f"<magenta>Vertical furigana check position: '{horizontal_threshold}' '{horizontal_distance}' '{vertical_overlap}'</magenta>")
# If vertically close and horizontally aligned, it's likely furigana
if (0 < vertical_distance < vertical_threshold and horizontal_overlap > 0.5):
is_furigana = True
logger.opt(colors=True).debug(f"<magenta>Skipping furigana line: '{current_text}' above line: '{below_line_text}'</magenta>")
break
# If horizontally close and vertically aligned, it's likely furigana
if (0 < horizontal_distance < horizontal_threshold and vertical_overlap > 0.5):
is_furigana = True
logger.opt(colors=True).debug(f"<magenta>Skipping vertical furigana line: '{current_text}' next to line: '{other_line_text}'</magenta>")
break
else:
vertical_threshold = other_line_bbox.height + current_line_bbox.height
vertical_distance = other_line_bbox.center_y - current_line_bbox.center_y
horizontal_overlap = self._check_horizontal_overlap(current_line_bbox, other_line_bbox)
logger.opt(colors=True).debug(f"<magenta>Horizontal furigana check position: '{vertical_threshold}' '{vertical_distance}' '{horizontal_overlap}'</magenta>")
# If vertically close and horizontally aligned, it's likely furigana
if (0 < vertical_distance < vertical_threshold and horizontal_overlap > 0.5):
is_furigana = True
logger.opt(colors=True).debug(f"<magenta>Skipping horizontal furigana line: '{current_text}' above line: '{other_line_text}'</magenta>")
break
if is_furigana:
continue
@@ -652,6 +675,9 @@ class TextFiltering:
return changed_lines
def _standalone_furigana_filter(self, result, result_ocr):
return self._find_changed_lines_text_impl(result, result_ocr, 0, [], None, False, 0)
def _find_overlap(self, previous_text, current_text):
min_overlap_length = 3
max_overlap_length = min(len(previous_text), len(current_text))
@@ -705,6 +731,25 @@ class TextFiltering:
return overlap_width / smaller_width if smaller_width > 0 else 0.0
def _check_vertical_overlap(self, bbox1, bbox2):
# Calculate top and bottom boundaries for both boxes
top1 = bbox1.center_y - bbox1.height / 2
bottom1 = bbox1.center_y + bbox1.height / 2
top2 = bbox2.center_y - bbox2.height / 2
bottom2 = bbox2.center_y + bbox2.height / 2
# Calculate overlap
overlap_top = max(top1, top2)
overlap_bottom = min(bottom1, bottom2)
if overlap_bottom <= overlap_top:
return 0.0
overlap_height = overlap_bottom - overlap_top
smaller_height = min(bbox1.height, bbox2.height)
return overlap_height / smaller_height if smaller_height > 0 else 0.0
def _create_changed_regions_image(self, pil_image, changed_lines, pil_image_2, changed_lines_2, margin=5):
def crop_image(image, lines):
img_width, img_height = image.size
@@ -1339,6 +1384,8 @@ class OutputResult:
return
output_string = self._post_process(text_to_process, True)
else:
if self.filtering.furigana_filter and isinstance(result_data, OcrResult):
result_data_text = self.filtering._standalone_furigana_filter(result_data_text, result_data)
output_string = self._post_process(result_data_text, False)
log_message = output_string

View File

@@ -87,8 +87,8 @@
;recover missed lines from unstable frames. Can lead to increased glitches.
;screen_capture_line_recovery = True
;When reading with screen capture, try to filter furigana lines.
;screen_capture_furigana_filter = True
;Try to filter furigana lines for Japanese.
;furigana_filter = True
;When reading with screen capture, combo to wait on for taking a screenshot.
;If periodic screenshots are also enabled, any screenshot taken this way
@@ -147,4 +147,9 @@
;[rapidocr]
;high_accuracy_detection = False
;high_accuracy_recognition = True
;high_accuracy_recognition = True
;[avision]
;fast_mode = False
;language_correction = True