From c98fb181e3e1f55b843abdbfd6fa192f1871a7fb Mon Sep 17 00:00:00 2001 From: AuroraWright Date: Wed, 22 Oct 2025 17:04:30 +0200 Subject: [PATCH] Have segmented and non segmented manga OCR as separate engines (comic text detector doesn't work well on small text areas) --- README.md | 2 +- owocr/config.py | 2 +- owocr/ocr.py | 75 ++++++++++++++++++++++++++++++++++++------------ owocr_config.ini | 18 ++++++++---- pyproject.toml | 2 +- 5 files changed, 71 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index be0eb7f..18b144e 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ The command-line options/config file allow you to configure OCR providers, hotke # Supported engines ## Local -- [Manga OCR](https://github.com/kha-white/manga-ocr) (with [comic-text-detector](https://github.com/dmMaze/comic-text-detector) as segmenter) - install: `pip install owocr[mangaocr]` → key: `m` +- [Manga OCR](https://github.com/kha-white/manga-ocr) (with optional [comic-text-detector](https://github.com/dmMaze/comic-text-detector) as segmenter) - install: `pip install owocr[mangaocr]` → keys: `m` (regular, ideal for small text areas), `n` (segmented, ideal for manga panels/larger images with multiple text areas) - [EasyOCR](https://github.com/JaidedAI/EasyOCR) - install: `pip install owocr[easyocr]` → key: `e` - [RapidOCR](https://github.com/RapidAI/RapidOCR) - install: `pip install owocr[rapidocr]` → key: `r` - Apple Vision framework - Probably the best local engine to date. **macOS only - Recommended (pre-installed)** → key: `a` diff --git a/owocr/config.py b/owocr/config.py index a87beab..6454ef6 100644 --- a/owocr/config.py +++ b/owocr/config.py @@ -25,7 +25,7 @@ parser.add_argument('-rs', '--read_from_secondary', type=str, default=argparse.S parser.add_argument('-w', '--write_to', type=str, default=argparse.SUPPRESS, help='Where to save recognized texts to. Can be either "clipboard", "websocket", or a path to a text file.') parser.add_argument('-e', '--engine', type=str, default=argparse.SUPPRESS, - help='OCR engine to use. Available: "mangaocr", "glens", "bing", "gvision", "avision", "alivetext", "azure", "winrtocr", "oneocr", "easyocr", "rapidocr", "ocrspace".') + help='OCR engine to use. Available: "mangaocr", "mangaocrs", glens", "bing", "gvision", "avision", "alivetext", "azure", "winrtocr", "oneocr", "easyocr", "rapidocr", "ocrspace".') parser.add_argument('-es', '--engine_secondary', type=str, default=argparse.SUPPRESS, help='Local OCR engine to use for two-pass screen capture processing.') parser.add_argument('-p', '--pause_at_startup', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS, diff --git a/owocr/ocr.py b/owocr/ocr.py index 5da0710..4829b27 100644 --- a/owocr/ocr.py +++ b/owocr/ocr.py @@ -85,6 +85,8 @@ try: except: optimized_png_encode = False +manga_ocr_model = None + @dataclass class BoundingBox: @@ -132,8 +134,18 @@ class OcrResult: paragraphs: List[Paragraph] = field(default_factory=list) -def empty_post_process(text): - return text +def initialize_manga_ocr(pretrained_model_name_or_path, force_cpu): + def empty_post_process(text): + return text + + global manga_ocr_model + if not manga_ocr_model: + logger.disable('manga_ocr') + logging.getLogger('transformers').setLevel(logging.ERROR) # silence transformers >=4.46 warnings + from manga_ocr import ocr + ocr.post_process = empty_post_process + logger.info(f'Loading Manga OCR model') + manga_ocr_model = MOCR(pretrained_model_name_or_path, force_cpu) def input_to_pil_image(img): is_path = False @@ -243,7 +255,7 @@ def merge_bounding_boxes(ocr_element_list, rotated=False): cos_a, sin_a = np.cos(angle), np.sin(angle) rot = np.array([[cos_a, -sin_a], [sin_a, cos_a]]) corners.append(local @ rot.T + [cx, cy]) - + return np.vstack(corners) if corners else np.empty((0, 2)) def _convex_hull(points): @@ -364,10 +376,10 @@ def merge_bounding_boxes(ocr_element_list, rotated=False): ) -class MangaOcr: - name = 'mangaocr' - readable_name = 'Manga OCR' - key = 'm' +class MangaOcrSegmented: + name = 'mangaocrs' + readable_name = 'Manga OCR (segmented)' + key = 'n' available = False local = True manual_language = False @@ -376,9 +388,9 @@ class MangaOcr: def __init__(self, config={}): if 'manga_ocr' not in sys.modules: - logger.warning('manga-ocr not available, Manga OCR will not work!') + logger.warning('manga-ocr not available, Manga OCR (segmented) will not work!') elif 'scipy' not in sys.modules: - logger.warning('scipy not available, Manga OCR will not work!') + logger.warning('scipy not available, Manga OCR (segmented) will not work!') else: comic_text_detector_path = Path.home() / ".cache" / "manga-ocr" comic_text_detector_file = comic_text_detector_path / "comictextdetector.pt" @@ -389,18 +401,12 @@ class MangaOcr: try: urllib.request.urlretrieve('https://github.com/zyddnys/manga-image-translator/releases/download/beta-0.3/comictextdetector.pt', str(comic_text_detector_file)) except: - logger.warning('Download failed. Manga OCR will not work!') + logger.warning('Download failed. Manga OCR (segmented) will not work!') return pretrained_model_name_or_path = config.get('pretrained_model_name_or_path', 'kha-white/manga-ocr-base') force_cpu = config.get('force_cpu', False) - - logger.disable('manga_ocr') - logging.getLogger('transformers').setLevel(logging.ERROR) # silence transformers >=4.46 warnings - from manga_ocr import ocr - ocr.post_process = empty_post_process - logger.info(f'Loading Manga OCR model') - self.model = MOCR(pretrained_model_name_or_path, force_cpu) + initialize_manga_ocr(pretrained_model_name_or_path, force_cpu) if not force_cpu and torch.cuda.is_available(): device = 'cuda' @@ -412,7 +418,7 @@ class MangaOcr: self.text_detector_model = TextDetector(model_path=comic_text_detector_file, input_size=1024, device=device, act='leaky') self.available = True - logger.info('Manga OCR ready') + logger.info('Manga OCR (segmented) ready') def _convert_line_bbox(self, rect, img_width, img_height): x1, y1 = float(rect[0][0]), float(rect[0][1]) @@ -505,7 +511,7 @@ class MangaOcr: for line_crop in line_crops: if blk.vertical: line_crop = cv2.rotate(line_crop, cv2.ROTATE_90_CLOCKWISE) - l_text += self.model(Image.fromarray(line_crop)) + l_text += manga_ocr_model(Image.fromarray(line_crop)) l_bbox = self._convert_line_bbox(line.tolist(), img_width, img_height) word = Word( @@ -549,6 +555,37 @@ class MangaOcr: img.close() return x +class MangaOcr: + name = 'mangaocr' + readable_name = 'Manga OCR' + key = 'm' + available = False + local = True + manual_language = False + coordinate_support = False + threading_support = True + + def __init__(self, config={}): + if 'manga_ocr' not in sys.modules: + logger.warning('manga-ocr not available, Manga OCR will not work!') + else: + pretrained_model_name_or_path = config.get('pretrained_model_name_or_path', 'kha-white/manga-ocr-base') + force_cpu = config.get('force_cpu', False) + initialize_manga_ocr(pretrained_model_name_or_path, force_cpu) + self.available = True + logger.info('Manga OCR ready') + + def __call__(self, img): + img, is_path = input_to_pil_image(img) + if not img: + return (False, 'Invalid image provided') + + x = (True, [manga_ocr_model(img)]) + + if is_path: + img.close() + return x + class GoogleVision: name = 'gvision' readable_name = 'Google Vision' diff --git a/owocr_config.ini b/owocr_config.ini index 94f207f..12ca863 100644 --- a/owocr_config.ini +++ b/owocr_config.ini @@ -14,9 +14,10 @@ ;a path to a text file. ;write_to = clipboard -;OCR engine to use. Available: "mangaocr", "glens", "bing","gvision", "avision", -;"alivetext", "azure", "winrtocr", "oneocr", "easyocr", "rapidocr", "ocrspace". -;engine = +;OCR engine to use. Available: "mangaocr", "mangaocrs", "glens", "bing","gvision", +;"avision", "alivetext", "azure", "winrtocr", "oneocr", "easyocr", "rapidocr", +;"ocrspace" +;engine = ;OCR engine to use for two-pass processing. ;engine_secondary = @@ -31,9 +32,9 @@ ;Delete image files after processing when reading from a directory. ;delete_images = False -;Restricts engines to load. Available: -;avision,alivetext,bing,glens,gvision,azure,mangaocr,winrtocr,oneocr,easyocr,rapidocr,ocrspace -;engines = avision,alivetext,bing,glens,gvision,azure,mangaocr,winrtocr,oneocr,easyocr,rapidocr,ocrspace +;Restrict engines to load. Available: avision,alivetext,bing,glens,gvision,azure, +;mangaocr,mangaocrs,winrtocr,oneocr, easyocr,rapidocr,ocrspace +;engines = avision,alivetext,bing,glens,gvision,azure,mangaocr,mangaocrs,winrtocr,oneocr,easyocr,rapidocr,ocrspace ;logger_format = {time:HH:mm:ss.SSS} | {message} @@ -146,6 +147,11 @@ ;force_cpu = False +;[mangaocrs] +;pretrained_model_name_or_path = kha-white/manga-ocr-base + +;force_cpu = False + ;[easyocr] ;gpu = True diff --git a/pyproject.toml b/pyproject.toml index e326984..1d14f58 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "owocr" -version = "1.19" +version = "1.19.1" description = "Japanese OCR" readme = "README.md" requires-python = ">=3.11"