From c98fb181e3e1f55b843abdbfd6fa192f1871a7fb Mon Sep 17 00:00:00 2001
From: AuroraWright <AuroraWright@users.noreply.github.com>
Date: Wed, 22 Oct 2025 17:04:30 +0200
Subject: [PATCH] Have segmented and non segmented manga OCR as separate
 engines (comic text detector doesn't work well on small text areas)

---
 README.md        |  2 +-
 owocr/config.py  |  2 +-
 owocr/ocr.py     | 75 ++++++++++++++++++++++++++++++++++++------------
 owocr_config.ini | 18 ++++++++----
 pyproject.toml   |  2 +-
 5 files changed, 71 insertions(+), 28 deletions(-)

diff --git a/README.md b/README.md
index be0eb7f..18b144e 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ The command-line options/config file allow you to configure OCR providers, hotke
 # Supported engines
 
 ## Local
-- [Manga OCR](https://github.com/kha-white/manga-ocr) (with [comic-text-detector](https://github.com/dmMaze/comic-text-detector) as segmenter) - install: `pip install owocr[mangaocr]` → key: `m`
+- [Manga OCR](https://github.com/kha-white/manga-ocr) (with optional [comic-text-detector](https://github.com/dmMaze/comic-text-detector) as segmenter) - install: `pip install owocr[mangaocr]` → keys: `m` (regular, ideal for small text areas), `n` (segmented, ideal for manga panels/larger images with multiple text areas)
 - [EasyOCR](https://github.com/JaidedAI/EasyOCR) - install: `pip install owocr[easyocr]` → key: `e`
 - [RapidOCR](https://github.com/RapidAI/RapidOCR) - install: `pip install owocr[rapidocr]` → key: `r`
 - Apple Vision framework - Probably the best local engine to date. **macOS only - Recommended (pre-installed)** → key: `a`
diff --git a/owocr/config.py b/owocr/config.py
index a87beab..6454ef6 100644
--- a/owocr/config.py
+++ b/owocr/config.py
@@ -25,7 +25,7 @@ parser.add_argument('-rs', '--read_from_secondary', type=str, default=argparse.S
 parser.add_argument('-w', '--write_to', type=str, default=argparse.SUPPRESS,
                     help='Where to save recognized texts to. Can be either "clipboard", "websocket", or a path to a text file.')
 parser.add_argument('-e', '--engine', type=str, default=argparse.SUPPRESS,
-                    help='OCR engine to use. Available: "mangaocr", "glens", "bing", "gvision", "avision", "alivetext", "azure", "winrtocr", "oneocr", "easyocr", "rapidocr", "ocrspace".')
+                    help='OCR engine to use. Available: "mangaocr", "mangaocrs", glens", "bing", "gvision", "avision", "alivetext", "azure", "winrtocr", "oneocr", "easyocr", "rapidocr", "ocrspace".')
 parser.add_argument('-es', '--engine_secondary', type=str, default=argparse.SUPPRESS,
                     help='Local OCR engine to use for two-pass screen capture processing.')
 parser.add_argument('-p', '--pause_at_startup', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS,
diff --git a/owocr/ocr.py b/owocr/ocr.py
index 5da0710..4829b27 100644
--- a/owocr/ocr.py
+++ b/owocr/ocr.py
@@ -85,6 +85,8 @@ try:
 except:
     optimized_png_encode = False
 
+manga_ocr_model = None
+
 
 @dataclass
 class BoundingBox:
@@ -132,8 +134,18 @@ class OcrResult:
     paragraphs: List[Paragraph] = field(default_factory=list)
 
 
-def empty_post_process(text):
-    return text
+def initialize_manga_ocr(pretrained_model_name_or_path, force_cpu):
+    def empty_post_process(text):
+        return text
+
+    global manga_ocr_model
+    if not manga_ocr_model:
+        logger.disable('manga_ocr')
+        logging.getLogger('transformers').setLevel(logging.ERROR) # silence transformers >=4.46 warnings
+        from manga_ocr import ocr
+        ocr.post_process = empty_post_process
+        logger.info(f'Loading Manga OCR model')
+        manga_ocr_model = MOCR(pretrained_model_name_or_path, force_cpu)
 
 def input_to_pil_image(img):
     is_path = False
@@ -243,7 +255,7 @@ def merge_bounding_boxes(ocr_element_list, rotated=False):
                 cos_a, sin_a = np.cos(angle), np.sin(angle)
                 rot = np.array([[cos_a, -sin_a], [sin_a, cos_a]])
                 corners.append(local @ rot.T + [cx, cy])
-        
+
         return np.vstack(corners) if corners else np.empty((0, 2))
 
     def _convex_hull(points):
@@ -364,10 +376,10 @@ def merge_bounding_boxes(ocr_element_list, rotated=False):
     )
 
 
-class MangaOcr:
-    name = 'mangaocr'
-    readable_name = 'Manga OCR'
-    key = 'm'
+class MangaOcrSegmented:
+    name = 'mangaocrs'
+    readable_name = 'Manga OCR (segmented)'
+    key = 'n'
     available = False
     local = True
     manual_language = False
@@ -376,9 +388,9 @@ class MangaOcr:
 
     def __init__(self, config={}):
         if 'manga_ocr' not in sys.modules:
-            logger.warning('manga-ocr not available, Manga OCR will not work!')
+            logger.warning('manga-ocr not available, Manga OCR (segmented) will not work!')
         elif 'scipy' not in sys.modules:
-            logger.warning('scipy not available, Manga OCR will not work!')
+            logger.warning('scipy not available, Manga OCR (segmented) will not work!')
         else:
             comic_text_detector_path = Path.home() / ".cache" / "manga-ocr"
             comic_text_detector_file = comic_text_detector_path / "comictextdetector.pt"
@@ -389,18 +401,12 @@ class MangaOcr:
                 try:
                     urllib.request.urlretrieve('https://github.com/zyddnys/manga-image-translator/releases/download/beta-0.3/comictextdetector.pt', str(comic_text_detector_file))
                 except:
-                    logger.warning('Download failed. Manga OCR will not work!')
+                    logger.warning('Download failed. Manga OCR (segmented) will not work!')
                     return
 
             pretrained_model_name_or_path = config.get('pretrained_model_name_or_path', 'kha-white/manga-ocr-base')
             force_cpu = config.get('force_cpu', False)
-
-            logger.disable('manga_ocr')
-            logging.getLogger('transformers').setLevel(logging.ERROR) # silence transformers >=4.46 warnings
-            from manga_ocr import ocr
-            ocr.post_process = empty_post_process
-            logger.info(f'Loading Manga OCR model')
-            self.model = MOCR(pretrained_model_name_or_path, force_cpu)
+            initialize_manga_ocr(pretrained_model_name_or_path, force_cpu)
 
             if not force_cpu and torch.cuda.is_available():
                 device = 'cuda'
@@ -412,7 +418,7 @@ class MangaOcr:
             self.text_detector_model = TextDetector(model_path=comic_text_detector_file, input_size=1024, device=device, act='leaky')
 
             self.available = True
-            logger.info('Manga OCR ready')
+            logger.info('Manga OCR (segmented) ready')
 
     def _convert_line_bbox(self, rect, img_width, img_height):
         x1, y1 = float(rect[0][0]), float(rect[0][1])
@@ -505,7 +511,7 @@ class MangaOcr:
                 for line_crop in line_crops:
                     if blk.vertical:
                         line_crop = cv2.rotate(line_crop, cv2.ROTATE_90_CLOCKWISE)
-                    l_text += self.model(Image.fromarray(line_crop))
+                    l_text += manga_ocr_model(Image.fromarray(line_crop))
                 l_bbox = self._convert_line_bbox(line.tolist(), img_width, img_height)
 
                 word = Word(
@@ -549,6 +555,37 @@ class MangaOcr:
             img.close()
         return x
 
+class MangaOcr:
+    name = 'mangaocr'
+    readable_name = 'Manga OCR'
+    key = 'm'
+    available = False
+    local = True
+    manual_language = False
+    coordinate_support = False
+    threading_support = True
+
+    def __init__(self, config={}):
+        if 'manga_ocr' not in sys.modules:
+            logger.warning('manga-ocr not available, Manga OCR will not work!')
+        else:
+            pretrained_model_name_or_path = config.get('pretrained_model_name_or_path', 'kha-white/manga-ocr-base')
+            force_cpu = config.get('force_cpu', False)
+            initialize_manga_ocr(pretrained_model_name_or_path, force_cpu)
+            self.available = True
+            logger.info('Manga OCR ready')
+
+    def __call__(self, img):
+        img, is_path = input_to_pil_image(img)
+        if not img:
+            return (False, 'Invalid image provided')
+
+        x = (True, [manga_ocr_model(img)])
+
+        if is_path:
+            img.close()
+        return x
+
 class GoogleVision:
     name = 'gvision'
     readable_name = 'Google Vision'
diff --git a/owocr_config.ini b/owocr_config.ini
index 94f207f..12ca863 100644
--- a/owocr_config.ini
+++ b/owocr_config.ini
@@ -14,9 +14,10 @@
 ;a path to a text file.
 ;write_to = clipboard
 
-;OCR engine to use. Available: "mangaocr", "glens", "bing","gvision", "avision",
-;"alivetext", "azure", "winrtocr", "oneocr", "easyocr", "rapidocr", "ocrspace".
-;engine = 
+;OCR engine to use. Available: "mangaocr", "mangaocrs", "glens", "bing","gvision",
+;"avision", "alivetext", "azure", "winrtocr", "oneocr", "easyocr", "rapidocr",
+;"ocrspace"
+;engine =
 
 ;OCR engine to use for two-pass processing.
 ;engine_secondary = 
@@ -31,9 +32,9 @@
 ;Delete image files after processing when reading from a directory.
 ;delete_images = False
 
-;Restricts engines to load. Available:
-;avision,alivetext,bing,glens,gvision,azure,mangaocr,winrtocr,oneocr,easyocr,rapidocr,ocrspace
-;engines = avision,alivetext,bing,glens,gvision,azure,mangaocr,winrtocr,oneocr,easyocr,rapidocr,ocrspace
+;Restrict engines to load. Available: avision,alivetext,bing,glens,gvision,azure,
+;mangaocr,mangaocrs,winrtocr,oneocr, easyocr,rapidocr,ocrspace
+;engines = avision,alivetext,bing,glens,gvision,azure,mangaocr,mangaocrs,winrtocr,oneocr,easyocr,rapidocr,ocrspace
 
 ;logger_format = <green>{time:HH:mm:ss.SSS}</green> | <level>{message}</level>
 
@@ -146,6 +147,11 @@
 
 ;force_cpu = False
 
+;[mangaocrs]
+;pretrained_model_name_or_path = kha-white/manga-ocr-base
+
+;force_cpu = False
+
 ;[easyocr]
 ;gpu = True
 
diff --git a/pyproject.toml b/pyproject.toml
index e326984..1d14f58 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "owocr"
-version = "1.19"
+version = "1.19.1"
 description = "Japanese OCR"
 readme = "README.md"
 requires-python = ">=3.11"