import sys import time import threading import os from pathlib import Path import fire import numpy as np import pyperclip from PIL import Image from PIL import UnidentifiedImageError from loguru import logger from pynput import keyboard from manga_ocr import * def are_images_identical(img1, img2): if None in (img1, img2): return img1 == img2 img1 = np.array(img1) img2 = np.array(img2) return (img1.shape == img2.shape) and (img1 == img2).all() def process_and_write_results(engine_instance, engine_name, img_or_path, write_to): t0 = time.time() text = engine_instance(img_or_path) t1 = time.time() logger.opt(ansi=True).info(f"Text recognized in {t1 - t0:0.03f}s using {engine_name}: {text}") if write_to == 'clipboard': pyperclip.copy(text) else: write_to = Path(write_to) if write_to.suffix != '.txt': raise ValueError('write_to must be either "clipboard" or a path to a text file') with write_to.open('a', encoding="utf-8") as f: f.write(text + '\n') def get_path_key(path): return path, path.lstat().st_mtime def run(read_from='clipboard', write_to='clipboard', pretrained_model_name_or_path='kha-white/manga-ocr-base', force_cpu=False, delay_secs=0.5, engine='mangaocr', start_paused=False, verbose=False ): """ Run OCR in the background, waiting for new images to appear either in system clipboard, or a directory. Recognized texts can be either saved to system clipboard, or appended to a text file. :param read_from: Specifies where to read input images from. Can be either "clipboard", or a path to a directory. :param write_to: Specifies where to save recognized texts to. Can be either "clipboard", or a path to a text file. :param pretrained_model_name_or_path: Path to a trained model, either local or from Transformers' model hub. :param force_cpu: If True, OCR will use CPU even if GPU is available. :param delay_secs: How often to check for new images, in seconds. :param engine: OCR engine to use. Available: "mangaocr", "gvision", "avision", "azure", "easyocr", "paddleocr". :param start_paused: Pause at startup. :param verbose: If True, unhides all warnings. """ fmt = "{time:HH:mm:ss.SSS} | {message}" config = { "handlers": [ {"sink": sys.stderr, "format": fmt}, ], } logger.configure(**config) avision = AppleVision() gvision = GoogleVision() azure = AzureComputerVision() mangaocr = MangaOcr(pretrained_model_name_or_path, force_cpu) easyocr = EasyOCR() paddleocr = PaddleOCR() engines = ['avision', 'gvision', 'azure', 'mangaocr', 'easyocr', 'paddleocr'] engine_names = ['Apple Vision', 'Google Vision', 'Azure Computer Vision', 'Manga OCR', 'EasyOCR', 'PaddleOCR'] engine_instances = [avision, gvision, azure, mangaocr, easyocr, paddleocr] engine_keys = 'agvmeo' def get_engine_name(engine): return engine_names[engines.index(engine)] if engine not in engines: msg = 'Unknown OCR engine!' raise NotImplementedError(msg) if sys.platform not in ('darwin', 'win32') and write_to == 'clipboard': # Check if the system is using Wayland import os if os.environ.get('WAYLAND_DISPLAY'): # Check if the wl-clipboard package is installed if os.system("which wl-copy > /dev/null") == 0: pyperclip.set_clipboard("wl-clipboard") else: msg = 'Your session uses wayland and does not have wl-clipboard installed. ' \ 'Install wl-clipboard for write in clipboard to work.' raise NotImplementedError(msg) if read_from == 'clipboard': from PIL import ImageGrab global just_unpaused global tmp_paused paused = start_paused just_unpaused = True tmp_paused = False img = None logger.opt(ansi=True).info(f"Reading from clipboard using {get_engine_name(engine)}{' (paused)' if paused else ''}") def on_key_press(key): global tmp_paused if key == keyboard.Key.cmd_r or key == keyboard.Key.ctrl_r: tmp_paused = True def on_key_release(key): global tmp_paused global just_unpaused if key == keyboard.Key.cmd_r or key == keyboard.Key.ctrl_r: tmp_paused = False just_unpaused = True tmp_paused_listener = keyboard.Listener( on_press=on_key_press, on_release=on_key_release) tmp_paused_listener.start() else: read_from = Path(read_from) if not read_from.is_dir(): raise ValueError('read_from must be either "clipboard" or a path to a directory') logger.opt(ansi=True).info(f'Reading from directory {read_from} using {get_engine_name(engine)}') old_paths = set() for path in read_from.iterdir(): old_paths.add(get_path_key(path)) def getchar_thread(): global user_input import os if os.name == 'nt': # how it works on windows import msvcrt while True: user_input = msvcrt.getch() if user_input.lower() in 'tq': break else: import tty, termios, sys fd = sys.stdin.fileno() old_settings = termios.tcgetattr(fd) try: tty.setcbreak(sys.stdin.fileno()) while True: user_input = sys.stdin.read(1) if user_input.lower() in 'tq': break finally: termios.tcsetattr(fd, termios.TCSADRAIN, old_settings) global user_input user_input = '' user_input_thread = threading.Thread(target=getchar_thread, daemon=True) user_input_thread.start() while True: if user_input != '': if user_input.lower() in 'tq': if read_from == 'clipboard': tmp_paused_listener.stop() user_input_thread.join() logger.info('Terminated!') break if read_from == 'clipboard' and user_input.lower() == 'p': if paused: logger.info('Unpaused!') just_unpaused = True else: logger.info('Paused!') paused = not paused elif user_input.lower() == 's': if engine == engines[-1]: engine = engines[0] else: engine = engines[engines.index(engine) + 1] logger.opt(ansi=True).info(f"Switched to {get_engine_name(engine)}!") elif user_input.lower() in engine_keys: new_engine = engines[engine_keys.find(user_input.lower())] if engine != new_engine: engine = new_engine logger.opt(ansi=True).info(f"Switched to {get_engine_name(engine)}!") user_input = '' if read_from == 'clipboard': if not paused and not tmp_paused: old_img = img try: img = ImageGrab.grabclipboard() except OSError as error: if not verbose and "cannot identify image file" in str(error): # Pillow error when clipboard hasn't changed since last grab (Linux) pass elif not verbose and "target image/png not available" in str(error): # Pillow error when clipboard contains text (Linux, X11) pass else: logger.warning('Error while reading from clipboard ({})'.format(error)) else: if not just_unpaused and isinstance(img, Image.Image) and not are_images_identical(img, old_img): process_and_write_results(engine_instances[engines.index(engine)], get_engine_name(engine), img, write_to) if just_unpaused: just_unpaused = False else: for path in read_from.iterdir(): path_key = get_path_key(path) if path_key not in old_paths: old_paths.add(path_key) try: img = Image.open(path) img.load() except (UnidentifiedImageError, OSError) as e: logger.warning(f'Error while reading file {path}: {e}') else: process_and_write_results(engine_instances[engines.index(engine)], get_engine_name(engine), img, write_to) time.sleep(delay_secs) if __name__ == '__main__': fire.Fire(run)