Push my mod

This commit is contained in:
AuroraWright
2023-09-15 12:03:43 +02:00
parent 1a3ffca7c8
commit 9bfc265192
6 changed files with 504 additions and 209 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

View File

@@ -1,3 +1,6 @@
__version__ = '0.1.11' __version__ = '0.1.10'
from manga_ocr.ocr import MangaOcr from manga_ocr.ocr import MangaOcr
from manga_ocr.ocr import GoogleVision
from manga_ocr.ocr import AppleVision
from manga_ocr.ocr import AzureComputerVision

View File

@@ -1,11 +1,11 @@
import fire import fire
from manga_ocr.run import run from manga_ocr.run import run
def main(): def main():
fire.Fire(run) fire.Fire(run)
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@@ -1,61 +1,231 @@
import re import re
from pathlib import Path import os
import io
import jaconv from pathlib import Path
import torch import warnings
from PIL import Image import configparser
from loguru import logger import time
from transformers import AutoFeatureExtractor, AutoTokenizer, VisionEncoderDecoderModel import sys
import platform
class MangaOcr: import jaconv
def __init__(self, pretrained_model_name_or_path='kha-white/manga-ocr-base', force_cpu=False): import torch
logger.info(f'Loading OCR model from {pretrained_model_name_or_path}') from PIL import Image
self.feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path) from loguru import logger
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path) from transformers import ViTImageProcessor, AutoTokenizer, VisionEncoderDecoderModel
self.model = VisionEncoderDecoderModel.from_pretrained(pretrained_model_name_or_path)
try:
if not force_cpu and torch.cuda.is_available(): import Vision
logger.info('Using CUDA') import objc
self.model.cuda() except ImportError:
elif not force_cpu and torch.backends.mps.is_available(): pass
logger.info('Using MPS')
self.model.to('mps') try:
else: from google.cloud import vision
logger.info('Using CPU') from google.oauth2 import service_account
except ImportError:
example_path = Path(__file__).parent / 'assets/example.jpg' pass
if not example_path.is_file():
example_path = Path(__file__).parent.parent / 'assets/example.jpg' try:
self(example_path) from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
logger.info('OCR ready') from msrest.authentication import CognitiveServicesCredentials
except ImportError:
def __call__(self, img_or_path): pass
if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
img = Image.open(img_or_path) class MangaOcr:
elif isinstance(img_or_path, Image.Image): def __init__(self, pretrained_model_name_or_path='kha-white/manga-ocr-base', force_cpu=False):
img = img_or_path logger.info(f'Loading OCR model from {pretrained_model_name_or_path}')
else: self.processor = ViTImageProcessor.from_pretrained(pretrained_model_name_or_path)
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}') self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
self.model = VisionEncoderDecoderModel.from_pretrained(pretrained_model_name_or_path)
img = img.convert('L').convert('RGB')
if not force_cpu and torch.cuda.is_available():
x = self._preprocess(img) logger.info('Using CUDA')
x = self.model.generate(x[None].to(self.model.device), max_length=300)[0].cpu() self.model.cuda()
x = self.tokenizer.decode(x, skip_special_tokens=True) elif not force_cpu and torch.backends.mps.is_available():
x = post_process(x) logger.info('Using MPS')
return x warnings.filterwarnings("ignore", message=".*MPS: no support.*")
self.model.to('mps')
def _preprocess(self, img): else:
pixel_values = self.feature_extractor(img, return_tensors="pt").pixel_values logger.info('Using CPU')
return pixel_values.squeeze()
logger.info('Manga OCR ready')
def post_process(text): def __call__(self, img_or_path):
text = ''.join(text.split()) if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
text = text.replace('', '...') img = Image.open(img_or_path)
text = re.sub('[・.]{2,}', lambda x: (x.end() - x.start()) * '.', text) elif isinstance(img_or_path, Image.Image):
text = jaconv.h2z(text, ascii=True, digit=True) img = img_or_path
else:
return text raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
img = img.convert('L').convert('RGB')
x = self._preprocess(img)
x = self.model.generate(x[None].to(self.model.device), max_length=300)[0].cpu()
x = self.tokenizer.decode(x, skip_special_tokens=True)
x = post_process(x)
return x
def _preprocess(self, img):
pixel_values = self.processor(img, return_tensors="pt").pixel_values
return pixel_values.squeeze()
class GoogleVision:
def __init__(self):
if 'google.cloud' not in sys.modules:
logger.warning('google-cloud-vision not available, Google Vision will not work!')
self.available = False
else:
logger.info(f'Parsing Google credentials')
google_credentials_file = os.path.join(os.path.expanduser('~'),'.config','google_vision.json')
try:
google_credentials = service_account.Credentials.from_service_account_file(google_credentials_file)
self.client = vision.ImageAnnotatorClient(credentials=google_credentials)
self.available = True
logger.info('Google Vision ready')
except:
logger.warning('Error parsing Google credentials, Google Vision will not work!')
self.available = False
def __call__(self, img_or_path):
if not self.available:
return "Engine not available!"
if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
img = Image.open(img_or_path)
elif isinstance(img_or_path, Image.Image):
img = img_or_path
else:
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
image_bytes = self._preprocess(img)
image = vision.Image(content=image_bytes)
response = self.client.text_detection(image=image)
texts = response.text_annotations
x = post_process(texts[0].description)
return x
def _preprocess(self, img):
image_bytes = io.BytesIO()
img.save(image_bytes, format=img.format)
return image_bytes.getvalue()
class AppleVision:
def __init__(self):
if sys.platform != "darwin":
logger.warning('Apple Vision is not supported on non-macOS platforms!')
self.available = False
elif int(platform.mac_ver()[0].split('.')[0]) < 13:
logger.warning('Apple Vision is not supported on macOS older than Ventura/13.0!')
self.available = False
else:
if 'objc' not in sys.modules:
logger.warning('pyobjc not available, Apple Vision will not work!')
self.available = False
else:
self.available = True
logger.info('Apple Vision ready')
def __call__(self, img_or_path):
if not self.available:
return "Engine not available!"
if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
img = Image.open(img_or_path)
elif isinstance(img_or_path, Image.Image):
img = img_or_path
else:
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
with objc.autorelease_pool():
req = Vision.VNRecognizeTextRequest.alloc().init()
req.setRecognitionLevel_(0)
req.setRecognitionLanguages_(['ja','en'])
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(
self._preprocess(img), None
)
success = handler.performRequests_error_([req], None)
res = ''
if success:
for result in req.results():
res += result.text() + ' '
req.dealloc()
handler.dealloc()
x = post_process(res)
return x
def _preprocess(self, img):
image_bytes = io.BytesIO()
img.save(image_bytes, format=img.format)
return image_bytes.getvalue()
class AzureComputerVision:
def __init__(self):
if 'azure.cognitiveservices.vision.computervision' not in sys.modules:
logger.warning('azure-cognitiveservices-vision-computervision not available, Azure Computer Vision will not work!')
self.available = False
else:
logger.info(f'Parsing Azure credentials')
azure_credentials_file = os.path.join(os.path.expanduser('~'),'.config','azure_computer_vision.ini')
try:
azure_credentials = configparser.ConfigParser()
azure_credentials.read(azure_credentials_file)
self.client = ComputerVisionClient(azure_credentials['config']['endpoint'], CognitiveServicesCredentials(azure_credentials['config']['api_key']))
self.available = True
logger.info('Azure Computer Vision ready')
except:
logger.warning('Error parsing Azure credentials, Azure Computer Vision will not work!')
self.available = False
def __call__(self, img_or_path):
if not self.available:
return "Engine not available!"
if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
img = Image.open(img_or_path)
elif isinstance(img_or_path, Image.Image):
img = img_or_path
else:
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
image_io = self._preprocess(img)
read_response = self.client.read_in_stream(image_io, raw=True)
read_operation_location = read_response.headers["Operation-Location"]
operation_id = read_operation_location.split("/")[-1]
while True:
read_result = self.client.get_read_result(operation_id)
if read_result.status.lower() not in ['notstarted', 'running']:
break
time.sleep(0.3)
res = ''
if read_result.status == OperationStatusCodes.succeeded:
for text_result in read_result.analyze_result.read_results:
for line in text_result.lines:
res += line.text + ' '
x = post_process(res)
return x
def _preprocess(self, img):
image_io = io.BytesIO()
img.save(image_io, format=img.format)
image_io.seek(0)
return image_io
def post_process(text):
text = ''.join(text.split())
text = text.replace('', '...')
text = re.sub('[・.]{2,}', lambda x: (x.end() - x.start()) * '.', text)
text = jaconv.h2z(text, ascii=True, digit=True)
return text

View File

@@ -1,134 +1,252 @@
import sys import sys
import time import time
from pathlib import Path import threading
import os
import fire from pathlib import Path
import numpy as np
import pyperclip import fire
from PIL import Image import numpy as np
from PIL import UnidentifiedImageError import pyperclip
from loguru import logger from PIL import Image
from PIL import UnidentifiedImageError
from manga_ocr import MangaOcr from loguru import logger
from pynput import keyboard
def are_images_identical(img1, img2): from manga_ocr import MangaOcr
if None in (img1, img2): from manga_ocr import GoogleVision
return img1 == img2 from manga_ocr import AppleVision
from manga_ocr import AzureComputerVision
img1 = np.array(img1)
img2 = np.array(img2) engines = ['avision', 'gvision', 'azure', 'mangaocr']
return (img1.shape == img2.shape) and (img1 == img2).all()
def get_engine_name(engine):
engine_names = ['Apple Vision', 'Google Vision', 'Azure Computer Vision', 'Manga OCR']
def process_and_write_results(mocr, img_or_path, write_to): return engine_names[engines.index(engine)]
t0 = time.time()
text = mocr(img_or_path)
t1 = time.time() def are_images_identical(img1, img2):
if None in (img1, img2):
logger.info(f'Text recognized in {t1 - t0:0.03f} s: {text}') return img1 == img2
if write_to == 'clipboard': img1 = np.array(img1)
pyperclip.copy(text) img2 = np.array(img2)
else:
write_to = Path(write_to) return (img1.shape == img2.shape) and (img1 == img2).all()
if write_to.suffix != '.txt':
raise ValueError('write_to must be either "clipboard" or a path to a text file')
def process_and_write_results(mocr, avision, gvision, azure, img_or_path, write_to, engine):
with write_to.open('a', encoding="utf-8") as f: t0 = time.time()
f.write(text + '\n') if engine == 'gvision':
text = gvision(img_or_path)
elif engine == 'avision':
def get_path_key(path): text = avision(img_or_path)
return path, path.lstat().st_mtime elif engine == 'azure':
text = azure(img_or_path)
else:
def run(read_from='clipboard', text = mocr(img_or_path)
write_to='clipboard', t1 = time.time()
pretrained_model_name_or_path='kha-white/manga-ocr-base',
force_cpu=False, logger.opt(ansi=True).info(f"Text recognized in {t1 - t0:0.03f}s using <cyan>{get_engine_name(engine)}</cyan>: {text}")
delay_secs=0.1,
verbose=False if write_to == 'clipboard':
): pyperclip.copy(text)
""" else:
Run OCR in the background, waiting for new images to appear either in system clipboard, or a directory. write_to = Path(write_to)
Recognized texts can be either saved to system clipboard, or appended to a text file. if write_to.suffix != '.txt':
raise ValueError('write_to must be either "clipboard" or a path to a text file')
:param read_from: Specifies where to read input images from. Can be either "clipboard", or a path to a directory.
:param write_to: Specifies where to save recognized texts to. Can be either "clipboard", or a path to a text file. with write_to.open('a', encoding="utf-8") as f:
:param pretrained_model_name_or_path: Path to a trained model, either local or from Transformers' model hub. f.write(text + '\n')
:param force_cpu: If True, OCR will use CPU even if GPU is available.
:param verbose: If True, unhides all warnings.
:param delay_secs: How often to check for new images, in seconds. def get_path_key(path):
""" return path, path.lstat().st_mtime
mocr = MangaOcr(pretrained_model_name_or_path, force_cpu)
def run(read_from='clipboard',
if sys.platform not in ('darwin', 'win32') and write_to == 'clipboard': write_to='clipboard',
# Check if the system is using Wayland pretrained_model_name_or_path='kha-white/manga-ocr-base',
import os force_cpu=False,
if os.environ.get('WAYLAND_DISPLAY'): delay_secs=0.5,
# Check if the wl-clipboard package is installed engine='mangaocr',
if os.system("which wl-copy > /dev/null") == 0: verbose=False
pyperclip.set_clipboard("wl-clipboard") ):
else: """
msg = 'Your session uses wayland and does not have wl-clipboard installed. ' \ Run OCR in the background, waiting for new images to appear either in system clipboard, or a directory.
'Install wl-clipboard for write in clipboard to work.' Recognized texts can be either saved to system clipboard, or appended to a text file.
raise NotImplementedError(msg)
:param read_from: Specifies where to read input images from. Can be either "clipboard", or a path to a directory.
if read_from == 'clipboard': :param write_to: Specifies where to save recognized texts to. Can be either "clipboard", or a path to a text file.
from PIL import ImageGrab :param pretrained_model_name_or_path: Path to a trained model, either local or from Transformers' model hub.
logger.info('Reading from clipboard') :param force_cpu: If True, OCR will use CPU even if GPU is available.
:param delay_secs: How often to check for new images, in seconds.
img = None :param engine: OCR engine to use. Available: "mangaocr", "gvision", "avision", "azure".
while True: :param verbose: If True, unhides all warnings.
old_img = img """
try: fmt = "<green>{time:HH:mm:ss.SSS}</green> | <level>{message}</level>"
img = ImageGrab.grabclipboard() config = {
except OSError as error: "handlers": [
if not verbose and "cannot identify image file" in str(error): {"sink": sys.stderr, "format": fmt},
# Pillow error when clipboard hasn't changed since last grab (Linux) ],
pass }
elif not verbose and "target image/png not available" in str(error): logger.configure(**config)
# Pillow error when clipboard contains text (Linux, X11)
pass mocr = MangaOcr(pretrained_model_name_or_path, force_cpu)
else: gvision = GoogleVision()
logger.warning('Error while reading from clipboard ({})'.format(error)) azure = AzureComputerVision()
else: avision = AppleVision()
if isinstance(img, Image.Image) and not are_images_identical(img, old_img):
process_and_write_results(mocr, img, write_to) if engine not in engines:
msg = 'Unknown OCR engine!'
time.sleep(delay_secs) raise NotImplementedError(msg)
else: if sys.platform not in ('darwin', 'win32') and write_to == 'clipboard':
read_from = Path(read_from) # Check if the system is using Wayland
if not read_from.is_dir(): import os
raise ValueError('read_from must be either "clipboard" or a path to a directory') if os.environ.get('WAYLAND_DISPLAY'):
# Check if the wl-clipboard package is installed
logger.info(f'Reading from directory {read_from}') if os.system("which wl-copy > /dev/null") == 0:
pyperclip.set_clipboard("wl-clipboard")
old_paths = set() else:
for path in read_from.iterdir(): msg = 'Your session uses wayland and does not have wl-clipboard installed. ' \
old_paths.add(get_path_key(path)) 'Install wl-clipboard for write in clipboard to work.'
raise NotImplementedError(msg)
while True:
for path in read_from.iterdir(): if read_from == 'clipboard':
path_key = get_path_key(path) from PIL import ImageGrab
if path_key not in old_paths: logger.info('Reading from clipboard')
old_paths.add(path_key)
paused = False
try: global just_unpaused
img = Image.open(path) just_unpaused = True
img.load() img = None
except (UnidentifiedImageError, OSError) as e:
logger.warning(f'Error while reading file {path}: {e}') def on_key_press(key):
else: global tmp_paused
process_and_write_results(mocr, img, write_to) if key == keyboard.Key.cmd_r or key == keyboard.Key.ctrl_r:
tmp_paused = True
time.sleep(delay_secs)
def on_key_release(key):
global tmp_paused
if __name__ == '__main__': global just_unpaused
fire.Fire(run) if key == keyboard.Key.cmd_r or key == keyboard.Key.ctrl_r:
tmp_paused = False
just_unpaused = True
global tmp_paused
tmp_paused = False
tmp_paused_listener = keyboard.Listener(
on_press=on_key_press,
on_release=on_key_release)
tmp_paused_listener.start()
else:
read_from = Path(read_from)
if not read_from.is_dir():
raise ValueError('read_from must be either "clipboard" or a path to a directory')
logger.info(f'Reading from directory {read_from}')
old_paths = set()
for path in read_from.iterdir():
old_paths.add(get_path_key(path))
def getchar_thread():
global user_input
import os
if os.name == 'nt': # how it works on windows
import msvcrt
while True:
user_input = msvcrt.getch()
if user_input.lower() in 'tq':
break
else:
import tty, termios, sys
fd = sys.stdin.fileno()
old_settings = termios.tcgetattr(fd)
try:
tty.setcbreak(sys.stdin.fileno())
while True:
user_input = sys.stdin.read(1)
if user_input.lower() in 'tq':
break
finally:
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
global user_input
user_input = ''
user_input_thread = threading.Thread(target=getchar_thread, daemon=True)
user_input_thread.start()
while True:
if user_input != '':
if user_input.lower() in 'tq':
if read_from == 'clipboard':
tmp_paused_listener.stop()
user_input_thread.join()
logger.info('Terminated!')
break
if read_from == 'clipboard' and user_input.lower() == 'p':
if paused:
logger.info('Unpaused!')
just_unpaused = True
else:
logger.info('Paused!')
paused = not paused
elif user_input.lower() == 's':
if engine == engines[-1]:
engine = engines[0]
else:
engine = engines[engines.index(engine) + 1]
logger.opt(ansi=True).info(f"Switched to <cyan>{get_engine_name(engine)}</cyan>!")
elif user_input.lower() in 'agvm':
new_engine = engines['agvm'.find(user_input.lower())]
if engine != new_engine:
engine = new_engine
logger.opt(ansi=True).info(f"Switched to <cyan>{get_engine_name(engine)}</cyan>!")
user_input = ''
if read_from == 'clipboard':
if not paused and not tmp_paused:
old_img = img
try:
img = ImageGrab.grabclipboard()
except OSError as error:
if not verbose and "cannot identify image file" in str(error):
# Pillow error when clipboard hasn't changed since last grab (Linux)
pass
elif not verbose and "target image/png not available" in str(error):
# Pillow error when clipboard contains text (Linux, X11)
pass
else:
logger.warning('Error while reading from clipboard ({})'.format(error))
else:
if not just_unpaused and isinstance(img, Image.Image) and not are_images_identical(img, old_img):
process_and_write_results(mocr, avision, gvision, azure, img, write_to, engine)
if just_unpaused:
just_unpaused = False
else:
for path in read_from.iterdir():
path_key = get_path_key(path)
if path_key not in old_paths:
old_paths.add(path_key)
try:
img = Image.open(path)
img.load()
except (UnidentifiedImageError, OSError) as e:
logger.warning(f'Error while reading file {path}: {e}')
else:
process_and_write_results(mocr, avision, gvision, azure, img, write_to, engine)
time.sleep(delay_secs)
if __name__ == '__main__':
fire.Fire(run)

View File

@@ -8,3 +8,7 @@ pyperclip
torch>=1.0 torch>=1.0
transformers>=4.25.0 transformers>=4.25.0
unidic_lite unidic_lite
google-cloud-vision
azure-cognitiveservices-vision-computervision
pyobjc
pynput