Replace PaddleOCR with RapidOCR, handle errors when offline

2024-01-30 11:11:07 +01:00
parent 52c822272e
commit c7565b0eed
5 changed files with 105 additions and 58 deletions
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ This has been tested with Python 3.11. Newer/older versions might work. It can b
 ## Local providers
 - [Manga OCR](https://github.com/kha-white/manga-ocr): refer to the readme for installation ("m" key)
 - [EasyOCR](https://github.com/JaidedAI/EasyOCR): refer to the readme for installation ("e" key)
- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR): refer to the [wiki](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.7/doc/doc_en/quickstart_en.md) for installation ("o" key)
+- [RapidOCR](https://github.com/RapidAI/RapidOCR): refer to the readme for installation ("r" key)
 - Apple Vision framework: this will work on macOS Ventura or later. In my experience, the best of the local providers for horizontal text ("a" key)
 - WinRT OCR: this will work on Windows 10 or later if winocr (`pip install winocr`) is installed. It can also be used by installing winocr on a Windows virtual machine and running the server (`winocr_serve`), installing requests (`pip install requests`) and specifying the IP address of the Windows VM/machine in the config file (see below) ("w" key)
--- a/owocr/ocr.py
+++ b/owocr/ocr.py
@@ -5,6 +5,7 @@ from pathlib import Path
 import time
 import sys
 import platform
 import logging
 from math import sqrt
 import jaconv
@@ -27,6 +28,7 @@ except ImportError:
 try:
    from google.cloud import vision
    from google.oauth2 import service_account
    from google.api_core.exceptions import ServiceUnavailable
 except ImportError:
    pass
@@ -34,6 +36,7 @@ try:
    from azure.cognitiveservices.vision.computervision import ComputerVisionClient
    from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
    from msrest.authentication import CognitiveServicesCredentials
    from msrest.exceptions import ClientRequestError
 except ImportError:
    pass
@@ -43,7 +46,7 @@ except ImportError:
    pass
 try:
-    from paddleocr import PaddleOCR as POCR
+    from rapidocr_onnxruntime import RapidOCR as ROCR
 except ImportError:
    pass
@@ -96,7 +99,7 @@ class MangaOcr:
        else:
            raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
-        x = self.model(img)
+        x = (True, self.model(img))
        return x
 class GoogleVision:
@@ -129,9 +132,14 @@ class GoogleVision:
        image_bytes = self._preprocess(img)
        image = vision.Image(content=image_bytes)
-        response = self.client.text_detection(image=image)
+        try:
            response = self.client.text_detection(image=image)
        except ServiceUnavailable:
            return (False, 'Connection error!')
        except:
            return (False, 'Unknown error!')
        texts = response.text_annotations
-        x = post_process(texts[0].description)
+        x = (True, post_process(texts[0].description))
        return x
    def _preprocess(self, img):
@@ -168,22 +176,30 @@ class GoogleLens:
        try:
            res = requests.post(url, files=files, timeout=20)
        except requests.exceptions.Timeout:
-            return 'Request timeout!'
+            return (False, 'Request timeout!')
        except requests.exceptions.ConnectionError:
            return (False, 'Connection error!')
-        x = ''
+        if res.status_code != 200:
-        if res.status_code == 200:
+            return (False, 'Unknown error!')
            regex = re.compile(r">AF_initDataCallback\(({key: 'ds:1'.*?)\);</script>")
            match = regex.search(res.text)
            if match != None:
                lens_object = pyjson5.loads(match.group(1))
                if not 'errorHasStatus' in lens_object:
                    text = lens_object['data'][3][4][0]
                    if len(text) > 0:
                        lines = text[0]
                        for line in lines:
                            x += line + ' '
                        x = post_process(x)
        regex = re.compile(r">AF_initDataCallback\(({key: 'ds:1'.*?)\);</script>")
        match = regex.search(res.text)
        if match == None:
            return (False, 'Regex error!')
        lens_object = pyjson5.loads(match.group(1))
        if 'errorHasStatus' in lens_object:
            return (False, 'Unknown Lens error!')
        res = ''
        text = lens_object['data'][3][4][0]
        if len(text) > 0:
            lines = text[0]
            for line in lines:
                res += line + ' '
        x = (True, post_process(res))
        return x
    def _preprocess(self, img):
@@ -238,9 +254,11 @@ class AppleVision:
                for result in req.results():
                    res += result.text() + ' '
                req.dealloc()
                x = (True, post_process(res))
            else:
                x = (False, 'Unknown error!')
            handler.dealloc()
            x = post_process(res)
            return x
    def _preprocess(self, img):
@@ -289,11 +307,16 @@ class WinRTOCR:
            try:
                res = requests.post(self.url, params=params, data=self._preprocess(img), timeout=3)
            except requests.exceptions.Timeout:
-                return 'Request timeout!'
+                return (False, 'Request timeout!')
            except requests.exceptions.ConnectionError:
                return (False, 'Connection error!')
            if res.status_code != 200:
                return (False, 'Unknown error!')
            res = json.loads(res.text)['text']
-        x = post_process(res)
+        x = (True, post_process(res))
        return x
    def _preprocess(self, img):
@@ -328,24 +351,33 @@ class AzureComputerVision:
            raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
        image_io = self._preprocess(img)
-        read_response = self.client.read_in_stream(image_io, raw=True)
+        logging.getLogger('urllib3.connectionpool').disabled = True
-        read_operation_location = read_response.headers['Operation-Location']
+        try:
-        operation_id = read_operation_location.split('/')[-1]
+            read_response = self.client.read_in_stream(image_io, raw=True)
-        while True:
+            read_operation_location = read_response.headers['Operation-Location']
-            read_result = self.client.get_read_result(operation_id)
+            operation_id = read_operation_location.split('/')[-1]
-            if read_result.status.lower() not in ['notstarted', 'running']:
+
-                break
+            while True:
-            time.sleep(0.3)
+                read_result = self.client.get_read_result(operation_id)
                if read_result.status.lower() not in [OperationStatusCodes.not_started, OperationStatusCodes.running]:
                    break
                time.sleep(0.3)
        except ClientRequestError:
            return (False, 'Connection error!')
        except:
            return (False, 'Unknown error!')
        res = ''
        if read_result.status == OperationStatusCodes.succeeded:
            for text_result in read_result.analyze_result.read_results:
                for line in text_result.lines:
                    res += line.text + ' '
        else:
            return (False, 'Unknown error!')
-        x = post_process(res)
+        x = (True, post_process(res))
        return x
    def _preprocess(self, img):
@@ -382,7 +414,7 @@ class EasyOCR:
        for text in read_result:
            res += text + ' '
-        x = post_process(res)
+        x = (True, post_process(res))
        return x
    def _preprocess(self, img):
@@ -390,20 +422,29 @@ class EasyOCR:
        img.save(image_bytes, format='png')
        return image_bytes.getvalue()
-class PaddleOCR:
+class RapidOCR:
-    name = 'paddleocr'
+    name = 'rapidocr'
-    readable_name = 'PaddleOCR'
+    readable_name = 'RapidOCR'
-    key = 'o'
+    key = 'r'
    available = False
    def __init__(self):
-        if 'paddleocr' not in sys.modules:
+        if 'rapidocr_onnxruntime' not in sys.modules:
-            logger.warning('paddleocr not available, PaddleOCR will not work!')
+            logger.warning('rapidocr_onnxruntime not available, RapidOCR will not work!')
        else:
-            logger.info('Loading PaddleOCR model')
+            rapidocr_model_file = os.path.join(os.path.expanduser('~'),'.cache','rapidocr_japan_PP-OCRv4_rec_infer.onnx')
-            self.model = POCR(use_angle_cls=True, show_log=False, lang='japan')
+            if not os.path.isfile(rapidocr_model_file):
                logger.info('Downloading RapidOCR model')
                try:
                    urllib.request.urlretrieve('https://raw.githubusercontent.com/AuroraWright/owocr/master/rapidocr_japan_PP-OCRv4_rec_infer.onnx', rapidocr_model_file)
                except Exception as inst:
                    logger.warning('Download failed. RapidOCR will not work!')
                    return
            logger.info('Loading RapidOCR model')
            self.model = ROCR(rec_model_path=rapidocr_model_file)
            self.available = True
-            logger.info('PaddleOCR ready')
+            logger.info('RapidOCR ready')
    def __call__(self, img_or_path):
        if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
@@ -413,15 +454,16 @@ class PaddleOCR:
        else:
            raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
        logging.getLogger().disabled = True
        res = ''
-        read_results = self.model.ocr(self._preprocess(img), cls=True)
+        read_results, elapsed = self.model(self._preprocess(img))
-        for read_result in read_results:
+        if read_results:
-            if read_result:
+            for read_result in read_results:
-                for text in read_result:
+                res += read_result[1] + ' '
                    res += text[1][0] + ' '
-        x = post_process(res)
+        x = (True, post_process(res))
        return x
    def _preprocess(self, img):
        return np.array(img.convert('RGB'))
--- a/owocr/run.py
+++ b/owocr/run.py
@@ -93,6 +93,8 @@ class WebsocketServerThread(threading.Thread):
                        await websocket.send('False')
                    except websockets.exceptions.ConnectionClosedOK:
                        pass
        except websockets.exceptions.ConnectionClosedError:
            pass
        finally:
            self.clients.remove(websocket)
@@ -105,7 +107,7 @@ class WebsocketServerThread(threading.Thread):
    def run(self):
        asyncio.set_event_loop(self.loop)
-        start_server = websockets.serve(self.server_handler, '0.0.0.0', config.get_general('websocket_port'), max_size=50000000)
+        start_server = websockets.serve(self.server_handler, '0.0.0.0', config.get_general('websocket_port'), max_size=1000000000)
        self.server = start_server
        self.loop.run_until_complete(start_server)
        self.loop.run_forever()
@@ -244,17 +246,20 @@ def are_images_identical(img1, img2):
 def process_and_write_results(engine_instance, img_or_path, write_to):
    t0 = time.time()
-    text = engine_instance(img_or_path)
+    res, text = engine_instance(img_or_path)
    t1 = time.time()
    engine_color = config.get_general('engine_color')
-    logger.opt(ansi=True).info(f'Text recognized in {t1 - t0:0.03f}s using <{engine_color}>{engine_instance.readable_name}</{engine_color}>: {text}')
+    if res:
-    if config.get_general('notifications'):
+        logger.opt(ansi=True).info(f'Text recognized in {t1 - t0:0.03f}s using <{engine_color}>{engine_instance.readable_name}</{engine_color}>: {text}')
-        notification = Notify()
+        if config.get_general('notifications'):
-        notification.application_name = 'owocr'
+            notification = Notify()
-        notification.title = 'Text recognized:'
+            notification.application_name = 'owocr'
-        notification.message = text
+            notification.title = 'Text recognized:'
-        notification.send(block=False)
+            notification.message = text
            notification.send(block=False)
    else:
        logger.opt(ansi=True).info(f'<{engine_color}>{engine_instance.readable_name}</{engine_color}> reported an error after {t1 - t0:0.03f}s: {text}')
    if write_to == 'websocket':
        websocket_server_thread.send_text(text)
@@ -294,7 +299,7 @@ def run(read_from=None,
    :param read_from: Specifies where to read input images from. Can be either "clipboard", "websocket", "screencapture", or a path to a directory.
    :param write_to: Specifies where to save recognized texts to. Can be either "clipboard", "websocket", or a path to a text file.
    :param delay_secs: How often to check for new images, in seconds.
-    :param engine: OCR engine to use. Available: "mangaocr", "glens", "gvision", "avision", "azure", "winrtocr", "easyocr", "paddleocr".
+    :param engine: OCR engine to use. Available: "mangaocr", "glens", "gvision", "avision", "azure", "winrtocr", "easyocr", "rapidocr".
    :param pause_at_startup: Pause at startup.
    :param ignore_flag: Process flagged clipboard images (images that are copied to the clipboard with the *ocr_ignore* string).
    :param delete_images: Delete image files after processing when reading from a directory.
--- a/owocr_config.ini
+++ b/owocr_config.ini
@@ -1,5 +1,5 @@
 [general]
-;engines = avision,glens,gvision,azure,mangaocr,winrtocr,easyocr,paddleocr
+;engines = avision,glens,gvision,azure,mangaocr,winrtocr,easyocr,rapidocr
 ;engine = glens
 ;read_from = clipboard
 ;write_to = clipboard
--- a/rapidocr_japan_PP-OCRv4_rec_infer.onnx
+++ b/rapidocr_japan_PP-OCRv4_rec_infer.onnx