Big refactoring, allow second image source

2025-05-04 08:37:54 +02:00
parent 7a307f4cb9
commit e48f388755
3 changed files with 252 additions and 259 deletions
--- a/owocr/ocr.py
+++ b/owocr/ocr.py
@@ -96,6 +96,22 @@ def post_process(text):
    return text


+def input_to_pil_image(img):
+    if isinstance(img, Image.Image):
+        pil_image = img
+    elif isinstance(img, (bytes, bytearray)):
+        pil_image = Image.open(io.BytesIO(img))
+    elif isinstance(img, Path):
+        try:
+            pil_image = Image.open(img)
+            pil_image.load()
+        except (UnidentifiedImageError, OSError) as e:
+            return None
+    else:
+        raise ValueError(f'img must be a path, PIL.Image or bytes object, instead got: {img}')
+    return pil_image
+
+
 def pil_image_to_bytes(img, img_format='png', png_compression=6, jpeg_quality=80, optimize=False):
    if img_format == 'png' and optimized_png_encode and not optimize:
        raw_data = img.convert('RGBA').tobytes()
@@ -157,15 +173,14 @@ class MangaOcr:
            self.available = True
            logger.info('Manga OCR ready')

-    def __call__(self, img_or_path):
-        if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
-            img = Image.open(img_or_path)
-        elif isinstance(img_or_path, Image.Image):
-            img = img_or_path
-        else:
-            raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
+    def __call__(self, img):
+        img = input_to_pil_image(img)
+        if not img:
+            return (False, 'Invalid image provided')

        x = (True, self.model(img))
+
+        img.close()
        return x

 class GoogleVision:
@@ -188,13 +203,10 @@ class GoogleVision:
            except:
                logger.warning('Error parsing Google credentials, Google Vision will not work!')

-    def __call__(self, img_or_path):
-        if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
-            img = Image.open(img_or_path)
-        elif isinstance(img_or_path, Image.Image):
-            img = img_or_path
-        else:
-            raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
+    def __call__(self, img):
+        img = input_to_pil_image(img)
+        if not img:
+            return (False, 'Invalid image provided')

        image_bytes = self._preprocess(img)
        image = vision.Image(content=image_bytes)
@@ -207,6 +219,8 @@ class GoogleVision:
        texts = response.text_annotations
        res = texts[0].description if len(texts) > 0 else ''
        x = (True, res)
+
+        img.close()
        return x

    def _preprocess(self, img):
@@ -225,13 +239,10 @@ class GoogleLens:
            self.available = True
            logger.info('Google Lens ready')

-    def __call__(self, img_or_path):
-        if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
-            img = Image.open(img_or_path)
-        elif isinstance(img_or_path, Image.Image):
-            img = img_or_path
-        else:
-            raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
+    def __call__(self, img):
+        img = input_to_pil_image(img)
+        if not img:
+            return (False, 'Invalid image provided')

        request = LensOverlayServerRequest()

@@ -298,6 +309,8 @@ class GoogleLens:
                res += '\n'

        x = (True, res)
+
+        img.close()
        return x

    def _preprocess(self, img):
@@ -305,9 +318,10 @@ class GoogleLens:
            aspect_ratio = img.width / img.height
            new_w = int(sqrt(3000000 * aspect_ratio))
            new_h = int(new_w / aspect_ratio)
-            img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
+            img_resized = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
+            img.close()

-        return (pil_image_to_bytes(img), img.width, img.height)
+        return (pil_image_to_bytes(img_resized), img_resized.width, img_resized.height)

 class GoogleLensWeb:
    name = 'glensweb'
@@ -323,13 +337,10 @@ class GoogleLensWeb:
            self.available = True
            logger.info('Google Lens (web) ready')

-    def __call__(self, img_or_path):
-        if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
-            img = Image.open(img_or_path)
-        elif isinstance(img_or_path, Image.Image):
-            img = img_or_path
-        else:
-            raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
+    def __call__(self, img):
+        img = input_to_pil_image(img)
+        if not img:
+            return (False, 'Invalid image provided')

        url = 'https://lens.google.com/v3/upload'
        files = {'encoded_image': ('image.png', self._preprocess(img), 'image/png')}
@@ -393,6 +404,8 @@ class GoogleLensWeb:
            res += '\n'

        x = (True, res)
+
+        img.close()
        return x

    def _preprocess(self, img):
@@ -400,9 +413,10 @@ class GoogleLensWeb:
            aspect_ratio = img.width / img.height
            new_w = int(sqrt(3000000 * aspect_ratio))
            new_h = int(new_w / aspect_ratio)
-            img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
+            img_resized = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
+            img.close()

-        return pil_image_to_bytes(img)
+        return pil_image_to_bytes(img_resized)

 class Bing:
    name = 'bing'
@@ -415,13 +429,10 @@ class Bing:
        self.available = True
        logger.info('Bing ready')

-    def __call__(self, img_or_path):
-        if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
-            img = Image.open(img_or_path)
-        elif isinstance(img_or_path, Image.Image):
-            img = img_or_path
-        else:
-            raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
+    def __call__(self, img):
+        img = input_to_pil_image(img)
+        if not img:
+            return (False, 'Invalid image provided')

        img_bytes = self._preprocess(img)
        if not img_bytes:
@@ -515,6 +526,8 @@ class Bing:
                        res += line['text'] + '\n'
        
        x = (True, res)
+
+        img.close()
        return x

    def _preprocess(self, img):
@@ -526,9 +539,10 @@ class Bing:
            resize_factor = max(max_pixel_size / img.width, max_pixel_size / img.height)
            new_w = int(img.width * resize_factor)
            new_h = int(img.height * resize_factor)
-            img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
+            img_resized = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
+            img.close()

-        img_bytes, _ = limit_image_size(img, max_byte_size)
+        img_bytes, _ = limit_image_size(img_resized, max_byte_size)

        if img_bytes:
            res = base64.b64encode(img_bytes).decode('utf-8')
@@ -550,13 +564,10 @@ class AppleVision:
            self.available = True
            logger.info('Apple Vision ready')

-    def __call__(self, img_or_path):
-        if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
-            img = Image.open(img_or_path)
-        elif isinstance(img_or_path, Image.Image):
-            img = img_or_path
-        else:
-            raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
+    def __call__(self, img):
+        img = input_to_pil_image(img)
+        if not img:
+            return (False, 'Invalid image provided')

        with objc.autorelease_pool():
            req = Vision.VNRecognizeTextRequest.alloc().init()
@@ -579,6 +590,7 @@ class AppleVision:
            else:
                x = (False, 'Unknown error!')

+            img.close()
            return x

    def _preprocess(self, img):
@@ -631,13 +643,10 @@ class AppleLiveText:
            self.available = True
            logger.info('Apple Live Text ready')

-    def __call__(self, img_or_path):
-        if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
-            img = Image.open(img_or_path)
-        elif isinstance(img_or_path, Image.Image):
-            img = img_or_path
-        else:
-            raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
+    def __call__(self, img):
+        img = input_to_pil_image(img)
+        if not img:
+            return (False, 'Invalid image provided')

        with objc.autorelease_pool():
            analyzer = self.VKCImageAnalyzer.alloc().init()
@@ -691,13 +700,10 @@ class WinRTOCR:
            except:
                logger.warning('Error reading URL from config, WinRT OCR will not work!')

-    def __call__(self, img_or_path):
-        if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
-            img = Image.open(img_or_path)
-        elif isinstance(img_or_path, Image.Image):
-            img = img_or_path
-        else:
-            raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
+    def __call__(self, img):
+        img = input_to_pil_image(img)
+        if not img:
+            return (False, 'Invalid image provided')

        if sys.platform == 'win32':
            res = winocr.recognize_pil_sync(img, lang='ja')['text']
@@ -716,6 +722,8 @@ class WinRTOCR:
            res = res.json()['text']

        x = (True, res)
+
+        img.close()
        return x

    def _preprocess(self, img):
@@ -749,13 +757,10 @@ class OneOCR:
            except:
                logger.warning('Error reading URL from config, OneOCR will not work!')

-    def __call__(self, img_or_path):
-        if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
-            img = Image.open(img_or_path)
-        elif isinstance(img_or_path, Image.Image):
-            img = img_or_path
-        else:
-            raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
+    def __call__(self, img):
+        img = input_to_pil_image(img)
+        if not img:
+            return (False, 'Invalid image provided')

        if sys.platform == 'win32':
            try:
@@ -776,6 +781,8 @@ class OneOCR:
            res = res.json()['text']

        x = (True, res)
+
+        img.close()
        return x

    def _preprocess(self, img):
@@ -799,13 +806,10 @@ class AzureImageAnalysis:
            except:
                logger.warning('Error parsing Azure credentials, Azure Image Analysis will not work!')

-    def __call__(self, img_or_path):
-        if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
-            img = Image.open(img_or_path)
-        elif isinstance(img_or_path, Image.Image):
-            img = img_or_path
-        else:
-            raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
+    def __call__(self, img):
+        img = input_to_pil_image(img)
+        if not img:
+            return (False, 'Invalid image provided')

        try:
            read_result = self.client.analyze(image_data=self._preprocess(img), visual_features=[VisualFeatures.READ])
@@ -823,6 +827,8 @@ class AzureImageAnalysis:
            return (False, 'Unknown error!')

        x = (True, res)
+
+        img.close()
        return x

    def _preprocess(self, img):
@@ -830,9 +836,10 @@ class AzureImageAnalysis:
            resize_factor = max(50 / img.width, 50 / img.height)
            new_w = int(img.width * resize_factor)
            new_h = int(img.height * resize_factor)
-            img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
+            img_resized = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
+            img.close()

-        return pil_image_to_bytes(img)
+        return pil_image_to_bytes(img_resized)

 class EasyOCR:
    name = 'easyocr'
@@ -850,13 +857,10 @@ class EasyOCR:
            self.available = True
            logger.info('EasyOCR ready')

-    def __call__(self, img_or_path):
-        if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
-            img = Image.open(img_or_path)
-        elif isinstance(img_or_path, Image.Image):
-            img = img_or_path
-        else:
-            raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
+    def __call__(self, img):
+        img = input_to_pil_image(img)
+        if not img:
+            return (False, 'Invalid image provided')

        res = ''
        read_result = self.model.readtext(self._preprocess(img), detail=0)
@@ -864,6 +868,8 @@ class EasyOCR:
            res += text + '\n'

        x = (True, res)
+
+        img.close()
        return x

    def _preprocess(self, img):
@@ -897,13 +903,10 @@ class RapidOCR:
            self.available = True
            logger.info('RapidOCR ready')

-    def __call__(self, img_or_path):
-        if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
-            img = Image.open(img_or_path)
-        elif isinstance(img_or_path, Image.Image):
-            img = img_or_path
-        else:
-            raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
+    def __call__(self, img):
+        img = input_to_pil_image(img)
+        if not img:
+            return (False, 'Invalid image provided')

        res = ''
        read_results, elapsed = self.model(self._preprocess(img))
@@ -912,6 +915,8 @@ class RapidOCR:
                res += read_result[1] + '\n'

        x = (True, res)
+
+        img.close()
        return x

    def _preprocess(self, img):
@@ -932,13 +937,10 @@ class OCRSpace:
        except:
            logger.warning('Error reading API key from config, OCRSpace will not work!')

-    def __call__(self, img_or_path):
-        if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
-            img = Image.open(img_or_path)
-        elif isinstance(img_or_path, Image.Image):
-            img = img_or_path
-        else:
-            raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
+    def __call__(self, img):
+        img = input_to_pil_image(img)
+        if not img:
+            return (False, 'Invalid image provided')

        img_bytes, img_extension = self._preprocess(img)
        if not img_bytes:
@@ -969,6 +971,8 @@ class OCRSpace:

        res = res['ParsedResults'][0]['ParsedText']
        x = (True, res)
+
+        img.close()
        return x

    def _preprocess(self, img):