From 9518272a5bbdf282ad3b4d2fb7aa46ff1bd8e173 Mon Sep 17 00:00:00 2001 From: AuroraWright Date: Sat, 20 Jan 2024 02:45:29 +0100 Subject: [PATCH] Implement Google Lens (thanks Viola!) --- README.md | 3 +++ owocr/__init__.py | 8 +------- owocr/ocr.py | 50 +++++++++++++++++++++++++++++++++++++++++++++++ owocr/run.py | 2 +- owocr_config.ini | 2 +- 5 files changed, 56 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index aa2a99f..0885e17 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ This has been tested with Python 3.11. Newer/older versions might work. For now - WinRT OCR: this will work on Windows 10 or later if winocr (`pip install winocr`) is installed. It can also be used by installing winocr on a Windows virtual machine and running the server (`winocr_serve`), installing requests (`pip install requests`) and specifying the IP address of the Windows VM/machine in the config file (see below) ("w" key) ## Cloud providers +- Google Lens: Google Vision in disguise (no need for API keys!), however it needs to download a couple megabytes of data for each request. You need to install chompjs and requests (`pip install chompjs requests`) ("l" key) - Google Vision: you need a service account .json file named google_vision.json in `user directory/.config/` and installing google-cloud-vision (`pip install google-cloud-vision`) ("g" key) - Azure Computer Vision: you need to specify an api key and an endpoint in the config file (see below) and to install azure-cognitiveservices-vision-computervision (`pip install azure-cognitiveservices-vision-computervision`) ("v" key) @@ -36,3 +37,5 @@ This uses code from/references these projects: - [Manga OCR](https://github.com/kha-white/manga-ocr) - [ocrmac](https://github.com/straussmaximilian/ocrmac) for the Apple Vision framework API - [NadeOCR](https://github.com/Natsume-197/NadeOCR) for the Google Vision API + +Thanks to viola for working on the Google Lens implementation! \ No newline at end of file diff --git a/owocr/__init__.py b/owocr/__init__.py index 00e7eb9..657df77 100644 --- a/owocr/__init__.py +++ b/owocr/__init__.py @@ -1,9 +1,3 @@ __version__ = '0.1.10' -from owocr.ocr import MangaOcr -from owocr.ocr import GoogleVision -from owocr.ocr import AppleVision -from owocr.ocr import WinRTOCR -from owocr.ocr import AzureComputerVision -from owocr.ocr import EasyOCR -from owocr.ocr import PaddleOCR +from owocr.ocr import * diff --git a/owocr/ocr.py b/owocr/ocr.py index 4b54b35..04d67d8 100644 --- a/owocr/ocr.py +++ b/owocr/ocr.py @@ -56,6 +56,11 @@ try: except ImportError: pass +try: + import chompjs +except ImportError: + pass + def post_process(text): text = ''.join(text.split()) @@ -138,6 +143,51 @@ class GoogleVision: img.save(image_bytes, format=img.format) return image_bytes.getvalue() +class GoogleLens: + name = "glens" + readable_name = "Google Lens" + key = "l" + available = False + + def __init__(self): + if 'chompjs' not in sys.modules: + logger.warning('chompjs not available, Google Lens will not work!') + elif 'requests' not in sys.modules: + logger.warning('requests not available, Google Lens will not work!') + else: + self.available = True + logger.info('Google Lens ready') + + def __call__(self, img_or_path): + if isinstance(img_or_path, str) or isinstance(img_or_path, Path): + img = Image.open(img_or_path) + elif isinstance(img_or_path, Image.Image): + img = img_or_path + else: + raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}') + + timestamp = int(time.time() * 1000) + url = f"https://lens.google.com/v3/upload?stcs={timestamp}" + files = {"encoded_image": ('owo' + str(timestamp) +'.png', self._preprocess(img), 'image/png')} + res = requests.post(url, files=files) + + x = '' + if res.status_code == 200: + regex = re.compile(r">AF_initDataCallback\(({key: 'ds:1'.*?);") + match = regex.search(res.text) + if match != None: + lines = chompjs.parse_js_object(match.group(1))["data"][3][4][0][0] + for line in lines: + x += line + ' ' + x = post_process(x) + + return x + + def _preprocess(self, img): + image_bytes = io.BytesIO() + img.save(image_bytes, format="png") + return image_bytes.getvalue() + class AppleVision: name = "avision" readable_name = "Apple Vision" diff --git a/owocr/run.py b/owocr/run.py index 63e8676..5e37acd 100644 --- a/owocr/run.py +++ b/owocr/run.py @@ -156,7 +156,7 @@ def run(read_from='clipboard', :param read_from: Specifies where to read input images from. Can be either "clipboard", "websocket", or a path to a directory. :param write_to: Specifies where to save recognized texts to. Can be either "clipboard", "websocket", or a path to a text file. :param delay_secs: How often to check for new images, in seconds. - :param engine: OCR engine to use. Available: "mangaocr", "gvision", "avision", "azure", "winrtocr", "easyocr", "paddleocr". + :param engine: OCR engine to use. Available: "mangaocr", "glens", "gvision", "avision", "azure", "winrtocr", "easyocr", "paddleocr". :param pause_at_startup: Pause at startup. :param ignore_flag: Process flagged clipboard images (images that are copied to the clipboard with the *ocr_ignore* string). :param delete_images: Delete image files after processing when reading from a directory. diff --git a/owocr_config.ini b/owocr_config.ini index c15a8b0..4c1701f 100644 --- a/owocr_config.ini +++ b/owocr_config.ini @@ -1,5 +1,5 @@ [general] -;engines = avision,gvision,azure,mangaocr,winrtocr,easyocr,paddleocr +;engines = avision,glens,gvision,azure,mangaocr,winrtocr,easyocr,paddleocr ;logger_format = {time:HH:mm:ss.SSS} | {message} ;engine_color = cyan ;websocket_port = 7331