Implement Google Lens (thanks Viola!)

This commit is contained in:
AuroraWright
2024-01-20 02:45:29 +01:00
parent 4a11dd1178
commit 9518272a5b
5 changed files with 56 additions and 9 deletions

View File

@@ -16,6 +16,7 @@ This has been tested with Python 3.11. Newer/older versions might work. For now
- WinRT OCR: this will work on Windows 10 or later if winocr (`pip install winocr`) is installed. It can also be used by installing winocr on a Windows virtual machine and running the server (`winocr_serve`), installing requests (`pip install requests`) and specifying the IP address of the Windows VM/machine in the config file (see below) ("w" key)
## Cloud providers
- Google Lens: Google Vision in disguise (no need for API keys!), however it needs to download a couple megabytes of data for each request. You need to install chompjs and requests (`pip install chompjs requests`) ("l" key)
- Google Vision: you need a service account .json file named google_vision.json in `user directory/.config/` and installing google-cloud-vision (`pip install google-cloud-vision`) ("g" key)
- Azure Computer Vision: you need to specify an api key and an endpoint in the config file (see below) and to install azure-cognitiveservices-vision-computervision (`pip install azure-cognitiveservices-vision-computervision`) ("v" key)
@@ -36,3 +37,5 @@ This uses code from/references these projects:
- [Manga OCR](https://github.com/kha-white/manga-ocr)
- [ocrmac](https://github.com/straussmaximilian/ocrmac) for the Apple Vision framework API
- [NadeOCR](https://github.com/Natsume-197/NadeOCR) for the Google Vision API
Thanks to viola for working on the Google Lens implementation!

View File

@@ -1,9 +1,3 @@
__version__ = '0.1.10'
from owocr.ocr import MangaOcr
from owocr.ocr import GoogleVision
from owocr.ocr import AppleVision
from owocr.ocr import WinRTOCR
from owocr.ocr import AzureComputerVision
from owocr.ocr import EasyOCR
from owocr.ocr import PaddleOCR
from owocr.ocr import *

View File

@@ -56,6 +56,11 @@ try:
except ImportError:
pass
try:
import chompjs
except ImportError:
pass
def post_process(text):
text = ''.join(text.split())
@@ -138,6 +143,51 @@ class GoogleVision:
img.save(image_bytes, format=img.format)
return image_bytes.getvalue()
class GoogleLens:
name = "glens"
readable_name = "Google Lens"
key = "l"
available = False
def __init__(self):
if 'chompjs' not in sys.modules:
logger.warning('chompjs not available, Google Lens will not work!')
elif 'requests' not in sys.modules:
logger.warning('requests not available, Google Lens will not work!')
else:
self.available = True
logger.info('Google Lens ready')
def __call__(self, img_or_path):
if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
img = Image.open(img_or_path)
elif isinstance(img_or_path, Image.Image):
img = img_or_path
else:
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
timestamp = int(time.time() * 1000)
url = f"https://lens.google.com/v3/upload?stcs={timestamp}"
files = {"encoded_image": ('owo' + str(timestamp) +'.png', self._preprocess(img), 'image/png')}
res = requests.post(url, files=files)
x = ''
if res.status_code == 200:
regex = re.compile(r">AF_initDataCallback\(({key: 'ds:1'.*?);</script>")
match = regex.search(res.text)
if match != None:
lines = chompjs.parse_js_object(match.group(1))["data"][3][4][0][0]
for line in lines:
x += line + ' '
x = post_process(x)
return x
def _preprocess(self, img):
image_bytes = io.BytesIO()
img.save(image_bytes, format="png")
return image_bytes.getvalue()
class AppleVision:
name = "avision"
readable_name = "Apple Vision"

View File

@@ -156,7 +156,7 @@ def run(read_from='clipboard',
:param read_from: Specifies where to read input images from. Can be either "clipboard", "websocket", or a path to a directory.
:param write_to: Specifies where to save recognized texts to. Can be either "clipboard", "websocket", or a path to a text file.
:param delay_secs: How often to check for new images, in seconds.
:param engine: OCR engine to use. Available: "mangaocr", "gvision", "avision", "azure", "winrtocr", "easyocr", "paddleocr".
:param engine: OCR engine to use. Available: "mangaocr", "glens", "gvision", "avision", "azure", "winrtocr", "easyocr", "paddleocr".
:param pause_at_startup: Pause at startup.
:param ignore_flag: Process flagged clipboard images (images that are copied to the clipboard with the *ocr_ignore* string).
:param delete_images: Delete image files after processing when reading from a directory.

View File

@@ -1,5 +1,5 @@
[general]
;engines = avision,gvision,azure,mangaocr,winrtocr,easyocr,paddleocr
;engines = avision,glens,gvision,azure,mangaocr,winrtocr,easyocr,paddleocr
;logger_format = <green>{time:HH:mm:ss.SSS}</green> | <level>{message}</level>
;engine_color = cyan
;websocket_port = 7331