Add support for fpng-py, update Azure APIs

This commit is contained in:
AuroraWright
2024-02-02 15:27:55 +01:00
parent a6581dd3ff
commit c2feb52233
2 changed files with 48 additions and 49 deletions

View File

@@ -18,7 +18,7 @@ This has been tested with Python 3.11. Newer/older versions might work. It can b
## Cloud providers ## Cloud providers
- Google Lens: Google Vision in disguise (no need for API keys!), however it needs to download a couple megabytes of data for each request. You need to install pyjson5 and requests (`pip install pyjson5 requests`) ("l" key) - Google Lens: Google Vision in disguise (no need for API keys!), however it needs to download a couple megabytes of data for each request. You need to install pyjson5 and requests (`pip install pyjson5 requests`) ("l" key)
- Google Vision: you need a service account .json file named google_vision.json in `user directory/.config/` and installing google-cloud-vision (`pip install google-cloud-vision`) ("g" key) - Google Vision: you need a service account .json file named google_vision.json in `user directory/.config/` and installing google-cloud-vision (`pip install google-cloud-vision`) ("g" key)
- Azure Computer Vision: you need to specify an api key and an endpoint in the config file (see below) and to install azure-cognitiveservices-vision-computervision (`pip install azure-cognitiveservices-vision-computervision`) ("v" key) - Azure Image Analysis: you need to specify an api key and an endpoint in the config file (see below) and to install azure-ai-vision-imageanalysis (`pip install azure-ai-vision-imageanalysis`) ("v" key)
# Usage # Usage
@@ -31,6 +31,7 @@ However:
- holding ctrl or cmd at any time will pause image processing temporarily - holding ctrl or cmd at any time will pause image processing temporarily
- for systems where text can be copied to the clipboard at the same time as images, if `*ocr_ignore*` is copied with an image, the image will be ignored - for systems where text can be copied to the clipboard at the same time as images, if `*ocr_ignore*` is copied with an image, the image will be ignored
- optionally, notifications can be enabled in the config file to show the text with a native OS notification - optionally, notifications can be enabled in the config file to show the text with a native OS notification
- optionally, you can speed up the online providers by installing fpng-py: `pip install fpng-py` (requires a developer environment on some operating systems/Python versions)
- idle resource usage on macOS and Windows when reading from the clipboard has been eliminated using native OS polling - idle resource usage on macOS and Windows when reading from the clipboard has been eliminated using native OS polling
- a config file (to be created in `user directory/.config/owocr_config.ini`, on Windows `user directory` is the `C:\Users\yourusername` folder) can be used to configure the script, as an example to limit providers (to reduce clutter/memory usage) as well as specifying provider settings such as api keys etc. A sample config file is provided [here](https://raw.githubusercontent.com/AuroraWright/owocr/master/owocr_config.ini) - a config file (to be created in `user directory/.config/owocr_config.ini`, on Windows `user directory` is the `C:\Users\yourusername` folder) can be used to configure the script, as an example to limit providers (to reduce clutter/memory usage) as well as specifying provider settings such as api keys etc. A sample config file is provided [here](https://raw.githubusercontent.com/AuroraWright/owocr/master/owocr_config.ini)

View File

@@ -33,10 +33,10 @@ except ImportError:
pass pass
try: try:
from azure.cognitiveservices.vision.computervision import ComputerVisionClient from azure.ai.vision.imageanalysis import ImageAnalysisClient
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes from azure.ai.vision.imageanalysis.models import VisualFeatures
from msrest.authentication import CognitiveServicesCredentials from azure.core.credentials import AzureKeyCredential
from msrest.exceptions import ClientRequestError from azure.core.exceptions import ServiceRequestError
except ImportError: except ImportError:
pass pass
@@ -66,6 +66,12 @@ try:
except ImportError: except ImportError:
pass pass
try:
import fpng_py
optimized_png_encode = True
except:
optimized_png_encode = False
def empty_post_process(text): def empty_post_process(text):
return text return text
@@ -79,6 +85,22 @@ def post_process(text):
return text return text
def pil_image_to_bytes(img, img_format='png', png_compression=6):
if img_format == 'png' and optimized_png_encode:
raw_data = img.convert('RGBA').tobytes()
width, height = img.size
image_bytes = fpng_py.fpng_encode_image_to_memory(raw_data, width, height)
else:
image_bytes = io.BytesIO()
img.save(image_bytes, format=img_format, compress_level=png_compression)
image_bytes = image_bytes.getvalue()
return image_bytes
def pil_image_to_numpy_array(img):
return np.array(img.convert('RGBA'))
class MangaOcr: class MangaOcr:
name = 'mangaocr' name = 'mangaocr'
readable_name = 'Manga OCR' readable_name = 'Manga OCR'
@@ -149,9 +171,7 @@ class GoogleVision:
return x return x
def _preprocess(self, img): def _preprocess(self, img):
image_bytes = io.BytesIO() return pil_image_to_bytes(img)
img.save(image_bytes, format='png')
return image_bytes.getvalue()
class GoogleLens: class GoogleLens:
name = 'glens' name = 'glens'
@@ -216,9 +236,7 @@ class GoogleLens:
new_h = int(new_w / aspect_ratio) new_h = int(new_w / aspect_ratio)
img = img.resize((new_w, new_h), Image.LANCZOS) img = img.resize((new_w, new_h), Image.LANCZOS)
image_bytes = io.BytesIO() return pil_image_to_bytes(img)
img.save(image_bytes, format='png')
return image_bytes.getvalue()
class AppleVision: class AppleVision:
name = 'avision' name = 'avision'
@@ -268,9 +286,7 @@ class AppleVision:
return x return x
def _preprocess(self, img): def _preprocess(self, img):
image_bytes = io.BytesIO() return pil_image_to_bytes(img, 'tiff')
img.save(image_bytes, format='tiff')
return image_bytes.getvalue()
class WinRTOCR: class WinRTOCR:
name = 'winrtocr' name = 'winrtocr'
@@ -326,27 +342,25 @@ class WinRTOCR:
return x return x
def _preprocess(self, img): def _preprocess(self, img):
image_bytes = io.BytesIO() return pil_image_to_bytes(img, png_compression=1)
img.save(image_bytes, format='png', compress_level=1)
return image_bytes.getvalue()
class AzureComputerVision: class AzureImageAnalysis:
name = 'azure' name = 'azure'
readable_name = 'Azure Computer Vision' readable_name = 'Azure Image Analysis'
key = 'v' key = 'v'
available = False available = False
def __init__(self, config={}): def __init__(self, config={}):
if 'azure.cognitiveservices.vision.computervision' not in sys.modules: if 'azure.ai.vision.imageanalysis' not in sys.modules:
logger.warning('azure-cognitiveservices-vision-computervision not available, Azure Computer Vision will not work!') logger.warning('azure-ai-vision-imageanalysis not available, Azure Image Analysis will not work!')
else: else:
logger.info(f'Parsing Azure credentials') logger.info(f'Parsing Azure credentials')
try: try:
self.client = ComputerVisionClient(config['endpoint'], CognitiveServicesCredentials(config['api_key'])) self.client = ImageAnalysisClient(config['endpoint'], AzureKeyCredential(config['api_key']))
self.available = True self.available = True
logger.info('Azure Computer Vision ready') logger.info('Azure Image Analysis ready')
except: except:
logger.warning('Error parsing Azure credentials, Azure Computer Vision will not work!') logger.warning('Error parsing Azure credentials, Azure Image Analysis will not work!')
def __call__(self, img_or_path): def __call__(self, img_or_path):
if isinstance(img_or_path, str) or isinstance(img_or_path, Path): if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
@@ -356,29 +370,16 @@ class AzureComputerVision:
else: else:
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}') raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
image_io = self._preprocess(img)
logging.getLogger('urllib3.connectionpool').disabled = True
try: try:
read_response = self.client.read_in_stream(image_io, raw=True) read_result = self.client.analyze(image_data=self._preprocess(img), visual_features=[VisualFeatures.READ])
except ServiceRequestError:
read_operation_location = read_response.headers['Operation-Location']
operation_id = read_operation_location.split('/')[-1]
while True:
read_result = self.client.get_read_result(operation_id)
if read_result.status.lower() not in [OperationStatusCodes.not_started, OperationStatusCodes.running]:
break
time.sleep(0.3)
except ClientRequestError:
return (False, 'Connection error!') return (False, 'Connection error!')
except: except:
return (False, 'Unknown error!') return (False, 'Unknown error!')
res = '' res = ''
if read_result.status == OperationStatusCodes.succeeded: if read_result.read:
for text_result in read_result.analyze_result.read_results: for line in read_result.read.blocks[0].lines:
for line in text_result.lines:
res += line.text + ' ' res += line.text + ' '
else: else:
return (False, 'Unknown error!') return (False, 'Unknown error!')
@@ -387,10 +388,7 @@ class AzureComputerVision:
return x return x
def _preprocess(self, img): def _preprocess(self, img):
image_io = io.BytesIO() return pil_image_to_bytes(img)
img.save(image_io, format='png')
image_io.seek(0)
return image_io
class EasyOCR: class EasyOCR:
name = 'easyocr' name = 'easyocr'
@@ -424,7 +422,7 @@ class EasyOCR:
return x return x
def _preprocess(self, img): def _preprocess(self, img):
return np.array(img.convert('RGB')) return pil_image_to_numpy_array(img)
class RapidOCR: class RapidOCR:
name = 'rapidocr' name = 'rapidocr'
@@ -450,6 +448,7 @@ class RapidOCR:
logger.info('Loading RapidOCR model') logger.info('Loading RapidOCR model')
self.model = ROCR(rec_model_path=rapidocr_model_file) self.model = ROCR(rec_model_path=rapidocr_model_file)
logging.getLogger().disabled = True
self.available = True self.available = True
logger.info('RapidOCR ready') logger.info('RapidOCR ready')
@@ -461,7 +460,6 @@ class RapidOCR:
else: else:
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}') raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
logging.getLogger().disabled = True
res = '' res = ''
read_results, elapsed = self.model(self._preprocess(img)) read_results, elapsed = self.model(self._preprocess(img))
if read_results: if read_results:
@@ -472,4 +470,4 @@ class RapidOCR:
return x return x
def _preprocess(self, img): def _preprocess(self, img):
return np.array(img.convert('RGB')) return pil_image_to_numpy_array(img)