Add coordinate support for the remaining engines, remove Lens Web as it's pointless and potentially confusing

This commit is contained in:
AuroraWright
2025-10-16 07:49:21 +02:00
parent c21bdef848
commit 0f86750b23
6 changed files with 339 additions and 180 deletions

View File

@@ -38,7 +38,6 @@ Additionally:
## Cloud providers
- Google Lens: Google Vision in disguise (no need for API keys!), install with `pip install owocr[lens]` ("l" key)
- Google Lens (web): alternative version of Lens (Google webpage version). Results should be the same but it's much slower. Install with `pip install owocr[lensweb]` ("k" key)
- Bing: Azure in disguise (no need for API keys!) ("b" key)
- Google Vision: install with `pip install owocr[gvision]`, you also need a service account .json file named google_vision.json in `user directory/.config/` ("g" key)
- Azure Image Analysis: install with `pip install owocr[azure]`, you also need to specify an api key and an endpoint in the config file ("v" key)

View File

@@ -25,7 +25,7 @@ parser.add_argument('-rs', '--read_from_secondary', type=str, default=argparse.S
parser.add_argument('-w', '--write_to', type=str, default=argparse.SUPPRESS,
help='Where to save recognized texts to. Can be either "clipboard", "websocket", or a path to a text file.')
parser.add_argument('-e', '--engine', type=str, default=argparse.SUPPRESS,
help='OCR engine to use. Available: "mangaocr", "glens", "glensweb", "bing", "gvision", "avision", "alivetext", "azure", "winrtocr", "oneocr", "easyocr", "rapidocr", "ocrspace".')
help='OCR engine to use. Available: "mangaocr", "glens", "bing", "gvision", "avision", "alivetext", "azure", "winrtocr", "oneocr", "easyocr", "rapidocr", "ocrspace".')
parser.add_argument('-es', '--engine_secondary', type=str, default=argparse.SUPPRESS,
help='OCR engine to use for two-pass processing.')
parser.add_argument('-p', '--pause_at_startup', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS,
@@ -62,6 +62,10 @@ parser.add_argument('-l', '--language', type=str, default=argparse.SUPPRESS,
help='Two letter language code for filtering screencapture OCR results. Ex. "ja" for Japanese, "zh" for Chinese, "ko" for Korean, "ar" for Arabic, "ru" for Russian, "el" for Greek, "he" for Hebrew, "th" for Thai. Any other value will use Latin Extended (for most European languages and English).')
parser.add_argument('-of', '--output_format', type=str, default=argparse.SUPPRESS,
help='The output format for OCR results. Can be "text" (default) or "json" (to include coordinates).')
parser.add_argument('-wp', '--websocket_port', type=int, default=argparse.SUPPRESS,
help='Websocket port to use if reading or writing to websocket.')
parser.add_argument('-ds', '--delay_seconds', type=float, default=argparse.SUPPRESS,
help='Delay (in seconds) between checks when reading from clipboard (on macOS/Linux) or a directory.')
parser.add_argument('-v', '--verbosity', type=int, default=argparse.SUPPRESS,
help='Terminal window verbosity. Can be -2 (all recognized text is showed whole, default), -1 (only timestamps are shown), 0 (nothing is shown but errors), or larger than 0 to cut displayed text to that amount of characters.')
parser.add_argument('--uwu', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS, help=argparse.SUPPRESS)

View File

@@ -67,11 +67,6 @@ try:
except ImportError:
pass
try:
import pyjson5
except ImportError:
pass
try:
import betterproto
from .lens_betterproto import *
@@ -224,53 +219,146 @@ def quad_to_bounding_box(x1, y1, x2, y2, x3, y3, x4, y4, img_width=None, img_hei
rotation_z=angle
)
def merge_bounding_boxes(ocr_element_list):
all_corners = []
def merge_bounding_boxes(ocr_element_list, rotated=False):
def _get_all_corners(ocr_element_list):
corners = []
for element in ocr_element_list:
bbox = element.bounding_box
angle = bbox.rotation_z
hw = bbox.width / 2
hh = bbox.height / 2
angle = bbox.rotation_z or 0.0
hw, hh = bbox.width / 2.0, bbox.height / 2.0
cx, cy = bbox.center_x, bbox.center_y
if not angle:
corners = [
(bbox.center_x - hw, bbox.center_y - hh), # Top-left
(bbox.center_x + hw, bbox.center_y - hh), # Top-right
(bbox.center_x + hw, bbox.center_y + hh), # Bottom-right
(bbox.center_x - hw, bbox.center_y + hh) # Bottom-left
]
all_corners.extend(corners)
# Local corner offsets
local = np.array([[-hw, -hh], [hw, -hh], [hw, hh], [-hw, hh]])
if abs(angle) < 1e-12:
corners.append(local + [cx, cy])
else:
local_corners = [
(-hw, -hh), # Top-left
( hw, -hh), # Top-right
( hw, hh), # Bottom-right
(-hw, hh) # Bottom-left
]
# Rotation matrix
cos_a, sin_a = np.cos(angle), np.sin(angle)
rot = np.array([[cos_a, -sin_a], [sin_a, cos_a]])
corners.append(local @ rot.T + [cx, cy])
# Rotate and translate corners
cos_angle = cos(angle)
sin_angle = sin(angle)
return np.vstack(corners) if corners else np.empty((0, 2))
for x_local, y_local in local_corners:
x_rotated = x_local * cos_angle - y_local * sin_angle
y_rotated = x_local * sin_angle + y_local * cos_angle
x_global = bbox.center_x + x_rotated
y_global = bbox.center_y + y_rotated
all_corners.append((x_global, y_global))
def _convex_hull(points):
if len(points) <= 3:
return points
xs, ys = zip(*all_corners)
min_x, max_x = min(xs), max(xs)
min_y, max_y = min(ys), max(ys)
pts = np.unique(points, axis=0)
pts = pts[np.lexsort((pts[:, 1], pts[:, 0]))]
if len(pts) <= 1:
return pts
def cross(o, a, b):
return (a[0] - o[0]) * (b[1] - o[1]) - (a[1] - o[1]) * (b[0] - o[0])
lower, upper = [], []
for p in pts:
while len(lower) >= 2 and cross(lower[-2], lower[-1], p) <= 0:
lower.pop()
lower.append(p)
for p in pts[::-1]:
while len(upper) >= 2 and cross(upper[-2], upper[-1], p) <= 0:
upper.pop()
upper.append(p)
return np.array(lower[:-1] + upper[:-1])
all_corners = _get_all_corners(ocr_element_list)
# Axis-aligned case
if not rotated:
min_pt, max_pt = all_corners.min(axis=0), all_corners.max(axis=0)
center = (min_pt + max_pt) / 2
size = max_pt - min_pt
return BoundingBox(
center_x=center[0],
center_y=center[1],
width=size[0],
height=size[1]
)
hull = _convex_hull(all_corners)
m = len(hull)
# Trivial cases
if m == 1:
return BoundingBox(
center_x=hull[0, 0],
center_y=hull[0, 1],
width=0.0,
height=0.0,
rotation_z=0.0
)
if m == 2:
diff = hull[1] - hull[0]
length = np.linalg.norm(diff)
center = hull.mean(axis=0)
return BoundingBox(
center_x=center[0],
center_y=center[1],
width=length,
height=0.0,
rotation_z=np.arctan2(diff[1], diff[0])
)
# Test each edge orientation
edges = np.roll(hull, -1, axis=0) - hull
edge_lengths = np.linalg.norm(edges, axis=1)
valid = edge_lengths > 1e-12
if not valid.any():
# Fallback to axis-aligned
min_pt, max_pt = all_corners.min(axis=0), all_corners.max(axis=0)
center = (min_pt + max_pt) / 2
size = max_pt - min_pt
return BoundingBox(
center_x=center[0],
center_y=center[1],
width=size[0],
height=size[1]
)
angles = np.arctan2(edges[valid, 1], edges[valid, 0])
best_area, best_idx = np.inf, -1
for idx, angle in enumerate(angles):
# Rotation matrix (rotate by -angle)
cos_a, sin_a = np.cos(angle), np.sin(angle)
rot = np.array([[cos_a, sin_a], [-sin_a, cos_a]])
rotated = hull @ rot.T
min_pt, max_pt = rotated.min(axis=0), rotated.max(axis=0)
area = np.prod(max_pt - min_pt)
if area < best_area:
best_area, best_idx = area, idx
best_bounds = (min_pt, max_pt, angle)
min_pt, max_pt, angle = best_bounds
width, height = max_pt - min_pt
center_rot = (min_pt + max_pt) / 2
# Rotate center back to global coordinates
cos_a, sin_a = np.cos(angle), np.sin(angle)
rot_back = np.array([[cos_a, -sin_a], [sin_a, cos_a]])
center = rot_back @ center_rot
# Normalize angle to [-π, π]
angle = np.mod(angle + np.pi, 2 * np.pi) - np.pi
return BoundingBox(
center_x=(min_x + max_x) / 2,
center_y=(min_y + max_y) / 2,
width=max_x - min_x,
height=max_y - min_y
center_x=center[0],
center_y=center[1],
width=width,
height=height,
rotation_z=angle
)
class MangaOcr:
name = 'mangaocr'
readable_name = 'Manga OCR'
@@ -312,7 +400,7 @@ class GoogleVision:
available = False
local = False
manual_language = False
coordinate_support = False
coordinate_support = True
threading_support = True
def __init__(self):
@@ -336,20 +424,103 @@ class GoogleVision:
image_bytes = self._preprocess(img)
image = vision.Image(content=image_bytes)
try:
response = self.client.text_detection(image=image)
response = self.client.document_text_detection(image=image)
except ServiceUnavailable:
return (False, 'Connection error!')
except:
except Exception as e:
return (False, 'Unknown error!')
texts = response.text_annotations
res = texts[0].description if len(texts) > 0 else ''
x = (True, res)
ocr_result = self._to_generic_result(response.full_text_annotation, img.width, img.height)
x = (True, ocr_result)
if is_path:
img.close()
return x
def _to_generic_result(self, full_text_annotation, img_width, img_height):
paragraphs = []
if full_text_annotation:
for page in full_text_annotation.pages:
if page.width == img_width and page.height == img_height:
for block in page.blocks:
for google_paragraph in block.paragraphs:
p_bbox = self._convert_bbox(google_paragraph.bounding_box, img_width, img_height)
lines = self._create_lines_from_google_paragraph(google_paragraph, img_width, img_height)
paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
paragraphs.append(paragraph)
return OcrResult(
image_properties=ImageProperties(width=img_width, height=img_height),
paragraphs=paragraphs
)
def _create_lines_from_google_paragraph(self, google_paragraph, img_width, img_height):
lines = []
words = []
for google_word in google_paragraph.words:
word = self._create_word_from_google_word(google_word, img_width, img_height)
words.append(word)
if word.separator == '\n':
l_bbox = merge_bounding_boxes(words, True)
line = Line(bounding_box=l_bbox, words=words)
lines.append(line)
words = []
return lines
def _create_word_from_google_word(self, google_word, img_width, img_height):
w_bbox = self._convert_bbox(google_word.bounding_box, img_width, img_height)
w_separator = ''
w_text_parts = []
for i, symbol in enumerate(google_word.symbols):
separator = None
if hasattr(symbol, 'property') and hasattr(symbol.property, 'detected_break'):
detected_break = symbol.property.detected_break
detected_separator = self._break_type_to_char(detected_break.type_)
if i == len(google_word.symbols) - 1:
w_separator = detected_separator
else:
separator = detected_separator
symbol_text = symbol.text
w_text_parts.append(symbol_text)
if separator:
w_text_parts.append(separator)
word_text = ''.join(w_text_parts)
return Word(
text=word_text,
bounding_box=w_bbox,
separator=w_separator
)
def _break_type_to_char(self, break_type):
if break_type == vision.TextAnnotation.DetectedBreak.BreakType.SPACE:
return ' '
elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.SURE_SPACE:
return ' '
elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.EOL_SURE_SPACE:
return '\n'
elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.HYPHEN:
return '-'
elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.LINE_BREAK:
return '\n'
return ''
def _convert_bbox(self, quad, img_width, img_height):
vertices = quad.vertices
return quad_to_bounding_box(
vertices[0].x, vertices[0].y,
vertices[1].x, vertices[1].y,
vertices[2].x, vertices[2].y,
vertices[3].x, vertices[3].y,
img_width, img_height
)
def _preprocess(self, img):
return pil_image_to_bytes(img)
@@ -501,104 +672,6 @@ class GoogleLens:
return (pil_image_to_bytes(img), img.width, img.height)
class GoogleLensWeb:
name = 'glensweb'
readable_name = 'Google Lens (web)'
key = 'k'
available = False
local = False
manual_language = False
coordinate_support = False
threading_support = True
def __init__(self):
if 'pyjson5' not in sys.modules:
logger.warning('pyjson5 not available, Google Lens (web) will not work!')
else:
self.requests_session = requests.Session()
self.available = True
logger.info('Google Lens (web) ready')
def __call__(self, img):
img, is_path = input_to_pil_image(img)
if not img:
return (False, 'Invalid image provided')
url = 'https://lens.google.com/v3/upload'
files = {'encoded_image': ('image.png', self._preprocess(img), 'image/png')}
headers = {
'Host': 'lens.google.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:136.0) Gecko/20100101 Firefox/136.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'ja-JP;q=0.6,ja;q=0.5',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Referer': 'https://www.google.com/',
'Origin': 'https://www.google.com',
'Alt-Used': 'lens.google.com',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-site',
'Priority': 'u=0, i',
'TE': 'trailers'
}
cookies = {'SOCS': 'CAESEwgDEgk0ODE3Nzk3MjQaAmVuIAEaBgiA_LyaBg'}
try:
res = self.requests_session.post(url, files=files, headers=headers, cookies=cookies, timeout=20, allow_redirects=False)
except requests.exceptions.Timeout:
return (False, 'Request timeout!')
except requests.exceptions.ConnectionError:
return (False, 'Connection error!')
if res.status_code != 303:
return (False, 'Unknown error!')
redirect_url = res.headers.get('Location')
if not redirect_url:
return (False, 'Error getting redirect URL!')
parsed_url = urlparse(redirect_url)
query_params = parse_qs(parsed_url.query)
if ('vsrid' not in query_params) or ('gsessionid' not in query_params):
return (False, 'Unknown error!')
try:
res = self.requests_session.get(f"https://lens.google.com/qfmetadata?vsrid={query_params['vsrid'][0]}&gsessionid={query_params['gsessionid'][0]}", timeout=20)
except requests.exceptions.Timeout:
return (False, 'Request timeout!')
except requests.exceptions.ConnectionError:
return (False, 'Connection error!')
if (len(res.text.splitlines()) != 3):
return (False, 'Unknown error!')
lens_object = pyjson5.loads(res.text.splitlines()[2])
res = []
text = lens_object[0][2][0][0]
for paragraph in text:
for line in paragraph[1]:
for word in line[0]:
res.append(word[1] + word[2])
x = (True, res)
if is_path:
img.close()
return x
def _preprocess(self, img):
if img.width * img.height > 3000000:
aspect_ratio = img.width / img.height
new_w = int(sqrt(3000000 * aspect_ratio))
new_h = int(new_w / aspect_ratio)
img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
return pil_image_to_bytes(img)
class Bing:
name = 'bing'
readable_name = 'Bing'
@@ -1234,7 +1307,7 @@ class AzureImageAnalysis:
available = False
local = False
manual_language = False
coordinate_support = False
coordinate_support = True
threading_support = True
def __init__(self, config={}):
@@ -1261,20 +1334,55 @@ class AzureImageAnalysis:
except:
return (False, 'Unknown error!')
res = []
if read_result.read:
for block in read_result.read.blocks:
for line in block.lines:
res.append(line.text)
else:
return (False, 'Unknown error!')
x = (True, res)
ocr_result = self._to_generic_result(read_result, img.width, img.height)
x = (True, ocr_result)
if is_path:
img.close()
return x
def _to_generic_result(self, read_result, img_width, img_height):
paragraphs = []
if read_result.read:
for block in read_result.read.blocks:
lines = []
for azure_line in block.lines:
l_bbox = self._convert_bbox(azure_line.bounding_polygon, img_width, img_height)
words = []
for azure_word in azure_line.words:
w_bbox = self._convert_bbox(azure_word.bounding_polygon, img_width, img_height)
word = Word(
text=azure_word.text,
bounding_box=w_bbox
)
words.append(word)
line = Line(
bounding_box=l_bbox,
words=words,
text=azure_line.text
)
lines.append(line)
p_bbox = merge_bounding_boxes(lines)
paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
paragraphs.append(paragraph)
return OcrResult(
image_properties=ImageProperties(width=img_width, height=img_height),
paragraphs=paragraphs
)
def _convert_bbox(self, rect, img_width, img_height):
return quad_to_bounding_box(
rect[0]['x'], rect[0]['y'],
rect[1]['x'], rect[1]['y'],
rect[2]['x'], rect[2]['y'],
rect[3]['x'], rect[3]['y'],
img_width, img_height
)
def _preprocess(self, img):
min_pixel_size = 50
max_pixel_size = 10000
@@ -1461,7 +1569,7 @@ class OCRSpace:
available = False
local = False
manual_language = True
coordinate_support = False
coordinate_support = True
threading_support = True
def __init__(self, config={}, language='ja'):
@@ -1498,14 +1606,16 @@ class OCRSpace:
if not img:
return (False, 'Invalid image provided')
img_bytes, img_extension, _ = self._preprocess(img)
og_img_width, og_img_height = img.size
img_bytes, img_extension, img_size = self._preprocess(img)
if not img_bytes:
return (False, 'Image is too big!')
data = {
'apikey': self.api_key,
'language': self.language,
'OCREngine': str(self.engine_version)
'OCREngine': str(self.engine_version),
'isOverlayRequired': 'True'
}
files = {'file': ('image.' + img_extension, img_bytes, 'image/' + img_extension)}
@@ -1526,12 +1636,57 @@ class OCRSpace:
if res['IsErroredOnProcessing']:
return (False, res['ErrorMessage'])
res = res['ParsedResults'][0]['ParsedText']
x = (True, res)
img_width, img_height = img_size
ocr_result = self._to_generic_result(res, img_width, img_height, og_img_width, og_img_height)
x = (True, ocr_result)
if is_path:
img.close()
return x
def _to_generic_result(self, api_result, img_width, img_height, og_img_width, og_img_height):
parsed_result = api_result['ParsedResults'][0]
text_overlay = parsed_result.get('TextOverlay', {})
image_props = ImageProperties(width=og_img_width, height=og_img_height)
ocr_result = OcrResult(image_properties=image_props)
lines_data = text_overlay.get('Lines', [])
lines = []
for line_data in lines_data:
words = []
for word_data in line_data.get('Words', []):
w_bbox = self._convert_bbox(word_data, img_width, img_height)
words.append(Word(text=word_data['WordText'], bounding_box=w_bbox))
l_bbox = merge_bounding_boxes(words)
lines.append(Line(bounding_box=l_bbox, words=words))
if lines:
p_bbox = merge_bounding_boxes(lines)
paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
ocr_result.paragraphs = [paragraph]
else:
ocr_result.paragraphs = []
return ocr_result
def _convert_bbox(self, word_data, img_width, img_height):
left = word_data['Left'] / img_width
top = word_data['Top'] / img_height
width = word_data['Width'] / img_width
height = word_data['Height'] / img_height
center_x = left + width / 2
center_y = top + height / 2
return BoundingBox(
center_x=center_x,
center_y=center_y,
width=width,
height=height
)
def _preprocess(self, img):
return limit_image_size(img, self.max_byte_size)

View File

@@ -434,7 +434,7 @@ class TextFiltering:
self.frame_stabilization_timestamp = time.time()
return 0, 0, None
def _find_changed_lines_impl(self, current_result, previous_result, next_result = None):
def _find_changed_lines_impl(self, current_result, previous_result, next_result=None):
if not current_result:
return None
@@ -1341,8 +1341,6 @@ class OutputResult:
else:
output_string = self._post_process(result_data_text, False)
log_message = output_string
if output_format == 'json':
logger.opt(colors=True).warning(f"<{engine_color}>{engine_name}</{engine_color}> does not support JSON output. Falling back to text.")
if verbosity != 0:
if verbosity < -1:
@@ -1494,6 +1492,7 @@ def run():
global engine_instances
global engine_keys
output_format = config.get_general('output_format')
engine_instances = []
config_engines = []
engine_keys = []
@@ -1506,6 +1505,11 @@ def run():
for _,engine_class in sorted(inspect.getmembers(sys.modules[__name__], lambda x: hasattr(x, '__module__') and x.__module__ and __package__ + '.ocr' in x.__module__ and inspect.isclass(x) and hasattr(x, 'name'))):
if len(config_engines) == 0 or engine_class.name in config_engines:
if output_format == 'json' and not engine_class.coordinate_support:
logger.warning(f"Skipping {engine_class.readable_name} as it does not support JSON output.")
continue
if config.get_engine(engine_class.name) == None:
if engine_class.manual_language:
engine_instance = engine_class(language=config.get_general('language'))
@@ -1545,7 +1549,6 @@ def run():
read_from_path = None
read_from_readable = []
write_to = config.get_general('write_to')
output_format = config.get_general('output_format')
terminated = threading.Event()
paused = threading.Event()
if config.get_general('pause_at_startup'):

View File

@@ -11,9 +11,8 @@
;a path to a text file.
;write_to = clipboard
;OCR engine to use. Available: "mangaocr", "glens", "glensweb", "bing",
;"gvision", "avision", "alivetext", "azure", "winrtocr", "oneocr", "easyocr",
;"rapidocr", "ocrspace".
;OCR engine to use. Available: "mangaocr", "glens", "bing","gvision", "avision",
;"alivetext", "azure", "winrtocr", "oneocr", "easyocr", "rapidocr", "ocrspace".
;engine =
;OCR engine to use for two-pass processing.
@@ -30,15 +29,18 @@
;delete_images = False
;Available:
;avision,alivetext,bing,glens,glensweb,gvision,azure,mangaocr,winrtocr,oneocr,easyocr,rapidocr,ocrspace
;engines = avision,alivetext,bing,glens,glensweb,gvision,azure,mangaocr,winrtocr,oneocr,easyocr,rapidocr,ocrspace
;avision,alivetext,bing,glens,gvision,azure,mangaocr,winrtocr,oneocr,easyocr,rapidocr,ocrspace
;engines = avision,alivetext,bing,glens,gvision,azure,mangaocr,winrtocr,oneocr,easyocr,rapidocr,ocrspace
;logger_format = <green>{time:HH:mm:ss.SSS}</green> | <level>{message}</level>
;engine_color = cyan
;Delay (in seconds) between checks when reading from clipboard (on macOS/Linux)
;or a directory.
;delay_secs = 0.5
;Websocket port to use if reading or writing to websocket.
;websocket_port = 7331
;Show an operating system notification with the detected text. Will be ignored

View File

@@ -61,10 +61,6 @@ lens = [
"betterproto==2.0.0b7"
]
lensweb = [
"pyjson5"
]
gvision = [
"google-cloud-vision"
]