Add coordinate support for the remaining engines, remove Lens Web as it's pointless and potentially confusing

This commit is contained in:
AuroraWright
2025-10-16 07:49:21 +02:00
parent c21bdef848
commit 0f86750b23
6 changed files with 339 additions and 180 deletions

View File

@@ -67,11 +67,6 @@ try:
except ImportError:
pass
try:
import pyjson5
except ImportError:
pass
try:
import betterproto
from .lens_betterproto import *
@@ -224,53 +219,146 @@ def quad_to_bounding_box(x1, y1, x2, y2, x3, y3, x4, y4, img_width=None, img_hei
rotation_z=angle
)
def merge_bounding_boxes(ocr_element_list):
all_corners = []
def merge_bounding_boxes(ocr_element_list, rotated=False):
def _get_all_corners(ocr_element_list):
corners = []
for element in ocr_element_list:
bbox = element.bounding_box
angle = bbox.rotation_z or 0.0
hw, hh = bbox.width / 2.0, bbox.height / 2.0
cx, cy = bbox.center_x, bbox.center_y
for element in ocr_element_list:
bbox = element.bounding_box
angle = bbox.rotation_z
hw = bbox.width / 2
hh = bbox.height / 2
# Local corner offsets
local = np.array([[-hw, -hh], [hw, -hh], [hw, hh], [-hw, hh]])
if not angle:
corners = [
(bbox.center_x - hw, bbox.center_y - hh), # Top-left
(bbox.center_x + hw, bbox.center_y - hh), # Top-right
(bbox.center_x + hw, bbox.center_y + hh), # Bottom-right
(bbox.center_x - hw, bbox.center_y + hh) # Bottom-left
]
all_corners.extend(corners)
else:
local_corners = [
(-hw, -hh), # Top-left
( hw, -hh), # Top-right
( hw, hh), # Bottom-right
(-hw, hh) # Bottom-left
]
if abs(angle) < 1e-12:
corners.append(local + [cx, cy])
else:
# Rotation matrix
cos_a, sin_a = np.cos(angle), np.sin(angle)
rot = np.array([[cos_a, -sin_a], [sin_a, cos_a]])
corners.append(local @ rot.T + [cx, cy])
return np.vstack(corners) if corners else np.empty((0, 2))
# Rotate and translate corners
cos_angle = cos(angle)
sin_angle = sin(angle)
def _convex_hull(points):
if len(points) <= 3:
return points
for x_local, y_local in local_corners:
x_rotated = x_local * cos_angle - y_local * sin_angle
y_rotated = x_local * sin_angle + y_local * cos_angle
x_global = bbox.center_x + x_rotated
y_global = bbox.center_y + y_rotated
all_corners.append((x_global, y_global))
pts = np.unique(points, axis=0)
pts = pts[np.lexsort((pts[:, 1], pts[:, 0]))]
xs, ys = zip(*all_corners)
min_x, max_x = min(xs), max(xs)
min_y, max_y = min(ys), max(ys)
if len(pts) <= 1:
return pts
def cross(o, a, b):
return (a[0] - o[0]) * (b[1] - o[1]) - (a[1] - o[1]) * (b[0] - o[0])
lower, upper = [], []
for p in pts:
while len(lower) >= 2 and cross(lower[-2], lower[-1], p) <= 0:
lower.pop()
lower.append(p)
for p in pts[::-1]:
while len(upper) >= 2 and cross(upper[-2], upper[-1], p) <= 0:
upper.pop()
upper.append(p)
return np.array(lower[:-1] + upper[:-1])
all_corners = _get_all_corners(ocr_element_list)
# Axis-aligned case
if not rotated:
min_pt, max_pt = all_corners.min(axis=0), all_corners.max(axis=0)
center = (min_pt + max_pt) / 2
size = max_pt - min_pt
return BoundingBox(
center_x=center[0],
center_y=center[1],
width=size[0],
height=size[1]
)
hull = _convex_hull(all_corners)
m = len(hull)
# Trivial cases
if m == 1:
return BoundingBox(
center_x=hull[0, 0],
center_y=hull[0, 1],
width=0.0,
height=0.0,
rotation_z=0.0
)
if m == 2:
diff = hull[1] - hull[0]
length = np.linalg.norm(diff)
center = hull.mean(axis=0)
return BoundingBox(
center_x=center[0],
center_y=center[1],
width=length,
height=0.0,
rotation_z=np.arctan2(diff[1], diff[0])
)
# Test each edge orientation
edges = np.roll(hull, -1, axis=0) - hull
edge_lengths = np.linalg.norm(edges, axis=1)
valid = edge_lengths > 1e-12
if not valid.any():
# Fallback to axis-aligned
min_pt, max_pt = all_corners.min(axis=0), all_corners.max(axis=0)
center = (min_pt + max_pt) / 2
size = max_pt - min_pt
return BoundingBox(
center_x=center[0],
center_y=center[1],
width=size[0],
height=size[1]
)
angles = np.arctan2(edges[valid, 1], edges[valid, 0])
best_area, best_idx = np.inf, -1
for idx, angle in enumerate(angles):
# Rotation matrix (rotate by -angle)
cos_a, sin_a = np.cos(angle), np.sin(angle)
rot = np.array([[cos_a, sin_a], [-sin_a, cos_a]])
rotated = hull @ rot.T
min_pt, max_pt = rotated.min(axis=0), rotated.max(axis=0)
area = np.prod(max_pt - min_pt)
if area < best_area:
best_area, best_idx = area, idx
best_bounds = (min_pt, max_pt, angle)
min_pt, max_pt, angle = best_bounds
width, height = max_pt - min_pt
center_rot = (min_pt + max_pt) / 2
# Rotate center back to global coordinates
cos_a, sin_a = np.cos(angle), np.sin(angle)
rot_back = np.array([[cos_a, -sin_a], [sin_a, cos_a]])
center = rot_back @ center_rot
# Normalize angle to [-π, π]
angle = np.mod(angle + np.pi, 2 * np.pi) - np.pi
return BoundingBox(
center_x=(min_x + max_x) / 2,
center_y=(min_y + max_y) / 2,
width=max_x - min_x,
height=max_y - min_y
center_x=center[0],
center_y=center[1],
width=width,
height=height,
rotation_z=angle
)
class MangaOcr:
name = 'mangaocr'
readable_name = 'Manga OCR'
@@ -312,7 +400,7 @@ class GoogleVision:
available = False
local = False
manual_language = False
coordinate_support = False
coordinate_support = True
threading_support = True
def __init__(self):
@@ -336,20 +424,103 @@ class GoogleVision:
image_bytes = self._preprocess(img)
image = vision.Image(content=image_bytes)
try:
response = self.client.text_detection(image=image)
response = self.client.document_text_detection(image=image)
except ServiceUnavailable:
return (False, 'Connection error!')
except:
except Exception as e:
return (False, 'Unknown error!')
texts = response.text_annotations
res = texts[0].description if len(texts) > 0 else ''
x = (True, res)
ocr_result = self._to_generic_result(response.full_text_annotation, img.width, img.height)
x = (True, ocr_result)
if is_path:
img.close()
return x
def _to_generic_result(self, full_text_annotation, img_width, img_height):
paragraphs = []
if full_text_annotation:
for page in full_text_annotation.pages:
if page.width == img_width and page.height == img_height:
for block in page.blocks:
for google_paragraph in block.paragraphs:
p_bbox = self._convert_bbox(google_paragraph.bounding_box, img_width, img_height)
lines = self._create_lines_from_google_paragraph(google_paragraph, img_width, img_height)
paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
paragraphs.append(paragraph)
return OcrResult(
image_properties=ImageProperties(width=img_width, height=img_height),
paragraphs=paragraphs
)
def _create_lines_from_google_paragraph(self, google_paragraph, img_width, img_height):
lines = []
words = []
for google_word in google_paragraph.words:
word = self._create_word_from_google_word(google_word, img_width, img_height)
words.append(word)
if word.separator == '\n':
l_bbox = merge_bounding_boxes(words, True)
line = Line(bounding_box=l_bbox, words=words)
lines.append(line)
words = []
return lines
def _create_word_from_google_word(self, google_word, img_width, img_height):
w_bbox = self._convert_bbox(google_word.bounding_box, img_width, img_height)
w_separator = ''
w_text_parts = []
for i, symbol in enumerate(google_word.symbols):
separator = None
if hasattr(symbol, 'property') and hasattr(symbol.property, 'detected_break'):
detected_break = symbol.property.detected_break
detected_separator = self._break_type_to_char(detected_break.type_)
if i == len(google_word.symbols) - 1:
w_separator = detected_separator
else:
separator = detected_separator
symbol_text = symbol.text
w_text_parts.append(symbol_text)
if separator:
w_text_parts.append(separator)
word_text = ''.join(w_text_parts)
return Word(
text=word_text,
bounding_box=w_bbox,
separator=w_separator
)
def _break_type_to_char(self, break_type):
if break_type == vision.TextAnnotation.DetectedBreak.BreakType.SPACE:
return ' '
elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.SURE_SPACE:
return ' '
elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.EOL_SURE_SPACE:
return '\n'
elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.HYPHEN:
return '-'
elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.LINE_BREAK:
return '\n'
return ''
def _convert_bbox(self, quad, img_width, img_height):
vertices = quad.vertices
return quad_to_bounding_box(
vertices[0].x, vertices[0].y,
vertices[1].x, vertices[1].y,
vertices[2].x, vertices[2].y,
vertices[3].x, vertices[3].y,
img_width, img_height
)
def _preprocess(self, img):
return pil_image_to_bytes(img)
@@ -501,104 +672,6 @@ class GoogleLens:
return (pil_image_to_bytes(img), img.width, img.height)
class GoogleLensWeb:
name = 'glensweb'
readable_name = 'Google Lens (web)'
key = 'k'
available = False
local = False
manual_language = False
coordinate_support = False
threading_support = True
def __init__(self):
if 'pyjson5' not in sys.modules:
logger.warning('pyjson5 not available, Google Lens (web) will not work!')
else:
self.requests_session = requests.Session()
self.available = True
logger.info('Google Lens (web) ready')
def __call__(self, img):
img, is_path = input_to_pil_image(img)
if not img:
return (False, 'Invalid image provided')
url = 'https://lens.google.com/v3/upload'
files = {'encoded_image': ('image.png', self._preprocess(img), 'image/png')}
headers = {
'Host': 'lens.google.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:136.0) Gecko/20100101 Firefox/136.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'ja-JP;q=0.6,ja;q=0.5',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Referer': 'https://www.google.com/',
'Origin': 'https://www.google.com',
'Alt-Used': 'lens.google.com',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-site',
'Priority': 'u=0, i',
'TE': 'trailers'
}
cookies = {'SOCS': 'CAESEwgDEgk0ODE3Nzk3MjQaAmVuIAEaBgiA_LyaBg'}
try:
res = self.requests_session.post(url, files=files, headers=headers, cookies=cookies, timeout=20, allow_redirects=False)
except requests.exceptions.Timeout:
return (False, 'Request timeout!')
except requests.exceptions.ConnectionError:
return (False, 'Connection error!')
if res.status_code != 303:
return (False, 'Unknown error!')
redirect_url = res.headers.get('Location')
if not redirect_url:
return (False, 'Error getting redirect URL!')
parsed_url = urlparse(redirect_url)
query_params = parse_qs(parsed_url.query)
if ('vsrid' not in query_params) or ('gsessionid' not in query_params):
return (False, 'Unknown error!')
try:
res = self.requests_session.get(f"https://lens.google.com/qfmetadata?vsrid={query_params['vsrid'][0]}&gsessionid={query_params['gsessionid'][0]}", timeout=20)
except requests.exceptions.Timeout:
return (False, 'Request timeout!')
except requests.exceptions.ConnectionError:
return (False, 'Connection error!')
if (len(res.text.splitlines()) != 3):
return (False, 'Unknown error!')
lens_object = pyjson5.loads(res.text.splitlines()[2])
res = []
text = lens_object[0][2][0][0]
for paragraph in text:
for line in paragraph[1]:
for word in line[0]:
res.append(word[1] + word[2])
x = (True, res)
if is_path:
img.close()
return x
def _preprocess(self, img):
if img.width * img.height > 3000000:
aspect_ratio = img.width / img.height
new_w = int(sqrt(3000000 * aspect_ratio))
new_h = int(new_w / aspect_ratio)
img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
return pil_image_to_bytes(img)
class Bing:
name = 'bing'
readable_name = 'Bing'
@@ -1131,9 +1204,9 @@ class OneOCR:
def _convert_bbox(self, rect, img_width, img_height):
return quad_to_bounding_box(
rect['x1'], rect['y1'],
rect['x2'], rect['y2'],
rect['x3'], rect['y3'],
rect['x1'], rect['y1'],
rect['x2'], rect['y2'],
rect['x3'], rect['y3'],
rect['x4'], rect['y4'],
img_width, img_height
)
@@ -1234,7 +1307,7 @@ class AzureImageAnalysis:
available = False
local = False
manual_language = False
coordinate_support = False
coordinate_support = True
threading_support = True
def __init__(self, config={}):
@@ -1261,20 +1334,55 @@ class AzureImageAnalysis:
except:
return (False, 'Unknown error!')
res = []
if read_result.read:
for block in read_result.read.blocks:
for line in block.lines:
res.append(line.text)
else:
return (False, 'Unknown error!')
x = (True, res)
ocr_result = self._to_generic_result(read_result, img.width, img.height)
x = (True, ocr_result)
if is_path:
img.close()
return x
def _to_generic_result(self, read_result, img_width, img_height):
paragraphs = []
if read_result.read:
for block in read_result.read.blocks:
lines = []
for azure_line in block.lines:
l_bbox = self._convert_bbox(azure_line.bounding_polygon, img_width, img_height)
words = []
for azure_word in azure_line.words:
w_bbox = self._convert_bbox(azure_word.bounding_polygon, img_width, img_height)
word = Word(
text=azure_word.text,
bounding_box=w_bbox
)
words.append(word)
line = Line(
bounding_box=l_bbox,
words=words,
text=azure_line.text
)
lines.append(line)
p_bbox = merge_bounding_boxes(lines)
paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
paragraphs.append(paragraph)
return OcrResult(
image_properties=ImageProperties(width=img_width, height=img_height),
paragraphs=paragraphs
)
def _convert_bbox(self, rect, img_width, img_height):
return quad_to_bounding_box(
rect[0]['x'], rect[0]['y'],
rect[1]['x'], rect[1]['y'],
rect[2]['x'], rect[2]['y'],
rect[3]['x'], rect[3]['y'],
img_width, img_height
)
def _preprocess(self, img):
min_pixel_size = 50
max_pixel_size = 10000
@@ -1461,7 +1569,7 @@ class OCRSpace:
available = False
local = False
manual_language = True
coordinate_support = False
coordinate_support = True
threading_support = True
def __init__(self, config={}, language='ja'):
@@ -1498,14 +1606,16 @@ class OCRSpace:
if not img:
return (False, 'Invalid image provided')
img_bytes, img_extension, _ = self._preprocess(img)
og_img_width, og_img_height = img.size
img_bytes, img_extension, img_size = self._preprocess(img)
if not img_bytes:
return (False, 'Image is too big!')
data = {
'apikey': self.api_key,
'language': self.language,
'OCREngine': str(self.engine_version)
'OCREngine': str(self.engine_version),
'isOverlayRequired': 'True'
}
files = {'file': ('image.' + img_extension, img_bytes, 'image/' + img_extension)}
@@ -1526,12 +1636,57 @@ class OCRSpace:
if res['IsErroredOnProcessing']:
return (False, res['ErrorMessage'])
res = res['ParsedResults'][0]['ParsedText']
x = (True, res)
img_width, img_height = img_size
ocr_result = self._to_generic_result(res, img_width, img_height, og_img_width, og_img_height)
x = (True, ocr_result)
if is_path:
img.close()
return x
def _to_generic_result(self, api_result, img_width, img_height, og_img_width, og_img_height):
parsed_result = api_result['ParsedResults'][0]
text_overlay = parsed_result.get('TextOverlay', {})
image_props = ImageProperties(width=og_img_width, height=og_img_height)
ocr_result = OcrResult(image_properties=image_props)
lines_data = text_overlay.get('Lines', [])
lines = []
for line_data in lines_data:
words = []
for word_data in line_data.get('Words', []):
w_bbox = self._convert_bbox(word_data, img_width, img_height)
words.append(Word(text=word_data['WordText'], bounding_box=w_bbox))
l_bbox = merge_bounding_boxes(words)
lines.append(Line(bounding_box=l_bbox, words=words))
if lines:
p_bbox = merge_bounding_boxes(lines)
paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
ocr_result.paragraphs = [paragraph]
else:
ocr_result.paragraphs = []
return ocr_result
def _convert_bbox(self, word_data, img_width, img_height):
left = word_data['Left'] / img_width
top = word_data['Top'] / img_height
width = word_data['Width'] / img_width
height = word_data['Height'] / img_height
center_x = left + width / 2
center_y = top + height / 2
return BoundingBox(
center_x=center_x,
center_y=center_y,
width=width,
height=height
)
def _preprocess(self, img):
return limit_image_size(img, self.max_byte_size)