add new json (text+coordinates) output format for bing, glens and oneocr as a proof-of-concept
This commit is contained in:
271
owocr/ocr.py
271
owocr/ocr.py
@@ -5,10 +5,12 @@ from pathlib import Path
|
||||
import sys
|
||||
import platform
|
||||
import logging
|
||||
from math import sqrt
|
||||
from math import sqrt, sin, cos, atan2
|
||||
import json
|
||||
import base64
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from typing import List, Optional
|
||||
|
||||
import jaconv
|
||||
import numpy as np
|
||||
@@ -83,6 +85,50 @@ try:
|
||||
except:
|
||||
optimized_png_encode = False
|
||||
|
||||
@dataclass
|
||||
class BoundingBox:
|
||||
"""
|
||||
Represents the normalized coordinates of a detected element.
|
||||
All values are floats between 0.0 and 1.0.
|
||||
"""
|
||||
center_x: float
|
||||
center_y: float
|
||||
width: float
|
||||
height: float
|
||||
rotation_z: Optional[float] = None # Optional rotation in radians
|
||||
|
||||
@dataclass
|
||||
class Word:
|
||||
"""Represents a single recognized word and its properties."""
|
||||
text: str
|
||||
bounding_box: BoundingBox
|
||||
separator: Optional[str] = None # The character(s) that follow the word, e.g., a space
|
||||
|
||||
@dataclass
|
||||
class Line:
|
||||
"""Represents a single line of text, composed of words."""
|
||||
bounding_box: BoundingBox
|
||||
words: List[Word] = field(default_factory=list)
|
||||
|
||||
@dataclass
|
||||
class Paragraph:
|
||||
"""Represents a block of text, composed of lines."""
|
||||
bounding_box: BoundingBox
|
||||
lines: List[Line] = field(default_factory=list)
|
||||
writing_direction: Optional[str] = None # Optional: e.g., "LEFT_TO_RIGHT"
|
||||
|
||||
@dataclass
|
||||
class ImageProperties:
|
||||
"""Stores the original dimensions of the processed image."""
|
||||
width: int
|
||||
height: int
|
||||
|
||||
@dataclass
|
||||
class OcrResult:
|
||||
"""The root object for a complete OCR analysis of an image."""
|
||||
image_properties: ImageProperties
|
||||
paragraphs: List[Paragraph] = field(default_factory=list)
|
||||
|
||||
|
||||
def empty_post_process(text):
|
||||
return text
|
||||
@@ -243,6 +289,62 @@ class GoogleLens:
|
||||
self.available = True
|
||||
logger.info('Google Lens ready')
|
||||
|
||||
def _to_generic_result(self, response, img_width, img_height):
|
||||
paragraphs = []
|
||||
if 'objects_response' in response and 'text' in response['objects_response']:
|
||||
text_data = response['objects_response']['text']
|
||||
if 'text_layout' in text_data:
|
||||
for p in text_data['text_layout'].get('paragraphs', []):
|
||||
lines = []
|
||||
for l in p.get('lines', []):
|
||||
words = []
|
||||
for w in l.get('words', []):
|
||||
w_bbox = w.get('geometry', {}).get('bounding_box', {})
|
||||
word = Word(
|
||||
text=w.get('plain_text', ''),
|
||||
separator=w.get('text_separator'),
|
||||
bounding_box=BoundingBox(
|
||||
center_x=w_bbox.get('center_x'),
|
||||
center_y=w_bbox.get('center_y'),
|
||||
width=w_bbox.get('width'),
|
||||
height=w_bbox.get('height'),
|
||||
rotation_z=w_bbox.get('rotation_z')
|
||||
)
|
||||
)
|
||||
words.append(word)
|
||||
|
||||
l_bbox = l.get('geometry', {}).get('bounding_box', {})
|
||||
line = Line(
|
||||
bounding_box=BoundingBox(
|
||||
center_x=l_bbox.get('center_x'),
|
||||
center_y=l_bbox.get('center_y'),
|
||||
width=l_bbox.get('width'),
|
||||
height=l_bbox.get('height'),
|
||||
rotation_z=l_bbox.get('rotation_z')
|
||||
),
|
||||
words=words
|
||||
)
|
||||
lines.append(line)
|
||||
|
||||
p_bbox = p.get('geometry', {}).get('bounding_box', {})
|
||||
paragraph = Paragraph(
|
||||
bounding_box=BoundingBox(
|
||||
center_x=p_bbox.get('center_x'),
|
||||
center_y=p_bbox.get('center_y'),
|
||||
width=p_bbox.get('width'),
|
||||
height=p_bbox.get('height'),
|
||||
rotation_z=p_bbox.get('rotation_z')
|
||||
),
|
||||
lines=lines,
|
||||
writing_direction=p.get('writing_direction')
|
||||
)
|
||||
paragraphs.append(paragraph)
|
||||
|
||||
return OcrResult(
|
||||
image_properties=ImageProperties(width=img_width, height=img_height),
|
||||
paragraphs=paragraphs
|
||||
)
|
||||
|
||||
def __call__(self, img):
|
||||
img, is_path = input_to_pil_image(img)
|
||||
if not img:
|
||||
@@ -272,7 +374,7 @@ class GoogleLens:
|
||||
image_data = self._preprocess(img)
|
||||
request.objects_request.image_data.payload.image_bytes = image_data[0]
|
||||
request.objects_request.image_data.image_metadata.width = image_data[1]
|
||||
request.objects_request.image_data.image_metadata.height = image_data[2]
|
||||
request.objects_request.image_data.image_metadata.height = image_data[2]
|
||||
|
||||
payload = request.SerializeToString()
|
||||
|
||||
@@ -302,17 +404,8 @@ class GoogleLens:
|
||||
response_proto = LensOverlayServerResponse().FromString(res.content)
|
||||
response_dict = response_proto.to_dict(betterproto.Casing.SNAKE)
|
||||
|
||||
res = ''
|
||||
text = response_dict['objects_response']['text']
|
||||
if 'text_layout' in text:
|
||||
paragraphs = text['text_layout']['paragraphs']
|
||||
for paragraph in paragraphs:
|
||||
for line in paragraph['lines']:
|
||||
for word in line['words']:
|
||||
res += word['plain_text'] + word['text_separator']
|
||||
res += '\n'
|
||||
|
||||
x = (True, res)
|
||||
ocr_result = self._to_generic_result(response_dict, img.width, img.height)
|
||||
x = (True, ocr_result)
|
||||
|
||||
if is_path:
|
||||
img.close()
|
||||
@@ -433,6 +526,69 @@ class Bing:
|
||||
self.available = True
|
||||
logger.info('Bing ready')
|
||||
|
||||
def _quad_to_center_bbox(self, quad):
|
||||
center_x = (quad['topLeft']['x'] + quad['topRight']['x'] + quad['bottomRight']['x'] + quad['bottomLeft']['x']) / 4
|
||||
center_y = (quad['topLeft']['y'] + quad['topRight']['y'] + quad['bottomRight']['y'] + quad['bottomLeft']['y']) / 4
|
||||
|
||||
width1 = sqrt((quad['topRight']['x'] - quad['topLeft']['x'])**2 + (quad['topRight']['y'] - quad['topLeft']['y'])**2)
|
||||
width2 = sqrt((quad['bottomRight']['x'] - quad['bottomLeft']['x'])**2 + (quad['bottomRight']['y'] - quad['bottomLeft']['y'])**2)
|
||||
avg_width = (width1 + width2) / 2
|
||||
|
||||
height1 = sqrt((quad['bottomLeft']['x'] - quad['topLeft']['x'])**2 + (quad['bottomLeft']['y'] - quad['topLeft']['y'])**2)
|
||||
height2 = sqrt((quad['bottomRight']['x'] - quad['topRight']['x'])**2 + (quad['bottomRight']['y'] - quad['topRight']['y'])**2)
|
||||
avg_height = (height1 + height2) / 2
|
||||
|
||||
return BoundingBox(center_x=center_x, center_y=center_y, width=avg_width, height=avg_height)
|
||||
|
||||
def _to_generic_result(self, response, img_width, img_height):
|
||||
paragraphs = []
|
||||
text_tag = None
|
||||
for tag in response.get('tags', []):
|
||||
if tag.get('displayName') == '##TextRecognition':
|
||||
text_tag = tag
|
||||
break
|
||||
|
||||
if text_tag:
|
||||
text_action = None
|
||||
for action in text_tag.get('actions', []):
|
||||
if action.get('_type') == 'ImageKnowledge/TextRecognitionAction':
|
||||
text_action = action
|
||||
break
|
||||
|
||||
if text_action:
|
||||
for p in text_action.get('data', {}).get('regions', []):
|
||||
lines = []
|
||||
for l in p.get('lines', []):
|
||||
words = []
|
||||
for w in l.get('words', []):
|
||||
word = Word(
|
||||
text=w.get('text', ''),
|
||||
bounding_box=self._quad_to_center_bbox(w['boundingBox']),
|
||||
separator=" "
|
||||
)
|
||||
words.append(word)
|
||||
|
||||
line = Line(
|
||||
bounding_box=self._quad_to_center_bbox(l['boundingBox']),
|
||||
words=words
|
||||
)
|
||||
lines.append(line)
|
||||
|
||||
# Bing doesn't provide paragraph-level separators, so we add a newline
|
||||
if lines and lines[-1].words:
|
||||
lines[-1].words[-1].separator = '\n'
|
||||
|
||||
paragraph = Paragraph(
|
||||
bounding_box=self._quad_to_center_bbox(p['boundingBox']),
|
||||
lines=lines
|
||||
)
|
||||
paragraphs.append(paragraph)
|
||||
|
||||
return OcrResult(
|
||||
image_properties=ImageProperties(width=img_width, height=img_height),
|
||||
paragraphs=paragraphs
|
||||
)
|
||||
|
||||
def __call__(self, img):
|
||||
img, is_path = input_to_pil_image(img)
|
||||
if not img:
|
||||
@@ -510,26 +666,9 @@ class Bing:
|
||||
return (False, 'Unknown error!')
|
||||
|
||||
data = res.json()
|
||||
|
||||
res = ''
|
||||
text_tag = None
|
||||
for tag in data['tags']:
|
||||
if tag.get('displayName') == '##TextRecognition':
|
||||
text_tag = tag
|
||||
break
|
||||
if text_tag:
|
||||
text_action = None
|
||||
for action in text_tag['actions']:
|
||||
if action.get('_type') == 'ImageKnowledge/TextRecognitionAction':
|
||||
text_action = action
|
||||
break
|
||||
if text_action:
|
||||
regions = text_action['data'].get('regions', [])
|
||||
for region in regions:
|
||||
for line in region.get('lines', []):
|
||||
res += line['text'] + '\n'
|
||||
|
||||
x = (True, res)
|
||||
ocr_result = self._to_generic_result(data, img.width, img.height)
|
||||
x = (True, ocr_result)
|
||||
|
||||
if is_path:
|
||||
img.close()
|
||||
@@ -763,6 +902,67 @@ class OneOCR:
|
||||
except:
|
||||
logger.warning('Error reading URL from config, OneOCR will not work!')
|
||||
|
||||
def _pixel_quad_to_center_bbox(self, rect, img_width, img_height):
|
||||
x_coords = [rect['x1'], rect['x2'], rect['x3'], rect['x4']]
|
||||
y_coords = [rect['y1'], rect['y2'], rect['y3'], rect['y4']]
|
||||
|
||||
center_x_px = sum(x_coords) / 4
|
||||
center_y_px = sum(y_coords) / 4
|
||||
|
||||
width_px = (abs(rect['x2'] - rect['x1']) + abs(rect['x3'] - rect['x4'])) / 2
|
||||
height_px = (abs(rect['y4'] - rect['y1']) + abs(rect['y3'] - rect['y2'])) / 2
|
||||
|
||||
return BoundingBox(
|
||||
center_x=center_x_px / img_width,
|
||||
center_y=center_y_px / img_height,
|
||||
width=width_px / img_width,
|
||||
height=height_px / img_height
|
||||
)
|
||||
|
||||
def _to_generic_result(self, response, img_width, img_height):
|
||||
lines = []
|
||||
for l in response.get('lines', []):
|
||||
words = []
|
||||
for i, w in enumerate(l.get('words', [])):
|
||||
separator = " " if i < len(l.get('words', [])) - 1 else None
|
||||
word = Word(
|
||||
text=w.get('text', ''),
|
||||
separator=separator,
|
||||
bounding_box=self._pixel_quad_to_center_bbox(w['bounding_rect'], img_width, img_height)
|
||||
)
|
||||
words.append(word)
|
||||
|
||||
line = Line(
|
||||
bounding_box=self._pixel_quad_to_center_bbox(l['bounding_rect'], img_width, img_height),
|
||||
words=words
|
||||
)
|
||||
lines.append(line)
|
||||
|
||||
# Create a single paragraph to hold all lines
|
||||
if lines:
|
||||
# Approximate paragraph bbox by combining all line bboxes
|
||||
all_line_bboxes = [l.bounding_box for l in lines]
|
||||
min_x = min(b.center_x - b.width / 2 for b in all_line_bboxes)
|
||||
max_x = max(b.center_x + b.width / 2 for b in all_line_bboxes)
|
||||
min_y = min(b.center_y - b.height / 2 for b in all_line_bboxes)
|
||||
max_y = max(b.center_y + b.height / 2 for b in all_line_bboxes)
|
||||
|
||||
p_bbox = BoundingBox(
|
||||
center_x=(min_x + max_x) / 2,
|
||||
center_y=(min_y + max_y) / 2,
|
||||
width=max_x - min_x,
|
||||
height=max_y - min_y
|
||||
)
|
||||
paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
|
||||
paragraphs = [paragraph]
|
||||
else:
|
||||
paragraphs = []
|
||||
|
||||
return OcrResult(
|
||||
image_properties=ImageProperties(width=img_width, height=img_height),
|
||||
paragraphs=paragraphs
|
||||
)
|
||||
|
||||
def __call__(self, img):
|
||||
img, is_path = input_to_pil_image(img)
|
||||
if not img:
|
||||
@@ -770,7 +970,7 @@ class OneOCR:
|
||||
|
||||
if sys.platform == 'win32':
|
||||
try:
|
||||
res = self.model.recognize_pil(img)['text']
|
||||
raw_res = self.model.recognize_pil(img)
|
||||
except RuntimeError as e:
|
||||
return (False, e)
|
||||
else:
|
||||
@@ -784,9 +984,10 @@ class OneOCR:
|
||||
if res.status_code != 200:
|
||||
return (False, 'Unknown error!')
|
||||
|
||||
res = res.json()['text']
|
||||
raw_res = res.json()
|
||||
|
||||
x = (True, res)
|
||||
ocr_response = self._to_generic_result(raw_res, img.width, img.height)
|
||||
x = (True, ocr_response)
|
||||
|
||||
if is_path:
|
||||
img.close()
|
||||
|
||||
Reference in New Issue
Block a user