Add coordinate support for the remaining engines, remove Lens Web as it's pointless and potentially confusing
This commit is contained in:
@@ -38,7 +38,6 @@ Additionally:
|
||||
|
||||
## Cloud providers
|
||||
- Google Lens: Google Vision in disguise (no need for API keys!), install with `pip install owocr[lens]` ("l" key)
|
||||
- Google Lens (web): alternative version of Lens (Google webpage version). Results should be the same but it's much slower. Install with `pip install owocr[lensweb]` ("k" key)
|
||||
- Bing: Azure in disguise (no need for API keys!) ("b" key)
|
||||
- Google Vision: install with `pip install owocr[gvision]`, you also need a service account .json file named google_vision.json in `user directory/.config/` ("g" key)
|
||||
- Azure Image Analysis: install with `pip install owocr[azure]`, you also need to specify an api key and an endpoint in the config file ("v" key)
|
||||
|
||||
@@ -25,7 +25,7 @@ parser.add_argument('-rs', '--read_from_secondary', type=str, default=argparse.S
|
||||
parser.add_argument('-w', '--write_to', type=str, default=argparse.SUPPRESS,
|
||||
help='Where to save recognized texts to. Can be either "clipboard", "websocket", or a path to a text file.')
|
||||
parser.add_argument('-e', '--engine', type=str, default=argparse.SUPPRESS,
|
||||
help='OCR engine to use. Available: "mangaocr", "glens", "glensweb", "bing", "gvision", "avision", "alivetext", "azure", "winrtocr", "oneocr", "easyocr", "rapidocr", "ocrspace".')
|
||||
help='OCR engine to use. Available: "mangaocr", "glens", "bing", "gvision", "avision", "alivetext", "azure", "winrtocr", "oneocr", "easyocr", "rapidocr", "ocrspace".')
|
||||
parser.add_argument('-es', '--engine_secondary', type=str, default=argparse.SUPPRESS,
|
||||
help='OCR engine to use for two-pass processing.')
|
||||
parser.add_argument('-p', '--pause_at_startup', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS,
|
||||
@@ -62,6 +62,10 @@ parser.add_argument('-l', '--language', type=str, default=argparse.SUPPRESS,
|
||||
help='Two letter language code for filtering screencapture OCR results. Ex. "ja" for Japanese, "zh" for Chinese, "ko" for Korean, "ar" for Arabic, "ru" for Russian, "el" for Greek, "he" for Hebrew, "th" for Thai. Any other value will use Latin Extended (for most European languages and English).')
|
||||
parser.add_argument('-of', '--output_format', type=str, default=argparse.SUPPRESS,
|
||||
help='The output format for OCR results. Can be "text" (default) or "json" (to include coordinates).')
|
||||
parser.add_argument('-wp', '--websocket_port', type=int, default=argparse.SUPPRESS,
|
||||
help='Websocket port to use if reading or writing to websocket.')
|
||||
parser.add_argument('-ds', '--delay_seconds', type=float, default=argparse.SUPPRESS,
|
||||
help='Delay (in seconds) between checks when reading from clipboard (on macOS/Linux) or a directory.')
|
||||
parser.add_argument('-v', '--verbosity', type=int, default=argparse.SUPPRESS,
|
||||
help='Terminal window verbosity. Can be -2 (all recognized text is showed whole, default), -1 (only timestamps are shown), 0 (nothing is shown but errors), or larger than 0 to cut displayed text to that amount of characters.')
|
||||
parser.add_argument('--uwu', type=str2bool, nargs='?', const=True, default=argparse.SUPPRESS, help=argparse.SUPPRESS)
|
||||
|
||||
475
owocr/ocr.py
475
owocr/ocr.py
@@ -67,11 +67,6 @@ try:
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import pyjson5
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import betterproto
|
||||
from .lens_betterproto import *
|
||||
@@ -224,53 +219,146 @@ def quad_to_bounding_box(x1, y1, x2, y2, x3, y3, x4, y4, img_width=None, img_hei
|
||||
rotation_z=angle
|
||||
)
|
||||
|
||||
def merge_bounding_boxes(ocr_element_list):
|
||||
all_corners = []
|
||||
|
||||
def merge_bounding_boxes(ocr_element_list, rotated=False):
|
||||
def _get_all_corners(ocr_element_list):
|
||||
corners = []
|
||||
for element in ocr_element_list:
|
||||
bbox = element.bounding_box
|
||||
angle = bbox.rotation_z
|
||||
hw = bbox.width / 2
|
||||
hh = bbox.height / 2
|
||||
angle = bbox.rotation_z or 0.0
|
||||
hw, hh = bbox.width / 2.0, bbox.height / 2.0
|
||||
cx, cy = bbox.center_x, bbox.center_y
|
||||
|
||||
if not angle:
|
||||
corners = [
|
||||
(bbox.center_x - hw, bbox.center_y - hh), # Top-left
|
||||
(bbox.center_x + hw, bbox.center_y - hh), # Top-right
|
||||
(bbox.center_x + hw, bbox.center_y + hh), # Bottom-right
|
||||
(bbox.center_x - hw, bbox.center_y + hh) # Bottom-left
|
||||
]
|
||||
all_corners.extend(corners)
|
||||
# Local corner offsets
|
||||
local = np.array([[-hw, -hh], [hw, -hh], [hw, hh], [-hw, hh]])
|
||||
|
||||
if abs(angle) < 1e-12:
|
||||
corners.append(local + [cx, cy])
|
||||
else:
|
||||
local_corners = [
|
||||
(-hw, -hh), # Top-left
|
||||
( hw, -hh), # Top-right
|
||||
( hw, hh), # Bottom-right
|
||||
(-hw, hh) # Bottom-left
|
||||
]
|
||||
# Rotation matrix
|
||||
cos_a, sin_a = np.cos(angle), np.sin(angle)
|
||||
rot = np.array([[cos_a, -sin_a], [sin_a, cos_a]])
|
||||
corners.append(local @ rot.T + [cx, cy])
|
||||
|
||||
# Rotate and translate corners
|
||||
cos_angle = cos(angle)
|
||||
sin_angle = sin(angle)
|
||||
return np.vstack(corners) if corners else np.empty((0, 2))
|
||||
|
||||
for x_local, y_local in local_corners:
|
||||
x_rotated = x_local * cos_angle - y_local * sin_angle
|
||||
y_rotated = x_local * sin_angle + y_local * cos_angle
|
||||
x_global = bbox.center_x + x_rotated
|
||||
y_global = bbox.center_y + y_rotated
|
||||
all_corners.append((x_global, y_global))
|
||||
def _convex_hull(points):
|
||||
if len(points) <= 3:
|
||||
return points
|
||||
|
||||
xs, ys = zip(*all_corners)
|
||||
min_x, max_x = min(xs), max(xs)
|
||||
min_y, max_y = min(ys), max(ys)
|
||||
pts = np.unique(points, axis=0)
|
||||
pts = pts[np.lexsort((pts[:, 1], pts[:, 0]))]
|
||||
|
||||
if len(pts) <= 1:
|
||||
return pts
|
||||
|
||||
def cross(o, a, b):
|
||||
return (a[0] - o[0]) * (b[1] - o[1]) - (a[1] - o[1]) * (b[0] - o[0])
|
||||
|
||||
lower, upper = [], []
|
||||
for p in pts:
|
||||
while len(lower) >= 2 and cross(lower[-2], lower[-1], p) <= 0:
|
||||
lower.pop()
|
||||
lower.append(p)
|
||||
for p in pts[::-1]:
|
||||
while len(upper) >= 2 and cross(upper[-2], upper[-1], p) <= 0:
|
||||
upper.pop()
|
||||
upper.append(p)
|
||||
|
||||
return np.array(lower[:-1] + upper[:-1])
|
||||
|
||||
all_corners = _get_all_corners(ocr_element_list)
|
||||
|
||||
# Axis-aligned case
|
||||
if not rotated:
|
||||
min_pt, max_pt = all_corners.min(axis=0), all_corners.max(axis=0)
|
||||
center = (min_pt + max_pt) / 2
|
||||
size = max_pt - min_pt
|
||||
return BoundingBox(
|
||||
center_x=center[0],
|
||||
center_y=center[1],
|
||||
width=size[0],
|
||||
height=size[1]
|
||||
)
|
||||
|
||||
hull = _convex_hull(all_corners)
|
||||
m = len(hull)
|
||||
|
||||
# Trivial cases
|
||||
if m == 1:
|
||||
return BoundingBox(
|
||||
center_x=hull[0, 0],
|
||||
center_y=hull[0, 1],
|
||||
width=0.0,
|
||||
height=0.0,
|
||||
rotation_z=0.0
|
||||
)
|
||||
|
||||
if m == 2:
|
||||
diff = hull[1] - hull[0]
|
||||
length = np.linalg.norm(diff)
|
||||
center = hull.mean(axis=0)
|
||||
return BoundingBox(
|
||||
center_x=center[0],
|
||||
center_y=center[1],
|
||||
width=length,
|
||||
height=0.0,
|
||||
rotation_z=np.arctan2(diff[1], diff[0])
|
||||
)
|
||||
|
||||
# Test each edge orientation
|
||||
edges = np.roll(hull, -1, axis=0) - hull
|
||||
edge_lengths = np.linalg.norm(edges, axis=1)
|
||||
valid = edge_lengths > 1e-12
|
||||
|
||||
if not valid.any():
|
||||
# Fallback to axis-aligned
|
||||
min_pt, max_pt = all_corners.min(axis=0), all_corners.max(axis=0)
|
||||
center = (min_pt + max_pt) / 2
|
||||
size = max_pt - min_pt
|
||||
return BoundingBox(
|
||||
center_x=center[0],
|
||||
center_y=center[1],
|
||||
width=size[0],
|
||||
height=size[1]
|
||||
)
|
||||
|
||||
angles = np.arctan2(edges[valid, 1], edges[valid, 0])
|
||||
best_area, best_idx = np.inf, -1
|
||||
|
||||
for idx, angle in enumerate(angles):
|
||||
# Rotation matrix (rotate by -angle)
|
||||
cos_a, sin_a = np.cos(angle), np.sin(angle)
|
||||
rot = np.array([[cos_a, sin_a], [-sin_a, cos_a]])
|
||||
rotated = hull @ rot.T
|
||||
|
||||
min_pt, max_pt = rotated.min(axis=0), rotated.max(axis=0)
|
||||
area = np.prod(max_pt - min_pt)
|
||||
|
||||
if area < best_area:
|
||||
best_area, best_idx = area, idx
|
||||
best_bounds = (min_pt, max_pt, angle)
|
||||
|
||||
min_pt, max_pt, angle = best_bounds
|
||||
width, height = max_pt - min_pt
|
||||
center_rot = (min_pt + max_pt) / 2
|
||||
|
||||
# Rotate center back to global coordinates
|
||||
cos_a, sin_a = np.cos(angle), np.sin(angle)
|
||||
rot_back = np.array([[cos_a, -sin_a], [sin_a, cos_a]])
|
||||
center = rot_back @ center_rot
|
||||
|
||||
# Normalize angle to [-π, π]
|
||||
angle = np.mod(angle + np.pi, 2 * np.pi) - np.pi
|
||||
|
||||
return BoundingBox(
|
||||
center_x=(min_x + max_x) / 2,
|
||||
center_y=(min_y + max_y) / 2,
|
||||
width=max_x - min_x,
|
||||
height=max_y - min_y
|
||||
center_x=center[0],
|
||||
center_y=center[1],
|
||||
width=width,
|
||||
height=height,
|
||||
rotation_z=angle
|
||||
)
|
||||
|
||||
|
||||
class MangaOcr:
|
||||
name = 'mangaocr'
|
||||
readable_name = 'Manga OCR'
|
||||
@@ -312,7 +400,7 @@ class GoogleVision:
|
||||
available = False
|
||||
local = False
|
||||
manual_language = False
|
||||
coordinate_support = False
|
||||
coordinate_support = True
|
||||
threading_support = True
|
||||
|
||||
def __init__(self):
|
||||
@@ -336,20 +424,103 @@ class GoogleVision:
|
||||
|
||||
image_bytes = self._preprocess(img)
|
||||
image = vision.Image(content=image_bytes)
|
||||
|
||||
try:
|
||||
response = self.client.text_detection(image=image)
|
||||
response = self.client.document_text_detection(image=image)
|
||||
except ServiceUnavailable:
|
||||
return (False, 'Connection error!')
|
||||
except:
|
||||
except Exception as e:
|
||||
return (False, 'Unknown error!')
|
||||
texts = response.text_annotations
|
||||
res = texts[0].description if len(texts) > 0 else ''
|
||||
x = (True, res)
|
||||
|
||||
ocr_result = self._to_generic_result(response.full_text_annotation, img.width, img.height)
|
||||
x = (True, ocr_result)
|
||||
|
||||
if is_path:
|
||||
img.close()
|
||||
return x
|
||||
|
||||
def _to_generic_result(self, full_text_annotation, img_width, img_height):
|
||||
paragraphs = []
|
||||
|
||||
if full_text_annotation:
|
||||
for page in full_text_annotation.pages:
|
||||
if page.width == img_width and page.height == img_height:
|
||||
for block in page.blocks:
|
||||
for google_paragraph in block.paragraphs:
|
||||
p_bbox = self._convert_bbox(google_paragraph.bounding_box, img_width, img_height)
|
||||
lines = self._create_lines_from_google_paragraph(google_paragraph, img_width, img_height)
|
||||
paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
|
||||
paragraphs.append(paragraph)
|
||||
|
||||
return OcrResult(
|
||||
image_properties=ImageProperties(width=img_width, height=img_height),
|
||||
paragraphs=paragraphs
|
||||
)
|
||||
|
||||
def _create_lines_from_google_paragraph(self, google_paragraph, img_width, img_height):
|
||||
lines = []
|
||||
words = []
|
||||
for google_word in google_paragraph.words:
|
||||
word = self._create_word_from_google_word(google_word, img_width, img_height)
|
||||
words.append(word)
|
||||
if word.separator == '\n':
|
||||
l_bbox = merge_bounding_boxes(words, True)
|
||||
line = Line(bounding_box=l_bbox, words=words)
|
||||
lines.append(line)
|
||||
words = []
|
||||
|
||||
return lines
|
||||
|
||||
def _create_word_from_google_word(self, google_word, img_width, img_height):
|
||||
w_bbox = self._convert_bbox(google_word.bounding_box, img_width, img_height)
|
||||
|
||||
w_separator = ''
|
||||
w_text_parts = []
|
||||
for i, symbol in enumerate(google_word.symbols):
|
||||
separator = None
|
||||
if hasattr(symbol, 'property') and hasattr(symbol.property, 'detected_break'):
|
||||
detected_break = symbol.property.detected_break
|
||||
detected_separator = self._break_type_to_char(detected_break.type_)
|
||||
if i == len(google_word.symbols) - 1:
|
||||
w_separator = detected_separator
|
||||
else:
|
||||
separator = detected_separator
|
||||
symbol_text = symbol.text
|
||||
w_text_parts.append(symbol_text)
|
||||
if separator:
|
||||
w_text_parts.append(separator)
|
||||
word_text = ''.join(w_text_parts)
|
||||
|
||||
return Word(
|
||||
text=word_text,
|
||||
bounding_box=w_bbox,
|
||||
separator=w_separator
|
||||
)
|
||||
|
||||
def _break_type_to_char(self, break_type):
|
||||
if break_type == vision.TextAnnotation.DetectedBreak.BreakType.SPACE:
|
||||
return ' '
|
||||
elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.SURE_SPACE:
|
||||
return ' '
|
||||
elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.EOL_SURE_SPACE:
|
||||
return '\n'
|
||||
elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.HYPHEN:
|
||||
return '-'
|
||||
elif break_type == vision.TextAnnotation.DetectedBreak.BreakType.LINE_BREAK:
|
||||
return '\n'
|
||||
return ''
|
||||
|
||||
def _convert_bbox(self, quad, img_width, img_height):
|
||||
vertices = quad.vertices
|
||||
|
||||
return quad_to_bounding_box(
|
||||
vertices[0].x, vertices[0].y,
|
||||
vertices[1].x, vertices[1].y,
|
||||
vertices[2].x, vertices[2].y,
|
||||
vertices[3].x, vertices[3].y,
|
||||
img_width, img_height
|
||||
)
|
||||
|
||||
def _preprocess(self, img):
|
||||
return pil_image_to_bytes(img)
|
||||
|
||||
@@ -501,104 +672,6 @@ class GoogleLens:
|
||||
|
||||
return (pil_image_to_bytes(img), img.width, img.height)
|
||||
|
||||
class GoogleLensWeb:
|
||||
name = 'glensweb'
|
||||
readable_name = 'Google Lens (web)'
|
||||
key = 'k'
|
||||
available = False
|
||||
local = False
|
||||
manual_language = False
|
||||
coordinate_support = False
|
||||
threading_support = True
|
||||
|
||||
def __init__(self):
|
||||
if 'pyjson5' not in sys.modules:
|
||||
logger.warning('pyjson5 not available, Google Lens (web) will not work!')
|
||||
else:
|
||||
self.requests_session = requests.Session()
|
||||
self.available = True
|
||||
logger.info('Google Lens (web) ready')
|
||||
|
||||
def __call__(self, img):
|
||||
img, is_path = input_to_pil_image(img)
|
||||
if not img:
|
||||
return (False, 'Invalid image provided')
|
||||
|
||||
url = 'https://lens.google.com/v3/upload'
|
||||
files = {'encoded_image': ('image.png', self._preprocess(img), 'image/png')}
|
||||
headers = {
|
||||
'Host': 'lens.google.com',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:136.0) Gecko/20100101 Firefox/136.0',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'ja-JP;q=0.6,ja;q=0.5',
|
||||
'Accept-Encoding': 'gzip, deflate, br, zstd',
|
||||
'Referer': 'https://www.google.com/',
|
||||
'Origin': 'https://www.google.com',
|
||||
'Alt-Used': 'lens.google.com',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'same-site',
|
||||
'Priority': 'u=0, i',
|
||||
'TE': 'trailers'
|
||||
}
|
||||
cookies = {'SOCS': 'CAESEwgDEgk0ODE3Nzk3MjQaAmVuIAEaBgiA_LyaBg'}
|
||||
|
||||
try:
|
||||
res = self.requests_session.post(url, files=files, headers=headers, cookies=cookies, timeout=20, allow_redirects=False)
|
||||
except requests.exceptions.Timeout:
|
||||
return (False, 'Request timeout!')
|
||||
except requests.exceptions.ConnectionError:
|
||||
return (False, 'Connection error!')
|
||||
|
||||
if res.status_code != 303:
|
||||
return (False, 'Unknown error!')
|
||||
|
||||
redirect_url = res.headers.get('Location')
|
||||
if not redirect_url:
|
||||
return (False, 'Error getting redirect URL!')
|
||||
|
||||
parsed_url = urlparse(redirect_url)
|
||||
query_params = parse_qs(parsed_url.query)
|
||||
|
||||
if ('vsrid' not in query_params) or ('gsessionid' not in query_params):
|
||||
return (False, 'Unknown error!')
|
||||
|
||||
try:
|
||||
res = self.requests_session.get(f"https://lens.google.com/qfmetadata?vsrid={query_params['vsrid'][0]}&gsessionid={query_params['gsessionid'][0]}", timeout=20)
|
||||
except requests.exceptions.Timeout:
|
||||
return (False, 'Request timeout!')
|
||||
except requests.exceptions.ConnectionError:
|
||||
return (False, 'Connection error!')
|
||||
|
||||
if (len(res.text.splitlines()) != 3):
|
||||
return (False, 'Unknown error!')
|
||||
|
||||
lens_object = pyjson5.loads(res.text.splitlines()[2])
|
||||
|
||||
res = []
|
||||
text = lens_object[0][2][0][0]
|
||||
for paragraph in text:
|
||||
for line in paragraph[1]:
|
||||
for word in line[0]:
|
||||
res.append(word[1] + word[2])
|
||||
|
||||
x = (True, res)
|
||||
|
||||
if is_path:
|
||||
img.close()
|
||||
return x
|
||||
|
||||
def _preprocess(self, img):
|
||||
if img.width * img.height > 3000000:
|
||||
aspect_ratio = img.width / img.height
|
||||
new_w = int(sqrt(3000000 * aspect_ratio))
|
||||
new_h = int(new_w / aspect_ratio)
|
||||
img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
|
||||
|
||||
return pil_image_to_bytes(img)
|
||||
|
||||
class Bing:
|
||||
name = 'bing'
|
||||
readable_name = 'Bing'
|
||||
@@ -1234,7 +1307,7 @@ class AzureImageAnalysis:
|
||||
available = False
|
||||
local = False
|
||||
manual_language = False
|
||||
coordinate_support = False
|
||||
coordinate_support = True
|
||||
threading_support = True
|
||||
|
||||
def __init__(self, config={}):
|
||||
@@ -1261,20 +1334,55 @@ class AzureImageAnalysis:
|
||||
except:
|
||||
return (False, 'Unknown error!')
|
||||
|
||||
res = []
|
||||
if read_result.read:
|
||||
for block in read_result.read.blocks:
|
||||
for line in block.lines:
|
||||
res.append(line.text)
|
||||
else:
|
||||
return (False, 'Unknown error!')
|
||||
|
||||
x = (True, res)
|
||||
ocr_result = self._to_generic_result(read_result, img.width, img.height)
|
||||
x = (True, ocr_result)
|
||||
|
||||
if is_path:
|
||||
img.close()
|
||||
return x
|
||||
|
||||
def _to_generic_result(self, read_result, img_width, img_height):
|
||||
paragraphs = []
|
||||
if read_result.read:
|
||||
for block in read_result.read.blocks:
|
||||
lines = []
|
||||
for azure_line in block.lines:
|
||||
l_bbox = self._convert_bbox(azure_line.bounding_polygon, img_width, img_height)
|
||||
|
||||
words = []
|
||||
for azure_word in azure_line.words:
|
||||
w_bbox = self._convert_bbox(azure_word.bounding_polygon, img_width, img_height)
|
||||
word = Word(
|
||||
text=azure_word.text,
|
||||
bounding_box=w_bbox
|
||||
)
|
||||
words.append(word)
|
||||
|
||||
line = Line(
|
||||
bounding_box=l_bbox,
|
||||
words=words,
|
||||
text=azure_line.text
|
||||
)
|
||||
lines.append(line)
|
||||
|
||||
p_bbox = merge_bounding_boxes(lines)
|
||||
paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
|
||||
paragraphs.append(paragraph)
|
||||
|
||||
return OcrResult(
|
||||
image_properties=ImageProperties(width=img_width, height=img_height),
|
||||
paragraphs=paragraphs
|
||||
)
|
||||
|
||||
def _convert_bbox(self, rect, img_width, img_height):
|
||||
return quad_to_bounding_box(
|
||||
rect[0]['x'], rect[0]['y'],
|
||||
rect[1]['x'], rect[1]['y'],
|
||||
rect[2]['x'], rect[2]['y'],
|
||||
rect[3]['x'], rect[3]['y'],
|
||||
img_width, img_height
|
||||
)
|
||||
|
||||
def _preprocess(self, img):
|
||||
min_pixel_size = 50
|
||||
max_pixel_size = 10000
|
||||
@@ -1461,7 +1569,7 @@ class OCRSpace:
|
||||
available = False
|
||||
local = False
|
||||
manual_language = True
|
||||
coordinate_support = False
|
||||
coordinate_support = True
|
||||
threading_support = True
|
||||
|
||||
def __init__(self, config={}, language='ja'):
|
||||
@@ -1498,14 +1606,16 @@ class OCRSpace:
|
||||
if not img:
|
||||
return (False, 'Invalid image provided')
|
||||
|
||||
img_bytes, img_extension, _ = self._preprocess(img)
|
||||
og_img_width, og_img_height = img.size
|
||||
img_bytes, img_extension, img_size = self._preprocess(img)
|
||||
if not img_bytes:
|
||||
return (False, 'Image is too big!')
|
||||
|
||||
data = {
|
||||
'apikey': self.api_key,
|
||||
'language': self.language,
|
||||
'OCREngine': str(self.engine_version)
|
||||
'OCREngine': str(self.engine_version),
|
||||
'isOverlayRequired': 'True'
|
||||
}
|
||||
files = {'file': ('image.' + img_extension, img_bytes, 'image/' + img_extension)}
|
||||
|
||||
@@ -1526,12 +1636,57 @@ class OCRSpace:
|
||||
if res['IsErroredOnProcessing']:
|
||||
return (False, res['ErrorMessage'])
|
||||
|
||||
res = res['ParsedResults'][0]['ParsedText']
|
||||
x = (True, res)
|
||||
img_width, img_height = img_size
|
||||
ocr_result = self._to_generic_result(res, img_width, img_height, og_img_width, og_img_height)
|
||||
x = (True, ocr_result)
|
||||
|
||||
if is_path:
|
||||
img.close()
|
||||
return x
|
||||
|
||||
def _to_generic_result(self, api_result, img_width, img_height, og_img_width, og_img_height):
|
||||
parsed_result = api_result['ParsedResults'][0]
|
||||
text_overlay = parsed_result.get('TextOverlay', {})
|
||||
|
||||
image_props = ImageProperties(width=og_img_width, height=og_img_height)
|
||||
ocr_result = OcrResult(image_properties=image_props)
|
||||
|
||||
lines_data = text_overlay.get('Lines', [])
|
||||
|
||||
lines = []
|
||||
for line_data in lines_data:
|
||||
words = []
|
||||
for word_data in line_data.get('Words', []):
|
||||
w_bbox = self._convert_bbox(word_data, img_width, img_height)
|
||||
words.append(Word(text=word_data['WordText'], bounding_box=w_bbox))
|
||||
|
||||
l_bbox = merge_bounding_boxes(words)
|
||||
lines.append(Line(bounding_box=l_bbox, words=words))
|
||||
|
||||
if lines:
|
||||
p_bbox = merge_bounding_boxes(lines)
|
||||
paragraph = Paragraph(bounding_box=p_bbox, lines=lines)
|
||||
ocr_result.paragraphs = [paragraph]
|
||||
else:
|
||||
ocr_result.paragraphs = []
|
||||
|
||||
return ocr_result
|
||||
|
||||
def _convert_bbox(self, word_data, img_width, img_height):
|
||||
left = word_data['Left'] / img_width
|
||||
top = word_data['Top'] / img_height
|
||||
width = word_data['Width'] / img_width
|
||||
height = word_data['Height'] / img_height
|
||||
|
||||
center_x = left + width / 2
|
||||
center_y = top + height / 2
|
||||
|
||||
return BoundingBox(
|
||||
center_x=center_x,
|
||||
center_y=center_y,
|
||||
width=width,
|
||||
height=height
|
||||
)
|
||||
|
||||
def _preprocess(self, img):
|
||||
return limit_image_size(img, self.max_byte_size)
|
||||
|
||||
@@ -1341,8 +1341,6 @@ class OutputResult:
|
||||
else:
|
||||
output_string = self._post_process(result_data_text, False)
|
||||
log_message = output_string
|
||||
if output_format == 'json':
|
||||
logger.opt(colors=True).warning(f"<{engine_color}>{engine_name}</{engine_color}> does not support JSON output. Falling back to text.")
|
||||
|
||||
if verbosity != 0:
|
||||
if verbosity < -1:
|
||||
@@ -1494,6 +1492,7 @@ def run():
|
||||
|
||||
global engine_instances
|
||||
global engine_keys
|
||||
output_format = config.get_general('output_format')
|
||||
engine_instances = []
|
||||
config_engines = []
|
||||
engine_keys = []
|
||||
@@ -1506,6 +1505,11 @@ def run():
|
||||
|
||||
for _,engine_class in sorted(inspect.getmembers(sys.modules[__name__], lambda x: hasattr(x, '__module__') and x.__module__ and __package__ + '.ocr' in x.__module__ and inspect.isclass(x) and hasattr(x, 'name'))):
|
||||
if len(config_engines) == 0 or engine_class.name in config_engines:
|
||||
|
||||
if output_format == 'json' and not engine_class.coordinate_support:
|
||||
logger.warning(f"Skipping {engine_class.readable_name} as it does not support JSON output.")
|
||||
continue
|
||||
|
||||
if config.get_engine(engine_class.name) == None:
|
||||
if engine_class.manual_language:
|
||||
engine_instance = engine_class(language=config.get_general('language'))
|
||||
@@ -1545,7 +1549,6 @@ def run():
|
||||
read_from_path = None
|
||||
read_from_readable = []
|
||||
write_to = config.get_general('write_to')
|
||||
output_format = config.get_general('output_format')
|
||||
terminated = threading.Event()
|
||||
paused = threading.Event()
|
||||
if config.get_general('pause_at_startup'):
|
||||
|
||||
@@ -11,9 +11,8 @@
|
||||
;a path to a text file.
|
||||
;write_to = clipboard
|
||||
|
||||
;OCR engine to use. Available: "mangaocr", "glens", "glensweb", "bing",
|
||||
;"gvision", "avision", "alivetext", "azure", "winrtocr", "oneocr", "easyocr",
|
||||
;"rapidocr", "ocrspace".
|
||||
;OCR engine to use. Available: "mangaocr", "glens", "bing","gvision", "avision",
|
||||
;"alivetext", "azure", "winrtocr", "oneocr", "easyocr", "rapidocr", "ocrspace".
|
||||
;engine =
|
||||
|
||||
;OCR engine to use for two-pass processing.
|
||||
@@ -30,15 +29,18 @@
|
||||
;delete_images = False
|
||||
|
||||
;Available:
|
||||
;avision,alivetext,bing,glens,glensweb,gvision,azure,mangaocr,winrtocr,oneocr,easyocr,rapidocr,ocrspace
|
||||
;engines = avision,alivetext,bing,glens,glensweb,gvision,azure,mangaocr,winrtocr,oneocr,easyocr,rapidocr,ocrspace
|
||||
;avision,alivetext,bing,glens,gvision,azure,mangaocr,winrtocr,oneocr,easyocr,rapidocr,ocrspace
|
||||
;engines = avision,alivetext,bing,glens,gvision,azure,mangaocr,winrtocr,oneocr,easyocr,rapidocr,ocrspace
|
||||
|
||||
;logger_format = <green>{time:HH:mm:ss.SSS}</green> | <level>{message}</level>
|
||||
|
||||
;engine_color = cyan
|
||||
|
||||
;Delay (in seconds) between checks when reading from clipboard (on macOS/Linux)
|
||||
;or a directory.
|
||||
;delay_secs = 0.5
|
||||
|
||||
;Websocket port to use if reading or writing to websocket.
|
||||
;websocket_port = 7331
|
||||
|
||||
;Show an operating system notification with the detected text. Will be ignored
|
||||
|
||||
@@ -61,10 +61,6 @@ lens = [
|
||||
"betterproto==2.0.0b7"
|
||||
]
|
||||
|
||||
lensweb = [
|
||||
"pyjson5"
|
||||
]
|
||||
|
||||
gvision = [
|
||||
"google-cloud-vision"
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user