Add EasyOCR/PaddleOCR, remove unneeded stuff
@@ -1 +0,0 @@
|
|||||||
include assets/example.jpg
|
|
||||||
|
Before Width: | Height: | Size: 55 KiB |
|
Before Width: | Height: | Size: 9.2 KiB |
|
Before Width: | Height: | Size: 34 KiB |
|
Before Width: | Height: | Size: 2.8 KiB |
|
Before Width: | Height: | Size: 18 KiB |
|
Before Width: | Height: | Size: 10 KiB |
|
Before Width: | Height: | Size: 3.8 KiB |
|
Before Width: | Height: | Size: 9.2 KiB |
|
Before Width: | Height: | Size: 15 KiB |
|
Before Width: | Height: | Size: 6.9 KiB |
|
Before Width: | Height: | Size: 6.2 KiB |
|
Before Width: | Height: | Size: 3.4 KiB |
|
Before Width: | Height: | Size: 15 KiB |
|
Before Width: | Height: | Size: 304 KiB |
|
Before Width: | Height: | Size: 405 KiB |
@@ -1,251 +0,0 @@
|
|||||||
len,p
|
|
||||||
1,0.014734972701616804
|
|
||||||
2,0.05048222747773489
|
|
||||||
3,0.05624961536094529
|
|
||||||
4,0.05972235654062228
|
|
||||||
5,0.05244278768803355
|
|
||||||
6,0.05518581363248727
|
|
||||||
7,0.046578690556781516
|
|
||||||
8,0.04875025276280738
|
|
||||||
9,0.04442471185039959
|
|
||||||
10,0.04181356215327536
|
|
||||||
11,0.040160713186745564
|
|
||||||
12,0.041162972666449804
|
|
||||||
13,0.03785727473339019
|
|
||||||
14,0.03527250028573187
|
|
||||||
15,0.03326798132632338
|
|
||||||
16,0.0307271656277749
|
|
||||||
17,0.028151182929938547
|
|
||||||
18,0.025794993977651372
|
|
||||||
19,0.024731192249193356
|
|
||||||
20,0.021856290057410126
|
|
||||||
21,0.021135366572008825
|
|
||||||
22,0.019113264112956403
|
|
||||||
23,0.017073578154260045
|
|
||||||
24,0.015992192926158093
|
|
||||||
25,0.013952506967461734
|
|
||||||
26,0.012572202245412905
|
|
||||||
27,0.011288606771405713
|
|
||||||
28,0.009758842302383443
|
|
||||||
29,0.008993960067872309
|
|
||||||
30,0.008176327334429372
|
|
||||||
31,0.0072356101034788955
|
|
||||||
32,0.006919107109888081
|
|
||||||
33,0.005978389878937605
|
|
||||||
34,0.004712377904574347
|
|
||||||
35,0.00467721090528648
|
|
||||||
36,0.004220039914544191
|
|
||||||
37,0.003463949429855024
|
|
||||||
38,0.003358448431991419
|
|
||||||
39,0.003059528938044539
|
|
||||||
40,0.00263752494659012
|
|
||||||
41,0.0021891457056697995
|
|
||||||
42,0.002364980702109141
|
|
||||||
43,0.002013310709230458
|
|
||||||
44,0.0019341849608327545
|
|
||||||
45,0.0013099707234730928
|
|
||||||
46,0.0013363459729389942
|
|
||||||
47,0.001204469725609488
|
|
||||||
48,0.0011341357270337517
|
|
||||||
49,0.0008967584818406407
|
|
||||||
50,0.000914341981484575
|
|
||||||
51,0.000914341981484575
|
|
||||||
52,0.0007736739843331018
|
|
||||||
53,0.0006505894868255629
|
|
||||||
54,0.0006681729864694971
|
|
||||||
55,0.0005011297398521228
|
|
||||||
56,0.0005714637384278593
|
|
||||||
57,0.00044837924092032037
|
|
||||||
58,0.000395628741988518
|
|
||||||
59,0.00031650299359081436
|
|
||||||
60,0.00031650299359081436
|
|
||||||
61,0.0002813359943029461
|
|
||||||
62,0.00026375249465901196
|
|
||||||
63,0.0002725442444809791
|
|
||||||
64,0.00020221024590524252
|
|
||||||
65,0.00032529474341278143
|
|
||||||
66,0.00023737724519311078
|
|
||||||
67,0.00023737724519311078
|
|
||||||
68,0.00022858549537114374
|
|
||||||
69,0.00020221024590524252
|
|
||||||
70,0.00012308449750753894
|
|
||||||
71,0.00010550099786360479
|
|
||||||
72,8.791749821967066e-05
|
|
||||||
73,0.00012308449750753894
|
|
||||||
74,0.00011429274768557187
|
|
||||||
75,7.912574839770359e-05
|
|
||||||
76,3.516699928786826e-05
|
|
||||||
77,7.033399857573652e-05
|
|
||||||
78,8.791749821967066e-05
|
|
||||||
79,3.516699928786826e-05
|
|
||||||
80,2.6375249465901198e-05
|
|
||||||
81,6.154224875376947e-05
|
|
||||||
82,0.00011429274768557187
|
|
||||||
83,7.033399857573652e-05
|
|
||||||
84,5.2750498931802396e-05
|
|
||||||
85,4.395874910983533e-05
|
|
||||||
86,3.516699928786826e-05
|
|
||||||
87,8.791749821967066e-05
|
|
||||||
88,6.154224875376947e-05
|
|
||||||
89,1.758349964393413e-05
|
|
||||||
90,1.758349964393413e-05
|
|
||||||
91,1.758349964393413e-05
|
|
||||||
92,8.791749821967065e-06
|
|
||||||
93,3.516699928786826e-05
|
|
||||||
94,2.6375249465901198e-05
|
|
||||||
95,2.6375249465901198e-05
|
|
||||||
96,1.758349964393413e-05
|
|
||||||
97,1.758349964393413e-05
|
|
||||||
98,4.395874910983533e-05
|
|
||||||
99,4.395874910983533e-05
|
|
||||||
100,8.791749821967065e-06
|
|
||||||
101,8.791749821967065e-06
|
|
||||||
102,2.6375249465901198e-05
|
|
||||||
103,2.6375249465901198e-05
|
|
||||||
104,8.791749821967065e-06
|
|
||||||
105,8.791749821967065e-06
|
|
||||||
106,1.758349964393413e-05
|
|
||||||
107,1.758349964393413e-05
|
|
||||||
108,8.791749821967065e-06
|
|
||||||
109,8.791749821967065e-06
|
|
||||||
110,8.791749821967065e-06
|
|
||||||
111,8.791749821967065e-06
|
|
||||||
112,8.791749821967065e-06
|
|
||||||
113,8.791749821967065e-06
|
|
||||||
114,3.516699928786826e-05
|
|
||||||
115,2.6375249465901198e-05
|
|
||||||
116,2.6375249465901198e-05
|
|
||||||
117,2.6375249465901198e-05
|
|
||||||
118,8.791749821967065e-06
|
|
||||||
119,8.791749821967065e-06
|
|
||||||
120,8.791749821967065e-06
|
|
||||||
121,8.791749821967065e-06
|
|
||||||
122,1.758349964393413e-05
|
|
||||||
123,8.791749821967065e-06
|
|
||||||
124,8.791749821967065e-06
|
|
||||||
125,8.791749821967065e-06
|
|
||||||
126,1.758349964393413e-05
|
|
||||||
127,1.758349964393413e-05
|
|
||||||
128,1.758349964393413e-05
|
|
||||||
129,1.758349964393413e-05
|
|
||||||
130,1.758349964393413e-05
|
|
||||||
131,8.791749821967065e-06
|
|
||||||
132,1.758349964393413e-05
|
|
||||||
133,8.791749821967065e-06
|
|
||||||
134,8.791749821967065e-06
|
|
||||||
135,8.791749821967065e-06
|
|
||||||
136,8.791749821967065e-06
|
|
||||||
137,8.791749821967065e-06
|
|
||||||
138,8.791749821967065e-06
|
|
||||||
139,8.791749821967065e-06
|
|
||||||
140,8.791749821967065e-06
|
|
||||||
141,8.791749821967065e-06
|
|
||||||
142,8.791749821967065e-06
|
|
||||||
143,8.791749821967065e-06
|
|
||||||
144,8.791749821967065e-06
|
|
||||||
145,8.791749821967065e-06
|
|
||||||
146,8.791749821967065e-06
|
|
||||||
147,8.791749821967065e-06
|
|
||||||
148,8.791749821967065e-06
|
|
||||||
149,8.791749821967065e-06
|
|
||||||
150,8.791749821967065e-06
|
|
||||||
151,8.791749821967065e-06
|
|
||||||
152,8.791749821967065e-06
|
|
||||||
153,8.791749821967065e-06
|
|
||||||
154,8.791749821967065e-06
|
|
||||||
155,8.791749821967065e-06
|
|
||||||
156,8.791749821967065e-06
|
|
||||||
157,8.791749821967065e-06
|
|
||||||
158,8.791749821967065e-06
|
|
||||||
159,8.791749821967065e-06
|
|
||||||
160,8.791749821967065e-06
|
|
||||||
161,8.791749821967065e-06
|
|
||||||
162,8.791749821967065e-06
|
|
||||||
163,8.791749821967065e-06
|
|
||||||
164,8.791749821967065e-06
|
|
||||||
165,8.791749821967065e-06
|
|
||||||
166,8.791749821967065e-06
|
|
||||||
167,8.791749821967065e-06
|
|
||||||
168,8.791749821967065e-06
|
|
||||||
169,8.791749821967065e-06
|
|
||||||
170,8.791749821967065e-06
|
|
||||||
171,8.791749821967065e-06
|
|
||||||
172,8.791749821967065e-06
|
|
||||||
173,8.791749821967065e-06
|
|
||||||
174,8.791749821967065e-06
|
|
||||||
175,8.791749821967065e-06
|
|
||||||
176,8.791749821967065e-06
|
|
||||||
177,8.791749821967065e-06
|
|
||||||
178,8.791749821967065e-06
|
|
||||||
179,8.791749821967065e-06
|
|
||||||
180,8.791749821967065e-06
|
|
||||||
181,8.791749821967065e-06
|
|
||||||
182,8.791749821967065e-06
|
|
||||||
183,8.791749821967065e-06
|
|
||||||
184,8.791749821967065e-06
|
|
||||||
185,8.791749821967065e-06
|
|
||||||
186,8.791749821967065e-06
|
|
||||||
187,8.791749821967065e-06
|
|
||||||
188,8.791749821967065e-06
|
|
||||||
189,8.791749821967065e-06
|
|
||||||
190,8.791749821967065e-06
|
|
||||||
191,8.791749821967065e-06
|
|
||||||
192,8.791749821967065e-06
|
|
||||||
193,8.791749821967065e-06
|
|
||||||
194,8.791749821967065e-06
|
|
||||||
195,8.791749821967065e-06
|
|
||||||
196,8.791749821967065e-06
|
|
||||||
197,8.791749821967065e-06
|
|
||||||
198,8.791749821967065e-06
|
|
||||||
199,8.791749821967065e-06
|
|
||||||
200,8.791749821967065e-06
|
|
||||||
201,8.791749821967065e-06
|
|
||||||
202,8.791749821967065e-06
|
|
||||||
203,8.791749821967065e-06
|
|
||||||
204,8.791749821967065e-06
|
|
||||||
205,8.791749821967065e-06
|
|
||||||
206,8.791749821967065e-06
|
|
||||||
207,8.791749821967065e-06
|
|
||||||
208,8.791749821967065e-06
|
|
||||||
209,8.791749821967065e-06
|
|
||||||
210,8.791749821967065e-06
|
|
||||||
211,8.791749821967065e-06
|
|
||||||
212,8.791749821967065e-06
|
|
||||||
213,8.791749821967065e-06
|
|
||||||
214,8.791749821967065e-06
|
|
||||||
215,8.791749821967065e-06
|
|
||||||
216,8.791749821967065e-06
|
|
||||||
217,8.791749821967065e-06
|
|
||||||
218,8.791749821967065e-06
|
|
||||||
219,8.791749821967065e-06
|
|
||||||
220,8.791749821967065e-06
|
|
||||||
221,8.791749821967065e-06
|
|
||||||
222,8.791749821967065e-06
|
|
||||||
223,8.791749821967065e-06
|
|
||||||
224,8.791749821967065e-06
|
|
||||||
225,8.791749821967065e-06
|
|
||||||
226,8.791749821967065e-06
|
|
||||||
227,8.791749821967065e-06
|
|
||||||
228,8.791749821967065e-06
|
|
||||||
229,8.791749821967065e-06
|
|
||||||
230,8.791749821967065e-06
|
|
||||||
231,8.791749821967065e-06
|
|
||||||
232,8.791749821967065e-06
|
|
||||||
233,8.791749821967065e-06
|
|
||||||
234,8.791749821967065e-06
|
|
||||||
235,8.791749821967065e-06
|
|
||||||
236,8.791749821967065e-06
|
|
||||||
237,8.791749821967065e-06
|
|
||||||
238,8.791749821967065e-06
|
|
||||||
239,8.791749821967065e-06
|
|
||||||
240,8.791749821967065e-06
|
|
||||||
241,8.791749821967065e-06
|
|
||||||
242,8.791749821967065e-06
|
|
||||||
243,8.791749821967065e-06
|
|
||||||
244,8.791749821967065e-06
|
|
||||||
245,8.791749821967065e-06
|
|
||||||
246,8.791749821967065e-06
|
|
||||||
247,8.791749821967065e-06
|
|
||||||
248,8.791749821967065e-06
|
|
||||||
249,8.791749821967065e-06
|
|
||||||
250,8.791749821967065e-06
|
|
||||||
|
@@ -1,6 +0,0 @@
|
|||||||
source,id,line
|
|
||||||
cc-100,cc-100_446088,発展を遂げた貨幣経済に対して、後戻りする形の改革が、民衆に受け入れられるはずもありません。
|
|
||||||
cc-100,cc-100_446387,東京都渋谷区本町1丁目4−14 ホームヘルパー(パート:茂原)
|
|
||||||
cc-100,cc-100_446430,同時に、発表しあう場を増やしたいです。まず、自分の考えを発表するためには、しっかりと自分の考えを持っていなくてはいけません。そのために、ますますノートの必要性を感じることでしょう。また、質問や意見に答えることで、考えが深まります。友達の意見を聞くことが、より理解を深めることを実感してほしいです。
|
|
||||||
cc-100,cc-100_446493,※特典の数に限りがございますので、対象商品はお早めにお買い求めください。特典は無くなり次第終了となります。
|
|
||||||
cc-100,cc-100_446543,ハリウッドスターってもっと豪華な生活を送っているのかと思えば、キアヌ・リーブスってかなり質素なんですね。
|
|
||||||
|
5451
assets/vocab.csv
@@ -4,3 +4,5 @@ from manga_ocr.ocr import MangaOcr
|
|||||||
from manga_ocr.ocr import GoogleVision
|
from manga_ocr.ocr import GoogleVision
|
||||||
from manga_ocr.ocr import AppleVision
|
from manga_ocr.ocr import AppleVision
|
||||||
from manga_ocr.ocr import AzureComputerVision
|
from manga_ocr.ocr import AzureComputerVision
|
||||||
|
from manga_ocr.ocr import EasyOCR
|
||||||
|
from manga_ocr.ocr import PaddleOCR
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ import platform
|
|||||||
|
|
||||||
import jaconv
|
import jaconv
|
||||||
import torch
|
import torch
|
||||||
|
import numpy as np
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from transformers import ViTImageProcessor, AutoTokenizer, VisionEncoderDecoderModel
|
from transformers import ViTImageProcessor, AutoTokenizer, VisionEncoderDecoderModel
|
||||||
@@ -33,9 +34,19 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
import easyocr
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
from paddleocr import PaddleOCR as POCR
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
class MangaOcr:
|
class MangaOcr:
|
||||||
def __init__(self, pretrained_model_name_or_path='kha-white/manga-ocr-base', force_cpu=False):
|
def __init__(self, pretrained_model_name_or_path='kha-white/manga-ocr-base', force_cpu=False):
|
||||||
logger.info(f'Loading OCR model from {pretrained_model_name_or_path}')
|
logger.info(f'Loading Manga OCR model from {pretrained_model_name_or_path}')
|
||||||
self.processor = ViTImageProcessor.from_pretrained(pretrained_model_name_or_path)
|
self.processor = ViTImageProcessor.from_pretrained(pretrained_model_name_or_path)
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
|
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
|
||||||
self.model = VisionEncoderDecoderModel.from_pretrained(pretrained_model_name_or_path)
|
self.model = VisionEncoderDecoderModel.from_pretrained(pretrained_model_name_or_path)
|
||||||
@@ -222,6 +233,76 @@ class AzureComputerVision:
|
|||||||
image_io.seek(0)
|
image_io.seek(0)
|
||||||
return image_io
|
return image_io
|
||||||
|
|
||||||
|
class EasyOCR:
|
||||||
|
def __init__(self):
|
||||||
|
if 'easyocr' not in sys.modules:
|
||||||
|
logger.warning('easyocr not available, EasyOCR will not work!')
|
||||||
|
self.available = False
|
||||||
|
else:
|
||||||
|
logger.info('Loading EasyOCR model')
|
||||||
|
self.model = easyocr.Reader(['ja','en'])
|
||||||
|
self.available = True
|
||||||
|
logger.info('EasyOCR ready')
|
||||||
|
|
||||||
|
def __call__(self, img_or_path):
|
||||||
|
if not self.available:
|
||||||
|
return "Engine not available!"
|
||||||
|
|
||||||
|
if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
|
||||||
|
img = Image.open(img_or_path)
|
||||||
|
elif isinstance(img_or_path, Image.Image):
|
||||||
|
img = img_or_path
|
||||||
|
else:
|
||||||
|
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
|
||||||
|
|
||||||
|
res = ''
|
||||||
|
read_result = self.model.readtext(self._preprocess(img), detail=0)
|
||||||
|
for text in read_result:
|
||||||
|
res += text + ' '
|
||||||
|
|
||||||
|
x = post_process(res)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def _preprocess(self, img):
|
||||||
|
image_bytes = io.BytesIO()
|
||||||
|
img.save(image_bytes, format=img.format)
|
||||||
|
return image_bytes.getvalue()
|
||||||
|
|
||||||
|
class PaddleOCR:
|
||||||
|
def __init__(self):
|
||||||
|
if 'paddleocr' not in sys.modules:
|
||||||
|
logger.warning('easyocr not available, PaddleOCR will not work!')
|
||||||
|
self.available = False
|
||||||
|
else:
|
||||||
|
logger.info('Loading PaddleOCR model')
|
||||||
|
self.model = POCR(use_angle_cls=True, show_log=False, lang='japan')
|
||||||
|
self.available = True
|
||||||
|
logger.info('PaddleOCR ready')
|
||||||
|
|
||||||
|
def __call__(self, img_or_path):
|
||||||
|
if not self.available:
|
||||||
|
return "Engine not available!"
|
||||||
|
|
||||||
|
if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
|
||||||
|
img = Image.open(img_or_path)
|
||||||
|
elif isinstance(img_or_path, Image.Image):
|
||||||
|
img = img_or_path
|
||||||
|
else:
|
||||||
|
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
|
||||||
|
|
||||||
|
res = ''
|
||||||
|
read_results = self.model.ocr(self._preprocess(img), cls=True)
|
||||||
|
for read_result in read_results:
|
||||||
|
if read_result:
|
||||||
|
for text in read_result:
|
||||||
|
res += text[1][0] + ' '
|
||||||
|
|
||||||
|
x = post_process(res)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def _preprocess(self, img):
|
||||||
|
return np.array(img.convert('RGB'))
|
||||||
|
|
||||||
|
|
||||||
def post_process(text):
|
def post_process(text):
|
||||||
text = ''.join(text.split())
|
text = ''.join(text.split())
|
||||||
|
|||||||
@@ -12,17 +12,7 @@ from PIL import UnidentifiedImageError
|
|||||||
from loguru import logger
|
from loguru import logger
|
||||||
from pynput import keyboard
|
from pynput import keyboard
|
||||||
|
|
||||||
from manga_ocr import MangaOcr
|
from manga_ocr import *
|
||||||
from manga_ocr import GoogleVision
|
|
||||||
from manga_ocr import AppleVision
|
|
||||||
from manga_ocr import AzureComputerVision
|
|
||||||
|
|
||||||
engines = ['avision', 'gvision', 'azure', 'mangaocr']
|
|
||||||
|
|
||||||
|
|
||||||
def get_engine_name(engine):
|
|
||||||
engine_names = ['Apple Vision', 'Google Vision', 'Azure Computer Vision', 'Manga OCR']
|
|
||||||
return engine_names[engines.index(engine)]
|
|
||||||
|
|
||||||
|
|
||||||
def are_images_identical(img1, img2):
|
def are_images_identical(img1, img2):
|
||||||
@@ -35,19 +25,12 @@ def are_images_identical(img1, img2):
|
|||||||
return (img1.shape == img2.shape) and (img1 == img2).all()
|
return (img1.shape == img2.shape) and (img1 == img2).all()
|
||||||
|
|
||||||
|
|
||||||
def process_and_write_results(mocr, avision, gvision, azure, img_or_path, write_to, engine):
|
def process_and_write_results(engine_instance, engine_name, img_or_path, write_to):
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
if engine == 'gvision':
|
text = engine_instance(img_or_path)
|
||||||
text = gvision(img_or_path)
|
|
||||||
elif engine == 'avision':
|
|
||||||
text = avision(img_or_path)
|
|
||||||
elif engine == 'azure':
|
|
||||||
text = azure(img_or_path)
|
|
||||||
else:
|
|
||||||
text = mocr(img_or_path)
|
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
|
|
||||||
logger.opt(ansi=True).info(f"Text recognized in {t1 - t0:0.03f}s using <cyan>{get_engine_name(engine)}</cyan>: {text}")
|
logger.opt(ansi=True).info(f"Text recognized in {t1 - t0:0.03f}s using <cyan>{engine_name}</cyan>: {text}")
|
||||||
|
|
||||||
if write_to == 'clipboard':
|
if write_to == 'clipboard':
|
||||||
pyperclip.copy(text)
|
pyperclip.copy(text)
|
||||||
@@ -81,7 +64,7 @@ def run(read_from='clipboard',
|
|||||||
:param pretrained_model_name_or_path: Path to a trained model, either local or from Transformers' model hub.
|
:param pretrained_model_name_or_path: Path to a trained model, either local or from Transformers' model hub.
|
||||||
:param force_cpu: If True, OCR will use CPU even if GPU is available.
|
:param force_cpu: If True, OCR will use CPU even if GPU is available.
|
||||||
:param delay_secs: How often to check for new images, in seconds.
|
:param delay_secs: How often to check for new images, in seconds.
|
||||||
:param engine: OCR engine to use. Available: "mangaocr", "gvision", "avision", "azure".
|
:param engine: OCR engine to use. Available: "mangaocr", "gvision", "avision", "azure", "easyocr", "paddleocr".
|
||||||
:param verbose: If True, unhides all warnings.
|
:param verbose: If True, unhides all warnings.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -93,10 +76,20 @@ def run(read_from='clipboard',
|
|||||||
}
|
}
|
||||||
logger.configure(**config)
|
logger.configure(**config)
|
||||||
|
|
||||||
mocr = MangaOcr(pretrained_model_name_or_path, force_cpu)
|
avision = AppleVision()
|
||||||
gvision = GoogleVision()
|
gvision = GoogleVision()
|
||||||
azure = AzureComputerVision()
|
azure = AzureComputerVision()
|
||||||
avision = AppleVision()
|
mangaocr = MangaOcr(pretrained_model_name_or_path, force_cpu)
|
||||||
|
easyocr = EasyOCR()
|
||||||
|
paddleocr = PaddleOCR()
|
||||||
|
|
||||||
|
engines = ['avision', 'gvision', 'azure', 'mangaocr', 'easyocr', 'paddleocr']
|
||||||
|
engine_names = ['Apple Vision', 'Google Vision', 'Azure Computer Vision', 'Manga OCR', 'EasyOCR', 'PaddleOCR']
|
||||||
|
engine_instances = [avision, gvision, azure, mangaocr, easyocr, paddleocr]
|
||||||
|
engine_keys = 'agvmeo'
|
||||||
|
|
||||||
|
def get_engine_name(engine):
|
||||||
|
return engine_names[engines.index(engine)]
|
||||||
|
|
||||||
if engine not in engines:
|
if engine not in engines:
|
||||||
msg = 'Unknown OCR engine!'
|
msg = 'Unknown OCR engine!'
|
||||||
@@ -203,8 +196,8 @@ def run(read_from='clipboard',
|
|||||||
engine = engines[engines.index(engine) + 1]
|
engine = engines[engines.index(engine) + 1]
|
||||||
|
|
||||||
logger.opt(ansi=True).info(f"Switched to <cyan>{get_engine_name(engine)}</cyan>!")
|
logger.opt(ansi=True).info(f"Switched to <cyan>{get_engine_name(engine)}</cyan>!")
|
||||||
elif user_input.lower() in 'agvm':
|
elif user_input.lower() in engine_keys:
|
||||||
new_engine = engines['agvm'.find(user_input.lower())]
|
new_engine = engines[engine_keys.find(user_input.lower())]
|
||||||
if engine != new_engine:
|
if engine != new_engine:
|
||||||
engine = new_engine
|
engine = new_engine
|
||||||
logger.opt(ansi=True).info(f"Switched to <cyan>{get_engine_name(engine)}</cyan>!")
|
logger.opt(ansi=True).info(f"Switched to <cyan>{get_engine_name(engine)}</cyan>!")
|
||||||
@@ -228,7 +221,7 @@ def run(read_from='clipboard',
|
|||||||
logger.warning('Error while reading from clipboard ({})'.format(error))
|
logger.warning('Error while reading from clipboard ({})'.format(error))
|
||||||
else:
|
else:
|
||||||
if not just_unpaused and isinstance(img, Image.Image) and not are_images_identical(img, old_img):
|
if not just_unpaused and isinstance(img, Image.Image) and not are_images_identical(img, old_img):
|
||||||
process_and_write_results(mocr, avision, gvision, azure, img, write_to, engine)
|
process_and_write_results(engine_instances[engines.index(engine)], get_engine_name(engine), img, write_to)
|
||||||
|
|
||||||
if just_unpaused:
|
if just_unpaused:
|
||||||
just_unpaused = False
|
just_unpaused = False
|
||||||
@@ -244,7 +237,7 @@ def run(read_from='clipboard',
|
|||||||
except (UnidentifiedImageError, OSError) as e:
|
except (UnidentifiedImageError, OSError) as e:
|
||||||
logger.warning(f'Error while reading file {path}: {e}')
|
logger.warning(f'Error while reading file {path}: {e}')
|
||||||
else:
|
else:
|
||||||
process_and_write_results(mocr, avision, gvision, azure, img, write_to, engine)
|
process_and_write_results(engine_instances[engines.index(engine)], get_engine_name(engine), img, write_to)
|
||||||
|
|
||||||
time.sleep(delay_secs)
|
time.sleep(delay_secs)
|
||||||
|
|
||||||
|
|||||||
@@ -1,98 +0,0 @@
|
|||||||
# Project structure
|
|
||||||
|
|
||||||
```
|
|
||||||
assets/ # assets (see description below)
|
|
||||||
manga_ocr/ # release code (inference only)
|
|
||||||
manga_ocr_dev/ # development code
|
|
||||||
env.py # global constants
|
|
||||||
data/ # data preprocessing
|
|
||||||
synthetic_data_generator/ # generation of synthetic image-text pairs
|
|
||||||
training/ # model training
|
|
||||||
```
|
|
||||||
|
|
||||||
## assets
|
|
||||||
|
|
||||||
### fonts.csv
|
|
||||||
csv with columns:
|
|
||||||
- font_path: path to font file, relative to `FONTS_ROOT`
|
|
||||||
- supported_chars: string of characters supported by this font
|
|
||||||
- num_chars: number of supported characters
|
|
||||||
- label: common/regular/special (used to sample regular fonts more often than special)
|
|
||||||
|
|
||||||
List of fonts with metadata used by synthetic data generator.
|
|
||||||
Provided file is just an example, you have to generate similar file for your own set of fonts,
|
|
||||||
using `manga_ocr_dev/synthetic_data_generator/scan_fonts.py` script.
|
|
||||||
Note that `label` will be filled with `regular` by default. You have to label your special fonts manually.
|
|
||||||
|
|
||||||
### lines_example.csv
|
|
||||||
csv with columns:
|
|
||||||
- source: source of text
|
|
||||||
- id: unique id of the line
|
|
||||||
- line: line from language corpus
|
|
||||||
|
|
||||||
Example of csv used for synthetic data generation.
|
|
||||||
|
|
||||||
### len_to_p.csv
|
|
||||||
csv with columns:
|
|
||||||
- len: length of text
|
|
||||||
- p: probability of text of this length occurring in manga
|
|
||||||
|
|
||||||
Used by synthetic data generator to more-or-less match the natural distribution of text lengths.
|
|
||||||
Computed based on Manga109-s dataset.
|
|
||||||
|
|
||||||
### vocab.csv
|
|
||||||
List of all characters supported by tokenizer.
|
|
||||||
|
|
||||||
# Training OCR
|
|
||||||
|
|
||||||
`env.py` contains global constants used across the repo. Set your paths to data etc. there.
|
|
||||||
|
|
||||||
1. Download [Manga109-s](http://www.manga109.org/en/download_s.html) dataset.
|
|
||||||
2. Set `MANGA109_ROOT`, so that your directory structure looks like this:
|
|
||||||
```
|
|
||||||
<MANGA109_ROOT>/
|
|
||||||
Manga109s_released_2021_02_28/
|
|
||||||
annotations/
|
|
||||||
annotations.v2018.05.31/
|
|
||||||
images/
|
|
||||||
books.txt
|
|
||||||
readme.txt
|
|
||||||
```
|
|
||||||
3. Preprocess Manga109-s with `data/process_manga109s.py`
|
|
||||||
4. Optionally generate synthetic data (see below)
|
|
||||||
5. Train with `manga_ocr_dev/training/train.py`
|
|
||||||
|
|
||||||
# Synthetic data generation
|
|
||||||
|
|
||||||
Generated data is split into packages (named `0000`, `0001` etc.) for easier management of large dataset.
|
|
||||||
Each package is assumed to have similar data distribution, so that a properly balanced dataset
|
|
||||||
can be built from any subset of packages.
|
|
||||||
|
|
||||||
Data generation pipeline assumes following directory structure:
|
|
||||||
|
|
||||||
```
|
|
||||||
<DATA_SYNTHETIC_ROOT>/
|
|
||||||
img/ # generated images (output from generation pipeline)
|
|
||||||
0000/
|
|
||||||
0001/
|
|
||||||
...
|
|
||||||
lines/ # lines from corpus (input to generation pipeline)
|
|
||||||
0000.csv
|
|
||||||
0001.csv
|
|
||||||
...
|
|
||||||
meta/ # metadata (output from generation pipeline)
|
|
||||||
0000.csv
|
|
||||||
0001.csv
|
|
||||||
...
|
|
||||||
```
|
|
||||||
|
|
||||||
To use a language corpus for data generation, `lines/*.csv` files must be provided.
|
|
||||||
For a small example of such file see `assets/lines_example.csv`.
|
|
||||||
|
|
||||||
To generate synthetic data:
|
|
||||||
1. Generate backgrounds with `data/generate_backgrounds.py`.
|
|
||||||
2. Put your fonts in `<FONTS_ROOT>`.
|
|
||||||
3. Generate fonts metadata with `synthetic_data_generator/scan_fonts.py`.
|
|
||||||
4. Optionally manually label your fonts with `common/regular/special` labels.
|
|
||||||
5. Provide `<DATA_SYNTHETIC_ROOT>/lines/*.csv`.
|
|
||||||
6. Run `synthetic_data_generator/run_generate.py` for each package.
|
|
||||||
@@ -1,85 +0,0 @@
|
|||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import cv2
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from manga_ocr_dev.env import MANGA109_ROOT, BACKGROUND_DIR
|
|
||||||
|
|
||||||
|
|
||||||
def find_rectangle(mask, y, x, aspect_ratio_range=(0.33, 3.0)):
|
|
||||||
ymin_ = ymax_ = y
|
|
||||||
xmin_ = xmax_ = x
|
|
||||||
|
|
||||||
ymin = ymax = xmin = xmax = None
|
|
||||||
|
|
||||||
while True:
|
|
||||||
if ymin is None:
|
|
||||||
ymin_ -= 1
|
|
||||||
if ymin_ == 0 or mask[ymin_, xmin_:xmax_].any():
|
|
||||||
ymin = ymin_
|
|
||||||
|
|
||||||
if ymax is None:
|
|
||||||
ymax_ += 1
|
|
||||||
if ymax_ == mask.shape[0] - 1 or mask[ymax_, xmin_:xmax_].any():
|
|
||||||
ymax = ymax_
|
|
||||||
|
|
||||||
if xmin is None:
|
|
||||||
xmin_ -= 1
|
|
||||||
if xmin_ == 0 or mask[ymin_:ymax_, xmin_].any():
|
|
||||||
xmin = xmin_
|
|
||||||
|
|
||||||
if xmax is None:
|
|
||||||
xmax_ += 1
|
|
||||||
if xmax_ == mask.shape[1] - 1 or mask[ymin_:ymax_, xmax_].any():
|
|
||||||
xmax = xmax_
|
|
||||||
|
|
||||||
h = ymax_ - ymin_
|
|
||||||
w = xmax_ - xmin_
|
|
||||||
if h > 1 and w > 1:
|
|
||||||
ratio = w / h
|
|
||||||
if ratio < aspect_ratio_range[0] or ratio > aspect_ratio_range[1]:
|
|
||||||
return ymin_, ymax_, xmin_, xmax_
|
|
||||||
|
|
||||||
if None not in (ymin, ymax, xmin, xmax):
|
|
||||||
return ymin, ymax, xmin, xmax
|
|
||||||
|
|
||||||
|
|
||||||
def generate_backgrounds(crops_per_page=5, min_size=40):
|
|
||||||
data = pd.read_csv(MANGA109_ROOT / 'data.csv')
|
|
||||||
frames_df = pd.read_csv(MANGA109_ROOT / 'frames.csv')
|
|
||||||
|
|
||||||
BACKGROUND_DIR.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
page_paths = data.page_path.unique()
|
|
||||||
for page_path in tqdm(page_paths):
|
|
||||||
page = cv2.imread(str(MANGA109_ROOT / page_path))
|
|
||||||
mask = np.zeros((page.shape[0], page.shape[1]), dtype=bool)
|
|
||||||
for row in data[data.page_path == page_path].itertuples():
|
|
||||||
mask[row.ymin:row.ymax, row.xmin:row.xmax] = True
|
|
||||||
|
|
||||||
frames_mask = np.zeros((page.shape[0], page.shape[1]), dtype=bool)
|
|
||||||
for row in frames_df[frames_df.page_path == page_path].itertuples():
|
|
||||||
frames_mask[row.ymin:row.ymax, row.xmin:row.xmax] = True
|
|
||||||
|
|
||||||
mask = mask | ~frames_mask
|
|
||||||
|
|
||||||
if mask.all():
|
|
||||||
continue
|
|
||||||
|
|
||||||
unmasked_points = np.stack(np.where(~mask), axis=1)
|
|
||||||
for i in range(crops_per_page):
|
|
||||||
p = unmasked_points[np.random.randint(0, unmasked_points.shape[0])]
|
|
||||||
y, x = p
|
|
||||||
ymin, ymax, xmin, xmax = find_rectangle(mask, y, x)
|
|
||||||
crop = page[ymin:ymax, xmin:xmax]
|
|
||||||
|
|
||||||
if crop.shape[0] >= min_size and crop.shape[1] >= min_size:
|
|
||||||
out_filename = '_'.join(
|
|
||||||
Path(page_path).with_suffix('').parts[-2:]) + f'_{ymin}_{ymax}_{xmin}_{xmax}.png'
|
|
||||||
cv2.imwrite(str(BACKGROUND_DIR / out_filename), crop)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
generate_backgrounds()
|
|
||||||
@@ -1,103 +0,0 @@
|
|||||||
import xml.etree.ElementTree as ET
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import cv2
|
|
||||||
import pandas as pd
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from manga_ocr_dev.env import MANGA109_ROOT
|
|
||||||
|
|
||||||
|
|
||||||
def get_books():
|
|
||||||
root = MANGA109_ROOT / 'Manga109s_released_2021_02_28'
|
|
||||||
books = (root / 'books.txt').read_text().splitlines()
|
|
||||||
books = pd.DataFrame({
|
|
||||||
'book': books,
|
|
||||||
'annotations': [str(root / 'annotations' / f'{book}.xml') for book in books],
|
|
||||||
'images': [str(root / 'images' / book) for book in books],
|
|
||||||
})
|
|
||||||
|
|
||||||
return books
|
|
||||||
|
|
||||||
|
|
||||||
def export_frames():
|
|
||||||
books = get_books()
|
|
||||||
|
|
||||||
data = []
|
|
||||||
for book in tqdm(books.itertuples(), total=len(books)):
|
|
||||||
tree = ET.parse(book.annotations)
|
|
||||||
root = tree.getroot()
|
|
||||||
for page in root.findall('./pages/page'):
|
|
||||||
for frame in page.findall('./frame'):
|
|
||||||
row = {}
|
|
||||||
row['book'] = book.book
|
|
||||||
row['page_index'] = int(page.attrib['index'])
|
|
||||||
row['page_path'] = str(Path(book.images) / f'{row["page_index"]:03d}.jpg')
|
|
||||||
row['page_width'] = int(page.attrib['width'])
|
|
||||||
row['page_height'] = int(page.attrib['height'])
|
|
||||||
row['id'] = frame.attrib['id']
|
|
||||||
row['xmin'] = int(frame.attrib['xmin'])
|
|
||||||
row['ymin'] = int(frame.attrib['ymin'])
|
|
||||||
row['xmax'] = int(frame.attrib['xmax'])
|
|
||||||
row['ymax'] = int(frame.attrib['ymax'])
|
|
||||||
data.append(row)
|
|
||||||
data = pd.DataFrame(data)
|
|
||||||
|
|
||||||
data.page_path = data.page_path.apply(lambda x: '/'.join(Path(x).parts[-4:]))
|
|
||||||
data.to_csv(MANGA109_ROOT / 'frames.csv', index=False)
|
|
||||||
|
|
||||||
|
|
||||||
def export_crops():
|
|
||||||
crops_root = MANGA109_ROOT / 'crops'
|
|
||||||
crops_root.mkdir(parents=True, exist_ok=True)
|
|
||||||
margin = 10
|
|
||||||
|
|
||||||
books = get_books()
|
|
||||||
|
|
||||||
data = []
|
|
||||||
for book in tqdm(books.itertuples(), total=len(books)):
|
|
||||||
tree = ET.parse(book.annotations)
|
|
||||||
root = tree.getroot()
|
|
||||||
for page in root.findall('./pages/page'):
|
|
||||||
for text in page.findall('./text'):
|
|
||||||
row = {}
|
|
||||||
row['book'] = book.book
|
|
||||||
row['page_index'] = int(page.attrib['index'])
|
|
||||||
row['page_path'] = str(Path(book.images) / f'{row["page_index"]:03d}.jpg')
|
|
||||||
row['page_width'] = int(page.attrib['width'])
|
|
||||||
row['page_height'] = int(page.attrib['height'])
|
|
||||||
row['id'] = text.attrib['id']
|
|
||||||
row['text'] = text.text
|
|
||||||
row['xmin'] = int(text.attrib['xmin'])
|
|
||||||
row['ymin'] = int(text.attrib['ymin'])
|
|
||||||
row['xmax'] = int(text.attrib['xmax'])
|
|
||||||
row['ymax'] = int(text.attrib['ymax'])
|
|
||||||
data.append(row)
|
|
||||||
data = pd.DataFrame(data)
|
|
||||||
|
|
||||||
n_test = int(0.1 * len(data))
|
|
||||||
data['split'] = 'train'
|
|
||||||
data.loc[data.sample(len(data)).iloc[:n_test].index, 'split'] = 'test'
|
|
||||||
|
|
||||||
data['crop_path'] = str(crops_root) + '\\' + data.id + '.png'
|
|
||||||
|
|
||||||
data.page_path = data.page_path.apply(lambda x: '/'.join(Path(x).parts[-4:]))
|
|
||||||
data.crop_path = data.crop_path.apply(lambda x: '/'.join(Path(x).parts[-2:]))
|
|
||||||
data.to_csv(MANGA109_ROOT / 'data.csv', index=False)
|
|
||||||
|
|
||||||
for page_path, boxes in tqdm(data.groupby('page_path'), total=data.page_path.nunique()):
|
|
||||||
img = cv2.imread(str(MANGA109_ROOT / page_path))
|
|
||||||
|
|
||||||
for box in boxes.itertuples():
|
|
||||||
xmin = max(box.xmin - margin, 0)
|
|
||||||
xmax = min(box.xmax + margin, img.shape[1])
|
|
||||||
ymin = max(box.ymin - margin, 0)
|
|
||||||
ymax = min(box.ymax + margin, img.shape[0])
|
|
||||||
crop = img[ymin:ymax, xmin:xmax]
|
|
||||||
out_path = (crops_root / box.id).with_suffix('.png')
|
|
||||||
cv2.imwrite(str(out_path), crop)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
export_frames()
|
|
||||||
export_crops()
|
|
||||||
@@ -1,9 +0,0 @@
|
|||||||
from pathlib import Path
|
|
||||||
|
|
||||||
ASSETS_PATH = Path(__file__).parent.parent / 'assets'
|
|
||||||
|
|
||||||
FONTS_ROOT = Path('~/data/jp_fonts').expanduser()
|
|
||||||
DATA_SYNTHETIC_ROOT = Path('~/data/manga/synthetic').expanduser()
|
|
||||||
BACKGROUND_DIR = Path('~/data/manga/Manga109s/background').expanduser()
|
|
||||||
MANGA109_ROOT = Path('~/data/manga/Manga109s').expanduser()
|
|
||||||
TRAIN_ROOT = Path('~/data/manga/out').expanduser()
|
|
||||||
@@ -1,25 +0,0 @@
|
|||||||
datasets
|
|
||||||
jiwer
|
|
||||||
torchinfo
|
|
||||||
transformers>=4.12.5
|
|
||||||
unidic-lite
|
|
||||||
ipadic
|
|
||||||
mecab-python3
|
|
||||||
fugashi
|
|
||||||
matplotlib
|
|
||||||
numpy
|
|
||||||
opencv-python
|
|
||||||
pandas
|
|
||||||
Pillow
|
|
||||||
pytest
|
|
||||||
scikit-image
|
|
||||||
scikit-learn
|
|
||||||
scipy
|
|
||||||
torch
|
|
||||||
torchvision
|
|
||||||
tqdm
|
|
||||||
wandb
|
|
||||||
fire
|
|
||||||
budou
|
|
||||||
albumentations>=1.1
|
|
||||||
html2image
|
|
||||||
@@ -1,38 +0,0 @@
|
|||||||
# Synthetic data generator
|
|
||||||
|
|
||||||
Generation of synthetic image-text pairs imitating Japanese manga for the purpose of training OCR.
|
|
||||||
|
|
||||||
Features:
|
|
||||||
- using either text from corpus or random text
|
|
||||||
- text overlaid on background images
|
|
||||||
- drawing text bubbles
|
|
||||||
- various fonts and font styles
|
|
||||||
- variety of text layouts:
|
|
||||||
- vertical and horizontal text
|
|
||||||
- multi-line text
|
|
||||||
- [furigana](https://en.wikipedia.org/wiki/Furigana) (added randomly)
|
|
||||||
- [tate chū yoko](https://www.w3.org/International/articles/vertical-text/#tcy)
|
|
||||||
|
|
||||||
|
|
||||||
Text rendering is done with the usage of [html2image](https://github.com/vgalin/html2image),
|
|
||||||
which is a wrapper around Chrome/Chromium browser's headless mode.
|
|
||||||
It's not too elegant of a solution, and it is very slow, but it only needs to be run once,
|
|
||||||
and when parallelized, processing time is manageable (~17 min per 10000 images on a 16-thread machine).
|
|
||||||
|
|
||||||
The upside of this approach is that a quite complex problem of typesetting and text rendering
|
|
||||||
(especially when dealing with both horizontal and vertical text) is offloaded to
|
|
||||||
the browser engine, keeping the codebase relatively simple and extendable.
|
|
||||||
|
|
||||||
High-level generation pipeline is as follows:
|
|
||||||
1. Preprocess text (truncate and/or split into lines, add random furigana).
|
|
||||||
2. Render text on a transparent background, using HTML engine.
|
|
||||||
3. Select background image from backgrounds dataset.
|
|
||||||
4. Overlay the text on the background, optionally drawing a bubble around the text.
|
|
||||||
|
|
||||||
# Examples
|
|
||||||
|
|
||||||
## Images generated with text from [CC-100 Japanese corpus](https://data.statmt.org/cc-100/)
|
|
||||||

|
|
||||||
|
|
||||||
## Images generated with random text
|
|
||||||

|
|
||||||
@@ -1,198 +0,0 @@
|
|||||||
import budou
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
from manga_ocr_dev.env import ASSETS_PATH, FONTS_ROOT
|
|
||||||
from manga_ocr_dev.synthetic_data_generator.renderer import Renderer
|
|
||||||
from manga_ocr_dev.synthetic_data_generator.utils import get_font_meta, get_charsets, is_ascii, is_kanji
|
|
||||||
|
|
||||||
|
|
||||||
class SyntheticDataGenerator:
|
|
||||||
def __init__(self):
|
|
||||||
self.vocab, self.hiragana, self.katakana = get_charsets()
|
|
||||||
self.len_to_p = pd.read_csv(ASSETS_PATH / 'len_to_p.csv')
|
|
||||||
self.parser = budou.get_parser('tinysegmenter')
|
|
||||||
self.fonts_df, self.font_map = get_font_meta()
|
|
||||||
self.font_labels, self.font_p = self.get_font_labels_prob()
|
|
||||||
self.renderer = Renderer()
|
|
||||||
|
|
||||||
def process(self, text=None, override_css_params=None):
|
|
||||||
"""
|
|
||||||
Generate image, text pair. Use source text if provided, otherwise generate random text.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if override_css_params is None:
|
|
||||||
override_css_params = {}
|
|
||||||
|
|
||||||
if text is None:
|
|
||||||
# if using random text, choose font first,
|
|
||||||
# and then generate text using only characters supported by that font
|
|
||||||
if 'font_path' not in override_css_params:
|
|
||||||
font_path = self.get_random_font()
|
|
||||||
vocab = self.font_map[font_path]
|
|
||||||
override_css_params['font_path'] = font_path
|
|
||||||
else:
|
|
||||||
font_path = override_css_params['font_path']
|
|
||||||
vocab = self.font_map[font_path]
|
|
||||||
|
|
||||||
words = self.get_random_words(vocab)
|
|
||||||
|
|
||||||
else:
|
|
||||||
text = text.replace(' ', ' ')
|
|
||||||
text = text.replace('…', '...')
|
|
||||||
words = self.split_into_words(text)
|
|
||||||
|
|
||||||
lines = self.words_to_lines(words)
|
|
||||||
text_gt = '\n'.join(lines)
|
|
||||||
|
|
||||||
if 'font_path' not in override_css_params:
|
|
||||||
override_css_params['font_path'] = self.get_random_font(text_gt)
|
|
||||||
|
|
||||||
font_path = override_css_params.get('font_path')
|
|
||||||
if font_path:
|
|
||||||
vocab = self.font_map.get(font_path)
|
|
||||||
|
|
||||||
# remove unsupported characters
|
|
||||||
lines = [''.join([c for c in line if c in vocab]) for line in lines]
|
|
||||||
text_gt = '\n'.join(lines)
|
|
||||||
else:
|
|
||||||
vocab = None
|
|
||||||
|
|
||||||
if np.random.random() < 0.5:
|
|
||||||
word_prob = np.random.choice([0.33, 1.0], p=[0.3, 0.7])
|
|
||||||
|
|
||||||
lines = [self.add_random_furigana(line, word_prob, vocab) for line in lines]
|
|
||||||
|
|
||||||
img, params = self.renderer.render(lines, override_css_params)
|
|
||||||
return img, text_gt, params
|
|
||||||
|
|
||||||
def get_random_words(self, vocab):
|
|
||||||
vocab = list(vocab)
|
|
||||||
max_text_len = np.random.choice(self.len_to_p.len, p=self.len_to_p.p)
|
|
||||||
|
|
||||||
words = []
|
|
||||||
text_len = 0
|
|
||||||
while True:
|
|
||||||
word = ''.join(np.random.choice(vocab, np.random.randint(1, 4)))
|
|
||||||
words.append(word)
|
|
||||||
text_len += len(word)
|
|
||||||
if text_len + len(word) >= max_text_len:
|
|
||||||
break
|
|
||||||
|
|
||||||
return words
|
|
||||||
|
|
||||||
def split_into_words(self, text):
|
|
||||||
max_text_len = np.random.choice(self.len_to_p.len, p=self.len_to_p.p)
|
|
||||||
|
|
||||||
words = []
|
|
||||||
text_len = 0
|
|
||||||
for chunk in self.parser.parse(text)['chunks']:
|
|
||||||
words.append(chunk.word)
|
|
||||||
text_len += len(chunk.word)
|
|
||||||
if text_len + len(chunk.word) >= max_text_len:
|
|
||||||
break
|
|
||||||
|
|
||||||
return words
|
|
||||||
|
|
||||||
def words_to_lines(self, words):
|
|
||||||
text = ''.join(words)
|
|
||||||
|
|
||||||
max_num_lines = 10
|
|
||||||
min_line_len = len(text) // max_num_lines
|
|
||||||
max_line_len = 20
|
|
||||||
max_line_len = np.clip(np.random.poisson(6), min_line_len, max_line_len)
|
|
||||||
lines = []
|
|
||||||
line = ''
|
|
||||||
for word in words:
|
|
||||||
line += word
|
|
||||||
if len(line) >= max_line_len:
|
|
||||||
lines.append(line)
|
|
||||||
line = ''
|
|
||||||
if line:
|
|
||||||
lines.append(line)
|
|
||||||
|
|
||||||
return lines
|
|
||||||
|
|
||||||
def add_random_furigana(self, line, word_prob=1.0, vocab=None):
|
|
||||||
if vocab is None:
|
|
||||||
vocab = self.vocab
|
|
||||||
else:
|
|
||||||
vocab = list(vocab)
|
|
||||||
|
|
||||||
processed = ''
|
|
||||||
kanji_group = ''
|
|
||||||
ascii_group = ''
|
|
||||||
for i, c in enumerate(line):
|
|
||||||
|
|
||||||
if is_kanji(c):
|
|
||||||
c_type = 'kanji'
|
|
||||||
kanji_group += c
|
|
||||||
elif is_ascii(c):
|
|
||||||
c_type = 'ascii'
|
|
||||||
ascii_group += c
|
|
||||||
else:
|
|
||||||
c_type = 'other'
|
|
||||||
|
|
||||||
if c_type != 'kanji' or i == len(line) - 1:
|
|
||||||
if kanji_group:
|
|
||||||
if np.random.uniform() < word_prob:
|
|
||||||
furigana_len = int(np.clip(np.random.normal(1.5, 0.5), 1, 4) * len(kanji_group))
|
|
||||||
char_source = np.random.choice(['hiragana', 'katakana', 'all'], p=[0.8, 0.15, 0.05])
|
|
||||||
char_source = {
|
|
||||||
'hiragana': self.hiragana,
|
|
||||||
'katakana': self.katakana,
|
|
||||||
'all': vocab
|
|
||||||
}[char_source]
|
|
||||||
furigana = ''.join(np.random.choice(char_source, furigana_len))
|
|
||||||
processed += f'<ruby>{kanji_group}<rt>{furigana}</rt></ruby>'
|
|
||||||
else:
|
|
||||||
processed += kanji_group
|
|
||||||
kanji_group = ''
|
|
||||||
|
|
||||||
if c_type != 'ascii' or i == len(line) - 1:
|
|
||||||
if ascii_group:
|
|
||||||
if len(ascii_group) <= 3 and np.random.uniform() < 0.7:
|
|
||||||
processed += f'<span style="text-combine-upright: all">{ascii_group}</span>'
|
|
||||||
else:
|
|
||||||
processed += ascii_group
|
|
||||||
ascii_group = ''
|
|
||||||
|
|
||||||
if c_type == 'other':
|
|
||||||
processed += c
|
|
||||||
|
|
||||||
return processed
|
|
||||||
|
|
||||||
def is_font_supporting_text(self, font_path, text):
|
|
||||||
chars = self.font_map[font_path]
|
|
||||||
for c in text:
|
|
||||||
if c.isspace():
|
|
||||||
continue
|
|
||||||
if c not in chars:
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
def get_font_labels_prob(self):
|
|
||||||
labels = {
|
|
||||||
'common': 0.2,
|
|
||||||
'regular': 0.75,
|
|
||||||
'special': 0.05,
|
|
||||||
}
|
|
||||||
labels = {k: labels[k] for k in self.fonts_df.label.unique()}
|
|
||||||
p = np.array(list(labels.values()))
|
|
||||||
p = p / p.sum()
|
|
||||||
labels = list(labels.keys())
|
|
||||||
return labels, p
|
|
||||||
|
|
||||||
def get_random_font(self, text=None):
|
|
||||||
label = np.random.choice(self.font_labels, p=self.font_p)
|
|
||||||
df = self.fonts_df[self.fonts_df.label == label]
|
|
||||||
|
|
||||||
if text is None:
|
|
||||||
return df.sample(1).iloc[0].font_path
|
|
||||||
|
|
||||||
valid_mask = df.font_path.apply(lambda x: self.is_font_supporting_text(x, text))
|
|
||||||
if not valid_mask.any():
|
|
||||||
# if text contains characters not supported by any font, just pick some of the more capable fonts
|
|
||||||
valid_mask = (df.num_chars >= 4000)
|
|
||||||
|
|
||||||
return str(FONTS_ROOT / df[valid_mask].sample(1).iloc[0].font_path)
|
|
||||||
@@ -1,265 +0,0 @@
|
|||||||
import os
|
|
||||||
import uuid
|
|
||||||
|
|
||||||
import albumentations as A
|
|
||||||
import cv2
|
|
||||||
import numpy as np
|
|
||||||
from html2image import Html2Image
|
|
||||||
|
|
||||||
from manga_ocr_dev.env import BACKGROUND_DIR
|
|
||||||
from manga_ocr_dev.synthetic_data_generator.utils import get_background_df
|
|
||||||
|
|
||||||
|
|
||||||
class Renderer:
|
|
||||||
def __init__(self):
|
|
||||||
self.hti = Html2Image()
|
|
||||||
self.background_df = get_background_df(BACKGROUND_DIR)
|
|
||||||
self.max_size = 600
|
|
||||||
|
|
||||||
def render(self, lines, override_css_params=None):
|
|
||||||
img, params = self.render_text(lines, override_css_params)
|
|
||||||
img = self.render_background(img)
|
|
||||||
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
||||||
img = A.LongestMaxSize(self.max_size)(image=img)['image']
|
|
||||||
return img, params
|
|
||||||
|
|
||||||
def render_text(self, lines, override_css_params=None):
|
|
||||||
"""Render text on transparent background and return as BGRA image."""
|
|
||||||
|
|
||||||
params = self.get_random_css_params()
|
|
||||||
if override_css_params:
|
|
||||||
params.update(override_css_params)
|
|
||||||
|
|
||||||
css = get_css(**params)
|
|
||||||
|
|
||||||
# this is just a rough estimate, image is cropped later anyway
|
|
||||||
size = (
|
|
||||||
int(max(len(line) for line in lines) * params['font_size'] * 1.5),
|
|
||||||
int(len(lines) * params['font_size'] * (3 + params['line_height'])),
|
|
||||||
)
|
|
||||||
if params['vertical']:
|
|
||||||
size = size[::-1]
|
|
||||||
html = self.lines_to_html(lines)
|
|
||||||
|
|
||||||
filename = str(uuid.uuid4()) + '.png'
|
|
||||||
self.hti.screenshot(html_str=html, css_str=css, save_as=filename, size=size)
|
|
||||||
img = cv2.imread(filename, cv2.IMREAD_UNCHANGED)
|
|
||||||
os.remove(filename)
|
|
||||||
return img, params
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_random_css_params():
|
|
||||||
params = {
|
|
||||||
'font_size': 48,
|
|
||||||
'vertical': True if np.random.rand() < 0.7 else False,
|
|
||||||
'line_height': 0.5,
|
|
||||||
'background_color': 'transparent',
|
|
||||||
'text_color': 'black',
|
|
||||||
}
|
|
||||||
|
|
||||||
if np.random.rand() < 0.7:
|
|
||||||
params['text_orientation'] = 'upright'
|
|
||||||
|
|
||||||
stroke_variant = np.random.choice(['stroke', 'shadow', 'none'], p=[0.8, 0.15, 0.05])
|
|
||||||
if stroke_variant == 'stroke':
|
|
||||||
params['stroke_size'] = np.random.choice([1, 2, 3, 4, 8])
|
|
||||||
params['stroke_color'] = 'white'
|
|
||||||
elif stroke_variant == 'shadow':
|
|
||||||
params['shadow_size'] = np.random.choice([2, 5, 10])
|
|
||||||
params['shadow_color'] = 'white' if np.random.rand() < 0.8 else 'black',
|
|
||||||
elif stroke_variant == 'none':
|
|
||||||
pass
|
|
||||||
|
|
||||||
return params
|
|
||||||
|
|
||||||
def render_background(self, img):
|
|
||||||
"""Add background and/or text bubble to a BGRA image, crop and return as BGR image."""
|
|
||||||
draw_bubble = np.random.random() < 0.7
|
|
||||||
|
|
||||||
m0 = int(min(img.shape[:2]) * 0.3)
|
|
||||||
img = crop_by_alpha(img, m0)
|
|
||||||
|
|
||||||
background_path = self.background_df.sample(1).iloc[0].path
|
|
||||||
background = cv2.imread(background_path)
|
|
||||||
|
|
||||||
t = [
|
|
||||||
A.HorizontalFlip(),
|
|
||||||
A.RandomRotate90(),
|
|
||||||
A.InvertImg(),
|
|
||||||
A.RandomBrightnessContrast((-0.2, 0.4), (-0.8, -0.3), p=0.5 if draw_bubble else 1),
|
|
||||||
A.Blur((3, 5), p=0.3),
|
|
||||||
A.Resize(img.shape[0], img.shape[1]),
|
|
||||||
]
|
|
||||||
|
|
||||||
background = A.Compose(t)(image=background)['image']
|
|
||||||
|
|
||||||
if not draw_bubble:
|
|
||||||
if np.random.rand() < 0.5:
|
|
||||||
img[:, :, :3] = 255 - img[:, :, :3]
|
|
||||||
|
|
||||||
else:
|
|
||||||
radius = np.random.uniform(0.7, 1.)
|
|
||||||
thickness = np.random.choice([1, 2, 3])
|
|
||||||
alpha = np.random.randint(60, 100)
|
|
||||||
sigma = np.random.randint(10, 15)
|
|
||||||
|
|
||||||
ymin = m0 - int(min(img.shape[:2]) * np.random.uniform(0.07, 0.12))
|
|
||||||
ymax = img.shape[0] - m0 + int(min(img.shape[:2]) * np.random.uniform(0.07, 0.12))
|
|
||||||
xmin = m0 - int(min(img.shape[:2]) * np.random.uniform(0.07, 0.12))
|
|
||||||
xmax = img.shape[1] - m0 + int(min(img.shape[:2]) * np.random.uniform(0.07, 0.12))
|
|
||||||
|
|
||||||
bubble_fill_color = (255, 255, 255, 255)
|
|
||||||
bubble_contour_color = (0, 0, 0, 255)
|
|
||||||
bubble = np.zeros((img.shape[0], img.shape[1], 4), dtype=np.uint8)
|
|
||||||
bubble = rounded_rectangle(bubble, (xmin, ymin), (xmax, ymax), radius=radius, color=bubble_fill_color,
|
|
||||||
thickness=-1)
|
|
||||||
bubble = rounded_rectangle(bubble, (xmin, ymin), (xmax, ymax), radius=radius, color=bubble_contour_color,
|
|
||||||
thickness=thickness)
|
|
||||||
|
|
||||||
t = [
|
|
||||||
A.ElasticTransform(alpha=alpha, sigma=sigma, alpha_affine=0, p=0.8),
|
|
||||||
]
|
|
||||||
bubble = A.Compose(t)(image=bubble)['image']
|
|
||||||
|
|
||||||
background = blend(bubble, background)
|
|
||||||
|
|
||||||
img = blend(img, background)
|
|
||||||
|
|
||||||
ymin = m0 - int(min(img.shape[:2]) * np.random.uniform(0.01, 0.2))
|
|
||||||
ymax = img.shape[0] - m0 + int(min(img.shape[:2]) * np.random.uniform(0.01, 0.2))
|
|
||||||
xmin = m0 - int(min(img.shape[:2]) * np.random.uniform(0.01, 0.2))
|
|
||||||
xmax = img.shape[1] - m0 + int(min(img.shape[:2]) * np.random.uniform(0.01, 0.2))
|
|
||||||
img = img[ymin:ymax, xmin:xmax]
|
|
||||||
return img
|
|
||||||
|
|
||||||
def lines_to_html(self, lines):
|
|
||||||
lines_str = '\n'.join(['<p>' + line + '</p>' for line in lines])
|
|
||||||
html = f"<html><body>\n{lines_str}\n</body></html>"
|
|
||||||
return html
|
|
||||||
|
|
||||||
|
|
||||||
def crop_by_alpha(img, margin):
|
|
||||||
y, x = np.where(img[:, :, 3] > 0)
|
|
||||||
ymin = y.min()
|
|
||||||
ymax = y.max()
|
|
||||||
xmin = x.min()
|
|
||||||
xmax = x.max()
|
|
||||||
img = img[ymin:ymax, xmin:xmax]
|
|
||||||
img = np.pad(img, ((margin, margin), (margin, margin), (0, 0)))
|
|
||||||
return img
|
|
||||||
|
|
||||||
|
|
||||||
def blend(img, background):
|
|
||||||
alpha = (img[:, :, 3] / 255)[:, :, np.newaxis]
|
|
||||||
img = img[:, :, :3]
|
|
||||||
img = (background * (1 - alpha) + img * alpha).astype(np.uint8)
|
|
||||||
return img
|
|
||||||
|
|
||||||
|
|
||||||
def rounded_rectangle(src, top_left, bottom_right, radius=1, color=255, thickness=1, line_type=cv2.LINE_AA):
|
|
||||||
"""From https://stackoverflow.com/a/60210706"""
|
|
||||||
|
|
||||||
# corners:
|
|
||||||
# p1 - p2
|
|
||||||
# | |
|
|
||||||
# p4 - p3
|
|
||||||
|
|
||||||
p1 = top_left
|
|
||||||
p2 = (bottom_right[0], top_left[1])
|
|
||||||
p3 = bottom_right
|
|
||||||
p4 = (top_left[0], bottom_right[1])
|
|
||||||
|
|
||||||
height = abs(bottom_right[1] - top_left[1])
|
|
||||||
width = abs(bottom_right[0] - top_left[0])
|
|
||||||
|
|
||||||
if radius > 1:
|
|
||||||
radius = 1
|
|
||||||
|
|
||||||
corner_radius = int(radius * (min(height, width) / 2))
|
|
||||||
|
|
||||||
if thickness < 0:
|
|
||||||
# big rect
|
|
||||||
top_left_main_rect = (int(p1[0] + corner_radius), int(p1[1]))
|
|
||||||
bottom_right_main_rect = (int(p3[0] - corner_radius), int(p3[1]))
|
|
||||||
|
|
||||||
top_left_rect_left = (p1[0], p1[1] + corner_radius)
|
|
||||||
bottom_right_rect_left = (p4[0] + corner_radius, p4[1] - corner_radius)
|
|
||||||
|
|
||||||
top_left_rect_right = (p2[0] - corner_radius, p2[1] + corner_radius)
|
|
||||||
bottom_right_rect_right = (p3[0], p3[1] - corner_radius)
|
|
||||||
|
|
||||||
all_rects = [
|
|
||||||
[top_left_main_rect, bottom_right_main_rect],
|
|
||||||
[top_left_rect_left, bottom_right_rect_left],
|
|
||||||
[top_left_rect_right, bottom_right_rect_right]]
|
|
||||||
|
|
||||||
[cv2.rectangle(src, rect[0], rect[1], color, thickness) for rect in all_rects]
|
|
||||||
|
|
||||||
# draw straight lines
|
|
||||||
cv2.line(src, (p1[0] + corner_radius, p1[1]), (p2[0] - corner_radius, p2[1]), color, abs(thickness), line_type)
|
|
||||||
cv2.line(src, (p2[0], p2[1] + corner_radius), (p3[0], p3[1] - corner_radius), color, abs(thickness), line_type)
|
|
||||||
cv2.line(src, (p3[0] - corner_radius, p4[1]), (p4[0] + corner_radius, p3[1]), color, abs(thickness), line_type)
|
|
||||||
cv2.line(src, (p4[0], p4[1] - corner_radius), (p1[0], p1[1] + corner_radius), color, abs(thickness), line_type)
|
|
||||||
|
|
||||||
# draw arcs
|
|
||||||
cv2.ellipse(src, (p1[0] + corner_radius, p1[1] + corner_radius), (corner_radius, corner_radius), 180.0, 0, 90,
|
|
||||||
color, thickness, line_type)
|
|
||||||
cv2.ellipse(src, (p2[0] - corner_radius, p2[1] + corner_radius), (corner_radius, corner_radius), 270.0, 0, 90,
|
|
||||||
color, thickness, line_type)
|
|
||||||
cv2.ellipse(src, (p3[0] - corner_radius, p3[1] - corner_radius), (corner_radius, corner_radius), 0.0, 0, 90, color,
|
|
||||||
thickness, line_type)
|
|
||||||
cv2.ellipse(src, (p4[0] + corner_radius, p4[1] - corner_radius), (corner_radius, corner_radius), 90.0, 0, 90, color,
|
|
||||||
thickness, line_type)
|
|
||||||
|
|
||||||
return src
|
|
||||||
|
|
||||||
|
|
||||||
def get_css(
|
|
||||||
font_size,
|
|
||||||
font_path,
|
|
||||||
vertical=True,
|
|
||||||
background_color='white',
|
|
||||||
text_color='black',
|
|
||||||
shadow_size=0,
|
|
||||||
shadow_color='black',
|
|
||||||
stroke_size=0,
|
|
||||||
stroke_color='black',
|
|
||||||
letter_spacing=None,
|
|
||||||
line_height=0.5,
|
|
||||||
text_orientation=None,
|
|
||||||
):
|
|
||||||
styles = [
|
|
||||||
f"background-color: {background_color};",
|
|
||||||
f"font-size: {font_size}px;",
|
|
||||||
f"color: {text_color};",
|
|
||||||
"font-family: custom;",
|
|
||||||
f"line-height: {line_height};",
|
|
||||||
"margin: 20px;",
|
|
||||||
]
|
|
||||||
|
|
||||||
if text_orientation:
|
|
||||||
styles.append(f"text-orientation: {text_orientation};")
|
|
||||||
|
|
||||||
if vertical:
|
|
||||||
styles.append("writing-mode: vertical-rl;")
|
|
||||||
|
|
||||||
if shadow_size > 0:
|
|
||||||
styles.append(f"text-shadow: 0 0 {shadow_size}px {shadow_color};")
|
|
||||||
|
|
||||||
if stroke_size > 0:
|
|
||||||
# stroke is simulated by shadow overlaid multiple times
|
|
||||||
styles.extend([
|
|
||||||
f"text-shadow: " + ','.join([f"0 0 {stroke_size}px {stroke_color}"] * 10 * stroke_size) + ";",
|
|
||||||
"-webkit-font-smoothing: antialiased;",
|
|
||||||
])
|
|
||||||
|
|
||||||
if letter_spacing:
|
|
||||||
styles.append(f"letter-spacing: {letter_spacing}em;")
|
|
||||||
|
|
||||||
font_path = font_path.replace('\\', '/')
|
|
||||||
|
|
||||||
styles_str = '\n'.join(styles)
|
|
||||||
css = ""
|
|
||||||
css += '\n@font-face {\nfont-family: custom;\nsrc: url("' + font_path + '");\n}\n'
|
|
||||||
css += "body {\n" + styles_str + "\n}"
|
|
||||||
return css
|
|
||||||
@@ -1,64 +0,0 @@
|
|||||||
import traceback
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import cv2
|
|
||||||
import fire
|
|
||||||
import pandas as pd
|
|
||||||
from tqdm.contrib.concurrent import thread_map
|
|
||||||
|
|
||||||
from manga_ocr_dev.env import FONTS_ROOT, DATA_SYNTHETIC_ROOT
|
|
||||||
from manga_ocr_dev.synthetic_data_generator.generator import SyntheticDataGenerator
|
|
||||||
|
|
||||||
generator = SyntheticDataGenerator()
|
|
||||||
|
|
||||||
|
|
||||||
def f(args):
|
|
||||||
try:
|
|
||||||
i, source, id_, text = args
|
|
||||||
filename = f'{id_}.jpg'
|
|
||||||
img, text_gt, params = generator.process(text)
|
|
||||||
|
|
||||||
cv2.imwrite(str(OUT_DIR / filename), img)
|
|
||||||
|
|
||||||
font_path = Path(params['font_path']).relative_to(FONTS_ROOT)
|
|
||||||
ret = source, id_, text_gt, params['vertical'], str(font_path)
|
|
||||||
return ret
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(traceback.format_exc())
|
|
||||||
|
|
||||||
|
|
||||||
def run(package=0, n_random=1000, n_limit=None, max_workers=16):
|
|
||||||
"""
|
|
||||||
:param package: number of data package to generate
|
|
||||||
:param n_random: how many samples with random text to generate
|
|
||||||
:param n_limit: limit number of generated samples (for debugging)
|
|
||||||
:param max_workers: max number of workers
|
|
||||||
"""
|
|
||||||
|
|
||||||
package = f'{package:04d}'
|
|
||||||
lines = pd.read_csv(DATA_SYNTHETIC_ROOT / f'lines/{package}.csv')
|
|
||||||
random_lines = pd.DataFrame({
|
|
||||||
'source': 'random',
|
|
||||||
'id': [f'random_{package}_{i}' for i in range(n_random)],
|
|
||||||
'line': None
|
|
||||||
})
|
|
||||||
lines = pd.concat([lines, random_lines], ignore_index=True)
|
|
||||||
if n_limit:
|
|
||||||
lines = lines.sample(n_limit)
|
|
||||||
args = [(i, *values) for i, values in enumerate(lines.values)]
|
|
||||||
|
|
||||||
global OUT_DIR
|
|
||||||
OUT_DIR = DATA_SYNTHETIC_ROOT / 'img' / package
|
|
||||||
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
data = thread_map(f, args, max_workers=max_workers, desc=f'Processing package {package}')
|
|
||||||
|
|
||||||
data = pd.DataFrame(data, columns=['source', 'id', 'text', 'vertical', 'font_path'])
|
|
||||||
meta_path = DATA_SYNTHETIC_ROOT / f'meta/{package}.csv'
|
|
||||||
meta_path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
data.to_csv(meta_path, index=False)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
fire.Fire(run)
|
|
||||||
@@ -1,72 +0,0 @@
|
|||||||
import PIL
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
from PIL import ImageDraw, ImageFont
|
|
||||||
from fontTools.ttLib import TTFont
|
|
||||||
from tqdm.contrib.concurrent import process_map
|
|
||||||
|
|
||||||
from manga_ocr_dev.env import ASSETS_PATH, FONTS_ROOT
|
|
||||||
|
|
||||||
vocab = pd.read_csv(ASSETS_PATH / 'vocab.csv').char.values
|
|
||||||
|
|
||||||
|
|
||||||
def has_glyph(font, glyph):
|
|
||||||
for table in font['cmap'].tables:
|
|
||||||
if ord(glyph) in table.cmap.keys():
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def process(font_path):
|
|
||||||
"""
|
|
||||||
Get supported characters list for a given font.
|
|
||||||
Font metadata is not always reliable, so try to render each character and see if anything shows up.
|
|
||||||
Still not perfect, because sometimes unsupported characters show up as rectangles.
|
|
||||||
"""
|
|
||||||
|
|
||||||
try:
|
|
||||||
font_path = str(font_path)
|
|
||||||
ttfont = TTFont(font_path)
|
|
||||||
pil_font = ImageFont.truetype(font_path, 24)
|
|
||||||
|
|
||||||
supported_chars = []
|
|
||||||
|
|
||||||
for char in vocab:
|
|
||||||
if not has_glyph(ttfont, char):
|
|
||||||
continue
|
|
||||||
|
|
||||||
image = PIL.Image.new('L', (40, 40), 255)
|
|
||||||
draw = ImageDraw.Draw(image)
|
|
||||||
draw.text((10, 0), char, 0, font=pil_font)
|
|
||||||
if (np.array(image) != 255).sum() == 0:
|
|
||||||
continue
|
|
||||||
|
|
||||||
supported_chars.append(char)
|
|
||||||
|
|
||||||
supported_chars = ''.join(supported_chars)
|
|
||||||
except Exception as e:
|
|
||||||
print(f'Error while processing {font_path}: {e}')
|
|
||||||
supported_chars = ''
|
|
||||||
|
|
||||||
return supported_chars
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
path_in = FONTS_ROOT
|
|
||||||
out_path = ASSETS_PATH / 'fonts.csv'
|
|
||||||
|
|
||||||
suffixes = {'.TTF', '.otf', '.ttc', '.ttf'}
|
|
||||||
font_paths = [path for path in path_in.glob('**/*') if
|
|
||||||
path.suffix in suffixes]
|
|
||||||
|
|
||||||
data = process_map(process, font_paths, max_workers=16)
|
|
||||||
|
|
||||||
font_paths = [str(path.relative_to(FONTS_ROOT)) for path in font_paths]
|
|
||||||
data = pd.DataFrame({'font_path': font_paths, 'supported_chars': data})
|
|
||||||
data['num_chars'] = data.supported_chars.str.len()
|
|
||||||
data['label'] = 'regular'
|
|
||||||
data.to_csv(out_path, index=False)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
@@ -1,54 +0,0 @@
|
|||||||
import pandas as pd
|
|
||||||
import unicodedata
|
|
||||||
|
|
||||||
from manga_ocr_dev.env import ASSETS_PATH, FONTS_ROOT
|
|
||||||
|
|
||||||
|
|
||||||
def get_background_df(background_dir):
|
|
||||||
background_df = []
|
|
||||||
for path in background_dir.iterdir():
|
|
||||||
ymin, ymax, xmin, xmax = [int(v) for v in path.stem.split('_')[-4:]]
|
|
||||||
h = ymax - ymin
|
|
||||||
w = xmax - xmin
|
|
||||||
ratio = w / h
|
|
||||||
|
|
||||||
background_df.append({
|
|
||||||
'path': str(path),
|
|
||||||
'h': h,
|
|
||||||
'w': w,
|
|
||||||
'ratio': ratio,
|
|
||||||
})
|
|
||||||
background_df = pd.DataFrame(background_df)
|
|
||||||
return background_df
|
|
||||||
|
|
||||||
|
|
||||||
def is_kanji(ch):
|
|
||||||
return 'CJK UNIFIED IDEOGRAPH' in unicodedata.name(ch)
|
|
||||||
|
|
||||||
|
|
||||||
def is_hiragana(ch):
|
|
||||||
return 'HIRAGANA' in unicodedata.name(ch)
|
|
||||||
|
|
||||||
|
|
||||||
def is_katakana(ch):
|
|
||||||
return 'KATAKANA' in unicodedata.name(ch)
|
|
||||||
|
|
||||||
|
|
||||||
def is_ascii(ch):
|
|
||||||
return ord(ch) < 128
|
|
||||||
|
|
||||||
|
|
||||||
def get_charsets(vocab_path=None):
|
|
||||||
if vocab_path is None:
|
|
||||||
vocab_path = ASSETS_PATH / 'vocab.csv'
|
|
||||||
vocab = pd.read_csv(vocab_path).char.values
|
|
||||||
hiragana = vocab[[is_hiragana(c) for c in vocab]][:-6]
|
|
||||||
katakana = vocab[[is_katakana(c) for c in vocab]][3:]
|
|
||||||
return vocab, hiragana, katakana
|
|
||||||
|
|
||||||
|
|
||||||
def get_font_meta():
|
|
||||||
df = pd.read_csv(ASSETS_PATH / 'fonts.csv')
|
|
||||||
df.font_path = df.font_path.apply(lambda x: str(FONTS_ROOT / x))
|
|
||||||
font_map = {row.font_path: set(row.supported_chars) for row in df.dropna().itertuples()}
|
|
||||||
return df, font_map
|
|
||||||
@@ -1,165 +0,0 @@
|
|||||||
import albumentations as A
|
|
||||||
import cv2
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
import torch
|
|
||||||
from torch.utils.data import Dataset
|
|
||||||
|
|
||||||
from manga_ocr_dev.env import MANGA109_ROOT, DATA_SYNTHETIC_ROOT
|
|
||||||
|
|
||||||
|
|
||||||
class MangaDataset(Dataset):
|
|
||||||
def __init__(self, processor, split, max_target_length, limit_size=None, augment=False, skip_packages=None):
|
|
||||||
self.processor = processor
|
|
||||||
self.max_target_length = max_target_length
|
|
||||||
|
|
||||||
data = []
|
|
||||||
|
|
||||||
print(f'Initializing dataset {split}...')
|
|
||||||
|
|
||||||
if skip_packages is None:
|
|
||||||
skip_packages = set()
|
|
||||||
else:
|
|
||||||
skip_packages = {f'{x:04d}' for x in skip_packages}
|
|
||||||
|
|
||||||
for path in sorted((DATA_SYNTHETIC_ROOT / 'meta').glob('*.csv')):
|
|
||||||
if path.stem in skip_packages:
|
|
||||||
print(f'Skipping package {path}')
|
|
||||||
continue
|
|
||||||
if not (DATA_SYNTHETIC_ROOT / 'img' / path.stem).is_dir():
|
|
||||||
print(f'Missing image data for package {path}, skipping')
|
|
||||||
continue
|
|
||||||
df = pd.read_csv(path)
|
|
||||||
df = df.dropna()
|
|
||||||
df['path'] = df.id.apply(lambda x: str(DATA_SYNTHETIC_ROOT / 'img' / path.stem / f'{x}.jpg'))
|
|
||||||
df = df[['path', 'text']]
|
|
||||||
df['synthetic'] = True
|
|
||||||
data.append(df)
|
|
||||||
|
|
||||||
df = pd.read_csv(MANGA109_ROOT / 'data.csv')
|
|
||||||
df = df[df.split == split].reset_index(drop=True)
|
|
||||||
df['path'] = df.crop_path.apply(lambda x: str(MANGA109_ROOT / x))
|
|
||||||
df = df[['path', 'text']]
|
|
||||||
df['synthetic'] = False
|
|
||||||
data.append(df)
|
|
||||||
|
|
||||||
data = pd.concat(data, ignore_index=True)
|
|
||||||
|
|
||||||
if limit_size:
|
|
||||||
data = data.iloc[:limit_size]
|
|
||||||
self.data = data
|
|
||||||
|
|
||||||
print(f'Dataset {split}: {len(self.data)}')
|
|
||||||
|
|
||||||
self.augment = augment
|
|
||||||
self.transform_medium, self.transform_heavy = self.get_transforms()
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return len(self.data)
|
|
||||||
|
|
||||||
def __getitem__(self, idx):
|
|
||||||
sample = self.data.loc[idx]
|
|
||||||
text = sample.text
|
|
||||||
|
|
||||||
if self.augment:
|
|
||||||
medium_p = 0.8
|
|
||||||
heavy_p = 0.02
|
|
||||||
transform_variant = np.random.choice(['none', 'medium', 'heavy'],
|
|
||||||
p=[1 - medium_p - heavy_p, medium_p, heavy_p])
|
|
||||||
transform = {
|
|
||||||
'none': None,
|
|
||||||
'medium': self.transform_medium,
|
|
||||||
'heavy': self.transform_heavy,
|
|
||||||
}[transform_variant]
|
|
||||||
else:
|
|
||||||
transform = None
|
|
||||||
|
|
||||||
pixel_values = self.read_image(self.processor, sample.path, transform)
|
|
||||||
labels = self.processor.tokenizer(text,
|
|
||||||
padding="max_length",
|
|
||||||
max_length=self.max_target_length,
|
|
||||||
truncation=True).input_ids
|
|
||||||
labels = np.array(labels)
|
|
||||||
# important: make sure that PAD tokens are ignored by the loss function
|
|
||||||
labels[labels == self.processor.tokenizer.pad_token_id] = -100
|
|
||||||
|
|
||||||
encoding = {
|
|
||||||
"pixel_values": pixel_values,
|
|
||||||
"labels": torch.tensor(labels),
|
|
||||||
}
|
|
||||||
return encoding
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def read_image(processor, path, transform=None):
|
|
||||||
img = cv2.imread(str(path))
|
|
||||||
|
|
||||||
if transform is None:
|
|
||||||
transform = A.ToGray(always_apply=True)
|
|
||||||
|
|
||||||
img = transform(image=img)['image']
|
|
||||||
|
|
||||||
pixel_values = processor(img, return_tensors="pt").pixel_values
|
|
||||||
return pixel_values.squeeze()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_transforms():
|
|
||||||
t_medium = A.Compose([
|
|
||||||
A.Rotate(5, border_mode=cv2.BORDER_REPLICATE, p=0.2),
|
|
||||||
A.Perspective((0.01, 0.05), pad_mode=cv2.BORDER_REPLICATE, p=0.2),
|
|
||||||
A.InvertImg(p=0.05),
|
|
||||||
|
|
||||||
A.OneOf([
|
|
||||||
A.Downscale(0.25, 0.5, interpolation=cv2.INTER_LINEAR),
|
|
||||||
A.Downscale(0.25, 0.5, interpolation=cv2.INTER_NEAREST),
|
|
||||||
], p=0.1),
|
|
||||||
A.Blur(p=0.2),
|
|
||||||
A.Sharpen(p=0.2),
|
|
||||||
A.RandomBrightnessContrast(p=0.5),
|
|
||||||
A.GaussNoise((50, 200), p=0.3),
|
|
||||||
A.ImageCompression(0, 30, p=0.1),
|
|
||||||
A.ToGray(always_apply=True),
|
|
||||||
])
|
|
||||||
|
|
||||||
t_heavy = A.Compose([
|
|
||||||
A.Rotate(10, border_mode=cv2.BORDER_REPLICATE, p=0.2),
|
|
||||||
A.Perspective((0.01, 0.05), pad_mode=cv2.BORDER_REPLICATE, p=0.2),
|
|
||||||
A.InvertImg(p=0.05),
|
|
||||||
|
|
||||||
A.OneOf([
|
|
||||||
A.Downscale(0.1, 0.2, interpolation=cv2.INTER_LINEAR),
|
|
||||||
A.Downscale(0.1, 0.2, interpolation=cv2.INTER_NEAREST),
|
|
||||||
], p=0.1),
|
|
||||||
A.Blur((4, 9), p=0.5),
|
|
||||||
A.Sharpen(p=0.5),
|
|
||||||
A.RandomBrightnessContrast(0.8, 0.8, p=1),
|
|
||||||
A.GaussNoise((1000, 10000), p=0.3),
|
|
||||||
A.ImageCompression(0, 10, p=0.5),
|
|
||||||
A.ToGray(always_apply=True),
|
|
||||||
])
|
|
||||||
|
|
||||||
return t_medium, t_heavy
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
from manga_ocr_dev.training.get_model import get_processor
|
|
||||||
from manga_ocr_dev.training.utils import tensor_to_image
|
|
||||||
|
|
||||||
encoder_name = 'facebook/deit-tiny-patch16-224'
|
|
||||||
decoder_name = 'cl-tohoku/bert-base-japanese-char-v2'
|
|
||||||
|
|
||||||
max_length = 300
|
|
||||||
|
|
||||||
processor = get_processor(encoder_name, decoder_name)
|
|
||||||
ds = MangaDataset(processor, 'train', max_length, augment=True)
|
|
||||||
|
|
||||||
for i in range(20):
|
|
||||||
sample = ds[0]
|
|
||||||
img = tensor_to_image(sample['pixel_values'])
|
|
||||||
tokens = sample['labels']
|
|
||||||
tokens[tokens == -100] = processor.tokenizer.pad_token_id
|
|
||||||
text = ''.join(processor.decode(tokens, skip_special_tokens=True).split())
|
|
||||||
|
|
||||||
print(f'{i}:\n{text}\n')
|
|
||||||
plt.imshow(img)
|
|
||||||
plt.show()
|
|
||||||
@@ -1,63 +0,0 @@
|
|||||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoModel, TrOCRProcessor, VisionEncoderDecoderModel, \
|
|
||||||
AutoFeatureExtractor, AutoTokenizer, VisionEncoderDecoderConfig
|
|
||||||
|
|
||||||
|
|
||||||
class TrOCRProcessorCustom(TrOCRProcessor):
|
|
||||||
"""The only point of this class is to bypass type checks of base class."""
|
|
||||||
|
|
||||||
def __init__(self, feature_extractor, tokenizer):
|
|
||||||
self.feature_extractor = feature_extractor
|
|
||||||
self.tokenizer = tokenizer
|
|
||||||
self.current_processor = self.feature_extractor
|
|
||||||
|
|
||||||
|
|
||||||
def get_processor(encoder_name, decoder_name):
|
|
||||||
feature_extractor = AutoFeatureExtractor.from_pretrained(encoder_name)
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(decoder_name)
|
|
||||||
processor = TrOCRProcessorCustom(feature_extractor, tokenizer)
|
|
||||||
return processor
|
|
||||||
|
|
||||||
|
|
||||||
def get_model(encoder_name, decoder_name, max_length, num_decoder_layers=None):
|
|
||||||
encoder_config = AutoConfig.from_pretrained(encoder_name)
|
|
||||||
encoder_config.is_decoder = False
|
|
||||||
encoder_config.add_cross_attention = False
|
|
||||||
encoder = AutoModel.from_config(encoder_config)
|
|
||||||
|
|
||||||
decoder_config = AutoConfig.from_pretrained(decoder_name)
|
|
||||||
decoder_config.max_length = max_length
|
|
||||||
decoder_config.is_decoder = True
|
|
||||||
decoder_config.add_cross_attention = True
|
|
||||||
decoder = AutoModelForCausalLM.from_config(decoder_config)
|
|
||||||
|
|
||||||
if num_decoder_layers is not None:
|
|
||||||
if decoder_config.model_type == 'bert':
|
|
||||||
decoder.bert.encoder.layer = decoder.bert.encoder.layer[-num_decoder_layers:]
|
|
||||||
elif decoder_config.model_type in ('roberta', 'xlm-roberta'):
|
|
||||||
decoder.roberta.encoder.layer = decoder.roberta.encoder.layer[-num_decoder_layers:]
|
|
||||||
else:
|
|
||||||
raise ValueError(f'Unsupported model_type: {decoder_config.model_type}')
|
|
||||||
|
|
||||||
decoder_config.num_hidden_layers = num_decoder_layers
|
|
||||||
|
|
||||||
config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config)
|
|
||||||
config.tie_word_embeddings = False
|
|
||||||
model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder, config=config)
|
|
||||||
|
|
||||||
processor = get_processor(encoder_name, decoder_name)
|
|
||||||
|
|
||||||
# set special tokens used for creating the decoder_input_ids from the labels
|
|
||||||
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
|
|
||||||
model.config.pad_token_id = processor.tokenizer.pad_token_id
|
|
||||||
# make sure vocab size is set correctly
|
|
||||||
model.config.vocab_size = model.config.decoder.vocab_size
|
|
||||||
|
|
||||||
# set beam search parameters
|
|
||||||
model.config.eos_token_id = processor.tokenizer.sep_token_id
|
|
||||||
model.config.max_length = max_length
|
|
||||||
model.config.early_stopping = True
|
|
||||||
model.config.no_repeat_ngram_size = 3
|
|
||||||
model.config.length_penalty = 2.0
|
|
||||||
model.config.num_beams = 4
|
|
||||||
|
|
||||||
return model, processor
|
|
||||||
@@ -1,32 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
from datasets import load_metric
|
|
||||||
|
|
||||||
|
|
||||||
class Metrics:
|
|
||||||
def __init__(self, processor):
|
|
||||||
self.cer_metric = load_metric("cer")
|
|
||||||
self.processor = processor
|
|
||||||
|
|
||||||
def compute_metrics(self, pred):
|
|
||||||
label_ids = pred.label_ids
|
|
||||||
pred_ids = pred.predictions
|
|
||||||
print(label_ids.shape, pred_ids.shape)
|
|
||||||
|
|
||||||
pred_str = self.processor.batch_decode(pred_ids, skip_special_tokens=True)
|
|
||||||
label_ids[label_ids == -100] = self.processor.tokenizer.pad_token_id
|
|
||||||
label_str = self.processor.batch_decode(label_ids, skip_special_tokens=True)
|
|
||||||
|
|
||||||
pred_str = np.array([''.join(text.split()) for text in pred_str])
|
|
||||||
label_str = np.array([''.join(text.split()) for text in label_str])
|
|
||||||
|
|
||||||
results = {}
|
|
||||||
try:
|
|
||||||
results['cer'] = self.cer_metric.compute(predictions=pred_str, references=label_str)
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
print(pred_str)
|
|
||||||
print(label_str)
|
|
||||||
results['cer'] = 0
|
|
||||||
results['accuracy'] = (pred_str == label_str).mean()
|
|
||||||
|
|
||||||
return results
|
|
||||||
@@ -1,64 +0,0 @@
|
|||||||
import fire
|
|
||||||
import wandb
|
|
||||||
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, default_data_collator
|
|
||||||
|
|
||||||
from manga_ocr_dev.env import TRAIN_ROOT
|
|
||||||
from manga_ocr_dev.training.dataset import MangaDataset
|
|
||||||
from manga_ocr_dev.training.get_model import get_model
|
|
||||||
from manga_ocr_dev.training.metrics import Metrics
|
|
||||||
|
|
||||||
|
|
||||||
def run(
|
|
||||||
run_name='debug',
|
|
||||||
encoder_name='facebook/deit-tiny-patch16-224',
|
|
||||||
decoder_name='cl-tohoku/bert-base-japanese-char-v2',
|
|
||||||
max_len=300,
|
|
||||||
num_decoder_layers=2,
|
|
||||||
batch_size=64,
|
|
||||||
num_epochs=8,
|
|
||||||
fp16=True,
|
|
||||||
):
|
|
||||||
wandb.login()
|
|
||||||
|
|
||||||
model, processor = get_model(encoder_name, decoder_name, max_len, num_decoder_layers)
|
|
||||||
|
|
||||||
# keep package 0 for validation
|
|
||||||
train_dataset = MangaDataset(processor, 'train', max_len, augment=True, skip_packages=[0])
|
|
||||||
eval_dataset = MangaDataset(processor, 'test', max_len, augment=False, skip_packages=range(1, 9999))
|
|
||||||
|
|
||||||
metrics = Metrics(processor)
|
|
||||||
|
|
||||||
training_args = Seq2SeqTrainingArguments(
|
|
||||||
predict_with_generate=True,
|
|
||||||
evaluation_strategy='steps',
|
|
||||||
save_strategy='steps',
|
|
||||||
per_device_train_batch_size=batch_size,
|
|
||||||
per_device_eval_batch_size=batch_size,
|
|
||||||
fp16=fp16,
|
|
||||||
fp16_full_eval=fp16,
|
|
||||||
dataloader_num_workers=16,
|
|
||||||
output_dir=TRAIN_ROOT,
|
|
||||||
logging_steps=10,
|
|
||||||
save_steps=20000,
|
|
||||||
eval_steps=20000,
|
|
||||||
num_train_epochs=num_epochs,
|
|
||||||
run_name=run_name
|
|
||||||
)
|
|
||||||
|
|
||||||
# instantiate trainer
|
|
||||||
trainer = Seq2SeqTrainer(
|
|
||||||
model=model,
|
|
||||||
tokenizer=processor.feature_extractor,
|
|
||||||
args=training_args,
|
|
||||||
compute_metrics=metrics.compute_metrics,
|
|
||||||
train_dataset=train_dataset,
|
|
||||||
eval_dataset=eval_dataset,
|
|
||||||
data_collator=default_data_collator,
|
|
||||||
)
|
|
||||||
trainer.train()
|
|
||||||
|
|
||||||
wandb.finish()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
fire.Fire(run)
|
|
||||||
@@ -1,27 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from torchinfo import summary
|
|
||||||
|
|
||||||
|
|
||||||
def encoder_summary(model, batch_size=4):
|
|
||||||
img_size = model.config.encoder.image_size
|
|
||||||
return summary(model.encoder, input_size=(batch_size, 3, img_size, img_size), depth=3,
|
|
||||||
col_names=["output_size", "num_params", "mult_adds"], device='cpu')
|
|
||||||
|
|
||||||
|
|
||||||
def decoder_summary(model, batch_size=4):
|
|
||||||
img_size = model.config.encoder.image_size
|
|
||||||
encoder_hidden_shape = (batch_size, (img_size // 16) ** 2 + 1, model.config.decoder.hidden_size)
|
|
||||||
decoder_inputs = {
|
|
||||||
'input_ids': torch.zeros(batch_size, 1, dtype=torch.int64),
|
|
||||||
'attention_mask': torch.ones(batch_size, 1, dtype=torch.int64),
|
|
||||||
'encoder_hidden_states': torch.rand(encoder_hidden_shape, dtype=torch.float32),
|
|
||||||
'return_dict': False
|
|
||||||
}
|
|
||||||
return summary(model.decoder, input_data=decoder_inputs, depth=4,
|
|
||||||
col_names=["output_size", "num_params", "mult_adds"],
|
|
||||||
device='cpu')
|
|
||||||
|
|
||||||
|
|
||||||
def tensor_to_image(img):
|
|
||||||
return ((img.cpu().numpy() + 1) / 2 * 255).clip(0, 255).astype(np.uint8).transpose(1, 2, 0)
|
|
||||||
@@ -12,3 +12,5 @@ google-cloud-vision
|
|||||||
azure-cognitiveservices-vision-computervision
|
azure-cognitiveservices-vision-computervision
|
||||||
pyobjc
|
pyobjc
|
||||||
pynput
|
pynput
|
||||||
|
easyocr
|
||||||
|
paddleocr
|
||||||
@@ -1,50 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"filename": "00.jpg",
|
|
||||||
"result": "素直にあやまるしか"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"filename": "01.jpg",
|
|
||||||
"result": "立川で見た〝穴〟の下の巨大な眼は:"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"filename": "02.jpg",
|
|
||||||
"result": "実戦剣術も一流です"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"filename": "03.jpg",
|
|
||||||
"result": "第30話重苦しい闇の奥で静かに呼吸づきながら"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"filename": "04.jpg",
|
|
||||||
"result": "きのうハンパーヶとって、ゴメン!!!"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"filename": "05.jpg",
|
|
||||||
"result": "ぎゃっ"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"filename": "06.jpg",
|
|
||||||
"result": "ピンポーーン"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"filename": "07.jpg",
|
|
||||||
"result": "LINK!私達7人の力でガノンの塔の結界をやぶります"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"filename": "08.jpg",
|
|
||||||
"result": "ファイアパンチ"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"filename": "09.jpg",
|
|
||||||
"result": "少し黙っている"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"filename": "10.jpg",
|
|
||||||
"result": "わかるかな〜?"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"filename": "11.jpg",
|
|
||||||
"result": "警察にも先生にも町中の人達に!!"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
Before Width: | Height: | Size: 9.2 KiB |
|
Before Width: | Height: | Size: 34 KiB |
|
Before Width: | Height: | Size: 2.8 KiB |
|
Before Width: | Height: | Size: 18 KiB |
|
Before Width: | Height: | Size: 89 KiB |
|
Before Width: | Height: | Size: 3.8 KiB |
|
Before Width: | Height: | Size: 9.2 KiB |
|
Before Width: | Height: | Size: 15 KiB |
|
Before Width: | Height: | Size: 6.9 KiB |
|
Before Width: | Height: | Size: 6.2 KiB |
|
Before Width: | Height: | Size: 3.4 KiB |
|
Before Width: | Height: | Size: 15 KiB |
@@ -1,25 +0,0 @@
|
|||||||
import json
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from manga_ocr import MangaOcr
|
|
||||||
|
|
||||||
TEST_DATA_ROOT = Path(__file__).parent / 'data'
|
|
||||||
|
|
||||||
|
|
||||||
def generate_expected_results():
|
|
||||||
mocr = MangaOcr()
|
|
||||||
|
|
||||||
results = []
|
|
||||||
|
|
||||||
for path in tqdm(sorted((TEST_DATA_ROOT / 'images').iterdir())):
|
|
||||||
result = mocr(path)
|
|
||||||
results.append({'filename': path.name, 'result': result})
|
|
||||||
|
|
||||||
(TEST_DATA_ROOT / 'expected_results.json').write_text(json.dumps(results, ensure_ascii=False, indent=2),
|
|
||||||
encoding='utf-8')
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
generate_expected_results()
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
import json
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from manga_ocr import MangaOcr
|
|
||||||
|
|
||||||
TEST_DATA_ROOT = Path(__file__).parent / 'data'
|
|
||||||
|
|
||||||
|
|
||||||
def test_ocr():
|
|
||||||
mocr = MangaOcr()
|
|
||||||
|
|
||||||
expected_results = json.loads((TEST_DATA_ROOT / 'expected_results.json').read_text(encoding='utf-8'))
|
|
||||||
|
|
||||||
for item in expected_results:
|
|
||||||
result = mocr(TEST_DATA_ROOT / 'images' / item['filename'])
|
|
||||||
assert result == item['result']
|
|
||||||