Add EasyOCR/PaddleOCR, remove unneeded stuff

This commit is contained in:
AuroraWright
2023-12-07 22:43:16 +01:00
parent aae112529b
commit c0641918dd
60 changed files with 109 additions and 7196 deletions

View File

@@ -1 +0,0 @@
include assets/example.jpg

Binary file not shown.

Before

Width:  |  Height:  |  Size: 55 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 9.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 10 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 9.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 304 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 405 KiB

File diff suppressed because one or more lines are too long

View File

@@ -1,251 +0,0 @@
len,p
1,0.014734972701616804
2,0.05048222747773489
3,0.05624961536094529
4,0.05972235654062228
5,0.05244278768803355
6,0.05518581363248727
7,0.046578690556781516
8,0.04875025276280738
9,0.04442471185039959
10,0.04181356215327536
11,0.040160713186745564
12,0.041162972666449804
13,0.03785727473339019
14,0.03527250028573187
15,0.03326798132632338
16,0.0307271656277749
17,0.028151182929938547
18,0.025794993977651372
19,0.024731192249193356
20,0.021856290057410126
21,0.021135366572008825
22,0.019113264112956403
23,0.017073578154260045
24,0.015992192926158093
25,0.013952506967461734
26,0.012572202245412905
27,0.011288606771405713
28,0.009758842302383443
29,0.008993960067872309
30,0.008176327334429372
31,0.0072356101034788955
32,0.006919107109888081
33,0.005978389878937605
34,0.004712377904574347
35,0.00467721090528648
36,0.004220039914544191
37,0.003463949429855024
38,0.003358448431991419
39,0.003059528938044539
40,0.00263752494659012
41,0.0021891457056697995
42,0.002364980702109141
43,0.002013310709230458
44,0.0019341849608327545
45,0.0013099707234730928
46,0.0013363459729389942
47,0.001204469725609488
48,0.0011341357270337517
49,0.0008967584818406407
50,0.000914341981484575
51,0.000914341981484575
52,0.0007736739843331018
53,0.0006505894868255629
54,0.0006681729864694971
55,0.0005011297398521228
56,0.0005714637384278593
57,0.00044837924092032037
58,0.000395628741988518
59,0.00031650299359081436
60,0.00031650299359081436
61,0.0002813359943029461
62,0.00026375249465901196
63,0.0002725442444809791
64,0.00020221024590524252
65,0.00032529474341278143
66,0.00023737724519311078
67,0.00023737724519311078
68,0.00022858549537114374
69,0.00020221024590524252
70,0.00012308449750753894
71,0.00010550099786360479
72,8.791749821967066e-05
73,0.00012308449750753894
74,0.00011429274768557187
75,7.912574839770359e-05
76,3.516699928786826e-05
77,7.033399857573652e-05
78,8.791749821967066e-05
79,3.516699928786826e-05
80,2.6375249465901198e-05
81,6.154224875376947e-05
82,0.00011429274768557187
83,7.033399857573652e-05
84,5.2750498931802396e-05
85,4.395874910983533e-05
86,3.516699928786826e-05
87,8.791749821967066e-05
88,6.154224875376947e-05
89,1.758349964393413e-05
90,1.758349964393413e-05
91,1.758349964393413e-05
92,8.791749821967065e-06
93,3.516699928786826e-05
94,2.6375249465901198e-05
95,2.6375249465901198e-05
96,1.758349964393413e-05
97,1.758349964393413e-05
98,4.395874910983533e-05
99,4.395874910983533e-05
100,8.791749821967065e-06
101,8.791749821967065e-06
102,2.6375249465901198e-05
103,2.6375249465901198e-05
104,8.791749821967065e-06
105,8.791749821967065e-06
106,1.758349964393413e-05
107,1.758349964393413e-05
108,8.791749821967065e-06
109,8.791749821967065e-06
110,8.791749821967065e-06
111,8.791749821967065e-06
112,8.791749821967065e-06
113,8.791749821967065e-06
114,3.516699928786826e-05
115,2.6375249465901198e-05
116,2.6375249465901198e-05
117,2.6375249465901198e-05
118,8.791749821967065e-06
119,8.791749821967065e-06
120,8.791749821967065e-06
121,8.791749821967065e-06
122,1.758349964393413e-05
123,8.791749821967065e-06
124,8.791749821967065e-06
125,8.791749821967065e-06
126,1.758349964393413e-05
127,1.758349964393413e-05
128,1.758349964393413e-05
129,1.758349964393413e-05
130,1.758349964393413e-05
131,8.791749821967065e-06
132,1.758349964393413e-05
133,8.791749821967065e-06
134,8.791749821967065e-06
135,8.791749821967065e-06
136,8.791749821967065e-06
137,8.791749821967065e-06
138,8.791749821967065e-06
139,8.791749821967065e-06
140,8.791749821967065e-06
141,8.791749821967065e-06
142,8.791749821967065e-06
143,8.791749821967065e-06
144,8.791749821967065e-06
145,8.791749821967065e-06
146,8.791749821967065e-06
147,8.791749821967065e-06
148,8.791749821967065e-06
149,8.791749821967065e-06
150,8.791749821967065e-06
151,8.791749821967065e-06
152,8.791749821967065e-06
153,8.791749821967065e-06
154,8.791749821967065e-06
155,8.791749821967065e-06
156,8.791749821967065e-06
157,8.791749821967065e-06
158,8.791749821967065e-06
159,8.791749821967065e-06
160,8.791749821967065e-06
161,8.791749821967065e-06
162,8.791749821967065e-06
163,8.791749821967065e-06
164,8.791749821967065e-06
165,8.791749821967065e-06
166,8.791749821967065e-06
167,8.791749821967065e-06
168,8.791749821967065e-06
169,8.791749821967065e-06
170,8.791749821967065e-06
171,8.791749821967065e-06
172,8.791749821967065e-06
173,8.791749821967065e-06
174,8.791749821967065e-06
175,8.791749821967065e-06
176,8.791749821967065e-06
177,8.791749821967065e-06
178,8.791749821967065e-06
179,8.791749821967065e-06
180,8.791749821967065e-06
181,8.791749821967065e-06
182,8.791749821967065e-06
183,8.791749821967065e-06
184,8.791749821967065e-06
185,8.791749821967065e-06
186,8.791749821967065e-06
187,8.791749821967065e-06
188,8.791749821967065e-06
189,8.791749821967065e-06
190,8.791749821967065e-06
191,8.791749821967065e-06
192,8.791749821967065e-06
193,8.791749821967065e-06
194,8.791749821967065e-06
195,8.791749821967065e-06
196,8.791749821967065e-06
197,8.791749821967065e-06
198,8.791749821967065e-06
199,8.791749821967065e-06
200,8.791749821967065e-06
201,8.791749821967065e-06
202,8.791749821967065e-06
203,8.791749821967065e-06
204,8.791749821967065e-06
205,8.791749821967065e-06
206,8.791749821967065e-06
207,8.791749821967065e-06
208,8.791749821967065e-06
209,8.791749821967065e-06
210,8.791749821967065e-06
211,8.791749821967065e-06
212,8.791749821967065e-06
213,8.791749821967065e-06
214,8.791749821967065e-06
215,8.791749821967065e-06
216,8.791749821967065e-06
217,8.791749821967065e-06
218,8.791749821967065e-06
219,8.791749821967065e-06
220,8.791749821967065e-06
221,8.791749821967065e-06
222,8.791749821967065e-06
223,8.791749821967065e-06
224,8.791749821967065e-06
225,8.791749821967065e-06
226,8.791749821967065e-06
227,8.791749821967065e-06
228,8.791749821967065e-06
229,8.791749821967065e-06
230,8.791749821967065e-06
231,8.791749821967065e-06
232,8.791749821967065e-06
233,8.791749821967065e-06
234,8.791749821967065e-06
235,8.791749821967065e-06
236,8.791749821967065e-06
237,8.791749821967065e-06
238,8.791749821967065e-06
239,8.791749821967065e-06
240,8.791749821967065e-06
241,8.791749821967065e-06
242,8.791749821967065e-06
243,8.791749821967065e-06
244,8.791749821967065e-06
245,8.791749821967065e-06
246,8.791749821967065e-06
247,8.791749821967065e-06
248,8.791749821967065e-06
249,8.791749821967065e-06
250,8.791749821967065e-06
1 len p
2 1 0.014734972701616804
3 2 0.05048222747773489
4 3 0.05624961536094529
5 4 0.05972235654062228
6 5 0.05244278768803355
7 6 0.05518581363248727
8 7 0.046578690556781516
9 8 0.04875025276280738
10 9 0.04442471185039959
11 10 0.04181356215327536
12 11 0.040160713186745564
13 12 0.041162972666449804
14 13 0.03785727473339019
15 14 0.03527250028573187
16 15 0.03326798132632338
17 16 0.0307271656277749
18 17 0.028151182929938547
19 18 0.025794993977651372
20 19 0.024731192249193356
21 20 0.021856290057410126
22 21 0.021135366572008825
23 22 0.019113264112956403
24 23 0.017073578154260045
25 24 0.015992192926158093
26 25 0.013952506967461734
27 26 0.012572202245412905
28 27 0.011288606771405713
29 28 0.009758842302383443
30 29 0.008993960067872309
31 30 0.008176327334429372
32 31 0.0072356101034788955
33 32 0.006919107109888081
34 33 0.005978389878937605
35 34 0.004712377904574347
36 35 0.00467721090528648
37 36 0.004220039914544191
38 37 0.003463949429855024
39 38 0.003358448431991419
40 39 0.003059528938044539
41 40 0.00263752494659012
42 41 0.0021891457056697995
43 42 0.002364980702109141
44 43 0.002013310709230458
45 44 0.0019341849608327545
46 45 0.0013099707234730928
47 46 0.0013363459729389942
48 47 0.001204469725609488
49 48 0.0011341357270337517
50 49 0.0008967584818406407
51 50 0.000914341981484575
52 51 0.000914341981484575
53 52 0.0007736739843331018
54 53 0.0006505894868255629
55 54 0.0006681729864694971
56 55 0.0005011297398521228
57 56 0.0005714637384278593
58 57 0.00044837924092032037
59 58 0.000395628741988518
60 59 0.00031650299359081436
61 60 0.00031650299359081436
62 61 0.0002813359943029461
63 62 0.00026375249465901196
64 63 0.0002725442444809791
65 64 0.00020221024590524252
66 65 0.00032529474341278143
67 66 0.00023737724519311078
68 67 0.00023737724519311078
69 68 0.00022858549537114374
70 69 0.00020221024590524252
71 70 0.00012308449750753894
72 71 0.00010550099786360479
73 72 8.791749821967066e-05
74 73 0.00012308449750753894
75 74 0.00011429274768557187
76 75 7.912574839770359e-05
77 76 3.516699928786826e-05
78 77 7.033399857573652e-05
79 78 8.791749821967066e-05
80 79 3.516699928786826e-05
81 80 2.6375249465901198e-05
82 81 6.154224875376947e-05
83 82 0.00011429274768557187
84 83 7.033399857573652e-05
85 84 5.2750498931802396e-05
86 85 4.395874910983533e-05
87 86 3.516699928786826e-05
88 87 8.791749821967066e-05
89 88 6.154224875376947e-05
90 89 1.758349964393413e-05
91 90 1.758349964393413e-05
92 91 1.758349964393413e-05
93 92 8.791749821967065e-06
94 93 3.516699928786826e-05
95 94 2.6375249465901198e-05
96 95 2.6375249465901198e-05
97 96 1.758349964393413e-05
98 97 1.758349964393413e-05
99 98 4.395874910983533e-05
100 99 4.395874910983533e-05
101 100 8.791749821967065e-06
102 101 8.791749821967065e-06
103 102 2.6375249465901198e-05
104 103 2.6375249465901198e-05
105 104 8.791749821967065e-06
106 105 8.791749821967065e-06
107 106 1.758349964393413e-05
108 107 1.758349964393413e-05
109 108 8.791749821967065e-06
110 109 8.791749821967065e-06
111 110 8.791749821967065e-06
112 111 8.791749821967065e-06
113 112 8.791749821967065e-06
114 113 8.791749821967065e-06
115 114 3.516699928786826e-05
116 115 2.6375249465901198e-05
117 116 2.6375249465901198e-05
118 117 2.6375249465901198e-05
119 118 8.791749821967065e-06
120 119 8.791749821967065e-06
121 120 8.791749821967065e-06
122 121 8.791749821967065e-06
123 122 1.758349964393413e-05
124 123 8.791749821967065e-06
125 124 8.791749821967065e-06
126 125 8.791749821967065e-06
127 126 1.758349964393413e-05
128 127 1.758349964393413e-05
129 128 1.758349964393413e-05
130 129 1.758349964393413e-05
131 130 1.758349964393413e-05
132 131 8.791749821967065e-06
133 132 1.758349964393413e-05
134 133 8.791749821967065e-06
135 134 8.791749821967065e-06
136 135 8.791749821967065e-06
137 136 8.791749821967065e-06
138 137 8.791749821967065e-06
139 138 8.791749821967065e-06
140 139 8.791749821967065e-06
141 140 8.791749821967065e-06
142 141 8.791749821967065e-06
143 142 8.791749821967065e-06
144 143 8.791749821967065e-06
145 144 8.791749821967065e-06
146 145 8.791749821967065e-06
147 146 8.791749821967065e-06
148 147 8.791749821967065e-06
149 148 8.791749821967065e-06
150 149 8.791749821967065e-06
151 150 8.791749821967065e-06
152 151 8.791749821967065e-06
153 152 8.791749821967065e-06
154 153 8.791749821967065e-06
155 154 8.791749821967065e-06
156 155 8.791749821967065e-06
157 156 8.791749821967065e-06
158 157 8.791749821967065e-06
159 158 8.791749821967065e-06
160 159 8.791749821967065e-06
161 160 8.791749821967065e-06
162 161 8.791749821967065e-06
163 162 8.791749821967065e-06
164 163 8.791749821967065e-06
165 164 8.791749821967065e-06
166 165 8.791749821967065e-06
167 166 8.791749821967065e-06
168 167 8.791749821967065e-06
169 168 8.791749821967065e-06
170 169 8.791749821967065e-06
171 170 8.791749821967065e-06
172 171 8.791749821967065e-06
173 172 8.791749821967065e-06
174 173 8.791749821967065e-06
175 174 8.791749821967065e-06
176 175 8.791749821967065e-06
177 176 8.791749821967065e-06
178 177 8.791749821967065e-06
179 178 8.791749821967065e-06
180 179 8.791749821967065e-06
181 180 8.791749821967065e-06
182 181 8.791749821967065e-06
183 182 8.791749821967065e-06
184 183 8.791749821967065e-06
185 184 8.791749821967065e-06
186 185 8.791749821967065e-06
187 186 8.791749821967065e-06
188 187 8.791749821967065e-06
189 188 8.791749821967065e-06
190 189 8.791749821967065e-06
191 190 8.791749821967065e-06
192 191 8.791749821967065e-06
193 192 8.791749821967065e-06
194 193 8.791749821967065e-06
195 194 8.791749821967065e-06
196 195 8.791749821967065e-06
197 196 8.791749821967065e-06
198 197 8.791749821967065e-06
199 198 8.791749821967065e-06
200 199 8.791749821967065e-06
201 200 8.791749821967065e-06
202 201 8.791749821967065e-06
203 202 8.791749821967065e-06
204 203 8.791749821967065e-06
205 204 8.791749821967065e-06
206 205 8.791749821967065e-06
207 206 8.791749821967065e-06
208 207 8.791749821967065e-06
209 208 8.791749821967065e-06
210 209 8.791749821967065e-06
211 210 8.791749821967065e-06
212 211 8.791749821967065e-06
213 212 8.791749821967065e-06
214 213 8.791749821967065e-06
215 214 8.791749821967065e-06
216 215 8.791749821967065e-06
217 216 8.791749821967065e-06
218 217 8.791749821967065e-06
219 218 8.791749821967065e-06
220 219 8.791749821967065e-06
221 220 8.791749821967065e-06
222 221 8.791749821967065e-06
223 222 8.791749821967065e-06
224 223 8.791749821967065e-06
225 224 8.791749821967065e-06
226 225 8.791749821967065e-06
227 226 8.791749821967065e-06
228 227 8.791749821967065e-06
229 228 8.791749821967065e-06
230 229 8.791749821967065e-06
231 230 8.791749821967065e-06
232 231 8.791749821967065e-06
233 232 8.791749821967065e-06
234 233 8.791749821967065e-06
235 234 8.791749821967065e-06
236 235 8.791749821967065e-06
237 236 8.791749821967065e-06
238 237 8.791749821967065e-06
239 238 8.791749821967065e-06
240 239 8.791749821967065e-06
241 240 8.791749821967065e-06
242 241 8.791749821967065e-06
243 242 8.791749821967065e-06
244 243 8.791749821967065e-06
245 244 8.791749821967065e-06
246 245 8.791749821967065e-06
247 246 8.791749821967065e-06
248 247 8.791749821967065e-06
249 248 8.791749821967065e-06
250 249 8.791749821967065e-06
251 250 8.791749821967065e-06

View File

@@ -1,6 +0,0 @@
source,id,line
cc-100,cc-100_446088,発展を遂げた貨幣経済に対して、後戻りする形の改革が、民衆に受け入れられるはずもありません。
cc-100,cc-100_446387,東京都渋谷区本町1丁目4−14 ホームヘルパー(パート:茂原)
cc-100,cc-100_446430,同時に、発表しあう場を増やしたいです。まず、自分の考えを発表するためには、しっかりと自分の考えを持っていなくてはいけません。そのために、ますますノートの必要性を感じることでしょう。また、質問や意見に答えることで、考えが深まります。友達の意見を聞くことが、より理解を深めることを実感してほしいです。
cc-100,cc-100_446493,※特典の数に限りがございますので、対象商品はお早めにお買い求めください。特典は無くなり次第終了となります。
cc-100,cc-100_446543,ハリウッドスターってもっと豪華な生活を送っているのかと思えば、キアヌ・リーブスってかなり質素なんですね。
1 source id line
2 cc-100 cc-100_446088 発展を遂げた貨幣経済に対して、後戻りする形の改革が、民衆に受け入れられるはずもありません。
3 cc-100 cc-100_446387 東京都渋谷区本町1丁目4−14 ホームヘルパー(パート:茂原)
4 cc-100 cc-100_446430 同時に、発表しあう場を増やしたいです。まず、自分の考えを発表するためには、しっかりと自分の考えを持っていなくてはいけません。そのために、ますますノートの必要性を感じることでしょう。また、質問や意見に答えることで、考えが深まります。友達の意見を聞くことが、より理解を深めることを実感してほしいです。
5 cc-100 cc-100_446493 ※特典の数に限りがございますので、対象商品はお早めにお買い求めください。特典は無くなり次第終了となります。
6 cc-100 cc-100_446543 ハリウッドスターってもっと豪華な生活を送っているのかと思えば、キアヌ・リーブスってかなり質素なんですね。

File diff suppressed because it is too large Load Diff

View File

@@ -4,3 +4,5 @@ from manga_ocr.ocr import MangaOcr
from manga_ocr.ocr import GoogleVision
from manga_ocr.ocr import AppleVision
from manga_ocr.ocr import AzureComputerVision
from manga_ocr.ocr import EasyOCR
from manga_ocr.ocr import PaddleOCR

View File

@@ -10,6 +10,7 @@ import platform
import jaconv
import torch
import numpy as np
from PIL import Image
from loguru import logger
from transformers import ViTImageProcessor, AutoTokenizer, VisionEncoderDecoderModel
@@ -33,9 +34,19 @@ try:
except ImportError:
pass
try:
import easyocr
except ImportError:
pass
try:
from paddleocr import PaddleOCR as POCR
except ImportError:
pass
class MangaOcr:
def __init__(self, pretrained_model_name_or_path='kha-white/manga-ocr-base', force_cpu=False):
logger.info(f'Loading OCR model from {pretrained_model_name_or_path}')
logger.info(f'Loading Manga OCR model from {pretrained_model_name_or_path}')
self.processor = ViTImageProcessor.from_pretrained(pretrained_model_name_or_path)
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
self.model = VisionEncoderDecoderModel.from_pretrained(pretrained_model_name_or_path)
@@ -222,6 +233,76 @@ class AzureComputerVision:
image_io.seek(0)
return image_io
class EasyOCR:
def __init__(self):
if 'easyocr' not in sys.modules:
logger.warning('easyocr not available, EasyOCR will not work!')
self.available = False
else:
logger.info('Loading EasyOCR model')
self.model = easyocr.Reader(['ja','en'])
self.available = True
logger.info('EasyOCR ready')
def __call__(self, img_or_path):
if not self.available:
return "Engine not available!"
if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
img = Image.open(img_or_path)
elif isinstance(img_or_path, Image.Image):
img = img_or_path
else:
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
res = ''
read_result = self.model.readtext(self._preprocess(img), detail=0)
for text in read_result:
res += text + ' '
x = post_process(res)
return x
def _preprocess(self, img):
image_bytes = io.BytesIO()
img.save(image_bytes, format=img.format)
return image_bytes.getvalue()
class PaddleOCR:
def __init__(self):
if 'paddleocr' not in sys.modules:
logger.warning('easyocr not available, PaddleOCR will not work!')
self.available = False
else:
logger.info('Loading PaddleOCR model')
self.model = POCR(use_angle_cls=True, show_log=False, lang='japan')
self.available = True
logger.info('PaddleOCR ready')
def __call__(self, img_or_path):
if not self.available:
return "Engine not available!"
if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
img = Image.open(img_or_path)
elif isinstance(img_or_path, Image.Image):
img = img_or_path
else:
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
res = ''
read_results = self.model.ocr(self._preprocess(img), cls=True)
for read_result in read_results:
if read_result:
for text in read_result:
res += text[1][0] + ' '
x = post_process(res)
return x
def _preprocess(self, img):
return np.array(img.convert('RGB'))
def post_process(text):
text = ''.join(text.split())
@@ -229,4 +310,4 @@ def post_process(text):
text = re.sub('[・.]{2,}', lambda x: (x.end() - x.start()) * '.', text)
text = jaconv.h2z(text, ascii=True, digit=True)
return text
return text

View File

@@ -12,17 +12,7 @@ from PIL import UnidentifiedImageError
from loguru import logger
from pynput import keyboard
from manga_ocr import MangaOcr
from manga_ocr import GoogleVision
from manga_ocr import AppleVision
from manga_ocr import AzureComputerVision
engines = ['avision', 'gvision', 'azure', 'mangaocr']
def get_engine_name(engine):
engine_names = ['Apple Vision', 'Google Vision', 'Azure Computer Vision', 'Manga OCR']
return engine_names[engines.index(engine)]
from manga_ocr import *
def are_images_identical(img1, img2):
@@ -35,19 +25,12 @@ def are_images_identical(img1, img2):
return (img1.shape == img2.shape) and (img1 == img2).all()
def process_and_write_results(mocr, avision, gvision, azure, img_or_path, write_to, engine):
def process_and_write_results(engine_instance, engine_name, img_or_path, write_to):
t0 = time.time()
if engine == 'gvision':
text = gvision(img_or_path)
elif engine == 'avision':
text = avision(img_or_path)
elif engine == 'azure':
text = azure(img_or_path)
else:
text = mocr(img_or_path)
text = engine_instance(img_or_path)
t1 = time.time()
logger.opt(ansi=True).info(f"Text recognized in {t1 - t0:0.03f}s using <cyan>{get_engine_name(engine)}</cyan>: {text}")
logger.opt(ansi=True).info(f"Text recognized in {t1 - t0:0.03f}s using <cyan>{engine_name}</cyan>: {text}")
if write_to == 'clipboard':
pyperclip.copy(text)
@@ -81,7 +64,7 @@ def run(read_from='clipboard',
:param pretrained_model_name_or_path: Path to a trained model, either local or from Transformers' model hub.
:param force_cpu: If True, OCR will use CPU even if GPU is available.
:param delay_secs: How often to check for new images, in seconds.
:param engine: OCR engine to use. Available: "mangaocr", "gvision", "avision", "azure".
:param engine: OCR engine to use. Available: "mangaocr", "gvision", "avision", "azure", "easyocr", "paddleocr".
:param verbose: If True, unhides all warnings.
"""
@@ -93,10 +76,20 @@ def run(read_from='clipboard',
}
logger.configure(**config)
mocr = MangaOcr(pretrained_model_name_or_path, force_cpu)
avision = AppleVision()
gvision = GoogleVision()
azure = AzureComputerVision()
avision = AppleVision()
mangaocr = MangaOcr(pretrained_model_name_or_path, force_cpu)
easyocr = EasyOCR()
paddleocr = PaddleOCR()
engines = ['avision', 'gvision', 'azure', 'mangaocr', 'easyocr', 'paddleocr']
engine_names = ['Apple Vision', 'Google Vision', 'Azure Computer Vision', 'Manga OCR', 'EasyOCR', 'PaddleOCR']
engine_instances = [avision, gvision, azure, mangaocr, easyocr, paddleocr]
engine_keys = 'agvmeo'
def get_engine_name(engine):
return engine_names[engines.index(engine)]
if engine not in engines:
msg = 'Unknown OCR engine!'
@@ -203,8 +196,8 @@ def run(read_from='clipboard',
engine = engines[engines.index(engine) + 1]
logger.opt(ansi=True).info(f"Switched to <cyan>{get_engine_name(engine)}</cyan>!")
elif user_input.lower() in 'agvm':
new_engine = engines['agvm'.find(user_input.lower())]
elif user_input.lower() in engine_keys:
new_engine = engines[engine_keys.find(user_input.lower())]
if engine != new_engine:
engine = new_engine
logger.opt(ansi=True).info(f"Switched to <cyan>{get_engine_name(engine)}</cyan>!")
@@ -228,7 +221,7 @@ def run(read_from='clipboard',
logger.warning('Error while reading from clipboard ({})'.format(error))
else:
if not just_unpaused and isinstance(img, Image.Image) and not are_images_identical(img, old_img):
process_and_write_results(mocr, avision, gvision, azure, img, write_to, engine)
process_and_write_results(engine_instances[engines.index(engine)], get_engine_name(engine), img, write_to)
if just_unpaused:
just_unpaused = False
@@ -244,7 +237,7 @@ def run(read_from='clipboard',
except (UnidentifiedImageError, OSError) as e:
logger.warning(f'Error while reading file {path}: {e}')
else:
process_and_write_results(mocr, avision, gvision, azure, img, write_to, engine)
process_and_write_results(engine_instances[engines.index(engine)], get_engine_name(engine), img, write_to)
time.sleep(delay_secs)

View File

@@ -1,98 +0,0 @@
# Project structure
```
assets/ # assets (see description below)
manga_ocr/ # release code (inference only)
manga_ocr_dev/ # development code
env.py # global constants
data/ # data preprocessing
synthetic_data_generator/ # generation of synthetic image-text pairs
training/ # model training
```
## assets
### fonts.csv
csv with columns:
- font_path: path to font file, relative to `FONTS_ROOT`
- supported_chars: string of characters supported by this font
- num_chars: number of supported characters
- label: common/regular/special (used to sample regular fonts more often than special)
List of fonts with metadata used by synthetic data generator.
Provided file is just an example, you have to generate similar file for your own set of fonts,
using `manga_ocr_dev/synthetic_data_generator/scan_fonts.py` script.
Note that `label` will be filled with `regular` by default. You have to label your special fonts manually.
### lines_example.csv
csv with columns:
- source: source of text
- id: unique id of the line
- line: line from language corpus
Example of csv used for synthetic data generation.
### len_to_p.csv
csv with columns:
- len: length of text
- p: probability of text of this length occurring in manga
Used by synthetic data generator to more-or-less match the natural distribution of text lengths.
Computed based on Manga109-s dataset.
### vocab.csv
List of all characters supported by tokenizer.
# Training OCR
`env.py` contains global constants used across the repo. Set your paths to data etc. there.
1. Download [Manga109-s](http://www.manga109.org/en/download_s.html) dataset.
2. Set `MANGA109_ROOT`, so that your directory structure looks like this:
```
<MANGA109_ROOT>/
Manga109s_released_2021_02_28/
annotations/
annotations.v2018.05.31/
images/
books.txt
readme.txt
```
3. Preprocess Manga109-s with `data/process_manga109s.py`
4. Optionally generate synthetic data (see below)
5. Train with `manga_ocr_dev/training/train.py`
# Synthetic data generation
Generated data is split into packages (named `0000`, `0001` etc.) for easier management of large dataset.
Each package is assumed to have similar data distribution, so that a properly balanced dataset
can be built from any subset of packages.
Data generation pipeline assumes following directory structure:
```
<DATA_SYNTHETIC_ROOT>/
img/ # generated images (output from generation pipeline)
0000/
0001/
...
lines/ # lines from corpus (input to generation pipeline)
0000.csv
0001.csv
...
meta/ # metadata (output from generation pipeline)
0000.csv
0001.csv
...
```
To use a language corpus for data generation, `lines/*.csv` files must be provided.
For a small example of such file see `assets/lines_example.csv`.
To generate synthetic data:
1. Generate backgrounds with `data/generate_backgrounds.py`.
2. Put your fonts in `<FONTS_ROOT>`.
3. Generate fonts metadata with `synthetic_data_generator/scan_fonts.py`.
4. Optionally manually label your fonts with `common/regular/special` labels.
5. Provide `<DATA_SYNTHETIC_ROOT>/lines/*.csv`.
6. Run `synthetic_data_generator/run_generate.py` for each package.

View File

@@ -1,85 +0,0 @@
from pathlib import Path
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from manga_ocr_dev.env import MANGA109_ROOT, BACKGROUND_DIR
def find_rectangle(mask, y, x, aspect_ratio_range=(0.33, 3.0)):
ymin_ = ymax_ = y
xmin_ = xmax_ = x
ymin = ymax = xmin = xmax = None
while True:
if ymin is None:
ymin_ -= 1
if ymin_ == 0 or mask[ymin_, xmin_:xmax_].any():
ymin = ymin_
if ymax is None:
ymax_ += 1
if ymax_ == mask.shape[0] - 1 or mask[ymax_, xmin_:xmax_].any():
ymax = ymax_
if xmin is None:
xmin_ -= 1
if xmin_ == 0 or mask[ymin_:ymax_, xmin_].any():
xmin = xmin_
if xmax is None:
xmax_ += 1
if xmax_ == mask.shape[1] - 1 or mask[ymin_:ymax_, xmax_].any():
xmax = xmax_
h = ymax_ - ymin_
w = xmax_ - xmin_
if h > 1 and w > 1:
ratio = w / h
if ratio < aspect_ratio_range[0] or ratio > aspect_ratio_range[1]:
return ymin_, ymax_, xmin_, xmax_
if None not in (ymin, ymax, xmin, xmax):
return ymin, ymax, xmin, xmax
def generate_backgrounds(crops_per_page=5, min_size=40):
data = pd.read_csv(MANGA109_ROOT / 'data.csv')
frames_df = pd.read_csv(MANGA109_ROOT / 'frames.csv')
BACKGROUND_DIR.mkdir(parents=True, exist_ok=True)
page_paths = data.page_path.unique()
for page_path in tqdm(page_paths):
page = cv2.imread(str(MANGA109_ROOT / page_path))
mask = np.zeros((page.shape[0], page.shape[1]), dtype=bool)
for row in data[data.page_path == page_path].itertuples():
mask[row.ymin:row.ymax, row.xmin:row.xmax] = True
frames_mask = np.zeros((page.shape[0], page.shape[1]), dtype=bool)
for row in frames_df[frames_df.page_path == page_path].itertuples():
frames_mask[row.ymin:row.ymax, row.xmin:row.xmax] = True
mask = mask | ~frames_mask
if mask.all():
continue
unmasked_points = np.stack(np.where(~mask), axis=1)
for i in range(crops_per_page):
p = unmasked_points[np.random.randint(0, unmasked_points.shape[0])]
y, x = p
ymin, ymax, xmin, xmax = find_rectangle(mask, y, x)
crop = page[ymin:ymax, xmin:xmax]
if crop.shape[0] >= min_size and crop.shape[1] >= min_size:
out_filename = '_'.join(
Path(page_path).with_suffix('').parts[-2:]) + f'_{ymin}_{ymax}_{xmin}_{xmax}.png'
cv2.imwrite(str(BACKGROUND_DIR / out_filename), crop)
if __name__ == '__main__':
generate_backgrounds()

View File

@@ -1,103 +0,0 @@
import xml.etree.ElementTree as ET
from pathlib import Path
import cv2
import pandas as pd
from tqdm import tqdm
from manga_ocr_dev.env import MANGA109_ROOT
def get_books():
root = MANGA109_ROOT / 'Manga109s_released_2021_02_28'
books = (root / 'books.txt').read_text().splitlines()
books = pd.DataFrame({
'book': books,
'annotations': [str(root / 'annotations' / f'{book}.xml') for book in books],
'images': [str(root / 'images' / book) for book in books],
})
return books
def export_frames():
books = get_books()
data = []
for book in tqdm(books.itertuples(), total=len(books)):
tree = ET.parse(book.annotations)
root = tree.getroot()
for page in root.findall('./pages/page'):
for frame in page.findall('./frame'):
row = {}
row['book'] = book.book
row['page_index'] = int(page.attrib['index'])
row['page_path'] = str(Path(book.images) / f'{row["page_index"]:03d}.jpg')
row['page_width'] = int(page.attrib['width'])
row['page_height'] = int(page.attrib['height'])
row['id'] = frame.attrib['id']
row['xmin'] = int(frame.attrib['xmin'])
row['ymin'] = int(frame.attrib['ymin'])
row['xmax'] = int(frame.attrib['xmax'])
row['ymax'] = int(frame.attrib['ymax'])
data.append(row)
data = pd.DataFrame(data)
data.page_path = data.page_path.apply(lambda x: '/'.join(Path(x).parts[-4:]))
data.to_csv(MANGA109_ROOT / 'frames.csv', index=False)
def export_crops():
crops_root = MANGA109_ROOT / 'crops'
crops_root.mkdir(parents=True, exist_ok=True)
margin = 10
books = get_books()
data = []
for book in tqdm(books.itertuples(), total=len(books)):
tree = ET.parse(book.annotations)
root = tree.getroot()
for page in root.findall('./pages/page'):
for text in page.findall('./text'):
row = {}
row['book'] = book.book
row['page_index'] = int(page.attrib['index'])
row['page_path'] = str(Path(book.images) / f'{row["page_index"]:03d}.jpg')
row['page_width'] = int(page.attrib['width'])
row['page_height'] = int(page.attrib['height'])
row['id'] = text.attrib['id']
row['text'] = text.text
row['xmin'] = int(text.attrib['xmin'])
row['ymin'] = int(text.attrib['ymin'])
row['xmax'] = int(text.attrib['xmax'])
row['ymax'] = int(text.attrib['ymax'])
data.append(row)
data = pd.DataFrame(data)
n_test = int(0.1 * len(data))
data['split'] = 'train'
data.loc[data.sample(len(data)).iloc[:n_test].index, 'split'] = 'test'
data['crop_path'] = str(crops_root) + '\\' + data.id + '.png'
data.page_path = data.page_path.apply(lambda x: '/'.join(Path(x).parts[-4:]))
data.crop_path = data.crop_path.apply(lambda x: '/'.join(Path(x).parts[-2:]))
data.to_csv(MANGA109_ROOT / 'data.csv', index=False)
for page_path, boxes in tqdm(data.groupby('page_path'), total=data.page_path.nunique()):
img = cv2.imread(str(MANGA109_ROOT / page_path))
for box in boxes.itertuples():
xmin = max(box.xmin - margin, 0)
xmax = min(box.xmax + margin, img.shape[1])
ymin = max(box.ymin - margin, 0)
ymax = min(box.ymax + margin, img.shape[0])
crop = img[ymin:ymax, xmin:xmax]
out_path = (crops_root / box.id).with_suffix('.png')
cv2.imwrite(str(out_path), crop)
if __name__ == '__main__':
export_frames()
export_crops()

View File

@@ -1,9 +0,0 @@
from pathlib import Path
ASSETS_PATH = Path(__file__).parent.parent / 'assets'
FONTS_ROOT = Path('~/data/jp_fonts').expanduser()
DATA_SYNTHETIC_ROOT = Path('~/data/manga/synthetic').expanduser()
BACKGROUND_DIR = Path('~/data/manga/Manga109s/background').expanduser()
MANGA109_ROOT = Path('~/data/manga/Manga109s').expanduser()
TRAIN_ROOT = Path('~/data/manga/out').expanduser()

View File

@@ -1,25 +0,0 @@
datasets
jiwer
torchinfo
transformers>=4.12.5
unidic-lite
ipadic
mecab-python3
fugashi
matplotlib
numpy
opencv-python
pandas
Pillow
pytest
scikit-image
scikit-learn
scipy
torch
torchvision
tqdm
wandb
fire
budou
albumentations>=1.1
html2image

View File

@@ -1,38 +0,0 @@
# Synthetic data generator
Generation of synthetic image-text pairs imitating Japanese manga for the purpose of training OCR.
Features:
- using either text from corpus or random text
- text overlaid on background images
- drawing text bubbles
- various fonts and font styles
- variety of text layouts:
- vertical and horizontal text
- multi-line text
- [furigana](https://en.wikipedia.org/wiki/Furigana) (added randomly)
- [tate chū yoko](https://www.w3.org/International/articles/vertical-text/#tcy)
Text rendering is done with the usage of [html2image](https://github.com/vgalin/html2image),
which is a wrapper around Chrome/Chromium browser's headless mode.
It's not too elegant of a solution, and it is very slow, but it only needs to be run once,
and when parallelized, processing time is manageable (~17 min per 10000 images on a 16-thread machine).
The upside of this approach is that a quite complex problem of typesetting and text rendering
(especially when dealing with both horizontal and vertical text) is offloaded to
the browser engine, keeping the codebase relatively simple and extendable.
High-level generation pipeline is as follows:
1. Preprocess text (truncate and/or split into lines, add random furigana).
2. Render text on a transparent background, using HTML engine.
3. Select background image from backgrounds dataset.
4. Overlay the text on the background, optionally drawing a bubble around the text.
# Examples
## Images generated with text from [CC-100 Japanese corpus](https://data.statmt.org/cc-100/)
![](../../assets/examples/cc-100.jpg)
## Images generated with random text
![](../../assets/examples/random.jpg)

View File

@@ -1,198 +0,0 @@
import budou
import numpy as np
import pandas as pd
from manga_ocr_dev.env import ASSETS_PATH, FONTS_ROOT
from manga_ocr_dev.synthetic_data_generator.renderer import Renderer
from manga_ocr_dev.synthetic_data_generator.utils import get_font_meta, get_charsets, is_ascii, is_kanji
class SyntheticDataGenerator:
def __init__(self):
self.vocab, self.hiragana, self.katakana = get_charsets()
self.len_to_p = pd.read_csv(ASSETS_PATH / 'len_to_p.csv')
self.parser = budou.get_parser('tinysegmenter')
self.fonts_df, self.font_map = get_font_meta()
self.font_labels, self.font_p = self.get_font_labels_prob()
self.renderer = Renderer()
def process(self, text=None, override_css_params=None):
"""
Generate image, text pair. Use source text if provided, otherwise generate random text.
"""
if override_css_params is None:
override_css_params = {}
if text is None:
# if using random text, choose font first,
# and then generate text using only characters supported by that font
if 'font_path' not in override_css_params:
font_path = self.get_random_font()
vocab = self.font_map[font_path]
override_css_params['font_path'] = font_path
else:
font_path = override_css_params['font_path']
vocab = self.font_map[font_path]
words = self.get_random_words(vocab)
else:
text = text.replace(' ', ' ')
text = text.replace('', '...')
words = self.split_into_words(text)
lines = self.words_to_lines(words)
text_gt = '\n'.join(lines)
if 'font_path' not in override_css_params:
override_css_params['font_path'] = self.get_random_font(text_gt)
font_path = override_css_params.get('font_path')
if font_path:
vocab = self.font_map.get(font_path)
# remove unsupported characters
lines = [''.join([c for c in line if c in vocab]) for line in lines]
text_gt = '\n'.join(lines)
else:
vocab = None
if np.random.random() < 0.5:
word_prob = np.random.choice([0.33, 1.0], p=[0.3, 0.7])
lines = [self.add_random_furigana(line, word_prob, vocab) for line in lines]
img, params = self.renderer.render(lines, override_css_params)
return img, text_gt, params
def get_random_words(self, vocab):
vocab = list(vocab)
max_text_len = np.random.choice(self.len_to_p.len, p=self.len_to_p.p)
words = []
text_len = 0
while True:
word = ''.join(np.random.choice(vocab, np.random.randint(1, 4)))
words.append(word)
text_len += len(word)
if text_len + len(word) >= max_text_len:
break
return words
def split_into_words(self, text):
max_text_len = np.random.choice(self.len_to_p.len, p=self.len_to_p.p)
words = []
text_len = 0
for chunk in self.parser.parse(text)['chunks']:
words.append(chunk.word)
text_len += len(chunk.word)
if text_len + len(chunk.word) >= max_text_len:
break
return words
def words_to_lines(self, words):
text = ''.join(words)
max_num_lines = 10
min_line_len = len(text) // max_num_lines
max_line_len = 20
max_line_len = np.clip(np.random.poisson(6), min_line_len, max_line_len)
lines = []
line = ''
for word in words:
line += word
if len(line) >= max_line_len:
lines.append(line)
line = ''
if line:
lines.append(line)
return lines
def add_random_furigana(self, line, word_prob=1.0, vocab=None):
if vocab is None:
vocab = self.vocab
else:
vocab = list(vocab)
processed = ''
kanji_group = ''
ascii_group = ''
for i, c in enumerate(line):
if is_kanji(c):
c_type = 'kanji'
kanji_group += c
elif is_ascii(c):
c_type = 'ascii'
ascii_group += c
else:
c_type = 'other'
if c_type != 'kanji' or i == len(line) - 1:
if kanji_group:
if np.random.uniform() < word_prob:
furigana_len = int(np.clip(np.random.normal(1.5, 0.5), 1, 4) * len(kanji_group))
char_source = np.random.choice(['hiragana', 'katakana', 'all'], p=[0.8, 0.15, 0.05])
char_source = {
'hiragana': self.hiragana,
'katakana': self.katakana,
'all': vocab
}[char_source]
furigana = ''.join(np.random.choice(char_source, furigana_len))
processed += f'<ruby>{kanji_group}<rt>{furigana}</rt></ruby>'
else:
processed += kanji_group
kanji_group = ''
if c_type != 'ascii' or i == len(line) - 1:
if ascii_group:
if len(ascii_group) <= 3 and np.random.uniform() < 0.7:
processed += f'<span style="text-combine-upright: all">{ascii_group}</span>'
else:
processed += ascii_group
ascii_group = ''
if c_type == 'other':
processed += c
return processed
def is_font_supporting_text(self, font_path, text):
chars = self.font_map[font_path]
for c in text:
if c.isspace():
continue
if c not in chars:
return False
return True
def get_font_labels_prob(self):
labels = {
'common': 0.2,
'regular': 0.75,
'special': 0.05,
}
labels = {k: labels[k] for k in self.fonts_df.label.unique()}
p = np.array(list(labels.values()))
p = p / p.sum()
labels = list(labels.keys())
return labels, p
def get_random_font(self, text=None):
label = np.random.choice(self.font_labels, p=self.font_p)
df = self.fonts_df[self.fonts_df.label == label]
if text is None:
return df.sample(1).iloc[0].font_path
valid_mask = df.font_path.apply(lambda x: self.is_font_supporting_text(x, text))
if not valid_mask.any():
# if text contains characters not supported by any font, just pick some of the more capable fonts
valid_mask = (df.num_chars >= 4000)
return str(FONTS_ROOT / df[valid_mask].sample(1).iloc[0].font_path)

View File

@@ -1,265 +0,0 @@
import os
import uuid
import albumentations as A
import cv2
import numpy as np
from html2image import Html2Image
from manga_ocr_dev.env import BACKGROUND_DIR
from manga_ocr_dev.synthetic_data_generator.utils import get_background_df
class Renderer:
def __init__(self):
self.hti = Html2Image()
self.background_df = get_background_df(BACKGROUND_DIR)
self.max_size = 600
def render(self, lines, override_css_params=None):
img, params = self.render_text(lines, override_css_params)
img = self.render_background(img)
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img = A.LongestMaxSize(self.max_size)(image=img)['image']
return img, params
def render_text(self, lines, override_css_params=None):
"""Render text on transparent background and return as BGRA image."""
params = self.get_random_css_params()
if override_css_params:
params.update(override_css_params)
css = get_css(**params)
# this is just a rough estimate, image is cropped later anyway
size = (
int(max(len(line) for line in lines) * params['font_size'] * 1.5),
int(len(lines) * params['font_size'] * (3 + params['line_height'])),
)
if params['vertical']:
size = size[::-1]
html = self.lines_to_html(lines)
filename = str(uuid.uuid4()) + '.png'
self.hti.screenshot(html_str=html, css_str=css, save_as=filename, size=size)
img = cv2.imread(filename, cv2.IMREAD_UNCHANGED)
os.remove(filename)
return img, params
@staticmethod
def get_random_css_params():
params = {
'font_size': 48,
'vertical': True if np.random.rand() < 0.7 else False,
'line_height': 0.5,
'background_color': 'transparent',
'text_color': 'black',
}
if np.random.rand() < 0.7:
params['text_orientation'] = 'upright'
stroke_variant = np.random.choice(['stroke', 'shadow', 'none'], p=[0.8, 0.15, 0.05])
if stroke_variant == 'stroke':
params['stroke_size'] = np.random.choice([1, 2, 3, 4, 8])
params['stroke_color'] = 'white'
elif stroke_variant == 'shadow':
params['shadow_size'] = np.random.choice([2, 5, 10])
params['shadow_color'] = 'white' if np.random.rand() < 0.8 else 'black',
elif stroke_variant == 'none':
pass
return params
def render_background(self, img):
"""Add background and/or text bubble to a BGRA image, crop and return as BGR image."""
draw_bubble = np.random.random() < 0.7
m0 = int(min(img.shape[:2]) * 0.3)
img = crop_by_alpha(img, m0)
background_path = self.background_df.sample(1).iloc[0].path
background = cv2.imread(background_path)
t = [
A.HorizontalFlip(),
A.RandomRotate90(),
A.InvertImg(),
A.RandomBrightnessContrast((-0.2, 0.4), (-0.8, -0.3), p=0.5 if draw_bubble else 1),
A.Blur((3, 5), p=0.3),
A.Resize(img.shape[0], img.shape[1]),
]
background = A.Compose(t)(image=background)['image']
if not draw_bubble:
if np.random.rand() < 0.5:
img[:, :, :3] = 255 - img[:, :, :3]
else:
radius = np.random.uniform(0.7, 1.)
thickness = np.random.choice([1, 2, 3])
alpha = np.random.randint(60, 100)
sigma = np.random.randint(10, 15)
ymin = m0 - int(min(img.shape[:2]) * np.random.uniform(0.07, 0.12))
ymax = img.shape[0] - m0 + int(min(img.shape[:2]) * np.random.uniform(0.07, 0.12))
xmin = m0 - int(min(img.shape[:2]) * np.random.uniform(0.07, 0.12))
xmax = img.shape[1] - m0 + int(min(img.shape[:2]) * np.random.uniform(0.07, 0.12))
bubble_fill_color = (255, 255, 255, 255)
bubble_contour_color = (0, 0, 0, 255)
bubble = np.zeros((img.shape[0], img.shape[1], 4), dtype=np.uint8)
bubble = rounded_rectangle(bubble, (xmin, ymin), (xmax, ymax), radius=radius, color=bubble_fill_color,
thickness=-1)
bubble = rounded_rectangle(bubble, (xmin, ymin), (xmax, ymax), radius=radius, color=bubble_contour_color,
thickness=thickness)
t = [
A.ElasticTransform(alpha=alpha, sigma=sigma, alpha_affine=0, p=0.8),
]
bubble = A.Compose(t)(image=bubble)['image']
background = blend(bubble, background)
img = blend(img, background)
ymin = m0 - int(min(img.shape[:2]) * np.random.uniform(0.01, 0.2))
ymax = img.shape[0] - m0 + int(min(img.shape[:2]) * np.random.uniform(0.01, 0.2))
xmin = m0 - int(min(img.shape[:2]) * np.random.uniform(0.01, 0.2))
xmax = img.shape[1] - m0 + int(min(img.shape[:2]) * np.random.uniform(0.01, 0.2))
img = img[ymin:ymax, xmin:xmax]
return img
def lines_to_html(self, lines):
lines_str = '\n'.join(['<p>' + line + '</p>' for line in lines])
html = f"<html><body>\n{lines_str}\n</body></html>"
return html
def crop_by_alpha(img, margin):
y, x = np.where(img[:, :, 3] > 0)
ymin = y.min()
ymax = y.max()
xmin = x.min()
xmax = x.max()
img = img[ymin:ymax, xmin:xmax]
img = np.pad(img, ((margin, margin), (margin, margin), (0, 0)))
return img
def blend(img, background):
alpha = (img[:, :, 3] / 255)[:, :, np.newaxis]
img = img[:, :, :3]
img = (background * (1 - alpha) + img * alpha).astype(np.uint8)
return img
def rounded_rectangle(src, top_left, bottom_right, radius=1, color=255, thickness=1, line_type=cv2.LINE_AA):
"""From https://stackoverflow.com/a/60210706"""
# corners:
# p1 - p2
# | |
# p4 - p3
p1 = top_left
p2 = (bottom_right[0], top_left[1])
p3 = bottom_right
p4 = (top_left[0], bottom_right[1])
height = abs(bottom_right[1] - top_left[1])
width = abs(bottom_right[0] - top_left[0])
if radius > 1:
radius = 1
corner_radius = int(radius * (min(height, width) / 2))
if thickness < 0:
# big rect
top_left_main_rect = (int(p1[0] + corner_radius), int(p1[1]))
bottom_right_main_rect = (int(p3[0] - corner_radius), int(p3[1]))
top_left_rect_left = (p1[0], p1[1] + corner_radius)
bottom_right_rect_left = (p4[0] + corner_radius, p4[1] - corner_radius)
top_left_rect_right = (p2[0] - corner_radius, p2[1] + corner_radius)
bottom_right_rect_right = (p3[0], p3[1] - corner_radius)
all_rects = [
[top_left_main_rect, bottom_right_main_rect],
[top_left_rect_left, bottom_right_rect_left],
[top_left_rect_right, bottom_right_rect_right]]
[cv2.rectangle(src, rect[0], rect[1], color, thickness) for rect in all_rects]
# draw straight lines
cv2.line(src, (p1[0] + corner_radius, p1[1]), (p2[0] - corner_radius, p2[1]), color, abs(thickness), line_type)
cv2.line(src, (p2[0], p2[1] + corner_radius), (p3[0], p3[1] - corner_radius), color, abs(thickness), line_type)
cv2.line(src, (p3[0] - corner_radius, p4[1]), (p4[0] + corner_radius, p3[1]), color, abs(thickness), line_type)
cv2.line(src, (p4[0], p4[1] - corner_radius), (p1[0], p1[1] + corner_radius), color, abs(thickness), line_type)
# draw arcs
cv2.ellipse(src, (p1[0] + corner_radius, p1[1] + corner_radius), (corner_radius, corner_radius), 180.0, 0, 90,
color, thickness, line_type)
cv2.ellipse(src, (p2[0] - corner_radius, p2[1] + corner_radius), (corner_radius, corner_radius), 270.0, 0, 90,
color, thickness, line_type)
cv2.ellipse(src, (p3[0] - corner_radius, p3[1] - corner_radius), (corner_radius, corner_radius), 0.0, 0, 90, color,
thickness, line_type)
cv2.ellipse(src, (p4[0] + corner_radius, p4[1] - corner_radius), (corner_radius, corner_radius), 90.0, 0, 90, color,
thickness, line_type)
return src
def get_css(
font_size,
font_path,
vertical=True,
background_color='white',
text_color='black',
shadow_size=0,
shadow_color='black',
stroke_size=0,
stroke_color='black',
letter_spacing=None,
line_height=0.5,
text_orientation=None,
):
styles = [
f"background-color: {background_color};",
f"font-size: {font_size}px;",
f"color: {text_color};",
"font-family: custom;",
f"line-height: {line_height};",
"margin: 20px;",
]
if text_orientation:
styles.append(f"text-orientation: {text_orientation};")
if vertical:
styles.append("writing-mode: vertical-rl;")
if shadow_size > 0:
styles.append(f"text-shadow: 0 0 {shadow_size}px {shadow_color};")
if stroke_size > 0:
# stroke is simulated by shadow overlaid multiple times
styles.extend([
f"text-shadow: " + ','.join([f"0 0 {stroke_size}px {stroke_color}"] * 10 * stroke_size) + ";",
"-webkit-font-smoothing: antialiased;",
])
if letter_spacing:
styles.append(f"letter-spacing: {letter_spacing}em;")
font_path = font_path.replace('\\', '/')
styles_str = '\n'.join(styles)
css = ""
css += '\n@font-face {\nfont-family: custom;\nsrc: url("' + font_path + '");\n}\n'
css += "body {\n" + styles_str + "\n}"
return css

View File

@@ -1,64 +0,0 @@
import traceback
from pathlib import Path
import cv2
import fire
import pandas as pd
from tqdm.contrib.concurrent import thread_map
from manga_ocr_dev.env import FONTS_ROOT, DATA_SYNTHETIC_ROOT
from manga_ocr_dev.synthetic_data_generator.generator import SyntheticDataGenerator
generator = SyntheticDataGenerator()
def f(args):
try:
i, source, id_, text = args
filename = f'{id_}.jpg'
img, text_gt, params = generator.process(text)
cv2.imwrite(str(OUT_DIR / filename), img)
font_path = Path(params['font_path']).relative_to(FONTS_ROOT)
ret = source, id_, text_gt, params['vertical'], str(font_path)
return ret
except Exception as e:
print(traceback.format_exc())
def run(package=0, n_random=1000, n_limit=None, max_workers=16):
"""
:param package: number of data package to generate
:param n_random: how many samples with random text to generate
:param n_limit: limit number of generated samples (for debugging)
:param max_workers: max number of workers
"""
package = f'{package:04d}'
lines = pd.read_csv(DATA_SYNTHETIC_ROOT / f'lines/{package}.csv')
random_lines = pd.DataFrame({
'source': 'random',
'id': [f'random_{package}_{i}' for i in range(n_random)],
'line': None
})
lines = pd.concat([lines, random_lines], ignore_index=True)
if n_limit:
lines = lines.sample(n_limit)
args = [(i, *values) for i, values in enumerate(lines.values)]
global OUT_DIR
OUT_DIR = DATA_SYNTHETIC_ROOT / 'img' / package
OUT_DIR.mkdir(parents=True, exist_ok=True)
data = thread_map(f, args, max_workers=max_workers, desc=f'Processing package {package}')
data = pd.DataFrame(data, columns=['source', 'id', 'text', 'vertical', 'font_path'])
meta_path = DATA_SYNTHETIC_ROOT / f'meta/{package}.csv'
meta_path.parent.mkdir(parents=True, exist_ok=True)
data.to_csv(meta_path, index=False)
if __name__ == '__main__':
fire.Fire(run)

View File

@@ -1,72 +0,0 @@
import PIL
import numpy as np
import pandas as pd
from PIL import ImageDraw, ImageFont
from fontTools.ttLib import TTFont
from tqdm.contrib.concurrent import process_map
from manga_ocr_dev.env import ASSETS_PATH, FONTS_ROOT
vocab = pd.read_csv(ASSETS_PATH / 'vocab.csv').char.values
def has_glyph(font, glyph):
for table in font['cmap'].tables:
if ord(glyph) in table.cmap.keys():
return True
return False
def process(font_path):
"""
Get supported characters list for a given font.
Font metadata is not always reliable, so try to render each character and see if anything shows up.
Still not perfect, because sometimes unsupported characters show up as rectangles.
"""
try:
font_path = str(font_path)
ttfont = TTFont(font_path)
pil_font = ImageFont.truetype(font_path, 24)
supported_chars = []
for char in vocab:
if not has_glyph(ttfont, char):
continue
image = PIL.Image.new('L', (40, 40), 255)
draw = ImageDraw.Draw(image)
draw.text((10, 0), char, 0, font=pil_font)
if (np.array(image) != 255).sum() == 0:
continue
supported_chars.append(char)
supported_chars = ''.join(supported_chars)
except Exception as e:
print(f'Error while processing {font_path}: {e}')
supported_chars = ''
return supported_chars
def main():
path_in = FONTS_ROOT
out_path = ASSETS_PATH / 'fonts.csv'
suffixes = {'.TTF', '.otf', '.ttc', '.ttf'}
font_paths = [path for path in path_in.glob('**/*') if
path.suffix in suffixes]
data = process_map(process, font_paths, max_workers=16)
font_paths = [str(path.relative_to(FONTS_ROOT)) for path in font_paths]
data = pd.DataFrame({'font_path': font_paths, 'supported_chars': data})
data['num_chars'] = data.supported_chars.str.len()
data['label'] = 'regular'
data.to_csv(out_path, index=False)
if __name__ == '__main__':
main()

View File

@@ -1,54 +0,0 @@
import pandas as pd
import unicodedata
from manga_ocr_dev.env import ASSETS_PATH, FONTS_ROOT
def get_background_df(background_dir):
background_df = []
for path in background_dir.iterdir():
ymin, ymax, xmin, xmax = [int(v) for v in path.stem.split('_')[-4:]]
h = ymax - ymin
w = xmax - xmin
ratio = w / h
background_df.append({
'path': str(path),
'h': h,
'w': w,
'ratio': ratio,
})
background_df = pd.DataFrame(background_df)
return background_df
def is_kanji(ch):
return 'CJK UNIFIED IDEOGRAPH' in unicodedata.name(ch)
def is_hiragana(ch):
return 'HIRAGANA' in unicodedata.name(ch)
def is_katakana(ch):
return 'KATAKANA' in unicodedata.name(ch)
def is_ascii(ch):
return ord(ch) < 128
def get_charsets(vocab_path=None):
if vocab_path is None:
vocab_path = ASSETS_PATH / 'vocab.csv'
vocab = pd.read_csv(vocab_path).char.values
hiragana = vocab[[is_hiragana(c) for c in vocab]][:-6]
katakana = vocab[[is_katakana(c) for c in vocab]][3:]
return vocab, hiragana, katakana
def get_font_meta():
df = pd.read_csv(ASSETS_PATH / 'fonts.csv')
df.font_path = df.font_path.apply(lambda x: str(FONTS_ROOT / x))
font_map = {row.font_path: set(row.supported_chars) for row in df.dropna().itertuples()}
return df, font_map

View File

@@ -1,165 +0,0 @@
import albumentations as A
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from manga_ocr_dev.env import MANGA109_ROOT, DATA_SYNTHETIC_ROOT
class MangaDataset(Dataset):
def __init__(self, processor, split, max_target_length, limit_size=None, augment=False, skip_packages=None):
self.processor = processor
self.max_target_length = max_target_length
data = []
print(f'Initializing dataset {split}...')
if skip_packages is None:
skip_packages = set()
else:
skip_packages = {f'{x:04d}' for x in skip_packages}
for path in sorted((DATA_SYNTHETIC_ROOT / 'meta').glob('*.csv')):
if path.stem in skip_packages:
print(f'Skipping package {path}')
continue
if not (DATA_SYNTHETIC_ROOT / 'img' / path.stem).is_dir():
print(f'Missing image data for package {path}, skipping')
continue
df = pd.read_csv(path)
df = df.dropna()
df['path'] = df.id.apply(lambda x: str(DATA_SYNTHETIC_ROOT / 'img' / path.stem / f'{x}.jpg'))
df = df[['path', 'text']]
df['synthetic'] = True
data.append(df)
df = pd.read_csv(MANGA109_ROOT / 'data.csv')
df = df[df.split == split].reset_index(drop=True)
df['path'] = df.crop_path.apply(lambda x: str(MANGA109_ROOT / x))
df = df[['path', 'text']]
df['synthetic'] = False
data.append(df)
data = pd.concat(data, ignore_index=True)
if limit_size:
data = data.iloc[:limit_size]
self.data = data
print(f'Dataset {split}: {len(self.data)}')
self.augment = augment
self.transform_medium, self.transform_heavy = self.get_transforms()
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
sample = self.data.loc[idx]
text = sample.text
if self.augment:
medium_p = 0.8
heavy_p = 0.02
transform_variant = np.random.choice(['none', 'medium', 'heavy'],
p=[1 - medium_p - heavy_p, medium_p, heavy_p])
transform = {
'none': None,
'medium': self.transform_medium,
'heavy': self.transform_heavy,
}[transform_variant]
else:
transform = None
pixel_values = self.read_image(self.processor, sample.path, transform)
labels = self.processor.tokenizer(text,
padding="max_length",
max_length=self.max_target_length,
truncation=True).input_ids
labels = np.array(labels)
# important: make sure that PAD tokens are ignored by the loss function
labels[labels == self.processor.tokenizer.pad_token_id] = -100
encoding = {
"pixel_values": pixel_values,
"labels": torch.tensor(labels),
}
return encoding
@staticmethod
def read_image(processor, path, transform=None):
img = cv2.imread(str(path))
if transform is None:
transform = A.ToGray(always_apply=True)
img = transform(image=img)['image']
pixel_values = processor(img, return_tensors="pt").pixel_values
return pixel_values.squeeze()
@staticmethod
def get_transforms():
t_medium = A.Compose([
A.Rotate(5, border_mode=cv2.BORDER_REPLICATE, p=0.2),
A.Perspective((0.01, 0.05), pad_mode=cv2.BORDER_REPLICATE, p=0.2),
A.InvertImg(p=0.05),
A.OneOf([
A.Downscale(0.25, 0.5, interpolation=cv2.INTER_LINEAR),
A.Downscale(0.25, 0.5, interpolation=cv2.INTER_NEAREST),
], p=0.1),
A.Blur(p=0.2),
A.Sharpen(p=0.2),
A.RandomBrightnessContrast(p=0.5),
A.GaussNoise((50, 200), p=0.3),
A.ImageCompression(0, 30, p=0.1),
A.ToGray(always_apply=True),
])
t_heavy = A.Compose([
A.Rotate(10, border_mode=cv2.BORDER_REPLICATE, p=0.2),
A.Perspective((0.01, 0.05), pad_mode=cv2.BORDER_REPLICATE, p=0.2),
A.InvertImg(p=0.05),
A.OneOf([
A.Downscale(0.1, 0.2, interpolation=cv2.INTER_LINEAR),
A.Downscale(0.1, 0.2, interpolation=cv2.INTER_NEAREST),
], p=0.1),
A.Blur((4, 9), p=0.5),
A.Sharpen(p=0.5),
A.RandomBrightnessContrast(0.8, 0.8, p=1),
A.GaussNoise((1000, 10000), p=0.3),
A.ImageCompression(0, 10, p=0.5),
A.ToGray(always_apply=True),
])
return t_medium, t_heavy
if __name__ == '__main__':
from manga_ocr_dev.training.get_model import get_processor
from manga_ocr_dev.training.utils import tensor_to_image
encoder_name = 'facebook/deit-tiny-patch16-224'
decoder_name = 'cl-tohoku/bert-base-japanese-char-v2'
max_length = 300
processor = get_processor(encoder_name, decoder_name)
ds = MangaDataset(processor, 'train', max_length, augment=True)
for i in range(20):
sample = ds[0]
img = tensor_to_image(sample['pixel_values'])
tokens = sample['labels']
tokens[tokens == -100] = processor.tokenizer.pad_token_id
text = ''.join(processor.decode(tokens, skip_special_tokens=True).split())
print(f'{i}:\n{text}\n')
plt.imshow(img)
plt.show()

View File

@@ -1,63 +0,0 @@
from transformers import AutoConfig, AutoModelForCausalLM, AutoModel, TrOCRProcessor, VisionEncoderDecoderModel, \
AutoFeatureExtractor, AutoTokenizer, VisionEncoderDecoderConfig
class TrOCRProcessorCustom(TrOCRProcessor):
"""The only point of this class is to bypass type checks of base class."""
def __init__(self, feature_extractor, tokenizer):
self.feature_extractor = feature_extractor
self.tokenizer = tokenizer
self.current_processor = self.feature_extractor
def get_processor(encoder_name, decoder_name):
feature_extractor = AutoFeatureExtractor.from_pretrained(encoder_name)
tokenizer = AutoTokenizer.from_pretrained(decoder_name)
processor = TrOCRProcessorCustom(feature_extractor, tokenizer)
return processor
def get_model(encoder_name, decoder_name, max_length, num_decoder_layers=None):
encoder_config = AutoConfig.from_pretrained(encoder_name)
encoder_config.is_decoder = False
encoder_config.add_cross_attention = False
encoder = AutoModel.from_config(encoder_config)
decoder_config = AutoConfig.from_pretrained(decoder_name)
decoder_config.max_length = max_length
decoder_config.is_decoder = True
decoder_config.add_cross_attention = True
decoder = AutoModelForCausalLM.from_config(decoder_config)
if num_decoder_layers is not None:
if decoder_config.model_type == 'bert':
decoder.bert.encoder.layer = decoder.bert.encoder.layer[-num_decoder_layers:]
elif decoder_config.model_type in ('roberta', 'xlm-roberta'):
decoder.roberta.encoder.layer = decoder.roberta.encoder.layer[-num_decoder_layers:]
else:
raise ValueError(f'Unsupported model_type: {decoder_config.model_type}')
decoder_config.num_hidden_layers = num_decoder_layers
config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config)
config.tie_word_embeddings = False
model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder, config=config)
processor = get_processor(encoder_name, decoder_name)
# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size
# set beam search parameters
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = max_length
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4
return model, processor

View File

@@ -1,32 +0,0 @@
import numpy as np
from datasets import load_metric
class Metrics:
def __init__(self, processor):
self.cer_metric = load_metric("cer")
self.processor = processor
def compute_metrics(self, pred):
label_ids = pred.label_ids
pred_ids = pred.predictions
print(label_ids.shape, pred_ids.shape)
pred_str = self.processor.batch_decode(pred_ids, skip_special_tokens=True)
label_ids[label_ids == -100] = self.processor.tokenizer.pad_token_id
label_str = self.processor.batch_decode(label_ids, skip_special_tokens=True)
pred_str = np.array([''.join(text.split()) for text in pred_str])
label_str = np.array([''.join(text.split()) for text in label_str])
results = {}
try:
results['cer'] = self.cer_metric.compute(predictions=pred_str, references=label_str)
except Exception as e:
print(e)
print(pred_str)
print(label_str)
results['cer'] = 0
results['accuracy'] = (pred_str == label_str).mean()
return results

View File

@@ -1,64 +0,0 @@
import fire
import wandb
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, default_data_collator
from manga_ocr_dev.env import TRAIN_ROOT
from manga_ocr_dev.training.dataset import MangaDataset
from manga_ocr_dev.training.get_model import get_model
from manga_ocr_dev.training.metrics import Metrics
def run(
run_name='debug',
encoder_name='facebook/deit-tiny-patch16-224',
decoder_name='cl-tohoku/bert-base-japanese-char-v2',
max_len=300,
num_decoder_layers=2,
batch_size=64,
num_epochs=8,
fp16=True,
):
wandb.login()
model, processor = get_model(encoder_name, decoder_name, max_len, num_decoder_layers)
# keep package 0 for validation
train_dataset = MangaDataset(processor, 'train', max_len, augment=True, skip_packages=[0])
eval_dataset = MangaDataset(processor, 'test', max_len, augment=False, skip_packages=range(1, 9999))
metrics = Metrics(processor)
training_args = Seq2SeqTrainingArguments(
predict_with_generate=True,
evaluation_strategy='steps',
save_strategy='steps',
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
fp16=fp16,
fp16_full_eval=fp16,
dataloader_num_workers=16,
output_dir=TRAIN_ROOT,
logging_steps=10,
save_steps=20000,
eval_steps=20000,
num_train_epochs=num_epochs,
run_name=run_name
)
# instantiate trainer
trainer = Seq2SeqTrainer(
model=model,
tokenizer=processor.feature_extractor,
args=training_args,
compute_metrics=metrics.compute_metrics,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=default_data_collator,
)
trainer.train()
wandb.finish()
if __name__ == '__main__':
fire.Fire(run)

View File

@@ -1,27 +0,0 @@
import numpy as np
import torch
from torchinfo import summary
def encoder_summary(model, batch_size=4):
img_size = model.config.encoder.image_size
return summary(model.encoder, input_size=(batch_size, 3, img_size, img_size), depth=3,
col_names=["output_size", "num_params", "mult_adds"], device='cpu')
def decoder_summary(model, batch_size=4):
img_size = model.config.encoder.image_size
encoder_hidden_shape = (batch_size, (img_size // 16) ** 2 + 1, model.config.decoder.hidden_size)
decoder_inputs = {
'input_ids': torch.zeros(batch_size, 1, dtype=torch.int64),
'attention_mask': torch.ones(batch_size, 1, dtype=torch.int64),
'encoder_hidden_states': torch.rand(encoder_hidden_shape, dtype=torch.float32),
'return_dict': False
}
return summary(model.decoder, input_data=decoder_inputs, depth=4,
col_names=["output_size", "num_params", "mult_adds"],
device='cpu')
def tensor_to_image(img):
return ((img.cpu().numpy() + 1) / 2 * 255).clip(0, 255).astype(np.uint8).transpose(1, 2, 0)

View File

@@ -11,4 +11,6 @@ unidic_lite
google-cloud-vision
azure-cognitiveservices-vision-computervision
pyobjc
pynput
pynput
easyocr
paddleocr

View File

View File

@@ -1,50 +0,0 @@
[
{
"filename": "00.jpg",
"result": "素直にあやまるしか"
},
{
"filename": "01.jpg",
"result": "立川で見た〝穴〟の下の巨大な眼は:"
},
{
"filename": "02.jpg",
"result": "実戦剣術も一流です"
},
{
"filename": "03.jpg",
"result": "第30話重苦しい闇の奥で静かに呼吸づきながら"
},
{
"filename": "04.jpg",
"result": "きのうハンパーヶとって、ゴメン!!!"
},
{
"filename": "05.jpg",
"result": "ぎゃっ"
},
{
"filename": "06.jpg",
"result": "ピンポーーン"
},
{
"filename": "07.jpg",
"result": "LINK!私達7人の力でガノンの塔の結界をやぶります"
},
{
"filename": "08.jpg",
"result": "ファイアパンチ"
},
{
"filename": "09.jpg",
"result": "少し黙っている"
},
{
"filename": "10.jpg",
"result": "わかるかな〜?"
},
{
"filename": "11.jpg",
"result": "警察にも先生にも町中の人達に!!"
}
]

Binary file not shown.

Before

Width:  |  Height:  |  Size: 9.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 89 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 9.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

View File

@@ -1,25 +0,0 @@
import json
from pathlib import Path
from tqdm import tqdm
from manga_ocr import MangaOcr
TEST_DATA_ROOT = Path(__file__).parent / 'data'
def generate_expected_results():
mocr = MangaOcr()
results = []
for path in tqdm(sorted((TEST_DATA_ROOT / 'images').iterdir())):
result = mocr(path)
results.append({'filename': path.name, 'result': result})
(TEST_DATA_ROOT / 'expected_results.json').write_text(json.dumps(results, ensure_ascii=False, indent=2),
encoding='utf-8')
if __name__ == '__main__':
generate_expected_results()

View File

@@ -1,16 +0,0 @@
import json
from pathlib import Path
from manga_ocr import MangaOcr
TEST_DATA_ROOT = Path(__file__).parent / 'data'
def test_ocr():
mocr = MangaOcr()
expected_results = json.loads((TEST_DATA_ROOT / 'expected_results.json').read_text(encoding='utf-8'))
for item in expected_results:
result = mocr(TEST_DATA_ROOT / 'images' / item['filename'])
assert result == item['result']