Add EasyOCR/PaddleOCR, remove unneeded stuff

2023-12-07 22:43:16 +01:00
parent aae112529b
commit c0641918dd
60 changed files with 109 additions and 7196 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1 +0,0 @@
-include assets/example.jpg
--- a/assets/example.jpg
+++ b/assets/example.jpg
--- a/assets/examples/00.jpg
+++ b/assets/examples/00.jpg
--- a/assets/examples/01.jpg
+++ b/assets/examples/01.jpg
--- a/assets/examples/02.jpg
+++ b/assets/examples/02.jpg
--- a/assets/examples/03.jpg
+++ b/assets/examples/03.jpg
--- a/assets/examples/04.jpg
+++ b/assets/examples/04.jpg
--- a/assets/examples/05.jpg
+++ b/assets/examples/05.jpg
--- a/assets/examples/06.jpg
+++ b/assets/examples/06.jpg
--- a/assets/examples/07.jpg
+++ b/assets/examples/07.jpg
--- a/assets/examples/08.jpg
+++ b/assets/examples/08.jpg
--- a/assets/examples/09.jpg
+++ b/assets/examples/09.jpg
--- a/assets/examples/10.jpg
+++ b/assets/examples/10.jpg
--- a/assets/examples/11.jpg
+++ b/assets/examples/11.jpg
--- a/assets/examples/cc-100.jpg
+++ b/assets/examples/cc-100.jpg
--- a/assets/examples/random.jpg
+++ b/assets/examples/random.jpg
--- a/assets/fonts.csv
+++ b/assets/fonts.csv
--- a/assets/len_to_p.csv
+++ b/assets/len_to_p.csv
@@ -1,251 +0,0 @@
-len,p
-1,0.014734972701616804
-2,0.05048222747773489
-3,0.05624961536094529
-4,0.05972235654062228
-5,0.05244278768803355
-6,0.05518581363248727
-7,0.046578690556781516
-8,0.04875025276280738
-9,0.04442471185039959
-10,0.04181356215327536
-11,0.040160713186745564
-12,0.041162972666449804
-13,0.03785727473339019
-14,0.03527250028573187
-15,0.03326798132632338
-16,0.0307271656277749
-17,0.028151182929938547
-18,0.025794993977651372
-19,0.024731192249193356
-20,0.021856290057410126
-21,0.021135366572008825
-22,0.019113264112956403
-23,0.017073578154260045
-24,0.015992192926158093
-25,0.013952506967461734
-26,0.012572202245412905
-27,0.011288606771405713
-28,0.009758842302383443
-29,0.008993960067872309
-30,0.008176327334429372
-31,0.0072356101034788955
-32,0.006919107109888081
-33,0.005978389878937605
-34,0.004712377904574347
-35,0.00467721090528648
-36,0.004220039914544191
-37,0.003463949429855024
-38,0.003358448431991419
-39,0.003059528938044539
-40,0.00263752494659012
-41,0.0021891457056697995
-42,0.002364980702109141
-43,0.002013310709230458
-44,0.0019341849608327545
-45,0.0013099707234730928
-46,0.0013363459729389942
-47,0.001204469725609488
-48,0.0011341357270337517
-49,0.0008967584818406407
-50,0.000914341981484575
-51,0.000914341981484575
-52,0.0007736739843331018
-53,0.0006505894868255629
-54,0.0006681729864694971
-55,0.0005011297398521228
-56,0.0005714637384278593
-57,0.00044837924092032037
-58,0.000395628741988518
-59,0.00031650299359081436
-60,0.00031650299359081436
-61,0.0002813359943029461
-62,0.00026375249465901196
-63,0.0002725442444809791
-64,0.00020221024590524252
-65,0.00032529474341278143
-66,0.00023737724519311078
-67,0.00023737724519311078
-68,0.00022858549537114374
-69,0.00020221024590524252
-70,0.00012308449750753894
-71,0.00010550099786360479
-72,8.791749821967066e-05
-73,0.00012308449750753894
-74,0.00011429274768557187
-75,7.912574839770359e-05
-76,3.516699928786826e-05
-77,7.033399857573652e-05
-78,8.791749821967066e-05
-79,3.516699928786826e-05
-80,2.6375249465901198e-05
-81,6.154224875376947e-05
-82,0.00011429274768557187
-83,7.033399857573652e-05
-84,5.2750498931802396e-05
-85,4.395874910983533e-05
-86,3.516699928786826e-05
-87,8.791749821967066e-05
-88,6.154224875376947e-05
-89,1.758349964393413e-05
-90,1.758349964393413e-05
-91,1.758349964393413e-05
-92,8.791749821967065e-06
-93,3.516699928786826e-05
-94,2.6375249465901198e-05
-95,2.6375249465901198e-05
-96,1.758349964393413e-05
-97,1.758349964393413e-05
-98,4.395874910983533e-05
-99,4.395874910983533e-05
-100,8.791749821967065e-06
-101,8.791749821967065e-06
-102,2.6375249465901198e-05
-103,2.6375249465901198e-05
-104,8.791749821967065e-06
-105,8.791749821967065e-06
-106,1.758349964393413e-05
-107,1.758349964393413e-05
-108,8.791749821967065e-06
-109,8.791749821967065e-06
-110,8.791749821967065e-06
-111,8.791749821967065e-06
-112,8.791749821967065e-06
-113,8.791749821967065e-06
-114,3.516699928786826e-05
-115,2.6375249465901198e-05
-116,2.6375249465901198e-05
-117,2.6375249465901198e-05
-118,8.791749821967065e-06
-119,8.791749821967065e-06
-120,8.791749821967065e-06
-121,8.791749821967065e-06
-122,1.758349964393413e-05
-123,8.791749821967065e-06
-124,8.791749821967065e-06
-125,8.791749821967065e-06
-126,1.758349964393413e-05
-127,1.758349964393413e-05
-128,1.758349964393413e-05
-129,1.758349964393413e-05
-130,1.758349964393413e-05
-131,8.791749821967065e-06
-132,1.758349964393413e-05
-133,8.791749821967065e-06
-134,8.791749821967065e-06
-135,8.791749821967065e-06
-136,8.791749821967065e-06
-137,8.791749821967065e-06
-138,8.791749821967065e-06
-139,8.791749821967065e-06
-140,8.791749821967065e-06
-141,8.791749821967065e-06
-142,8.791749821967065e-06
-143,8.791749821967065e-06
-144,8.791749821967065e-06
-145,8.791749821967065e-06
-146,8.791749821967065e-06
-147,8.791749821967065e-06
-148,8.791749821967065e-06
-149,8.791749821967065e-06
-150,8.791749821967065e-06
-151,8.791749821967065e-06
-152,8.791749821967065e-06
-153,8.791749821967065e-06
-154,8.791749821967065e-06
-155,8.791749821967065e-06
-156,8.791749821967065e-06
-157,8.791749821967065e-06
-158,8.791749821967065e-06
-159,8.791749821967065e-06
-160,8.791749821967065e-06
-161,8.791749821967065e-06
-162,8.791749821967065e-06
-163,8.791749821967065e-06
-164,8.791749821967065e-06
-165,8.791749821967065e-06
-166,8.791749821967065e-06
-167,8.791749821967065e-06
-168,8.791749821967065e-06
-169,8.791749821967065e-06
-170,8.791749821967065e-06
-171,8.791749821967065e-06
-172,8.791749821967065e-06
-173,8.791749821967065e-06
-174,8.791749821967065e-06
-175,8.791749821967065e-06
-176,8.791749821967065e-06
-177,8.791749821967065e-06
-178,8.791749821967065e-06
-179,8.791749821967065e-06
-180,8.791749821967065e-06
-181,8.791749821967065e-06
-182,8.791749821967065e-06
-183,8.791749821967065e-06
-184,8.791749821967065e-06
-185,8.791749821967065e-06
-186,8.791749821967065e-06
-187,8.791749821967065e-06
-188,8.791749821967065e-06
-189,8.791749821967065e-06
-190,8.791749821967065e-06
-191,8.791749821967065e-06
-192,8.791749821967065e-06
-193,8.791749821967065e-06
-194,8.791749821967065e-06
-195,8.791749821967065e-06
-196,8.791749821967065e-06
-197,8.791749821967065e-06
-198,8.791749821967065e-06
-199,8.791749821967065e-06
-200,8.791749821967065e-06
-201,8.791749821967065e-06
-202,8.791749821967065e-06
-203,8.791749821967065e-06
-204,8.791749821967065e-06
-205,8.791749821967065e-06
-206,8.791749821967065e-06
-207,8.791749821967065e-06
-208,8.791749821967065e-06
-209,8.791749821967065e-06
-210,8.791749821967065e-06
-211,8.791749821967065e-06
-212,8.791749821967065e-06
-213,8.791749821967065e-06
-214,8.791749821967065e-06
-215,8.791749821967065e-06
-216,8.791749821967065e-06
-217,8.791749821967065e-06
-218,8.791749821967065e-06
-219,8.791749821967065e-06
-220,8.791749821967065e-06
-221,8.791749821967065e-06
-222,8.791749821967065e-06
-223,8.791749821967065e-06
-224,8.791749821967065e-06
-225,8.791749821967065e-06
-226,8.791749821967065e-06
-227,8.791749821967065e-06
-228,8.791749821967065e-06
-229,8.791749821967065e-06
-230,8.791749821967065e-06
-231,8.791749821967065e-06
-232,8.791749821967065e-06
-233,8.791749821967065e-06
-234,8.791749821967065e-06
-235,8.791749821967065e-06
-236,8.791749821967065e-06
-237,8.791749821967065e-06
-238,8.791749821967065e-06
-239,8.791749821967065e-06
-240,8.791749821967065e-06
-241,8.791749821967065e-06
-242,8.791749821967065e-06
-243,8.791749821967065e-06
-244,8.791749821967065e-06
-245,8.791749821967065e-06
-246,8.791749821967065e-06
-247,8.791749821967065e-06
-248,8.791749821967065e-06
-249,8.791749821967065e-06
-250,8.791749821967065e-06
--- a/assets/lines_example.csv
+++ b/assets/lines_example.csv
@@ -1,6 +0,0 @@
-source,id,line
-cc-100,cc-100_446088,発展を遂げた貨幣経済に対して、後戻りする形の改革が、民衆に受け入れられるはずもありません。
-cc-100,cc-100_446387,東京都渋谷区本町１丁目４−１４ ホームヘルパー（パート：茂原）
-cc-100,cc-100_446430,同時に、発表しあう場を増やしたいです。まず、自分の考えを発表するためには、しっかりと自分の考えを持っていなくてはいけません。そのために、ますますノートの必要性を感じることでしょう。また、質問や意見に答えることで、考えが深まります。友達の意見を聞くことが、より理解を深めることを実感してほしいです。
-cc-100,cc-100_446493,※特典の数に限りがございますので、対象商品はお早めにお買い求めください。特典は無くなり次第終了となります。
-cc-100,cc-100_446543,ハリウッドスターってもっと豪華な生活を送っているのかと思えば、キアヌ・リーブスってかなり質素なんですね。
--- a/assets/vocab.csv
+++ b/assets/vocab.csv
--- a/manga_ocr/init.py
+++ b/manga_ocr/init.py
@@ -4,3 +4,5 @@ from manga_ocr.ocr import MangaOcr
 from manga_ocr.ocr import GoogleVision
 from manga_ocr.ocr import AppleVision
 from manga_ocr.ocr import AzureComputerVision
+from manga_ocr.ocr import EasyOCR
+from manga_ocr.ocr import PaddleOCR
--- a/manga_ocr/ocr.py
+++ b/manga_ocr/ocr.py
@@ -10,6 +10,7 @@ import platform

 import jaconv
 import torch
+import numpy as np
 from PIL import Image
 from loguru import logger
 from transformers import ViTImageProcessor, AutoTokenizer, VisionEncoderDecoderModel
@@ -33,9 +34,19 @@ try:
 except ImportError:
    pass

+try:
+    import easyocr
+except ImportError:
+    pass
+
+try:
+    from paddleocr import PaddleOCR as POCR
+except ImportError:
+    pass
+
 class MangaOcr:
    def __init__(self, pretrained_model_name_or_path='kha-white/manga-ocr-base', force_cpu=False):
-        logger.info(f'Loading OCR model from {pretrained_model_name_or_path}')
+        logger.info(f'Loading Manga OCR model from {pretrained_model_name_or_path}')
        self.processor = ViTImageProcessor.from_pretrained(pretrained_model_name_or_path)
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
        self.model = VisionEncoderDecoderModel.from_pretrained(pretrained_model_name_or_path)
@@ -222,6 +233,76 @@ class AzureComputerVision:
        image_io.seek(0)
        return image_io

+class EasyOCR:
+    def __init__(self):
+        if 'easyocr' not in sys.modules:
+            logger.warning('easyocr not available, EasyOCR will not work!')
+            self.available = False
+        else:
+            logger.info('Loading EasyOCR model')
+            self.model = easyocr.Reader(['ja','en'])
+            self.available = True
+            logger.info('EasyOCR ready')
+
+    def __call__(self, img_or_path):
+        if not self.available:
+            return "Engine not available!"
+
+        if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
+            img = Image.open(img_or_path)
+        elif isinstance(img_or_path, Image.Image):
+            img = img_or_path
+        else:
+            raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
+
+        res = ''
+        read_result = self.model.readtext(self._preprocess(img), detail=0)
+        for text in read_result:
+            res += text + ' '
+
+        x = post_process(res)
+        return x
+
+    def _preprocess(self, img):
+        image_bytes = io.BytesIO()
+        img.save(image_bytes, format=img.format)
+        return image_bytes.getvalue()
+
+class PaddleOCR:
+    def __init__(self):
+        if 'paddleocr' not in sys.modules:
+            logger.warning('easyocr not available, PaddleOCR will not work!')
+            self.available = False
+        else:
+            logger.info('Loading PaddleOCR model')
+            self.model = POCR(use_angle_cls=True, show_log=False, lang='japan')
+            self.available = True
+            logger.info('PaddleOCR ready')
+
+    def __call__(self, img_or_path):
+        if not self.available:
+            return "Engine not available!"
+
+        if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
+            img = Image.open(img_or_path)
+        elif isinstance(img_or_path, Image.Image):
+            img = img_or_path
+        else:
+            raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
+
+        res = ''
+        read_results = self.model.ocr(self._preprocess(img), cls=True)
+        for read_result in read_results:
+            if read_result:
+                for text in read_result:
+                    res += text[1][0] + ' '
+
+        x = post_process(res)
+        return x
+
+    def _preprocess(self, img):
+        return np.array(img.convert('RGB'))
+

 def post_process(text):
    text = ''.join(text.split())
@@ -229,4 +310,4 @@ def post_process(text):
    text = re.sub('[・.]{2,}', lambda x: (x.end() - x.start()) * '.', text)
    text = jaconv.h2z(text, ascii=True, digit=True)

-    return text
+    return text
--- a/manga_ocr/run.py
+++ b/manga_ocr/run.py
@@ -12,17 +12,7 @@ from PIL import UnidentifiedImageError
 from loguru import logger
 from pynput import keyboard

-from manga_ocr import MangaOcr
-from manga_ocr import GoogleVision
-from manga_ocr import AppleVision
-from manga_ocr import AzureComputerVision
-
-engines = ['avision', 'gvision', 'azure', 'mangaocr']
-
-
-def get_engine_name(engine):
-    engine_names = ['Apple Vision', 'Google Vision', 'Azure Computer Vision', 'Manga OCR']
-    return engine_names[engines.index(engine)]
+from manga_ocr import *


 def are_images_identical(img1, img2):
@@ -35,19 +25,12 @@ def are_images_identical(img1, img2):
    return (img1.shape == img2.shape) and (img1 == img2).all()


-def process_and_write_results(mocr, avision, gvision, azure, img_or_path, write_to, engine):
+def process_and_write_results(engine_instance, engine_name, img_or_path, write_to):
    t0 = time.time()
-    if engine == 'gvision':
-        text = gvision(img_or_path)
-    elif engine == 'avision':
-        text = avision(img_or_path)
-    elif engine == 'azure':
-        text = azure(img_or_path)
-    else:
-        text = mocr(img_or_path)
+    text = engine_instance(img_or_path)
    t1 = time.time()

-    logger.opt(ansi=True).info(f"Text recognized in {t1 - t0:0.03f}s using <cyan>{get_engine_name(engine)}</cyan>: {text}")
+    logger.opt(ansi=True).info(f"Text recognized in {t1 - t0:0.03f}s using <cyan>{engine_name}</cyan>: {text}")

    if write_to == 'clipboard':
        pyperclip.copy(text)
@@ -81,7 +64,7 @@ def run(read_from='clipboard',
    :param pretrained_model_name_or_path: Path to a trained model, either local or from Transformers' model hub.
    :param force_cpu: If True, OCR will use CPU even if GPU is available.
    :param delay_secs: How often to check for new images, in seconds.
-    :param engine: OCR engine to use. Available: "mangaocr", "gvision", "avision", "azure".
+    :param engine: OCR engine to use. Available: "mangaocr", "gvision", "avision", "azure", "easyocr", "paddleocr".
    :param verbose: If True, unhides all warnings.
    """

@@ -93,10 +76,20 @@ def run(read_from='clipboard',
    }
    logger.configure(**config)

-    mocr = MangaOcr(pretrained_model_name_or_path, force_cpu)
+    avision = AppleVision()
    gvision = GoogleVision()
    azure = AzureComputerVision()
-    avision = AppleVision()
+    mangaocr = MangaOcr(pretrained_model_name_or_path, force_cpu)
+    easyocr = EasyOCR()
+    paddleocr = PaddleOCR()
+
+    engines = ['avision', 'gvision', 'azure', 'mangaocr', 'easyocr', 'paddleocr']
+    engine_names = ['Apple Vision', 'Google Vision', 'Azure Computer Vision', 'Manga OCR', 'EasyOCR', 'PaddleOCR']
+    engine_instances = [avision, gvision, azure, mangaocr, easyocr, paddleocr]
+    engine_keys = 'agvmeo'
+
+    def get_engine_name(engine):
+        return engine_names[engines.index(engine)]

    if engine not in engines:
        msg = 'Unknown OCR engine!'
@@ -203,8 +196,8 @@ def run(read_from='clipboard',
                    engine = engines[engines.index(engine) + 1]

                logger.opt(ansi=True).info(f"Switched to <cyan>{get_engine_name(engine)}</cyan>!")
-            elif user_input.lower() in 'agvm':
-                new_engine = engines['agvm'.find(user_input.lower())]
+            elif user_input.lower() in engine_keys:
+                new_engine = engines[engine_keys.find(user_input.lower())]
                if engine != new_engine:
                    engine = new_engine
                    logger.opt(ansi=True).info(f"Switched to <cyan>{get_engine_name(engine)}</cyan>!")
@@ -228,7 +221,7 @@ def run(read_from='clipboard',
                        logger.warning('Error while reading from clipboard ({})'.format(error))
                else:
                    if not just_unpaused and isinstance(img, Image.Image) and not are_images_identical(img, old_img):
-                        process_and_write_results(mocr, avision, gvision, azure, img, write_to, engine)
+                        process_and_write_results(engine_instances[engines.index(engine)], get_engine_name(engine), img, write_to)

            if just_unpaused:
                just_unpaused = False
@@ -244,7 +237,7 @@ def run(read_from='clipboard',
                    except (UnidentifiedImageError, OSError) as e:
                        logger.warning(f'Error while reading file {path}: {e}')
                    else:
-                        process_and_write_results(mocr, avision, gvision, azure, img, write_to, engine)
+                        process_and_write_results(engine_instances[engines.index(engine)], get_engine_name(engine), img, write_to)

        time.sleep(delay_secs)

--- a/manga_ocr_dev/README.md
+++ b/manga_ocr_dev/README.md
@@ -1,98 +0,0 @@
-# Project structure
-
-```
-assets/                       # assets (see description below)
-manga_ocr/                    # release code (inference only)
-manga_ocr_dev/                # development code
-   env.py                     # global constants
-   data/                      # data preprocessing
-   synthetic_data_generator/  # generation of synthetic image-text pairs
-   training/                  # model training
-```
-
-## assets
-
-### fonts.csv
-csv with columns:
- font_path: path to font file, relative to `FONTS_ROOT`
- supported_chars: string of characters supported by this font
- num_chars: number of supported characters
- label: common/regular/special (used to sample regular fonts more often than special)
-
-List of fonts with metadata used by synthetic data generator.
-Provided file is just an example, you have to generate similar file for your own set of fonts,
-using `manga_ocr_dev/synthetic_data_generator/scan_fonts.py` script.
-Note that `label` will be filled with `regular` by default. You have to label your special fonts manually.
-
-### lines_example.csv
-csv with columns:
- source: source of text
- id: unique id of the line
- line: line from language corpus
-
-Example of csv used for synthetic data generation.
-
-### len_to_p.csv
-csv with columns:
- len: length of text
- p: probability of text of this length occurring in manga
-
-Used by synthetic data generator to more-or-less match the natural distribution of text lengths.
-Computed based on Manga109-s dataset.
-
-### vocab.csv
-List of all characters supported by tokenizer.
-
-# Training OCR
-
-`env.py` contains global constants used across the repo. Set your paths to data etc. there.
-
-1. Download [Manga109-s](http://www.manga109.org/en/download_s.html) dataset.
-2. Set `MANGA109_ROOT`, so that your directory structure looks like this: 
-    ```
-    <MANGA109_ROOT>/
-        Manga109s_released_2021_02_28/
-            annotations/
-            annotations.v2018.05.31/
-            images/
-            books.txt
-            readme.txt
-    ```
-3. Preprocess Manga109-s with `data/process_manga109s.py`
-4. Optionally generate synthetic data (see below)
-5. Train with `manga_ocr_dev/training/train.py`
-
-# Synthetic data generation
-
-Generated data is split into packages (named `0000`, `0001` etc.) for easier management of large dataset.
-Each package is assumed to have similar data distribution, so that a properly balanced dataset
-can be built from any subset of packages.
-
-Data generation pipeline assumes following directory structure:
-
-```
-<DATA_SYNTHETIC_ROOT>/
-   img/           # generated images (output from generation pipeline)
-      0000/
-      0001/
-      ...
-   lines/         # lines from corpus (input to generation pipeline)
-      0000.csv
-      0001.csv
-      ...
-   meta/          # metadata (output from generation pipeline)
-      0000.csv
-      0001.csv
-      ...
-```
-
-To use a language corpus for data generation, `lines/*.csv` files must be provided.
-For a small example of such file see `assets/lines_example.csv`.
-
-To generate synthetic data:
-1. Generate backgrounds with `data/generate_backgrounds.py`.
-2. Put your fonts in `<FONTS_ROOT>`.
-3. Generate fonts metadata with `synthetic_data_generator/scan_fonts.py`.
-4. Optionally manually label your fonts with `common/regular/special` labels.
-5. Provide `<DATA_SYNTHETIC_ROOT>/lines/*.csv`.
-6. Run `synthetic_data_generator/run_generate.py` for each package.
--- a/manga_ocr_dev/init.py
+++ b/manga_ocr_dev/init.py
--- a/manga_ocr_dev/data/init.py
+++ b/manga_ocr_dev/data/init.py
--- a/manga_ocr_dev/data/generate_backgrounds.py
+++ b/manga_ocr_dev/data/generate_backgrounds.py
@@ -1,85 +0,0 @@
-from pathlib import Path
-
-import cv2
-import numpy as np
-import pandas as pd
-from tqdm import tqdm
-
-from manga_ocr_dev.env import MANGA109_ROOT, BACKGROUND_DIR
-
-
-def find_rectangle(mask, y, x, aspect_ratio_range=(0.33, 3.0)):
-    ymin_ = ymax_ = y
-    xmin_ = xmax_ = x
-
-    ymin = ymax = xmin = xmax = None
-
-    while True:
-        if ymin is None:
-            ymin_ -= 1
-            if ymin_ == 0 or mask[ymin_, xmin_:xmax_].any():
-                ymin = ymin_
-
-        if ymax is None:
-            ymax_ += 1
-            if ymax_ == mask.shape[0] - 1 or mask[ymax_, xmin_:xmax_].any():
-                ymax = ymax_
-
-        if xmin is None:
-            xmin_ -= 1
-            if xmin_ == 0 or mask[ymin_:ymax_, xmin_].any():
-                xmin = xmin_
-
-        if xmax is None:
-            xmax_ += 1
-            if xmax_ == mask.shape[1] - 1 or mask[ymin_:ymax_, xmax_].any():
-                xmax = xmax_
-
-        h = ymax_ - ymin_
-        w = xmax_ - xmin_
-        if h > 1 and w > 1:
-            ratio = w / h
-            if ratio < aspect_ratio_range[0] or ratio > aspect_ratio_range[1]:
-                return ymin_, ymax_, xmin_, xmax_
-
-        if None not in (ymin, ymax, xmin, xmax):
-            return ymin, ymax, xmin, xmax
-
-
-def generate_backgrounds(crops_per_page=5, min_size=40):
-    data = pd.read_csv(MANGA109_ROOT / 'data.csv')
-    frames_df = pd.read_csv(MANGA109_ROOT / 'frames.csv')
-
-    BACKGROUND_DIR.mkdir(parents=True, exist_ok=True)
-
-    page_paths = data.page_path.unique()
-    for page_path in tqdm(page_paths):
-        page = cv2.imread(str(MANGA109_ROOT / page_path))
-        mask = np.zeros((page.shape[0], page.shape[1]), dtype=bool)
-        for row in data[data.page_path == page_path].itertuples():
-            mask[row.ymin:row.ymax, row.xmin:row.xmax] = True
-
-        frames_mask = np.zeros((page.shape[0], page.shape[1]), dtype=bool)
-        for row in frames_df[frames_df.page_path == page_path].itertuples():
-            frames_mask[row.ymin:row.ymax, row.xmin:row.xmax] = True
-
-        mask = mask | ~frames_mask
-
-        if mask.all():
-            continue
-
-        unmasked_points = np.stack(np.where(~mask), axis=1)
-        for i in range(crops_per_page):
-            p = unmasked_points[np.random.randint(0, unmasked_points.shape[0])]
-            y, x = p
-            ymin, ymax, xmin, xmax = find_rectangle(mask, y, x)
-            crop = page[ymin:ymax, xmin:xmax]
-
-            if crop.shape[0] >= min_size and crop.shape[1] >= min_size:
-                out_filename = '_'.join(
-                    Path(page_path).with_suffix('').parts[-2:]) + f'_{ymin}_{ymax}_{xmin}_{xmax}.png'
-                cv2.imwrite(str(BACKGROUND_DIR / out_filename), crop)
-
-
-if __name__ == '__main__':
-    generate_backgrounds()
--- a/manga_ocr_dev/data/process_manga109s.py
+++ b/manga_ocr_dev/data/process_manga109s.py
@@ -1,103 +0,0 @@
-import xml.etree.ElementTree as ET
-from pathlib import Path
-
-import cv2
-import pandas as pd
-from tqdm import tqdm
-
-from manga_ocr_dev.env import MANGA109_ROOT
-
-
-def get_books():
-    root = MANGA109_ROOT / 'Manga109s_released_2021_02_28'
-    books = (root / 'books.txt').read_text().splitlines()
-    books = pd.DataFrame({
-        'book': books,
-        'annotations': [str(root / 'annotations' / f'{book}.xml') for book in books],
-        'images': [str(root / 'images' / book) for book in books],
-    })
-
-    return books
-
-
-def export_frames():
-    books = get_books()
-
-    data = []
-    for book in tqdm(books.itertuples(), total=len(books)):
-        tree = ET.parse(book.annotations)
-        root = tree.getroot()
-        for page in root.findall('./pages/page'):
-            for frame in page.findall('./frame'):
-                row = {}
-                row['book'] = book.book
-                row['page_index'] = int(page.attrib['index'])
-                row['page_path'] = str(Path(book.images) / f'{row["page_index"]:03d}.jpg')
-                row['page_width'] = int(page.attrib['width'])
-                row['page_height'] = int(page.attrib['height'])
-                row['id'] = frame.attrib['id']
-                row['xmin'] = int(frame.attrib['xmin'])
-                row['ymin'] = int(frame.attrib['ymin'])
-                row['xmax'] = int(frame.attrib['xmax'])
-                row['ymax'] = int(frame.attrib['ymax'])
-                data.append(row)
-    data = pd.DataFrame(data)
-
-    data.page_path = data.page_path.apply(lambda x: '/'.join(Path(x).parts[-4:]))
-    data.to_csv(MANGA109_ROOT / 'frames.csv', index=False)
-
-
-def export_crops():
-    crops_root = MANGA109_ROOT / 'crops'
-    crops_root.mkdir(parents=True, exist_ok=True)
-    margin = 10
-
-    books = get_books()
-
-    data = []
-    for book in tqdm(books.itertuples(), total=len(books)):
-        tree = ET.parse(book.annotations)
-        root = tree.getroot()
-        for page in root.findall('./pages/page'):
-            for text in page.findall('./text'):
-                row = {}
-                row['book'] = book.book
-                row['page_index'] = int(page.attrib['index'])
-                row['page_path'] = str(Path(book.images) / f'{row["page_index"]:03d}.jpg')
-                row['page_width'] = int(page.attrib['width'])
-                row['page_height'] = int(page.attrib['height'])
-                row['id'] = text.attrib['id']
-                row['text'] = text.text
-                row['xmin'] = int(text.attrib['xmin'])
-                row['ymin'] = int(text.attrib['ymin'])
-                row['xmax'] = int(text.attrib['xmax'])
-                row['ymax'] = int(text.attrib['ymax'])
-                data.append(row)
-    data = pd.DataFrame(data)
-
-    n_test = int(0.1 * len(data))
-    data['split'] = 'train'
-    data.loc[data.sample(len(data)).iloc[:n_test].index, 'split'] = 'test'
-
-    data['crop_path'] = str(crops_root) + '\\' + data.id + '.png'
-
-    data.page_path = data.page_path.apply(lambda x: '/'.join(Path(x).parts[-4:]))
-    data.crop_path = data.crop_path.apply(lambda x: '/'.join(Path(x).parts[-2:]))
-    data.to_csv(MANGA109_ROOT / 'data.csv', index=False)
-
-    for page_path, boxes in tqdm(data.groupby('page_path'), total=data.page_path.nunique()):
-        img = cv2.imread(str(MANGA109_ROOT / page_path))
-
-        for box in boxes.itertuples():
-            xmin = max(box.xmin - margin, 0)
-            xmax = min(box.xmax + margin, img.shape[1])
-            ymin = max(box.ymin - margin, 0)
-            ymax = min(box.ymax + margin, img.shape[0])
-            crop = img[ymin:ymax, xmin:xmax]
-            out_path = (crops_root / box.id).with_suffix('.png')
-            cv2.imwrite(str(out_path), crop)
-
-
-if __name__ == '__main__':
-    export_frames()
-    export_crops()
--- a/manga_ocr_dev/env.py
+++ b/manga_ocr_dev/env.py
@@ -1,9 +0,0 @@
-from pathlib import Path
-
-ASSETS_PATH = Path(__file__).parent.parent / 'assets'
-
-FONTS_ROOT = Path('~/data/jp_fonts').expanduser()
-DATA_SYNTHETIC_ROOT = Path('~/data/manga/synthetic').expanduser()
-BACKGROUND_DIR = Path('~/data/manga/Manga109s/background').expanduser()
-MANGA109_ROOT = Path('~/data/manga/Manga109s').expanduser()
-TRAIN_ROOT = Path('~/data/manga/out').expanduser()
--- a/manga_ocr_dev/requirements.txt
+++ b/manga_ocr_dev/requirements.txt
@@ -1,25 +0,0 @@
-datasets
-jiwer
-torchinfo
-transformers>=4.12.5
-unidic-lite
-ipadic
-mecab-python3
-fugashi
-matplotlib
-numpy
-opencv-python
-pandas
-Pillow
-pytest
-scikit-image
-scikit-learn
-scipy
-torch
-torchvision
-tqdm
-wandb
-fire
-budou
-albumentations>=1.1
-html2image
--- a/manga_ocr_dev/synthetic_data_generator/README.md
+++ b/manga_ocr_dev/synthetic_data_generator/README.md
@@ -1,38 +0,0 @@
-# Synthetic data generator
-
-Generation of synthetic image-text pairs imitating Japanese manga for the purpose of training OCR.
-
-Features:
- using either text from corpus or random text
- text overlaid on background images
- drawing text bubbles
- various fonts and font styles
- variety of text layouts:
-  - vertical and horizontal text
-  - multi-line text
-  - [furigana](https://en.wikipedia.org/wiki/Furigana) (added randomly)
-  - [tate chū yoko](https://www.w3.org/International/articles/vertical-text/#tcy)
-
-
-Text rendering is done with the usage of [html2image](https://github.com/vgalin/html2image),
-which is a wrapper around Chrome/Chromium browser's headless mode.
-It's not too elegant of a solution, and it is very slow, but it only needs to be run once,
-and when parallelized, processing time is manageable (~17 min per 10000 images on a 16-thread machine).
-
-The upside of this approach is that a quite complex problem of typesetting and text rendering
-(especially when dealing with both horizontal and vertical text) is offloaded to
-the browser engine, keeping the codebase relatively simple and extendable. 
-
-High-level generation pipeline is as follows:
-1. Preprocess text (truncate and/or split into lines, add random furigana).
-2. Render text on a transparent background, using HTML engine.
-3. Select background image from backgrounds dataset.
-4. Overlay the text on the background, optionally drawing a bubble around the text.
-
-# Examples
-
-## Images generated with text from [CC-100 Japanese corpus](https://data.statmt.org/cc-100/)
-![](../../assets/examples/cc-100.jpg)
-
-## Images generated with random text
-![](../../assets/examples/random.jpg)
--- a/manga_ocr_dev/synthetic_data_generator/init.py
+++ b/manga_ocr_dev/synthetic_data_generator/init.py
--- a/manga_ocr_dev/synthetic_data_generator/generator.py
+++ b/manga_ocr_dev/synthetic_data_generator/generator.py
@@ -1,198 +0,0 @@
-import budou
-import numpy as np
-import pandas as pd
-
-from manga_ocr_dev.env import ASSETS_PATH, FONTS_ROOT
-from manga_ocr_dev.synthetic_data_generator.renderer import Renderer
-from manga_ocr_dev.synthetic_data_generator.utils import get_font_meta, get_charsets, is_ascii, is_kanji
-
-
-class SyntheticDataGenerator:
-    def __init__(self):
-        self.vocab, self.hiragana, self.katakana = get_charsets()
-        self.len_to_p = pd.read_csv(ASSETS_PATH / 'len_to_p.csv')
-        self.parser = budou.get_parser('tinysegmenter')
-        self.fonts_df, self.font_map = get_font_meta()
-        self.font_labels, self.font_p = self.get_font_labels_prob()
-        self.renderer = Renderer()
-
-    def process(self, text=None, override_css_params=None):
-        """
-        Generate image, text pair. Use source text if provided, otherwise generate random text.
-        """
-
-        if override_css_params is None:
-            override_css_params = {}
-
-        if text is None:
-            # if using random text, choose font first,
-            # and then generate text using only characters supported by that font
-            if 'font_path' not in override_css_params:
-                font_path = self.get_random_font()
-                vocab = self.font_map[font_path]
-                override_css_params['font_path'] = font_path
-            else:
-                font_path = override_css_params['font_path']
-                vocab = self.font_map[font_path]
-
-            words = self.get_random_words(vocab)
-
-        else:
-            text = text.replace('　', ' ')
-            text = text.replace('…', '...')
-            words = self.split_into_words(text)
-
-        lines = self.words_to_lines(words)
-        text_gt = '\n'.join(lines)
-
-        if 'font_path' not in override_css_params:
-            override_css_params['font_path'] = self.get_random_font(text_gt)
-
-        font_path = override_css_params.get('font_path')
-        if font_path:
-            vocab = self.font_map.get(font_path)
-
-            # remove unsupported characters
-            lines = [''.join([c for c in line if c in vocab]) for line in lines]
-            text_gt = '\n'.join(lines)
-        else:
-            vocab = None
-
-        if np.random.random() < 0.5:
-            word_prob = np.random.choice([0.33, 1.0], p=[0.3, 0.7])
-
-            lines = [self.add_random_furigana(line, word_prob, vocab) for line in lines]
-
-        img, params = self.renderer.render(lines, override_css_params)
-        return img, text_gt, params
-
-    def get_random_words(self, vocab):
-        vocab = list(vocab)
-        max_text_len = np.random.choice(self.len_to_p.len, p=self.len_to_p.p)
-
-        words = []
-        text_len = 0
-        while True:
-            word = ''.join(np.random.choice(vocab, np.random.randint(1, 4)))
-            words.append(word)
-            text_len += len(word)
-            if text_len + len(word) >= max_text_len:
-                break
-
-        return words
-
-    def split_into_words(self, text):
-        max_text_len = np.random.choice(self.len_to_p.len, p=self.len_to_p.p)
-
-        words = []
-        text_len = 0
-        for chunk in self.parser.parse(text)['chunks']:
-            words.append(chunk.word)
-            text_len += len(chunk.word)
-            if text_len + len(chunk.word) >= max_text_len:
-                break
-
-        return words
-
-    def words_to_lines(self, words):
-        text = ''.join(words)
-
-        max_num_lines = 10
-        min_line_len = len(text) // max_num_lines
-        max_line_len = 20
-        max_line_len = np.clip(np.random.poisson(6), min_line_len, max_line_len)
-        lines = []
-        line = ''
-        for word in words:
-            line += word
-            if len(line) >= max_line_len:
-                lines.append(line)
-                line = ''
-        if line:
-            lines.append(line)
-
-        return lines
-
-    def add_random_furigana(self, line, word_prob=1.0, vocab=None):
-        if vocab is None:
-            vocab = self.vocab
-        else:
-            vocab = list(vocab)
-
-        processed = ''
-        kanji_group = ''
-        ascii_group = ''
-        for i, c in enumerate(line):
-
-            if is_kanji(c):
-                c_type = 'kanji'
-                kanji_group += c
-            elif is_ascii(c):
-                c_type = 'ascii'
-                ascii_group += c
-            else:
-                c_type = 'other'
-
-            if c_type != 'kanji' or i == len(line) - 1:
-                if kanji_group:
-                    if np.random.uniform() < word_prob:
-                        furigana_len = int(np.clip(np.random.normal(1.5, 0.5), 1, 4) * len(kanji_group))
-                        char_source = np.random.choice(['hiragana', 'katakana', 'all'], p=[0.8, 0.15, 0.05])
-                        char_source = {
-                            'hiragana': self.hiragana,
-                            'katakana': self.katakana,
-                            'all': vocab
-                        }[char_source]
-                        furigana = ''.join(np.random.choice(char_source, furigana_len))
-                        processed += f'<ruby>{kanji_group}<rt>{furigana}</rt></ruby>'
-                    else:
-                        processed += kanji_group
-                    kanji_group = ''
-
-            if c_type != 'ascii' or i == len(line) - 1:
-                if ascii_group:
-                    if len(ascii_group) <= 3 and np.random.uniform() < 0.7:
-                        processed += f'<span style="text-combine-upright: all">{ascii_group}</span>'
-                    else:
-                        processed += ascii_group
-                    ascii_group = ''
-
-            if c_type == 'other':
-                processed += c
-
-        return processed
-
-    def is_font_supporting_text(self, font_path, text):
-        chars = self.font_map[font_path]
-        for c in text:
-            if c.isspace():
-                continue
-            if c not in chars:
-                return False
-        return True
-
-    def get_font_labels_prob(self):
-        labels = {
-            'common': 0.2,
-            'regular': 0.75,
-            'special': 0.05,
-        }
-        labels = {k: labels[k] for k in self.fonts_df.label.unique()}
-        p = np.array(list(labels.values()))
-        p = p / p.sum()
-        labels = list(labels.keys())
-        return labels, p
-
-    def get_random_font(self, text=None):
-        label = np.random.choice(self.font_labels, p=self.font_p)
-        df = self.fonts_df[self.fonts_df.label == label]
-
-        if text is None:
-            return df.sample(1).iloc[0].font_path
-
-        valid_mask = df.font_path.apply(lambda x: self.is_font_supporting_text(x, text))
-        if not valid_mask.any():
-            # if text contains characters not supported by any font, just pick some of the more capable fonts
-            valid_mask = (df.num_chars >= 4000)
-
-        return str(FONTS_ROOT / df[valid_mask].sample(1).iloc[0].font_path)
--- a/manga_ocr_dev/synthetic_data_generator/renderer.py
+++ b/manga_ocr_dev/synthetic_data_generator/renderer.py
@@ -1,265 +0,0 @@
-import os
-import uuid
-
-import albumentations as A
-import cv2
-import numpy as np
-from html2image import Html2Image
-
-from manga_ocr_dev.env import BACKGROUND_DIR
-from manga_ocr_dev.synthetic_data_generator.utils import get_background_df
-
-
-class Renderer:
-    def __init__(self):
-        self.hti = Html2Image()
-        self.background_df = get_background_df(BACKGROUND_DIR)
-        self.max_size = 600
-
-    def render(self, lines, override_css_params=None):
-        img, params = self.render_text(lines, override_css_params)
-        img = self.render_background(img)
-        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-        img = A.LongestMaxSize(self.max_size)(image=img)['image']
-        return img, params
-
-    def render_text(self, lines, override_css_params=None):
-        """Render text on transparent background and return as BGRA image."""
-
-        params = self.get_random_css_params()
-        if override_css_params:
-            params.update(override_css_params)
-
-        css = get_css(**params)
-
-        # this is just a rough estimate, image is cropped later anyway
-        size = (
-            int(max(len(line) for line in lines) * params['font_size'] * 1.5),
-            int(len(lines) * params['font_size'] * (3 + params['line_height'])),
-        )
-        if params['vertical']:
-            size = size[::-1]
-        html = self.lines_to_html(lines)
-
-        filename = str(uuid.uuid4()) + '.png'
-        self.hti.screenshot(html_str=html, css_str=css, save_as=filename, size=size)
-        img = cv2.imread(filename, cv2.IMREAD_UNCHANGED)
-        os.remove(filename)
-        return img, params
-
-    @staticmethod
-    def get_random_css_params():
-        params = {
-            'font_size': 48,
-            'vertical': True if np.random.rand() < 0.7 else False,
-            'line_height': 0.5,
-            'background_color': 'transparent',
-            'text_color': 'black',
-        }
-
-        if np.random.rand() < 0.7:
-            params['text_orientation'] = 'upright'
-
-        stroke_variant = np.random.choice(['stroke', 'shadow', 'none'], p=[0.8, 0.15, 0.05])
-        if stroke_variant == 'stroke':
-            params['stroke_size'] = np.random.choice([1, 2, 3, 4, 8])
-            params['stroke_color'] = 'white'
-        elif stroke_variant == 'shadow':
-            params['shadow_size'] = np.random.choice([2, 5, 10])
-            params['shadow_color'] = 'white' if np.random.rand() < 0.8 else 'black',
-        elif stroke_variant == 'none':
-            pass
-
-        return params
-
-    def render_background(self, img):
-        """Add background and/or text bubble to a BGRA image, crop and return as BGR image."""
-        draw_bubble = np.random.random() < 0.7
-
-        m0 = int(min(img.shape[:2]) * 0.3)
-        img = crop_by_alpha(img, m0)
-
-        background_path = self.background_df.sample(1).iloc[0].path
-        background = cv2.imread(background_path)
-
-        t = [
-            A.HorizontalFlip(),
-            A.RandomRotate90(),
-            A.InvertImg(),
-            A.RandomBrightnessContrast((-0.2, 0.4), (-0.8, -0.3), p=0.5 if draw_bubble else 1),
-            A.Blur((3, 5), p=0.3),
-            A.Resize(img.shape[0], img.shape[1]),
-        ]
-
-        background = A.Compose(t)(image=background)['image']
-
-        if not draw_bubble:
-            if np.random.rand() < 0.5:
-                img[:, :, :3] = 255 - img[:, :, :3]
-
-        else:
-            radius = np.random.uniform(0.7, 1.)
-            thickness = np.random.choice([1, 2, 3])
-            alpha = np.random.randint(60, 100)
-            sigma = np.random.randint(10, 15)
-
-            ymin = m0 - int(min(img.shape[:2]) * np.random.uniform(0.07, 0.12))
-            ymax = img.shape[0] - m0 + int(min(img.shape[:2]) * np.random.uniform(0.07, 0.12))
-            xmin = m0 - int(min(img.shape[:2]) * np.random.uniform(0.07, 0.12))
-            xmax = img.shape[1] - m0 + int(min(img.shape[:2]) * np.random.uniform(0.07, 0.12))
-
-            bubble_fill_color = (255, 255, 255, 255)
-            bubble_contour_color = (0, 0, 0, 255)
-            bubble = np.zeros((img.shape[0], img.shape[1], 4), dtype=np.uint8)
-            bubble = rounded_rectangle(bubble, (xmin, ymin), (xmax, ymax), radius=radius, color=bubble_fill_color,
-                                       thickness=-1)
-            bubble = rounded_rectangle(bubble, (xmin, ymin), (xmax, ymax), radius=radius, color=bubble_contour_color,
-                                       thickness=thickness)
-
-            t = [
-                A.ElasticTransform(alpha=alpha, sigma=sigma, alpha_affine=0, p=0.8),
-            ]
-            bubble = A.Compose(t)(image=bubble)['image']
-
-            background = blend(bubble, background)
-
-        img = blend(img, background)
-
-        ymin = m0 - int(min(img.shape[:2]) * np.random.uniform(0.01, 0.2))
-        ymax = img.shape[0] - m0 + int(min(img.shape[:2]) * np.random.uniform(0.01, 0.2))
-        xmin = m0 - int(min(img.shape[:2]) * np.random.uniform(0.01, 0.2))
-        xmax = img.shape[1] - m0 + int(min(img.shape[:2]) * np.random.uniform(0.01, 0.2))
-        img = img[ymin:ymax, xmin:xmax]
-        return img
-
-    def lines_to_html(self, lines):
-        lines_str = '\n'.join(['<p>' + line + '</p>' for line in lines])
-        html = f"<html><body>\n{lines_str}\n</body></html>"
-        return html
-
-
-def crop_by_alpha(img, margin):
-    y, x = np.where(img[:, :, 3] > 0)
-    ymin = y.min()
-    ymax = y.max()
-    xmin = x.min()
-    xmax = x.max()
-    img = img[ymin:ymax, xmin:xmax]
-    img = np.pad(img, ((margin, margin), (margin, margin), (0, 0)))
-    return img
-
-
-def blend(img, background):
-    alpha = (img[:, :, 3] / 255)[:, :, np.newaxis]
-    img = img[:, :, :3]
-    img = (background * (1 - alpha) + img * alpha).astype(np.uint8)
-    return img
-
-
-def rounded_rectangle(src, top_left, bottom_right, radius=1, color=255, thickness=1, line_type=cv2.LINE_AA):
-    """From https://stackoverflow.com/a/60210706"""
-
-    #  corners:
-    #  p1 - p2
-    #  |     |
-    #  p4 - p3
-
-    p1 = top_left
-    p2 = (bottom_right[0], top_left[1])
-    p3 = bottom_right
-    p4 = (top_left[0], bottom_right[1])
-
-    height = abs(bottom_right[1] - top_left[1])
-    width = abs(bottom_right[0] - top_left[0])
-
-    if radius > 1:
-        radius = 1
-
-    corner_radius = int(radius * (min(height, width) / 2))
-
-    if thickness < 0:
-        # big rect
-        top_left_main_rect = (int(p1[0] + corner_radius), int(p1[1]))
-        bottom_right_main_rect = (int(p3[0] - corner_radius), int(p3[1]))
-
-        top_left_rect_left = (p1[0], p1[1] + corner_radius)
-        bottom_right_rect_left = (p4[0] + corner_radius, p4[1] - corner_radius)
-
-        top_left_rect_right = (p2[0] - corner_radius, p2[1] + corner_radius)
-        bottom_right_rect_right = (p3[0], p3[1] - corner_radius)
-
-        all_rects = [
-            [top_left_main_rect, bottom_right_main_rect],
-            [top_left_rect_left, bottom_right_rect_left],
-            [top_left_rect_right, bottom_right_rect_right]]
-
-        [cv2.rectangle(src, rect[0], rect[1], color, thickness) for rect in all_rects]
-
-    # draw straight lines
-    cv2.line(src, (p1[0] + corner_radius, p1[1]), (p2[0] - corner_radius, p2[1]), color, abs(thickness), line_type)
-    cv2.line(src, (p2[0], p2[1] + corner_radius), (p3[0], p3[1] - corner_radius), color, abs(thickness), line_type)
-    cv2.line(src, (p3[0] - corner_radius, p4[1]), (p4[0] + corner_radius, p3[1]), color, abs(thickness), line_type)
-    cv2.line(src, (p4[0], p4[1] - corner_radius), (p1[0], p1[1] + corner_radius), color, abs(thickness), line_type)
-
-    # draw arcs
-    cv2.ellipse(src, (p1[0] + corner_radius, p1[1] + corner_radius), (corner_radius, corner_radius), 180.0, 0, 90,
-                color, thickness, line_type)
-    cv2.ellipse(src, (p2[0] - corner_radius, p2[1] + corner_radius), (corner_radius, corner_radius), 270.0, 0, 90,
-                color, thickness, line_type)
-    cv2.ellipse(src, (p3[0] - corner_radius, p3[1] - corner_radius), (corner_radius, corner_radius), 0.0, 0, 90, color,
-                thickness, line_type)
-    cv2.ellipse(src, (p4[0] + corner_radius, p4[1] - corner_radius), (corner_radius, corner_radius), 90.0, 0, 90, color,
-                thickness, line_type)
-
-    return src
-
-
-def get_css(
-        font_size,
-        font_path,
-        vertical=True,
-        background_color='white',
-        text_color='black',
-        shadow_size=0,
-        shadow_color='black',
-        stroke_size=0,
-        stroke_color='black',
-        letter_spacing=None,
-        line_height=0.5,
-        text_orientation=None,
-):
-    styles = [
-        f"background-color: {background_color};",
-        f"font-size: {font_size}px;",
-        f"color: {text_color};",
-        "font-family: custom;",
-        f"line-height: {line_height};",
-        "margin: 20px;",
-    ]
-
-    if text_orientation:
-        styles.append(f"text-orientation: {text_orientation};")
-
-    if vertical:
-        styles.append("writing-mode: vertical-rl;")
-
-    if shadow_size > 0:
-        styles.append(f"text-shadow: 0 0 {shadow_size}px {shadow_color};")
-
-    if stroke_size > 0:
-        # stroke is simulated by shadow overlaid multiple times
-        styles.extend([
-            f"text-shadow: " + ','.join([f"0 0 {stroke_size}px {stroke_color}"] * 10 * stroke_size) + ";",
-            "-webkit-font-smoothing: antialiased;",
-        ])
-
-    if letter_spacing:
-        styles.append(f"letter-spacing: {letter_spacing}em;")
-
-    font_path = font_path.replace('\\', '/')
-
-    styles_str = '\n'.join(styles)
-    css = ""
-    css += '\n@font-face {\nfont-family: custom;\nsrc: url("' + font_path + '");\n}\n'
-    css += "body {\n" + styles_str + "\n}"
-    return css
--- a/manga_ocr_dev/synthetic_data_generator/run_generate.py
+++ b/manga_ocr_dev/synthetic_data_generator/run_generate.py
@@ -1,64 +0,0 @@
-import traceback
-from pathlib import Path
-
-import cv2
-import fire
-import pandas as pd
-from tqdm.contrib.concurrent import thread_map
-
-from manga_ocr_dev.env import FONTS_ROOT, DATA_SYNTHETIC_ROOT
-from manga_ocr_dev.synthetic_data_generator.generator import SyntheticDataGenerator
-
-generator = SyntheticDataGenerator()
-
-
-def f(args):
-    try:
-        i, source, id_, text = args
-        filename = f'{id_}.jpg'
-        img, text_gt, params = generator.process(text)
-
-        cv2.imwrite(str(OUT_DIR / filename), img)
-
-        font_path = Path(params['font_path']).relative_to(FONTS_ROOT)
-        ret = source, id_, text_gt, params['vertical'], str(font_path)
-        return ret
-
-    except Exception as e:
-        print(traceback.format_exc())
-
-
-def run(package=0, n_random=1000, n_limit=None, max_workers=16):
-    """
-    :param package: number of data package to generate
-    :param n_random: how many samples with random text to generate
-    :param n_limit: limit number of generated samples (for debugging)
-    :param max_workers: max number of workers
-    """
-
-    package = f'{package:04d}'
-    lines = pd.read_csv(DATA_SYNTHETIC_ROOT / f'lines/{package}.csv')
-    random_lines = pd.DataFrame({
-        'source': 'random',
-        'id': [f'random_{package}_{i}' for i in range(n_random)],
-        'line': None
-    })
-    lines = pd.concat([lines, random_lines], ignore_index=True)
-    if n_limit:
-        lines = lines.sample(n_limit)
-    args = [(i, *values) for i, values in enumerate(lines.values)]
-
-    global OUT_DIR
-    OUT_DIR = DATA_SYNTHETIC_ROOT / 'img' / package
-    OUT_DIR.mkdir(parents=True, exist_ok=True)
-
-    data = thread_map(f, args, max_workers=max_workers, desc=f'Processing package {package}')
-
-    data = pd.DataFrame(data, columns=['source', 'id', 'text', 'vertical', 'font_path'])
-    meta_path = DATA_SYNTHETIC_ROOT / f'meta/{package}.csv'
-    meta_path.parent.mkdir(parents=True, exist_ok=True)
-    data.to_csv(meta_path, index=False)
-
-
-if __name__ == '__main__':
-    fire.Fire(run)
--- a/manga_ocr_dev/synthetic_data_generator/scan_fonts.py
+++ b/manga_ocr_dev/synthetic_data_generator/scan_fonts.py
@@ -1,72 +0,0 @@
-import PIL
-import numpy as np
-import pandas as pd
-from PIL import ImageDraw, ImageFont
-from fontTools.ttLib import TTFont
-from tqdm.contrib.concurrent import process_map
-
-from manga_ocr_dev.env import ASSETS_PATH, FONTS_ROOT
-
-vocab = pd.read_csv(ASSETS_PATH / 'vocab.csv').char.values
-
-
-def has_glyph(font, glyph):
-    for table in font['cmap'].tables:
-        if ord(glyph) in table.cmap.keys():
-            return True
-    return False
-
-
-def process(font_path):
-    """
-    Get supported characters list for a given font.
-    Font metadata is not always reliable, so try to render each character and see if anything shows up.
-    Still not perfect, because sometimes unsupported characters show up as rectangles.
-    """
-
-    try:
-        font_path = str(font_path)
-        ttfont = TTFont(font_path)
-        pil_font = ImageFont.truetype(font_path, 24)
-
-        supported_chars = []
-
-        for char in vocab:
-            if not has_glyph(ttfont, char):
-                continue
-
-            image = PIL.Image.new('L', (40, 40), 255)
-            draw = ImageDraw.Draw(image)
-            draw.text((10, 0), char, 0, font=pil_font)
-            if (np.array(image) != 255).sum() == 0:
-                continue
-
-            supported_chars.append(char)
-
-        supported_chars = ''.join(supported_chars)
-    except Exception as e:
-        print(f'Error while processing {font_path}: {e}')
-        supported_chars = ''
-
-    return supported_chars
-
-
-def main():
-    path_in = FONTS_ROOT
-    out_path = ASSETS_PATH / 'fonts.csv'
-
-    suffixes = {'.TTF', '.otf', '.ttc', '.ttf'}
-    font_paths = [path for path in path_in.glob('**/*') if
-                  path.suffix in suffixes]
-
-    data = process_map(process, font_paths, max_workers=16)
-
-    font_paths = [str(path.relative_to(FONTS_ROOT)) for path in font_paths]
-    data = pd.DataFrame({'font_path': font_paths, 'supported_chars': data})
-    data['num_chars'] = data.supported_chars.str.len()
-    data['label'] = 'regular'
-    data.to_csv(out_path, index=False)
-
-
-if __name__ == '__main__':
-    main()
--- a/manga_ocr_dev/synthetic_data_generator/utils.py
+++ b/manga_ocr_dev/synthetic_data_generator/utils.py
@@ -1,54 +0,0 @@
-import pandas as pd
-import unicodedata
-
-from manga_ocr_dev.env import ASSETS_PATH, FONTS_ROOT
-
-
-def get_background_df(background_dir):
-    background_df = []
-    for path in background_dir.iterdir():
-        ymin, ymax, xmin, xmax = [int(v) for v in path.stem.split('_')[-4:]]
-        h = ymax - ymin
-        w = xmax - xmin
-        ratio = w / h
-
-        background_df.append({
-            'path': str(path),
-            'h': h,
-            'w': w,
-            'ratio': ratio,
-        })
-    background_df = pd.DataFrame(background_df)
-    return background_df
-
-
-def is_kanji(ch):
-    return 'CJK UNIFIED IDEOGRAPH' in unicodedata.name(ch)
-
-
-def is_hiragana(ch):
-    return 'HIRAGANA' in unicodedata.name(ch)
-
-
-def is_katakana(ch):
-    return 'KATAKANA' in unicodedata.name(ch)
-
-
-def is_ascii(ch):
-    return ord(ch) < 128
-
-
-def get_charsets(vocab_path=None):
-    if vocab_path is None:
-        vocab_path = ASSETS_PATH / 'vocab.csv'
-    vocab = pd.read_csv(vocab_path).char.values
-    hiragana = vocab[[is_hiragana(c) for c in vocab]][:-6]
-    katakana = vocab[[is_katakana(c) for c in vocab]][3:]
-    return vocab, hiragana, katakana
-
-
-def get_font_meta():
-    df = pd.read_csv(ASSETS_PATH / 'fonts.csv')
-    df.font_path = df.font_path.apply(lambda x: str(FONTS_ROOT / x))
-    font_map = {row.font_path: set(row.supported_chars) for row in df.dropna().itertuples()}
-    return df, font_map
--- a/manga_ocr_dev/training/init.py
+++ b/manga_ocr_dev/training/init.py
--- a/manga_ocr_dev/training/dataset.py
+++ b/manga_ocr_dev/training/dataset.py
@@ -1,165 +0,0 @@
-import albumentations as A
-import cv2
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-import torch
-from torch.utils.data import Dataset
-
-from manga_ocr_dev.env import MANGA109_ROOT, DATA_SYNTHETIC_ROOT
-
-
-class MangaDataset(Dataset):
-    def __init__(self, processor, split, max_target_length, limit_size=None, augment=False, skip_packages=None):
-        self.processor = processor
-        self.max_target_length = max_target_length
-
-        data = []
-
-        print(f'Initializing dataset {split}...')
-
-        if skip_packages is None:
-            skip_packages = set()
-        else:
-            skip_packages = {f'{x:04d}' for x in skip_packages}
-
-        for path in sorted((DATA_SYNTHETIC_ROOT / 'meta').glob('*.csv')):
-            if path.stem in skip_packages:
-                print(f'Skipping package {path}')
-                continue
-            if not (DATA_SYNTHETIC_ROOT / 'img' / path.stem).is_dir():
-                print(f'Missing image data for package {path}, skipping')
-                continue
-            df = pd.read_csv(path)
-            df = df.dropna()
-            df['path'] = df.id.apply(lambda x: str(DATA_SYNTHETIC_ROOT / 'img' / path.stem / f'{x}.jpg'))
-            df = df[['path', 'text']]
-            df['synthetic'] = True
-            data.append(df)
-
-        df = pd.read_csv(MANGA109_ROOT / 'data.csv')
-        df = df[df.split == split].reset_index(drop=True)
-        df['path'] = df.crop_path.apply(lambda x: str(MANGA109_ROOT / x))
-        df = df[['path', 'text']]
-        df['synthetic'] = False
-        data.append(df)
-
-        data = pd.concat(data, ignore_index=True)
-
-        if limit_size:
-            data = data.iloc[:limit_size]
-        self.data = data
-
-        print(f'Dataset {split}: {len(self.data)}')
-
-        self.augment = augment
-        self.transform_medium, self.transform_heavy = self.get_transforms()
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(self, idx):
-        sample = self.data.loc[idx]
-        text = sample.text
-
-        if self.augment:
-            medium_p = 0.8
-            heavy_p = 0.02
-            transform_variant = np.random.choice(['none', 'medium', 'heavy'],
-                                                 p=[1 - medium_p - heavy_p, medium_p, heavy_p])
-            transform = {
-                'none': None,
-                'medium': self.transform_medium,
-                'heavy': self.transform_heavy,
-            }[transform_variant]
-        else:
-            transform = None
-
-        pixel_values = self.read_image(self.processor, sample.path, transform)
-        labels = self.processor.tokenizer(text,
-                                          padding="max_length",
-                                          max_length=self.max_target_length,
-                                          truncation=True).input_ids
-        labels = np.array(labels)
-        # important: make sure that PAD tokens are ignored by the loss function
-        labels[labels == self.processor.tokenizer.pad_token_id] = -100
-
-        encoding = {
-            "pixel_values": pixel_values,
-            "labels": torch.tensor(labels),
-        }
-        return encoding
-
-    @staticmethod
-    def read_image(processor, path, transform=None):
-        img = cv2.imread(str(path))
-
-        if transform is None:
-            transform = A.ToGray(always_apply=True)
-
-        img = transform(image=img)['image']
-
-        pixel_values = processor(img, return_tensors="pt").pixel_values
-        return pixel_values.squeeze()
-
-    @staticmethod
-    def get_transforms():
-        t_medium = A.Compose([
-            A.Rotate(5, border_mode=cv2.BORDER_REPLICATE, p=0.2),
-            A.Perspective((0.01, 0.05), pad_mode=cv2.BORDER_REPLICATE, p=0.2),
-            A.InvertImg(p=0.05),
-
-            A.OneOf([
-                A.Downscale(0.25, 0.5, interpolation=cv2.INTER_LINEAR),
-                A.Downscale(0.25, 0.5, interpolation=cv2.INTER_NEAREST),
-            ], p=0.1),
-            A.Blur(p=0.2),
-            A.Sharpen(p=0.2),
-            A.RandomBrightnessContrast(p=0.5),
-            A.GaussNoise((50, 200), p=0.3),
-            A.ImageCompression(0, 30, p=0.1),
-            A.ToGray(always_apply=True),
-        ])
-
-        t_heavy = A.Compose([
-            A.Rotate(10, border_mode=cv2.BORDER_REPLICATE, p=0.2),
-            A.Perspective((0.01, 0.05), pad_mode=cv2.BORDER_REPLICATE, p=0.2),
-            A.InvertImg(p=0.05),
-
-            A.OneOf([
-                A.Downscale(0.1, 0.2, interpolation=cv2.INTER_LINEAR),
-                A.Downscale(0.1, 0.2, interpolation=cv2.INTER_NEAREST),
-            ], p=0.1),
-            A.Blur((4, 9), p=0.5),
-            A.Sharpen(p=0.5),
-            A.RandomBrightnessContrast(0.8, 0.8, p=1),
-            A.GaussNoise((1000, 10000), p=0.3),
-            A.ImageCompression(0, 10, p=0.5),
-            A.ToGray(always_apply=True),
-        ])
-
-        return t_medium, t_heavy
-
-
-if __name__ == '__main__':
-    from manga_ocr_dev.training.get_model import get_processor
-    from manga_ocr_dev.training.utils import tensor_to_image
-
-    encoder_name = 'facebook/deit-tiny-patch16-224'
-    decoder_name = 'cl-tohoku/bert-base-japanese-char-v2'
-
-    max_length = 300
-
-    processor = get_processor(encoder_name, decoder_name)
-    ds = MangaDataset(processor, 'train', max_length, augment=True)
-
-    for i in range(20):
-        sample = ds[0]
-        img = tensor_to_image(sample['pixel_values'])
-        tokens = sample['labels']
-        tokens[tokens == -100] = processor.tokenizer.pad_token_id
-        text = ''.join(processor.decode(tokens, skip_special_tokens=True).split())
-
-        print(f'{i}:\n{text}\n')
-        plt.imshow(img)
-        plt.show()
--- a/manga_ocr_dev/training/get_model.py
+++ b/manga_ocr_dev/training/get_model.py
@@ -1,63 +0,0 @@
-from transformers import AutoConfig, AutoModelForCausalLM, AutoModel, TrOCRProcessor, VisionEncoderDecoderModel, \
-    AutoFeatureExtractor, AutoTokenizer, VisionEncoderDecoderConfig
-
-
-class TrOCRProcessorCustom(TrOCRProcessor):
-    """The only point of this class is to bypass type checks of base class."""
-
-    def __init__(self, feature_extractor, tokenizer):
-        self.feature_extractor = feature_extractor
-        self.tokenizer = tokenizer
-        self.current_processor = self.feature_extractor
-
-
-def get_processor(encoder_name, decoder_name):
-    feature_extractor = AutoFeatureExtractor.from_pretrained(encoder_name)
-    tokenizer = AutoTokenizer.from_pretrained(decoder_name)
-    processor = TrOCRProcessorCustom(feature_extractor, tokenizer)
-    return processor
-
-
-def get_model(encoder_name, decoder_name, max_length, num_decoder_layers=None):
-    encoder_config = AutoConfig.from_pretrained(encoder_name)
-    encoder_config.is_decoder = False
-    encoder_config.add_cross_attention = False
-    encoder = AutoModel.from_config(encoder_config)
-
-    decoder_config = AutoConfig.from_pretrained(decoder_name)
-    decoder_config.max_length = max_length
-    decoder_config.is_decoder = True
-    decoder_config.add_cross_attention = True
-    decoder = AutoModelForCausalLM.from_config(decoder_config)
-
-    if num_decoder_layers is not None:
-        if decoder_config.model_type == 'bert':
-            decoder.bert.encoder.layer = decoder.bert.encoder.layer[-num_decoder_layers:]
-        elif decoder_config.model_type in ('roberta', 'xlm-roberta'):
-            decoder.roberta.encoder.layer = decoder.roberta.encoder.layer[-num_decoder_layers:]
-        else:
-            raise ValueError(f'Unsupported model_type: {decoder_config.model_type}')
-
-        decoder_config.num_hidden_layers = num_decoder_layers
-
-    config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config)
-    config.tie_word_embeddings = False
-    model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder, config=config)
-
-    processor = get_processor(encoder_name, decoder_name)
-
-    # set special tokens used for creating the decoder_input_ids from the labels
-    model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
-    model.config.pad_token_id = processor.tokenizer.pad_token_id
-    # make sure vocab size is set correctly
-    model.config.vocab_size = model.config.decoder.vocab_size
-
-    # set beam search parameters
-    model.config.eos_token_id = processor.tokenizer.sep_token_id
-    model.config.max_length = max_length
-    model.config.early_stopping = True
-    model.config.no_repeat_ngram_size = 3
-    model.config.length_penalty = 2.0
-    model.config.num_beams = 4
-
-    return model, processor
--- a/manga_ocr_dev/training/metrics.py
+++ b/manga_ocr_dev/training/metrics.py
@@ -1,32 +0,0 @@
-import numpy as np
-from datasets import load_metric
-
-
-class Metrics:
-    def __init__(self, processor):
-        self.cer_metric = load_metric("cer")
-        self.processor = processor
-
-    def compute_metrics(self, pred):
-        label_ids = pred.label_ids
-        pred_ids = pred.predictions
-        print(label_ids.shape, pred_ids.shape)
-
-        pred_str = self.processor.batch_decode(pred_ids, skip_special_tokens=True)
-        label_ids[label_ids == -100] = self.processor.tokenizer.pad_token_id
-        label_str = self.processor.batch_decode(label_ids, skip_special_tokens=True)
-
-        pred_str = np.array([''.join(text.split()) for text in pred_str])
-        label_str = np.array([''.join(text.split()) for text in label_str])
-
-        results = {}
-        try:
-            results['cer'] = self.cer_metric.compute(predictions=pred_str, references=label_str)
-        except Exception as e:
-            print(e)
-            print(pred_str)
-            print(label_str)
-            results['cer'] = 0
-        results['accuracy'] = (pred_str == label_str).mean()
-
-        return results
--- a/manga_ocr_dev/training/train.py
+++ b/manga_ocr_dev/training/train.py
@@ -1,64 +0,0 @@
-import fire
-import wandb
-from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, default_data_collator
-
-from manga_ocr_dev.env import TRAIN_ROOT
-from manga_ocr_dev.training.dataset import MangaDataset
-from manga_ocr_dev.training.get_model import get_model
-from manga_ocr_dev.training.metrics import Metrics
-
-
-def run(
-        run_name='debug',
-        encoder_name='facebook/deit-tiny-patch16-224',
-        decoder_name='cl-tohoku/bert-base-japanese-char-v2',
-        max_len=300,
-        num_decoder_layers=2,
-        batch_size=64,
-        num_epochs=8,
-        fp16=True,
-):
-    wandb.login()
-
-    model, processor = get_model(encoder_name, decoder_name, max_len, num_decoder_layers)
-
-    # keep package 0 for validation
-    train_dataset = MangaDataset(processor, 'train', max_len, augment=True, skip_packages=[0])
-    eval_dataset = MangaDataset(processor, 'test', max_len, augment=False, skip_packages=range(1, 9999))
-
-    metrics = Metrics(processor)
-
-    training_args = Seq2SeqTrainingArguments(
-        predict_with_generate=True,
-        evaluation_strategy='steps',
-        save_strategy='steps',
-        per_device_train_batch_size=batch_size,
-        per_device_eval_batch_size=batch_size,
-        fp16=fp16,
-        fp16_full_eval=fp16,
-        dataloader_num_workers=16,
-        output_dir=TRAIN_ROOT,
-        logging_steps=10,
-        save_steps=20000,
-        eval_steps=20000,
-        num_train_epochs=num_epochs,
-        run_name=run_name
-    )
-
-    # instantiate trainer
-    trainer = Seq2SeqTrainer(
-        model=model,
-        tokenizer=processor.feature_extractor,
-        args=training_args,
-        compute_metrics=metrics.compute_metrics,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        data_collator=default_data_collator,
-    )
-    trainer.train()
-
-    wandb.finish()
-
-
-if __name__ == '__main__':
-    fire.Fire(run)
--- a/manga_ocr_dev/training/utils.py
+++ b/manga_ocr_dev/training/utils.py
@@ -1,27 +0,0 @@
-import numpy as np
-import torch
-from torchinfo import summary
-
-
-def encoder_summary(model, batch_size=4):
-    img_size = model.config.encoder.image_size
-    return summary(model.encoder, input_size=(batch_size, 3, img_size, img_size), depth=3,
-                   col_names=["output_size", "num_params", "mult_adds"], device='cpu')
-
-
-def decoder_summary(model, batch_size=4):
-    img_size = model.config.encoder.image_size
-    encoder_hidden_shape = (batch_size, (img_size // 16) ** 2 + 1, model.config.decoder.hidden_size)
-    decoder_inputs = {
-        'input_ids': torch.zeros(batch_size, 1, dtype=torch.int64),
-        'attention_mask': torch.ones(batch_size, 1, dtype=torch.int64),
-        'encoder_hidden_states': torch.rand(encoder_hidden_shape, dtype=torch.float32),
-        'return_dict': False
-    }
-    return summary(model.decoder, input_data=decoder_inputs, depth=4,
-                   col_names=["output_size", "num_params", "mult_adds"],
-                   device='cpu')
-
-
-def tensor_to_image(img):
-    return ((img.cpu().numpy() + 1) / 2 * 255).clip(0, 255).astype(np.uint8).transpose(1, 2, 0)
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,4 +11,6 @@ unidic_lite
 google-cloud-vision
 azure-cognitiveservices-vision-computervision
 pyobjc
-pynput
+pynput
+easyocr
+paddleocr
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/data/expected_results.json
+++ b/tests/data/expected_results.json
@@ -1,50 +0,0 @@
-[
-  {
-    "filename": "00.jpg",
-    "result": "素直にあやまるしか"
-  },
-  {
-    "filename": "01.jpg",
-    "result": "立川で見た〝穴〟の下の巨大な眼は："
-  },
-  {
-    "filename": "02.jpg",
-    "result": "実戦剣術も一流です"
-  },
-  {
-    "filename": "03.jpg",
-    "result": "第３０話重苦しい闇の奥で静かに呼吸づきながら"
-  },
-  {
-    "filename": "04.jpg",
-    "result": "きのうハンパーヶとって、ゴメン！！！"
-  },
-  {
-    "filename": "05.jpg",
-    "result": "ぎゃっ"
-  },
-  {
-    "filename": "06.jpg",
-    "result": "ピンポーーン"
-  },
-  {
-    "filename": "07.jpg",
-    "result": "ＬＩＮＫ！私達７人の力でガノンの塔の結界をやぶります"
-  },
-  {
-    "filename": "08.jpg",
-    "result": "ファイアパンチ"
-  },
-  {
-    "filename": "09.jpg",
-    "result": "少し黙っている"
-  },
-  {
-    "filename": "10.jpg",
-    "result": "わかるかな〜？"
-  },
-  {
-    "filename": "11.jpg",
-    "result": "警察にも先生にも町中の人達に！！"
-  }
-]
--- a/tests/data/images/00.jpg
+++ b/tests/data/images/00.jpg
--- a/tests/data/images/01.jpg
+++ b/tests/data/images/01.jpg
--- a/tests/data/images/02.jpg
+++ b/tests/data/images/02.jpg
--- a/tests/data/images/03.jpg
+++ b/tests/data/images/03.jpg
--- a/tests/data/images/04.jpg
+++ b/tests/data/images/04.jpg
--- a/tests/data/images/05.jpg
+++ b/tests/data/images/05.jpg
--- a/tests/data/images/06.jpg
+++ b/tests/data/images/06.jpg
--- a/tests/data/images/07.jpg
+++ b/tests/data/images/07.jpg
--- a/tests/data/images/08.jpg
+++ b/tests/data/images/08.jpg
--- a/tests/data/images/09.jpg
+++ b/tests/data/images/09.jpg
--- a/tests/data/images/10.jpg
+++ b/tests/data/images/10.jpg
--- a/tests/data/images/11.jpg
+++ b/tests/data/images/11.jpg
--- a/tests/generate_expected_results.py
+++ b/tests/generate_expected_results.py
@@ -1,25 +0,0 @@
-import json
-from pathlib import Path
-
-from tqdm import tqdm
-
-from manga_ocr import MangaOcr
-
-TEST_DATA_ROOT = Path(__file__).parent / 'data'
-
-
-def generate_expected_results():
-    mocr = MangaOcr()
-
-    results = []
-
-    for path in tqdm(sorted((TEST_DATA_ROOT / 'images').iterdir())):
-        result = mocr(path)
-        results.append({'filename': path.name, 'result': result})
-
-    (TEST_DATA_ROOT / 'expected_results.json').write_text(json.dumps(results, ensure_ascii=False, indent=2),
-                                                          encoding='utf-8')
-
-
-if __name__ == '__main__':
-    generate_expected_results()
--- a/tests/test_ocr.py
+++ b/tests/test_ocr.py
@@ -1,16 +0,0 @@
-import json
-from pathlib import Path
-
-from manga_ocr import MangaOcr
-
-TEST_DATA_ROOT = Path(__file__).parent / 'data'
-
-
-def test_ocr():
-    mocr = MangaOcr()
-
-    expected_results = json.loads((TEST_DATA_ROOT / 'expected_results.json').read_text(encoding='utf-8'))
-
-    for item in expected_results:
-        result = mocr(TEST_DATA_ROOT / 'images' / item['filename'])
-        assert result == item['result']