training and synthetic data generation code

2022-02-09 20:39:01 +01:00
parent a9085393f4
commit 975dbf4d5e
42 changed files with 7089 additions and 15 deletions
--- a/manga_ocr_dev/synthetic_data_generator/utils.py
+++ b/manga_ocr_dev/synthetic_data_generator/utils.py
@@ -0,0 +1,54 @@
+import pandas as pd
+import unicodedata
+
+from manga_ocr_dev.env import ASSETS_PATH, FONTS_ROOT
+
+
+def get_background_df(background_dir):
+    background_df = []
+    for path in background_dir.iterdir():
+        ymin, ymax, xmin, xmax = [int(v) for v in path.stem.split('_')[-4:]]
+        h = ymax - ymin
+        w = xmax - xmin
+        ratio = w / h
+
+        background_df.append({
+            'path': str(path),
+            'h': h,
+            'w': w,
+            'ratio': ratio,
+        })
+    background_df = pd.DataFrame(background_df)
+    return background_df
+
+
+def is_kanji(ch):
+    return 'CJK UNIFIED IDEOGRAPH' in unicodedata.name(ch)
+
+
+def is_hiragana(ch):
+    return 'HIRAGANA' in unicodedata.name(ch)
+
+
+def is_katakana(ch):
+    return 'KATAKANA' in unicodedata.name(ch)
+
+
+def is_ascii(ch):
+    return ord(ch) < 128
+
+
+def get_charsets(vocab_path=None):
+    if vocab_path is None:
+        vocab_path = ASSETS_PATH / 'vocab.csv'
+    vocab = pd.read_csv(vocab_path).char.values
+    hiragana = vocab[[is_hiragana(c) for c in vocab]][:-6]
+    katakana = vocab[[is_katakana(c) for c in vocab]][3:]
+    return vocab, hiragana, katakana
+
+
+def get_font_meta():
+    df = pd.read_csv(ASSETS_PATH / 'fonts.csv')
+    df.font_path = df.font_path.apply(lambda x: str(FONTS_ROOT / x))
+    font_map = {row.font_path: set(row.supported_chars) for row in df.dropna().itertuples()}
+    return df, font_map