Files
owocr/manga_ocr_dev/synthetic_data_generator/utils.py
2022-02-09 20:39:37 +01:00

55 lines
1.4 KiB
Python

import pandas as pd
import unicodedata
from manga_ocr_dev.env import ASSETS_PATH, FONTS_ROOT
def get_background_df(background_dir):
background_df = []
for path in background_dir.iterdir():
ymin, ymax, xmin, xmax = [int(v) for v in path.stem.split('_')[-4:]]
h = ymax - ymin
w = xmax - xmin
ratio = w / h
background_df.append({
'path': str(path),
'h': h,
'w': w,
'ratio': ratio,
})
background_df = pd.DataFrame(background_df)
return background_df
def is_kanji(ch):
return 'CJK UNIFIED IDEOGRAPH' in unicodedata.name(ch)
def is_hiragana(ch):
return 'HIRAGANA' in unicodedata.name(ch)
def is_katakana(ch):
return 'KATAKANA' in unicodedata.name(ch)
def is_ascii(ch):
return ord(ch) < 128
def get_charsets(vocab_path=None):
if vocab_path is None:
vocab_path = ASSETS_PATH / 'vocab.csv'
vocab = pd.read_csv(vocab_path).char.values
hiragana = vocab[[is_hiragana(c) for c in vocab]][:-6]
katakana = vocab[[is_katakana(c) for c in vocab]][3:]
return vocab, hiragana, katakana
def get_font_meta():
df = pd.read_csv(ASSETS_PATH / 'fonts.csv')
df.font_path = df.font_path.apply(lambda x: str(FONTS_ROOT / x))
font_map = {row.font_path: set(row.supported_chars) for row in df.dropna().itertuples()}
return df, font_map