55 lines
1.4 KiB
Python
55 lines
1.4 KiB
Python
import pandas as pd
|
|
import unicodedata
|
|
|
|
from manga_ocr_dev.env import ASSETS_PATH, FONTS_ROOT
|
|
|
|
|
|
def get_background_df(background_dir):
|
|
background_df = []
|
|
for path in background_dir.iterdir():
|
|
ymin, ymax, xmin, xmax = [int(v) for v in path.stem.split('_')[-4:]]
|
|
h = ymax - ymin
|
|
w = xmax - xmin
|
|
ratio = w / h
|
|
|
|
background_df.append({
|
|
'path': str(path),
|
|
'h': h,
|
|
'w': w,
|
|
'ratio': ratio,
|
|
})
|
|
background_df = pd.DataFrame(background_df)
|
|
return background_df
|
|
|
|
|
|
def is_kanji(ch):
|
|
return 'CJK UNIFIED IDEOGRAPH' in unicodedata.name(ch)
|
|
|
|
|
|
def is_hiragana(ch):
|
|
return 'HIRAGANA' in unicodedata.name(ch)
|
|
|
|
|
|
def is_katakana(ch):
|
|
return 'KATAKANA' in unicodedata.name(ch)
|
|
|
|
|
|
def is_ascii(ch):
|
|
return ord(ch) < 128
|
|
|
|
|
|
def get_charsets(vocab_path=None):
|
|
if vocab_path is None:
|
|
vocab_path = ASSETS_PATH / 'vocab.csv'
|
|
vocab = pd.read_csv(vocab_path).char.values
|
|
hiragana = vocab[[is_hiragana(c) for c in vocab]][:-6]
|
|
katakana = vocab[[is_katakana(c) for c in vocab]][3:]
|
|
return vocab, hiragana, katakana
|
|
|
|
|
|
def get_font_meta():
|
|
df = pd.read_csv(ASSETS_PATH / 'fonts.csv')
|
|
df.font_path = df.font_path.apply(lambda x: str(FONTS_ROOT / x))
|
|
font_map = {row.font_path: set(row.supported_chars) for row in df.dropna().itertuples()}
|
|
return df, font_map
|