73 lines
2.0 KiB
Python
73 lines
2.0 KiB
Python
import PIL
|
|
import numpy as np
|
|
import pandas as pd
|
|
from PIL import ImageDraw, ImageFont
|
|
from fontTools.ttLib import TTFont
|
|
from tqdm.contrib.concurrent import process_map
|
|
|
|
from manga_ocr_dev.env import ASSETS_PATH, FONTS_ROOT
|
|
|
|
vocab = pd.read_csv(ASSETS_PATH / 'vocab.csv').char.values
|
|
|
|
|
|
def has_glyph(font, glyph):
|
|
for table in font['cmap'].tables:
|
|
if ord(glyph) in table.cmap.keys():
|
|
return True
|
|
return False
|
|
|
|
|
|
def process(font_path):
|
|
"""
|
|
Get supported characters list for a given font.
|
|
Font metadata is not always reliable, so try to render each character and see if anything shows up.
|
|
Still not perfect, because sometimes unsupported characters show up as rectangles.
|
|
"""
|
|
|
|
try:
|
|
font_path = str(font_path)
|
|
ttfont = TTFont(font_path)
|
|
pil_font = ImageFont.truetype(font_path, 24)
|
|
|
|
supported_chars = []
|
|
|
|
for char in vocab:
|
|
if not has_glyph(ttfont, char):
|
|
continue
|
|
|
|
image = PIL.Image.new('L', (40, 40), 255)
|
|
draw = ImageDraw.Draw(image)
|
|
draw.text((10, 0), char, 0, font=pil_font)
|
|
if (np.array(image) != 255).sum() == 0:
|
|
continue
|
|
|
|
supported_chars.append(char)
|
|
|
|
supported_chars = ''.join(supported_chars)
|
|
except Exception as e:
|
|
print(f'Error while processing {font_path}: {e}')
|
|
supported_chars = ''
|
|
|
|
return supported_chars
|
|
|
|
|
|
def main():
|
|
path_in = FONTS_ROOT
|
|
out_path = ASSETS_PATH / 'fonts.csv'
|
|
|
|
suffixes = {'.TTF', '.otf', '.ttc', '.ttf'}
|
|
font_paths = [path for path in path_in.glob('**/*') if
|
|
path.suffix in suffixes]
|
|
|
|
data = process_map(process, font_paths, max_workers=16)
|
|
|
|
font_paths = [str(path.relative_to(FONTS_ROOT)) for path in font_paths]
|
|
data = pd.DataFrame({'font_path': font_paths, 'supported_chars': data})
|
|
data['num_chars'] = data.supported_chars.str.len()
|
|
data['label'] = 'regular'
|
|
data.to_csv(out_path, index=False)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|