training and synthetic data generation code
This commit is contained in:
72
manga_ocr_dev/synthetic_data_generator/scan_fonts.py
Normal file
72
manga_ocr_dev/synthetic_data_generator/scan_fonts.py
Normal file
@@ -0,0 +1,72 @@
|
||||
import PIL
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from PIL import ImageDraw, ImageFont
|
||||
from fontTools.ttLib import TTFont
|
||||
from tqdm.contrib.concurrent import process_map
|
||||
|
||||
from manga_ocr_dev.env import ASSETS_PATH, FONTS_ROOT
|
||||
|
||||
vocab = pd.read_csv(ASSETS_PATH / 'vocab.csv').char.values
|
||||
|
||||
|
||||
def has_glyph(font, glyph):
|
||||
for table in font['cmap'].tables:
|
||||
if ord(glyph) in table.cmap.keys():
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def process(font_path):
|
||||
"""
|
||||
Get supported characters list for a given font.
|
||||
Font metadata is not always reliable, so try to render each character and see if anything shows up.
|
||||
Still not perfect, because sometimes unsupported characters show up as rectangles.
|
||||
"""
|
||||
|
||||
try:
|
||||
font_path = str(font_path)
|
||||
ttfont = TTFont(font_path)
|
||||
pil_font = ImageFont.truetype(font_path, 24)
|
||||
|
||||
supported_chars = []
|
||||
|
||||
for char in vocab:
|
||||
if not has_glyph(ttfont, char):
|
||||
continue
|
||||
|
||||
image = PIL.Image.new('L', (40, 40), 255)
|
||||
draw = ImageDraw.Draw(image)
|
||||
draw.text((10, 0), char, 0, font=pil_font)
|
||||
if (np.array(image) != 255).sum() == 0:
|
||||
continue
|
||||
|
||||
supported_chars.append(char)
|
||||
|
||||
supported_chars = ''.join(supported_chars)
|
||||
except Exception as e:
|
||||
print(f'Error while processing {font_path}: {e}')
|
||||
supported_chars = ''
|
||||
|
||||
return supported_chars
|
||||
|
||||
|
||||
def main():
|
||||
path_in = FONTS_ROOT
|
||||
out_path = ASSETS_PATH / 'fonts.csv'
|
||||
|
||||
suffixes = {'.TTF', '.otf', '.ttc', '.ttf'}
|
||||
font_paths = [path for path in path_in.glob('**/*') if
|
||||
path.suffix in suffixes]
|
||||
|
||||
data = process_map(process, font_paths, max_workers=16)
|
||||
|
||||
font_paths = [str(path.relative_to(FONTS_ROOT)) for path in font_paths]
|
||||
data = pd.DataFrame({'font_path': font_paths, 'supported_chars': data})
|
||||
data['num_chars'] = data.supported_chars.str.len()
|
||||
data['label'] = 'regular'
|
||||
data.to_csv(out_path, index=False)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user