training and synthetic data generation code

2022-02-09 20:39:01 +01:00
parent a9085393f4
commit 975dbf4d5e
42 changed files with 7089 additions and 15 deletions
--- a/manga_ocr_dev/synthetic_data_generator/scan_fonts.py
+++ b/manga_ocr_dev/synthetic_data_generator/scan_fonts.py
@@ -0,0 +1,72 @@
+import PIL
+import numpy as np
+import pandas as pd
+from PIL import ImageDraw, ImageFont
+from fontTools.ttLib import TTFont
+from tqdm.contrib.concurrent import process_map
+
+from manga_ocr_dev.env import ASSETS_PATH, FONTS_ROOT
+
+vocab = pd.read_csv(ASSETS_PATH / 'vocab.csv').char.values
+
+
+def has_glyph(font, glyph):
+    for table in font['cmap'].tables:
+        if ord(glyph) in table.cmap.keys():
+            return True
+    return False
+
+
+def process(font_path):
+    """
+    Get supported characters list for a given font.
+    Font metadata is not always reliable, so try to render each character and see if anything shows up.
+    Still not perfect, because sometimes unsupported characters show up as rectangles.
+    """
+
+    try:
+        font_path = str(font_path)
+        ttfont = TTFont(font_path)
+        pil_font = ImageFont.truetype(font_path, 24)
+
+        supported_chars = []
+
+        for char in vocab:
+            if not has_glyph(ttfont, char):
+                continue
+
+            image = PIL.Image.new('L', (40, 40), 255)
+            draw = ImageDraw.Draw(image)
+            draw.text((10, 0), char, 0, font=pil_font)
+            if (np.array(image) != 255).sum() == 0:
+                continue
+
+            supported_chars.append(char)
+
+        supported_chars = ''.join(supported_chars)
+    except Exception as e:
+        print(f'Error while processing {font_path}: {e}')
+        supported_chars = ''
+
+    return supported_chars
+
+
+def main():
+    path_in = FONTS_ROOT
+    out_path = ASSETS_PATH / 'fonts.csv'
+
+    suffixes = {'.TTF', '.otf', '.ttc', '.ttf'}
+    font_paths = [path for path in path_in.glob('**/*') if
+                  path.suffix in suffixes]
+
+    data = process_map(process, font_paths, max_workers=16)
+
+    font_paths = [str(path.relative_to(FONTS_ROOT)) for path in font_paths]
+    data = pd.DataFrame({'font_path': font_paths, 'supported_chars': data})
+    data['num_chars'] = data.supported_chars.str.len()
+    data['label'] = 'regular'
+    data.to_csv(out_path, index=False)
+
+
+if __name__ == '__main__':
+    main()