training and synthetic data generation code

2022-02-09 20:39:01 +01:00
parent a9085393f4
commit 975dbf4d5e
42 changed files with 7089 additions and 15 deletions
--- a/manga_ocr_dev/data/init.py
+++ b/manga_ocr_dev/data/init.py
--- a/manga_ocr_dev/data/generate_backgrounds.py
+++ b/manga_ocr_dev/data/generate_backgrounds.py
@@ -0,0 +1,85 @@
+from pathlib import Path
+
+import cv2
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+from manga_ocr_dev.env import MANGA109_ROOT, BACKGROUND_DIR
+
+
+def find_rectangle(mask, y, x, aspect_ratio_range=(0.33, 3.0)):
+    ymin_ = ymax_ = y
+    xmin_ = xmax_ = x
+
+    ymin = ymax = xmin = xmax = None
+
+    while True:
+        if ymin is None:
+            ymin_ -= 1
+            if ymin_ == 0 or mask[ymin_, xmin_:xmax_].any():
+                ymin = ymin_
+
+        if ymax is None:
+            ymax_ += 1
+            if ymax_ == mask.shape[0] - 1 or mask[ymax_, xmin_:xmax_].any():
+                ymax = ymax_
+
+        if xmin is None:
+            xmin_ -= 1
+            if xmin_ == 0 or mask[ymin_:ymax_, xmin_].any():
+                xmin = xmin_
+
+        if xmax is None:
+            xmax_ += 1
+            if xmax_ == mask.shape[1] - 1 or mask[ymin_:ymax_, xmax_].any():
+                xmax = xmax_
+
+        h = ymax_ - ymin_
+        w = xmax_ - xmin_
+        if h > 1 and w > 1:
+            ratio = w / h
+            if ratio < aspect_ratio_range[0] or ratio > aspect_ratio_range[1]:
+                return ymin_, ymax_, xmin_, xmax_
+
+        if None not in (ymin, ymax, xmin, xmax):
+            return ymin, ymax, xmin, xmax
+
+
+def generate_backgrounds(crops_per_page=5, min_size=40):
+    data = pd.read_csv(MANGA109_ROOT / 'data.csv')
+    frames_df = pd.read_csv(MANGA109_ROOT / 'frames.csv')
+
+    BACKGROUND_DIR.mkdir(parents=True, exist_ok=True)
+
+    page_paths = data.page_path.unique()
+    for page_path in tqdm(page_paths):
+        page = cv2.imread(str(MANGA109_ROOT / page_path))
+        mask = np.zeros((page.shape[0], page.shape[1]), dtype=bool)
+        for row in data[data.page_path == page_path].itertuples():
+            mask[row.ymin:row.ymax, row.xmin:row.xmax] = True
+
+        frames_mask = np.zeros((page.shape[0], page.shape[1]), dtype=bool)
+        for row in frames_df[frames_df.page_path == page_path].itertuples():
+            frames_mask[row.ymin:row.ymax, row.xmin:row.xmax] = True
+
+        mask = mask | ~frames_mask
+
+        if mask.all():
+            continue
+
+        unmasked_points = np.stack(np.where(~mask), axis=1)
+        for i in range(crops_per_page):
+            p = unmasked_points[np.random.randint(0, unmasked_points.shape[0])]
+            y, x = p
+            ymin, ymax, xmin, xmax = find_rectangle(mask, y, x)
+            crop = page[ymin:ymax, xmin:xmax]
+
+            if crop.shape[0] >= min_size and crop.shape[1] >= min_size:
+                out_filename = '_'.join(
+                    Path(page_path).with_suffix('').parts[-2:]) + f'_{ymin}_{ymax}_{xmin}_{xmax}.png'
+                cv2.imwrite(str(BACKGROUND_DIR / out_filename), crop)
+
+
+if __name__ == '__main__':
+    generate_backgrounds()
--- a/manga_ocr_dev/data/process_manga109s.py
+++ b/manga_ocr_dev/data/process_manga109s.py
@@ -0,0 +1,103 @@
+import xml.etree.ElementTree as ET
+from pathlib import Path
+
+import cv2
+import pandas as pd
+from tqdm import tqdm
+
+from manga_ocr_dev.env import MANGA109_ROOT
+
+
+def get_books():
+    root = MANGA109_ROOT / 'Manga109s_released_2021_02_28'
+    books = (root / 'books.txt').read_text().splitlines()
+    books = pd.DataFrame({
+        'book': books,
+        'annotations': [str(root / 'annotations' / f'{book}.xml') for book in books],
+        'images': [str(root / 'images' / book) for book in books],
+    })
+
+    return books
+
+
+def export_frames():
+    books = get_books()
+
+    data = []
+    for book in tqdm(books.itertuples(), total=len(books)):
+        tree = ET.parse(book.annotations)
+        root = tree.getroot()
+        for page in root.findall('./pages/page'):
+            for frame in page.findall('./frame'):
+                row = {}
+                row['book'] = book.book
+                row['page_index'] = int(page.attrib['index'])
+                row['page_path'] = str(Path(book.images) / f'{row["page_index"]:03d}.jpg')
+                row['page_width'] = int(page.attrib['width'])
+                row['page_height'] = int(page.attrib['height'])
+                row['id'] = frame.attrib['id']
+                row['xmin'] = int(frame.attrib['xmin'])
+                row['ymin'] = int(frame.attrib['ymin'])
+                row['xmax'] = int(frame.attrib['xmax'])
+                row['ymax'] = int(frame.attrib['ymax'])
+                data.append(row)
+    data = pd.DataFrame(data)
+
+    data.page_path = data.page_path.apply(lambda x: '/'.join(Path(x).parts[-4:]))
+    data.to_csv(MANGA109_ROOT / 'frames.csv', index=False)
+
+
+def export_crops():
+    crops_root = MANGA109_ROOT / 'crops'
+    crops_root.mkdir(parents=True, exist_ok=True)
+    margin = 10
+
+    books = get_books()
+
+    data = []
+    for book in tqdm(books.itertuples(), total=len(books)):
+        tree = ET.parse(book.annotations)
+        root = tree.getroot()
+        for page in root.findall('./pages/page'):
+            for text in page.findall('./text'):
+                row = {}
+                row['book'] = book.book
+                row['page_index'] = int(page.attrib['index'])
+                row['page_path'] = str(Path(book.images) / f'{row["page_index"]:03d}.jpg')
+                row['page_width'] = int(page.attrib['width'])
+                row['page_height'] = int(page.attrib['height'])
+                row['id'] = text.attrib['id']
+                row['text'] = text.text
+                row['xmin'] = int(text.attrib['xmin'])
+                row['ymin'] = int(text.attrib['ymin'])
+                row['xmax'] = int(text.attrib['xmax'])
+                row['ymax'] = int(text.attrib['ymax'])
+                data.append(row)
+    data = pd.DataFrame(data)
+
+    n_test = int(0.1 * len(data))
+    data['split'] = 'train'
+    data.loc[data.sample(len(data)).iloc[:n_test].index, 'split'] = 'test'
+
+    data['crop_path'] = str(crops_root) + '\\' + data.id + '.png'
+
+    data.page_path = data.page_path.apply(lambda x: '/'.join(Path(x).parts[-4:]))
+    data.crop_path = data.crop_path.apply(lambda x: '/'.join(Path(x).parts[-2:]))
+    data.to_csv(MANGA109_ROOT / 'data.csv', index=False)
+
+    for page_path, boxes in tqdm(data.groupby('page_path'), total=data.page_path.nunique()):
+        img = cv2.imread(str(MANGA109_ROOT / page_path))
+
+        for box in boxes.itertuples():
+            xmin = max(box.xmin - margin, 0)
+            xmax = min(box.xmax + margin, img.shape[1])
+            ymin = max(box.ymin - margin, 0)
+            ymax = min(box.ymax + margin, img.shape[0])
+            crop = img[ymin:ymax, xmin:xmax]
+            out_path = (crops_root / box.id).with_suffix('.png')
+            cv2.imwrite(str(out_path), crop)
+
+
+if __name__ == '__main__':
+    export_frames()
+    export_crops()