Files
dotfiles/.agents/skills/pdf/scripts/extract_form_structure.py
2026-02-19 00:33:08 -08:00

116 lines
3.9 KiB
Python
Executable File

"""
Extract form structure from a non-fillable PDF.
This script analyzes the PDF to find:
- Text labels with their exact coordinates
- Horizontal lines (row boundaries)
- Checkboxes (small rectangles)
Output: A JSON file with the form structure that can be used to generate
accurate field coordinates for filling.
Usage: python extract_form_structure.py <input.pdf> <output.json>
"""
import json
import sys
import pdfplumber
def extract_form_structure(pdf_path):
structure = {
"pages": [],
"labels": [],
"lines": [],
"checkboxes": [],
"row_boundaries": []
}
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
structure["pages"].append({
"page_number": page_num,
"width": float(page.width),
"height": float(page.height)
})
words = page.extract_words()
for word in words:
structure["labels"].append({
"page": page_num,
"text": word["text"],
"x0": round(float(word["x0"]), 1),
"top": round(float(word["top"]), 1),
"x1": round(float(word["x1"]), 1),
"bottom": round(float(word["bottom"]), 1)
})
for line in page.lines:
if abs(float(line["x1"]) - float(line["x0"])) > page.width * 0.5:
structure["lines"].append({
"page": page_num,
"y": round(float(line["top"]), 1),
"x0": round(float(line["x0"]), 1),
"x1": round(float(line["x1"]), 1)
})
for rect in page.rects:
width = float(rect["x1"]) - float(rect["x0"])
height = float(rect["bottom"]) - float(rect["top"])
if 5 <= width <= 15 and 5 <= height <= 15 and abs(width - height) < 2:
structure["checkboxes"].append({
"page": page_num,
"x0": round(float(rect["x0"]), 1),
"top": round(float(rect["top"]), 1),
"x1": round(float(rect["x1"]), 1),
"bottom": round(float(rect["bottom"]), 1),
"center_x": round((float(rect["x0"]) + float(rect["x1"])) / 2, 1),
"center_y": round((float(rect["top"]) + float(rect["bottom"])) / 2, 1)
})
lines_by_page = {}
for line in structure["lines"]:
page = line["page"]
if page not in lines_by_page:
lines_by_page[page] = []
lines_by_page[page].append(line["y"])
for page, y_coords in lines_by_page.items():
y_coords = sorted(set(y_coords))
for i in range(len(y_coords) - 1):
structure["row_boundaries"].append({
"page": page,
"row_top": y_coords[i],
"row_bottom": y_coords[i + 1],
"row_height": round(y_coords[i + 1] - y_coords[i], 1)
})
return structure
def main():
if len(sys.argv) != 3:
print("Usage: extract_form_structure.py <input.pdf> <output.json>")
sys.exit(1)
pdf_path = sys.argv[1]
output_path = sys.argv[2]
print(f"Extracting structure from {pdf_path}...")
structure = extract_form_structure(pdf_path)
with open(output_path, "w") as f:
json.dump(structure, f, indent=2)
print(f"Found:")
print(f" - {len(structure['pages'])} pages")
print(f" - {len(structure['labels'])} text labels")
print(f" - {len(structure['lines'])} horizontal lines")
print(f" - {len(structure['checkboxes'])} checkboxes")
print(f" - {len(structure['row_boundaries'])} row boundaries")
print(f"Saved to {output_path}")
if __name__ == "__main__":
main()