Files
dotfiles/.agents/skills/pdf/scripts/extract_form_field_info.py
2026-02-19 00:33:08 -08:00

123 lines
4.2 KiB
Python

import json
import sys
from pypdf import PdfReader
def get_full_annotation_field_id(annotation):
components = []
while annotation:
field_name = annotation.get('/T')
if field_name:
components.append(field_name)
annotation = annotation.get('/Parent')
return ".".join(reversed(components)) if components else None
def make_field_dict(field, field_id):
field_dict = {"field_id": field_id}
ft = field.get('/FT')
if ft == "/Tx":
field_dict["type"] = "text"
elif ft == "/Btn":
field_dict["type"] = "checkbox"
states = field.get("/_States_", [])
if len(states) == 2:
if "/Off" in states:
field_dict["checked_value"] = states[0] if states[0] != "/Off" else states[1]
field_dict["unchecked_value"] = "/Off"
else:
print(f"Unexpected state values for checkbox `${field_id}`. Its checked and unchecked values may not be correct; if you're trying to check it, visually verify the results.")
field_dict["checked_value"] = states[0]
field_dict["unchecked_value"] = states[1]
elif ft == "/Ch":
field_dict["type"] = "choice"
states = field.get("/_States_", [])
field_dict["choice_options"] = [{
"value": state[0],
"text": state[1],
} for state in states]
else:
field_dict["type"] = f"unknown ({ft})"
return field_dict
def get_field_info(reader: PdfReader):
fields = reader.get_fields()
field_info_by_id = {}
possible_radio_names = set()
for field_id, field in fields.items():
if field.get("/Kids"):
if field.get("/FT") == "/Btn":
possible_radio_names.add(field_id)
continue
field_info_by_id[field_id] = make_field_dict(field, field_id)
radio_fields_by_id = {}
for page_index, page in enumerate(reader.pages):
annotations = page.get('/Annots', [])
for ann in annotations:
field_id = get_full_annotation_field_id(ann)
if field_id in field_info_by_id:
field_info_by_id[field_id]["page"] = page_index + 1
field_info_by_id[field_id]["rect"] = ann.get('/Rect')
elif field_id in possible_radio_names:
try:
on_values = [v for v in ann["/AP"]["/N"] if v != "/Off"]
except KeyError:
continue
if len(on_values) == 1:
rect = ann.get("/Rect")
if field_id not in radio_fields_by_id:
radio_fields_by_id[field_id] = {
"field_id": field_id,
"type": "radio_group",
"page": page_index + 1,
"radio_options": [],
}
radio_fields_by_id[field_id]["radio_options"].append({
"value": on_values[0],
"rect": rect,
})
fields_with_location = []
for field_info in field_info_by_id.values():
if "page" in field_info:
fields_with_location.append(field_info)
else:
print(f"Unable to determine location for field id: {field_info.get('field_id')}, ignoring")
def sort_key(f):
if "radio_options" in f:
rect = f["radio_options"][0]["rect"] or [0, 0, 0, 0]
else:
rect = f.get("rect") or [0, 0, 0, 0]
adjusted_position = [-rect[1], rect[0]]
return [f.get("page"), adjusted_position]
sorted_fields = fields_with_location + list(radio_fields_by_id.values())
sorted_fields.sort(key=sort_key)
return sorted_fields
def write_field_info(pdf_path: str, json_output_path: str):
reader = PdfReader(pdf_path)
field_info = get_field_info(reader)
with open(json_output_path, "w") as f:
json.dump(field_info, f, indent=2)
print(f"Wrote {len(field_info)} fields to {json_output_path}")
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: extract_form_field_info.py [input pdf] [output json]")
sys.exit(1)
write_field_info(sys.argv[1], sys.argv[2])