Files
dotfiles/.agents/skills/ios-simulator-skill/scripts/navigator.py
2026-02-19 00:33:08 -08:00

454 lines
14 KiB
Python
Executable File

#!/usr/bin/env python3
"""
iOS Simulator Navigator - Smart Element Finder and Interactor
Finds and interacts with UI elements using accessibility data.
Prioritizes structured navigation over pixel-based interaction.
This script is the core automation tool for iOS simulator navigation. It finds
UI elements by text, type, or accessibility ID and performs actions on them
(tap, enter text). Uses semantic element finding instead of fragile pixel coordinates.
Key Features:
- Find elements by text (fuzzy or exact matching)
- Find elements by type (Button, TextField, etc.)
- Find elements by accessibility identifier
- Tap elements at their center point
- Enter text into text fields
- List all tappable elements on screen
- Automatic element caching for performance
Usage Examples:
# Find and tap a button by text
python scripts/navigator.py --find-text "Login" --tap --udid <device-id>
# Enter text into first text field
python scripts/navigator.py --find-type TextField --index 0 --enter-text "username" --udid <device-id>
# Tap element by accessibility ID
python scripts/navigator.py --find-id "submitButton" --tap --udid <device-id>
# List all interactive elements
python scripts/navigator.py --list --udid <device-id>
# Tap at specific coordinates (fallback)
python scripts/navigator.py --tap-at 200,400 --udid <device-id>
Output Format:
Tapped: Button "Login" at (320, 450)
Entered text in: TextField "Username"
Not found: text='Submit'
Navigation Priority (best to worst):
1. Find by accessibility label/text (most reliable)
2. Find by element type + index (good for forms)
3. Find by accessibility ID (precise but app-specific)
4. Tap at coordinates (last resort, fragile)
Technical Details:
- Uses IDB's accessibility tree via `idb ui describe-all --json --nested`
- Caches tree for multiple operations (call with force_refresh to update)
- Finds elements by parsing tree recursively
- Calculates tap coordinates from element frame center
- Uses `idb ui tap` for tapping, `idb ui text` for text entry
- Extracts data from AXLabel, AXValue, and AXUniqueId fields
"""
import argparse
import json
import subprocess
import sys
from dataclasses import dataclass
from common import (
flatten_tree,
get_accessibility_tree,
get_device_screen_size,
resolve_udid,
transform_screenshot_coords,
)
@dataclass
class Element:
"""Represents a UI element from accessibility tree."""
type: str
label: str | None
value: str | None
identifier: str | None
frame: dict[str, float]
traits: list[str]
enabled: bool = True
@property
def center(self) -> tuple[int, int]:
"""Calculate center point for tapping."""
x = int(self.frame["x"] + self.frame["width"] / 2)
y = int(self.frame["y"] + self.frame["height"] / 2)
return (x, y)
@property
def description(self) -> str:
"""Human-readable description."""
label = self.label or self.value or self.identifier or "Unnamed"
return f'{self.type} "{label}"'
class Navigator:
"""Navigates iOS apps using accessibility data."""
def __init__(self, udid: str | None = None):
"""Initialize navigator with optional device UDID."""
self.udid = udid
self._tree_cache = None
def get_accessibility_tree(self, force_refresh: bool = False) -> dict:
"""Get accessibility tree (cached for efficiency)."""
if self._tree_cache and not force_refresh:
return self._tree_cache
# Delegate to shared utility
self._tree_cache = get_accessibility_tree(self.udid, nested=True)
return self._tree_cache
def _flatten_tree(self, node: dict, elements: list[Element] | None = None) -> list[Element]:
"""Flatten accessibility tree into list of elements."""
if elements is None:
elements = []
# Create element from node
if node.get("type"):
element = Element(
type=node.get("type", "Unknown"),
label=node.get("AXLabel"),
value=node.get("AXValue"),
identifier=node.get("AXUniqueId"),
frame=node.get("frame", {}),
traits=node.get("traits", []),
enabled=node.get("enabled", True),
)
elements.append(element)
# Process children
for child in node.get("children", []):
self._flatten_tree(child, elements)
return elements
def find_element(
self,
text: str | None = None,
element_type: str | None = None,
identifier: str | None = None,
index: int = 0,
fuzzy: bool = True,
) -> Element | None:
"""
Find element by various criteria.
Args:
text: Text to search in label/value
element_type: Type of element (Button, TextField, etc.)
identifier: Accessibility identifier
index: Which matching element to return (0-based)
fuzzy: Use fuzzy matching for text
Returns:
Element if found, None otherwise
"""
tree = self.get_accessibility_tree()
elements = self._flatten_tree(tree)
matches = []
for elem in elements:
# Skip disabled elements
if not elem.enabled:
continue
# Check type
if element_type and elem.type != element_type:
continue
# Check identifier (exact match)
if identifier and elem.identifier != identifier:
continue
# Check text (in label or value)
if text:
elem_text = (elem.label or "") + " " + (elem.value or "")
if fuzzy:
if text.lower() not in elem_text.lower():
continue
elif text not in (elem.label, elem.value):
continue
matches.append(elem)
if matches and index < len(matches):
return matches[index]
return None
def tap(self, element: Element) -> bool:
"""Tap on an element."""
x, y = element.center
return self.tap_at(x, y)
def tap_at(self, x: int, y: int) -> bool:
"""Tap at specific coordinates."""
cmd = ["idb", "ui", "tap", str(x), str(y)]
if self.udid:
cmd.extend(["--udid", self.udid])
try:
subprocess.run(cmd, capture_output=True, check=True)
return True
except subprocess.CalledProcessError:
return False
def enter_text(self, text: str, element: Element | None = None) -> bool:
"""
Enter text into element or current focus.
Args:
text: Text to enter
element: Optional element to tap first
Returns:
Success status
"""
# Tap element if provided
if element:
if not self.tap(element):
return False
# Small delay for focus
import time
time.sleep(0.5)
# Enter text
cmd = ["idb", "ui", "text", text]
if self.udid:
cmd.extend(["--udid", self.udid])
try:
subprocess.run(cmd, capture_output=True, check=True)
return True
except subprocess.CalledProcessError:
return False
def find_and_tap(
self,
text: str | None = None,
element_type: str | None = None,
identifier: str | None = None,
index: int = 0,
) -> tuple[bool, str]:
"""
Find element and tap it.
Returns:
(success, message) tuple
"""
element = self.find_element(text, element_type, identifier, index)
if not element:
criteria = []
if text:
criteria.append(f"text='{text}'")
if element_type:
criteria.append(f"type={element_type}")
if identifier:
criteria.append(f"id={identifier}")
return (False, f"Not found: {', '.join(criteria)}")
if self.tap(element):
return (True, f"Tapped: {element.description} at {element.center}")
return (False, f"Failed to tap: {element.description}")
def find_and_enter_text(
self,
text_to_enter: str,
find_text: str | None = None,
element_type: str | None = "TextField",
identifier: str | None = None,
index: int = 0,
) -> tuple[bool, str]:
"""
Find element and enter text into it.
Returns:
(success, message) tuple
"""
element = self.find_element(find_text, element_type, identifier, index)
if not element:
return (False, "TextField not found")
if self.enter_text(text_to_enter, element):
return (True, f"Entered text in: {element.description}")
return (False, "Failed to enter text")
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(description="Navigate iOS apps using accessibility data")
# Finding options
parser.add_argument("--find-text", help="Find element by text (fuzzy match)")
parser.add_argument("--find-exact", help="Find element by exact text")
parser.add_argument("--find-type", help="Element type (Button, TextField, etc.)")
parser.add_argument("--find-id", help="Accessibility identifier")
parser.add_argument("--index", type=int, default=0, help="Which match to use (0-based)")
# Action options
parser.add_argument("--tap", action="store_true", help="Tap the found element")
parser.add_argument("--tap-at", help="Tap at coordinates (x,y)")
parser.add_argument("--enter-text", help="Enter text into element")
# Coordinate transformation
parser.add_argument(
"--screenshot-coords",
action="store_true",
help="Interpret tap coordinates as from a screenshot (requires --screenshot-width/height)",
)
parser.add_argument(
"--screenshot-width",
type=int,
help="Screenshot width for coordinate transformation",
)
parser.add_argument(
"--screenshot-height",
type=int,
help="Screenshot height for coordinate transformation",
)
# Other options
parser.add_argument(
"--udid",
help="Device UDID (auto-detects booted simulator if not provided)",
)
parser.add_argument("--list", action="store_true", help="List all tappable elements")
args = parser.parse_args()
# Resolve UDID with auto-detection
try:
udid = resolve_udid(args.udid)
except RuntimeError as e:
print(f"Error: {e}")
sys.exit(1)
navigator = Navigator(udid=udid)
# List mode
if args.list:
tree = navigator.get_accessibility_tree()
elements = navigator._flatten_tree(tree)
# Filter to tappable elements
tappable = [
e
for e in elements
if e.enabled and e.type in ["Button", "Link", "Cell", "TextField", "SecureTextField"]
]
print(f"Tappable elements ({len(tappable)}):")
for elem in tappable[:10]: # Limit output for tokens
print(f" {elem.type}: \"{elem.label or elem.value or 'Unnamed'}\" {elem.center}")
if len(tappable) > 10:
print(f" ... and {len(tappable) - 10} more")
sys.exit(0)
# Direct tap at coordinates
if args.tap_at:
coords = args.tap_at.split(",")
if len(coords) != 2:
print("Error: --tap-at requires x,y format")
sys.exit(1)
x, y = int(coords[0]), int(coords[1])
# Handle coordinate transformation if requested
if args.screenshot_coords:
if not args.screenshot_width or not args.screenshot_height:
print(
"Error: --screenshot-coords requires --screenshot-width and --screenshot-height"
)
sys.exit(1)
device_w, device_h = get_device_screen_size(udid)
x, y = transform_screenshot_coords(
x,
y,
args.screenshot_width,
args.screenshot_height,
device_w,
device_h,
)
print(
f"Transformed screenshot coords ({coords[0]}, {coords[1]}) "
f"to device coords ({x}, {y})"
)
if navigator.tap_at(x, y):
print(f"Tapped at ({x}, {y})")
else:
print(f"Failed to tap at ({x}, {y})")
sys.exit(1)
# Find and tap
elif args.tap:
text = args.find_text or args.find_exact
fuzzy = args.find_text is not None
success, message = navigator.find_and_tap(
text=text, element_type=args.find_type, identifier=args.find_id, index=args.index
)
print(message)
if not success:
sys.exit(1)
# Find and enter text
elif args.enter_text:
text = args.find_text or args.find_exact
success, message = navigator.find_and_enter_text(
text_to_enter=args.enter_text,
find_text=text,
element_type=args.find_type or "TextField",
identifier=args.find_id,
index=args.index,
)
print(message)
if not success:
sys.exit(1)
# Just find (no action)
else:
text = args.find_text or args.find_exact
fuzzy = args.find_text is not None
element = navigator.find_element(
text=text,
element_type=args.find_type,
identifier=args.find_id,
index=args.index,
fuzzy=fuzzy,
)
if element:
print(f"Found: {element.description} at {element.center}")
else:
print("Element not found")
sys.exit(1)
if __name__ == "__main__":
main()