mirror of
https://github.com/ksyasuda/dotfiles.git
synced 2026-02-28 00:22:41 -08:00
454 lines
14 KiB
Python
Executable File
454 lines
14 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
iOS Simulator Navigator - Smart Element Finder and Interactor
|
|
|
|
Finds and interacts with UI elements using accessibility data.
|
|
Prioritizes structured navigation over pixel-based interaction.
|
|
|
|
This script is the core automation tool for iOS simulator navigation. It finds
|
|
UI elements by text, type, or accessibility ID and performs actions on them
|
|
(tap, enter text). Uses semantic element finding instead of fragile pixel coordinates.
|
|
|
|
Key Features:
|
|
- Find elements by text (fuzzy or exact matching)
|
|
- Find elements by type (Button, TextField, etc.)
|
|
- Find elements by accessibility identifier
|
|
- Tap elements at their center point
|
|
- Enter text into text fields
|
|
- List all tappable elements on screen
|
|
- Automatic element caching for performance
|
|
|
|
Usage Examples:
|
|
# Find and tap a button by text
|
|
python scripts/navigator.py --find-text "Login" --tap --udid <device-id>
|
|
|
|
# Enter text into first text field
|
|
python scripts/navigator.py --find-type TextField --index 0 --enter-text "username" --udid <device-id>
|
|
|
|
# Tap element by accessibility ID
|
|
python scripts/navigator.py --find-id "submitButton" --tap --udid <device-id>
|
|
|
|
# List all interactive elements
|
|
python scripts/navigator.py --list --udid <device-id>
|
|
|
|
# Tap at specific coordinates (fallback)
|
|
python scripts/navigator.py --tap-at 200,400 --udid <device-id>
|
|
|
|
Output Format:
|
|
Tapped: Button "Login" at (320, 450)
|
|
Entered text in: TextField "Username"
|
|
Not found: text='Submit'
|
|
|
|
Navigation Priority (best to worst):
|
|
1. Find by accessibility label/text (most reliable)
|
|
2. Find by element type + index (good for forms)
|
|
3. Find by accessibility ID (precise but app-specific)
|
|
4. Tap at coordinates (last resort, fragile)
|
|
|
|
Technical Details:
|
|
- Uses IDB's accessibility tree via `idb ui describe-all --json --nested`
|
|
- Caches tree for multiple operations (call with force_refresh to update)
|
|
- Finds elements by parsing tree recursively
|
|
- Calculates tap coordinates from element frame center
|
|
- Uses `idb ui tap` for tapping, `idb ui text` for text entry
|
|
- Extracts data from AXLabel, AXValue, and AXUniqueId fields
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import subprocess
|
|
import sys
|
|
from dataclasses import dataclass
|
|
|
|
from common import (
|
|
flatten_tree,
|
|
get_accessibility_tree,
|
|
get_device_screen_size,
|
|
resolve_udid,
|
|
transform_screenshot_coords,
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class Element:
|
|
"""Represents a UI element from accessibility tree."""
|
|
|
|
type: str
|
|
label: str | None
|
|
value: str | None
|
|
identifier: str | None
|
|
frame: dict[str, float]
|
|
traits: list[str]
|
|
enabled: bool = True
|
|
|
|
@property
|
|
def center(self) -> tuple[int, int]:
|
|
"""Calculate center point for tapping."""
|
|
x = int(self.frame["x"] + self.frame["width"] / 2)
|
|
y = int(self.frame["y"] + self.frame["height"] / 2)
|
|
return (x, y)
|
|
|
|
@property
|
|
def description(self) -> str:
|
|
"""Human-readable description."""
|
|
label = self.label or self.value or self.identifier or "Unnamed"
|
|
return f'{self.type} "{label}"'
|
|
|
|
|
|
class Navigator:
|
|
"""Navigates iOS apps using accessibility data."""
|
|
|
|
def __init__(self, udid: str | None = None):
|
|
"""Initialize navigator with optional device UDID."""
|
|
self.udid = udid
|
|
self._tree_cache = None
|
|
|
|
def get_accessibility_tree(self, force_refresh: bool = False) -> dict:
|
|
"""Get accessibility tree (cached for efficiency)."""
|
|
if self._tree_cache and not force_refresh:
|
|
return self._tree_cache
|
|
|
|
# Delegate to shared utility
|
|
self._tree_cache = get_accessibility_tree(self.udid, nested=True)
|
|
return self._tree_cache
|
|
|
|
def _flatten_tree(self, node: dict, elements: list[Element] | None = None) -> list[Element]:
|
|
"""Flatten accessibility tree into list of elements."""
|
|
if elements is None:
|
|
elements = []
|
|
|
|
# Create element from node
|
|
if node.get("type"):
|
|
element = Element(
|
|
type=node.get("type", "Unknown"),
|
|
label=node.get("AXLabel"),
|
|
value=node.get("AXValue"),
|
|
identifier=node.get("AXUniqueId"),
|
|
frame=node.get("frame", {}),
|
|
traits=node.get("traits", []),
|
|
enabled=node.get("enabled", True),
|
|
)
|
|
elements.append(element)
|
|
|
|
# Process children
|
|
for child in node.get("children", []):
|
|
self._flatten_tree(child, elements)
|
|
|
|
return elements
|
|
|
|
def find_element(
|
|
self,
|
|
text: str | None = None,
|
|
element_type: str | None = None,
|
|
identifier: str | None = None,
|
|
index: int = 0,
|
|
fuzzy: bool = True,
|
|
) -> Element | None:
|
|
"""
|
|
Find element by various criteria.
|
|
|
|
Args:
|
|
text: Text to search in label/value
|
|
element_type: Type of element (Button, TextField, etc.)
|
|
identifier: Accessibility identifier
|
|
index: Which matching element to return (0-based)
|
|
fuzzy: Use fuzzy matching for text
|
|
|
|
Returns:
|
|
Element if found, None otherwise
|
|
"""
|
|
tree = self.get_accessibility_tree()
|
|
elements = self._flatten_tree(tree)
|
|
|
|
matches = []
|
|
|
|
for elem in elements:
|
|
# Skip disabled elements
|
|
if not elem.enabled:
|
|
continue
|
|
|
|
# Check type
|
|
if element_type and elem.type != element_type:
|
|
continue
|
|
|
|
# Check identifier (exact match)
|
|
if identifier and elem.identifier != identifier:
|
|
continue
|
|
|
|
# Check text (in label or value)
|
|
if text:
|
|
elem_text = (elem.label or "") + " " + (elem.value or "")
|
|
if fuzzy:
|
|
if text.lower() not in elem_text.lower():
|
|
continue
|
|
elif text not in (elem.label, elem.value):
|
|
continue
|
|
|
|
matches.append(elem)
|
|
|
|
if matches and index < len(matches):
|
|
return matches[index]
|
|
|
|
return None
|
|
|
|
def tap(self, element: Element) -> bool:
|
|
"""Tap on an element."""
|
|
x, y = element.center
|
|
return self.tap_at(x, y)
|
|
|
|
def tap_at(self, x: int, y: int) -> bool:
|
|
"""Tap at specific coordinates."""
|
|
cmd = ["idb", "ui", "tap", str(x), str(y)]
|
|
if self.udid:
|
|
cmd.extend(["--udid", self.udid])
|
|
|
|
try:
|
|
subprocess.run(cmd, capture_output=True, check=True)
|
|
return True
|
|
except subprocess.CalledProcessError:
|
|
return False
|
|
|
|
def enter_text(self, text: str, element: Element | None = None) -> bool:
|
|
"""
|
|
Enter text into element or current focus.
|
|
|
|
Args:
|
|
text: Text to enter
|
|
element: Optional element to tap first
|
|
|
|
Returns:
|
|
Success status
|
|
"""
|
|
# Tap element if provided
|
|
if element:
|
|
if not self.tap(element):
|
|
return False
|
|
# Small delay for focus
|
|
import time
|
|
|
|
time.sleep(0.5)
|
|
|
|
# Enter text
|
|
cmd = ["idb", "ui", "text", text]
|
|
if self.udid:
|
|
cmd.extend(["--udid", self.udid])
|
|
|
|
try:
|
|
subprocess.run(cmd, capture_output=True, check=True)
|
|
return True
|
|
except subprocess.CalledProcessError:
|
|
return False
|
|
|
|
def find_and_tap(
|
|
self,
|
|
text: str | None = None,
|
|
element_type: str | None = None,
|
|
identifier: str | None = None,
|
|
index: int = 0,
|
|
) -> tuple[bool, str]:
|
|
"""
|
|
Find element and tap it.
|
|
|
|
Returns:
|
|
(success, message) tuple
|
|
"""
|
|
element = self.find_element(text, element_type, identifier, index)
|
|
|
|
if not element:
|
|
criteria = []
|
|
if text:
|
|
criteria.append(f"text='{text}'")
|
|
if element_type:
|
|
criteria.append(f"type={element_type}")
|
|
if identifier:
|
|
criteria.append(f"id={identifier}")
|
|
return (False, f"Not found: {', '.join(criteria)}")
|
|
|
|
if self.tap(element):
|
|
return (True, f"Tapped: {element.description} at {element.center}")
|
|
return (False, f"Failed to tap: {element.description}")
|
|
|
|
def find_and_enter_text(
|
|
self,
|
|
text_to_enter: str,
|
|
find_text: str | None = None,
|
|
element_type: str | None = "TextField",
|
|
identifier: str | None = None,
|
|
index: int = 0,
|
|
) -> tuple[bool, str]:
|
|
"""
|
|
Find element and enter text into it.
|
|
|
|
Returns:
|
|
(success, message) tuple
|
|
"""
|
|
element = self.find_element(find_text, element_type, identifier, index)
|
|
|
|
if not element:
|
|
return (False, "TextField not found")
|
|
|
|
if self.enter_text(text_to_enter, element):
|
|
return (True, f"Entered text in: {element.description}")
|
|
return (False, "Failed to enter text")
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
parser = argparse.ArgumentParser(description="Navigate iOS apps using accessibility data")
|
|
|
|
# Finding options
|
|
parser.add_argument("--find-text", help="Find element by text (fuzzy match)")
|
|
parser.add_argument("--find-exact", help="Find element by exact text")
|
|
parser.add_argument("--find-type", help="Element type (Button, TextField, etc.)")
|
|
parser.add_argument("--find-id", help="Accessibility identifier")
|
|
parser.add_argument("--index", type=int, default=0, help="Which match to use (0-based)")
|
|
|
|
# Action options
|
|
parser.add_argument("--tap", action="store_true", help="Tap the found element")
|
|
parser.add_argument("--tap-at", help="Tap at coordinates (x,y)")
|
|
parser.add_argument("--enter-text", help="Enter text into element")
|
|
|
|
# Coordinate transformation
|
|
parser.add_argument(
|
|
"--screenshot-coords",
|
|
action="store_true",
|
|
help="Interpret tap coordinates as from a screenshot (requires --screenshot-width/height)",
|
|
)
|
|
parser.add_argument(
|
|
"--screenshot-width",
|
|
type=int,
|
|
help="Screenshot width for coordinate transformation",
|
|
)
|
|
parser.add_argument(
|
|
"--screenshot-height",
|
|
type=int,
|
|
help="Screenshot height for coordinate transformation",
|
|
)
|
|
|
|
# Other options
|
|
parser.add_argument(
|
|
"--udid",
|
|
help="Device UDID (auto-detects booted simulator if not provided)",
|
|
)
|
|
parser.add_argument("--list", action="store_true", help="List all tappable elements")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Resolve UDID with auto-detection
|
|
try:
|
|
udid = resolve_udid(args.udid)
|
|
except RuntimeError as e:
|
|
print(f"Error: {e}")
|
|
sys.exit(1)
|
|
|
|
navigator = Navigator(udid=udid)
|
|
|
|
# List mode
|
|
if args.list:
|
|
tree = navigator.get_accessibility_tree()
|
|
elements = navigator._flatten_tree(tree)
|
|
|
|
# Filter to tappable elements
|
|
tappable = [
|
|
e
|
|
for e in elements
|
|
if e.enabled and e.type in ["Button", "Link", "Cell", "TextField", "SecureTextField"]
|
|
]
|
|
|
|
print(f"Tappable elements ({len(tappable)}):")
|
|
for elem in tappable[:10]: # Limit output for tokens
|
|
print(f" {elem.type}: \"{elem.label or elem.value or 'Unnamed'}\" {elem.center}")
|
|
|
|
if len(tappable) > 10:
|
|
print(f" ... and {len(tappable) - 10} more")
|
|
sys.exit(0)
|
|
|
|
# Direct tap at coordinates
|
|
if args.tap_at:
|
|
coords = args.tap_at.split(",")
|
|
if len(coords) != 2:
|
|
print("Error: --tap-at requires x,y format")
|
|
sys.exit(1)
|
|
|
|
x, y = int(coords[0]), int(coords[1])
|
|
|
|
# Handle coordinate transformation if requested
|
|
if args.screenshot_coords:
|
|
if not args.screenshot_width or not args.screenshot_height:
|
|
print(
|
|
"Error: --screenshot-coords requires --screenshot-width and --screenshot-height"
|
|
)
|
|
sys.exit(1)
|
|
|
|
device_w, device_h = get_device_screen_size(udid)
|
|
x, y = transform_screenshot_coords(
|
|
x,
|
|
y,
|
|
args.screenshot_width,
|
|
args.screenshot_height,
|
|
device_w,
|
|
device_h,
|
|
)
|
|
print(
|
|
f"Transformed screenshot coords ({coords[0]}, {coords[1]}) "
|
|
f"to device coords ({x}, {y})"
|
|
)
|
|
|
|
if navigator.tap_at(x, y):
|
|
print(f"Tapped at ({x}, {y})")
|
|
else:
|
|
print(f"Failed to tap at ({x}, {y})")
|
|
sys.exit(1)
|
|
|
|
# Find and tap
|
|
elif args.tap:
|
|
text = args.find_text or args.find_exact
|
|
fuzzy = args.find_text is not None
|
|
|
|
success, message = navigator.find_and_tap(
|
|
text=text, element_type=args.find_type, identifier=args.find_id, index=args.index
|
|
)
|
|
|
|
print(message)
|
|
if not success:
|
|
sys.exit(1)
|
|
|
|
# Find and enter text
|
|
elif args.enter_text:
|
|
text = args.find_text or args.find_exact
|
|
|
|
success, message = navigator.find_and_enter_text(
|
|
text_to_enter=args.enter_text,
|
|
find_text=text,
|
|
element_type=args.find_type or "TextField",
|
|
identifier=args.find_id,
|
|
index=args.index,
|
|
)
|
|
|
|
print(message)
|
|
if not success:
|
|
sys.exit(1)
|
|
|
|
# Just find (no action)
|
|
else:
|
|
text = args.find_text or args.find_exact
|
|
fuzzy = args.find_text is not None
|
|
|
|
element = navigator.find_element(
|
|
text=text,
|
|
element_type=args.find_type,
|
|
identifier=args.find_id,
|
|
index=args.index,
|
|
fuzzy=fuzzy,
|
|
)
|
|
|
|
if element:
|
|
print(f"Found: {element.description} at {element.center}")
|
|
else:
|
|
print("Element not found")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|