dotfiles/.agents/skills/ios-simulator-skill/scripts/navigator.py

#!/usr/bin/env python3
"""
iOS Simulator Navigator - Smart Element Finder and Interactor

Finds and interacts with UI elements using accessibility data.
Prioritizes structured navigation over pixel-based interaction.

This script is the core automation tool for iOS simulator navigation. It finds
UI elements by text, type, or accessibility ID and performs actions on them
(tap, enter text). Uses semantic element finding instead of fragile pixel coordinates.

Key Features:
- Find elements by text (fuzzy or exact matching)
- Find elements by type (Button, TextField, etc.)
- Find elements by accessibility identifier
- Tap elements at their center point
- Enter text into text fields
- List all tappable elements on screen
- Automatic element caching for performance

Usage Examples:
    # Find and tap a button by text
    python scripts/navigator.py --find-text "Login" --tap --udid <device-id>

    # Enter text into first text field
    python scripts/navigator.py --find-type TextField --index 0 --enter-text "username" --udid <device-id>

    # Tap element by accessibility ID
    python scripts/navigator.py --find-id "submitButton" --tap --udid <device-id>

    # List all interactive elements
    python scripts/navigator.py --list --udid <device-id>

    # Tap at specific coordinates (fallback)
    python scripts/navigator.py --tap-at 200,400 --udid <device-id>

Output Format:
    Tapped: Button "Login" at (320, 450)
    Entered text in: TextField "Username"
    Not found: text='Submit'

Navigation Priority (best to worst):
    1. Find by accessibility label/text (most reliable)
    2. Find by element type + index (good for forms)
    3. Find by accessibility ID (precise but app-specific)
    4. Tap at coordinates (last resort, fragile)

Technical Details:
- Uses IDB's accessibility tree via `idb ui describe-all --json --nested`
- Caches tree for multiple operations (call with force_refresh to update)
- Finds elements by parsing tree recursively
- Calculates tap coordinates from element frame center
- Uses `idb ui tap` for tapping, `idb ui text` for text entry
- Extracts data from AXLabel, AXValue, and AXUniqueId fields
"""

import argparse
import json
import subprocess
import sys
from dataclasses import dataclass

from common import (
    flatten_tree,
    get_accessibility_tree,
    get_device_screen_size,
    resolve_udid,
    transform_screenshot_coords,
)


@dataclass
class Element:
    """Represents a UI element from accessibility tree."""

    type: str
    label: str | None
    value: str | None
    identifier: str | None
    frame: dict[str, float]
    traits: list[str]
    enabled: bool = True

    @property
    def center(self) -> tuple[int, int]:
        """Calculate center point for tapping."""
        x = int(self.frame["x"] + self.frame["width"] / 2)
        y = int(self.frame["y"] + self.frame["height"] / 2)
        return (x, y)

    @property
    def description(self) -> str:
        """Human-readable description."""
        label = self.label or self.value or self.identifier or "Unnamed"
        return f'{self.type} "{label}"'


class Navigator:
    """Navigates iOS apps using accessibility data."""

    def __init__(self, udid: str | None = None):
        """Initialize navigator with optional device UDID."""
        self.udid = udid
        self._tree_cache = None

    def get_accessibility_tree(self, force_refresh: bool = False) -> dict:
        """Get accessibility tree (cached for efficiency)."""
        if self._tree_cache and not force_refresh:
            return self._tree_cache

        # Delegate to shared utility
        self._tree_cache = get_accessibility_tree(self.udid, nested=True)
        return self._tree_cache

    def _flatten_tree(self, node: dict, elements: list[Element] | None = None) -> list[Element]:
        """Flatten accessibility tree into list of elements."""
        if elements is None:
            elements = []

        # Create element from node
        if node.get("type"):
            element = Element(
                type=node.get("type", "Unknown"),
                label=node.get("AXLabel"),
                value=node.get("AXValue"),
                identifier=node.get("AXUniqueId"),
                frame=node.get("frame", {}),
                traits=node.get("traits", []),
                enabled=node.get("enabled", True),
            )
            elements.append(element)

        # Process children
        for child in node.get("children", []):
            self._flatten_tree(child, elements)

        return elements

    def find_element(
        self,
        text: str | None = None,
        element_type: str | None = None,
        identifier: str | None = None,
        index: int = 0,
        fuzzy: bool = True,
    ) -> Element | None:
        """
        Find element by various criteria.

        Args:
            text: Text to search in label/value
            element_type: Type of element (Button, TextField, etc.)
            identifier: Accessibility identifier
            index: Which matching element to return (0-based)
            fuzzy: Use fuzzy matching for text

        Returns:
            Element if found, None otherwise
        """
        tree = self.get_accessibility_tree()
        elements = self._flatten_tree(tree)

        matches = []

        for elem in elements:
            # Skip disabled elements
            if not elem.enabled:
                continue

            # Check type
            if element_type and elem.type != element_type:
                continue

            # Check identifier (exact match)
            if identifier and elem.identifier != identifier:
                continue

            # Check text (in label or value)
            if text:
                elem_text = (elem.label or "") + " " + (elem.value or "")
                if fuzzy:
                    if text.lower() not in elem_text.lower():
                        continue
                elif text not in (elem.label, elem.value):
                    continue

            matches.append(elem)

        if matches and index < len(matches):
            return matches[index]

        return None

    def tap(self, element: Element) -> bool:
        """Tap on an element."""
        x, y = element.center
        return self.tap_at(x, y)

    def tap_at(self, x: int, y: int) -> bool:
        """Tap at specific coordinates."""
        cmd = ["idb", "ui", "tap", str(x), str(y)]
        if self.udid:
            cmd.extend(["--udid", self.udid])

        try:
            subprocess.run(cmd, capture_output=True, check=True)
            return True
        except subprocess.CalledProcessError:
            return False

    def enter_text(self, text: str, element: Element | None = None) -> bool:
        """
        Enter text into element or current focus.

        Args:
            text: Text to enter
            element: Optional element to tap first

        Returns:
            Success status
        """
        # Tap element if provided
        if element:
            if not self.tap(element):
                return False
            # Small delay for focus
            import time

            time.sleep(0.5)

        # Enter text
        cmd = ["idb", "ui", "text", text]
        if self.udid:
            cmd.extend(["--udid", self.udid])

        try:
            subprocess.run(cmd, capture_output=True, check=True)
            return True
        except subprocess.CalledProcessError:
            return False

    def find_and_tap(
        self,
        text: str | None = None,
        element_type: str | None = None,
        identifier: str | None = None,
        index: int = 0,
    ) -> tuple[bool, str]:
        """
        Find element and tap it.

        Returns:
            (success, message) tuple
        """
        element = self.find_element(text, element_type, identifier, index)

        if not element:
            criteria = []
            if text:
                criteria.append(f"text='{text}'")
            if element_type:
                criteria.append(f"type={element_type}")
            if identifier:
                criteria.append(f"id={identifier}")
            return (False, f"Not found: {', '.join(criteria)}")

        if self.tap(element):
            return (True, f"Tapped: {element.description} at {element.center}")
        return (False, f"Failed to tap: {element.description}")

    def find_and_enter_text(
        self,
        text_to_enter: str,
        find_text: str | None = None,
        element_type: str | None = "TextField",
        identifier: str | None = None,
        index: int = 0,
    ) -> tuple[bool, str]:
        """
        Find element and enter text into it.

        Returns:
            (success, message) tuple
        """
        element = self.find_element(find_text, element_type, identifier, index)

        if not element:
            return (False, "TextField not found")

        if self.enter_text(text_to_enter, element):
            return (True, f"Entered text in: {element.description}")
        return (False, "Failed to enter text")


def main():
    """Main entry point."""
    parser = argparse.ArgumentParser(description="Navigate iOS apps using accessibility data")

    # Finding options
    parser.add_argument("--find-text", help="Find element by text (fuzzy match)")
    parser.add_argument("--find-exact", help="Find element by exact text")
    parser.add_argument("--find-type", help="Element type (Button, TextField, etc.)")
    parser.add_argument("--find-id", help="Accessibility identifier")
    parser.add_argument("--index", type=int, default=0, help="Which match to use (0-based)")

    # Action options
    parser.add_argument("--tap", action="store_true", help="Tap the found element")
    parser.add_argument("--tap-at", help="Tap at coordinates (x,y)")
    parser.add_argument("--enter-text", help="Enter text into element")

    # Coordinate transformation
    parser.add_argument(
        "--screenshot-coords",
        action="store_true",
        help="Interpret tap coordinates as from a screenshot (requires --screenshot-width/height)",
    )
    parser.add_argument(
        "--screenshot-width",
        type=int,
        help="Screenshot width for coordinate transformation",
    )
    parser.add_argument(
        "--screenshot-height",
        type=int,
        help="Screenshot height for coordinate transformation",
    )

    # Other options
    parser.add_argument(
        "--udid",
        help="Device UDID (auto-detects booted simulator if not provided)",
    )
    parser.add_argument("--list", action="store_true", help="List all tappable elements")

    args = parser.parse_args()

    # Resolve UDID with auto-detection
    try:
        udid = resolve_udid(args.udid)
    except RuntimeError as e:
        print(f"Error: {e}")
        sys.exit(1)

    navigator = Navigator(udid=udid)

    # List mode
    if args.list:
        tree = navigator.get_accessibility_tree()
        elements = navigator._flatten_tree(tree)

        # Filter to tappable elements
        tappable = [
            e
            for e in elements
            if e.enabled and e.type in ["Button", "Link", "Cell", "TextField", "SecureTextField"]
        ]

        print(f"Tappable elements ({len(tappable)}):")
        for elem in tappable[:10]:  # Limit output for tokens
            print(f"  {elem.type}: \"{elem.label or elem.value or 'Unnamed'}\" {elem.center}")

        if len(tappable) > 10:
            print(f"  ... and {len(tappable) - 10} more")
        sys.exit(0)

    # Direct tap at coordinates
    if args.tap_at:
        coords = args.tap_at.split(",")
        if len(coords) != 2:
            print("Error: --tap-at requires x,y format")
            sys.exit(1)

        x, y = int(coords[0]), int(coords[1])

        # Handle coordinate transformation if requested
        if args.screenshot_coords:
            if not args.screenshot_width or not args.screenshot_height:
                print(
                    "Error: --screenshot-coords requires --screenshot-width and --screenshot-height"
                )
                sys.exit(1)

            device_w, device_h = get_device_screen_size(udid)
            x, y = transform_screenshot_coords(
                x,
                y,
                args.screenshot_width,
                args.screenshot_height,
                device_w,
                device_h,
            )
            print(
                f"Transformed screenshot coords ({coords[0]}, {coords[1]}) "
                f"to device coords ({x}, {y})"
            )

        if navigator.tap_at(x, y):
            print(f"Tapped at ({x}, {y})")
        else:
            print(f"Failed to tap at ({x}, {y})")
            sys.exit(1)

    # Find and tap
    elif args.tap:
        text = args.find_text or args.find_exact
        fuzzy = args.find_text is not None

        success, message = navigator.find_and_tap(
            text=text, element_type=args.find_type, identifier=args.find_id, index=args.index
        )

        print(message)
        if not success:
            sys.exit(1)

    # Find and enter text
    elif args.enter_text:
        text = args.find_text or args.find_exact

        success, message = navigator.find_and_enter_text(
            text_to_enter=args.enter_text,
            find_text=text,
            element_type=args.find_type or "TextField",
            identifier=args.find_id,
            index=args.index,
        )

        print(message)
        if not success:
            sys.exit(1)

    # Just find (no action)
    else:
        text = args.find_text or args.find_exact
        fuzzy = args.find_text is not None

        element = navigator.find_element(
            text=text,
            element_type=args.find_type,
            identifier=args.find_id,
            index=args.index,
            fuzzy=fuzzy,
        )

        if element:
            print(f"Found: {element.description} at {element.center}")
        else:
            print("Element not found")
            sys.exit(1)


if __name__ == "__main__":
    main()