Files
dotfiles/.agents/skills/ios-simulator-skill/scripts/common/screenshot_utils.py
2026-02-19 00:33:08 -08:00

339 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Screenshot utilities with dual-mode support.
Provides unified screenshot handling with:
- File-based mode: Persistent artifacts for test documentation
- Inline base64 mode: Vision-based automation for agent analysis
- Size presets: Token optimization (full/half/quarter/thumb)
- Semantic naming: {appName}_{screenName}_{state}_{timestamp}.png
Supports resize operations via PIL (optional dependency).
Used by:
- test_recorder.py - Step-based screenshot recording
- app_state_capture.py - State snapshot captures
"""
import base64
import os
import subprocess
import sys
from datetime import datetime
from pathlib import Path
from typing import Any
# Try to import PIL for resizing, but make it optional
try:
from PIL import Image
HAS_PIL = True
except ImportError:
HAS_PIL = False
def generate_screenshot_name(
app_name: str | None = None,
screen_name: str | None = None,
state: str | None = None,
timestamp: str | None = None,
extension: str = "png",
) -> str:
"""Generate semantic screenshot filename.
Format: {appName}_{screenName}_{state}_{timestamp}.{ext}
Falls back to: screenshot_{timestamp}.{ext}
Args:
app_name: Application name (e.g., 'MyApp')
screen_name: Screen name (e.g., 'Login')
state: State description (e.g., 'Empty', 'Filled', 'Error')
timestamp: ISO timestamp (uses current time if None)
extension: File extension (default: 'png')
Returns:
Semantic filename ready for safe file creation
Example:
name = generate_screenshot_name('MyApp', 'Login', 'Empty')
# Returns: 'MyApp_Login_Empty_20251028-143052.png'
name = generate_screenshot_name()
# Returns: 'screenshot_20251028-143052.png'
"""
if timestamp is None:
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
# Build semantic name
if app_name or screen_name or state:
parts = [app_name, screen_name, state]
parts = [p for p in parts if p] # Filter None/empty
name = "_".join(parts) + f"_{timestamp}"
else:
name = f"screenshot_{timestamp}"
return f"{name}.{extension}"
def get_size_preset(size: str = "half") -> tuple[float, float]:
"""Get scale factors for size preset.
Args:
size: 'full', 'half', 'quarter', 'thumb'
Returns:
Tuple of (scale_x, scale_y) for resizing
Example:
scale_x, scale_y = get_size_preset('half')
# Returns: (0.5, 0.5)
"""
presets = {
"full": (1.0, 1.0),
"half": (0.5, 0.5),
"quarter": (0.25, 0.25),
"thumb": (0.1, 0.1),
}
return presets.get(size, (0.5, 0.5))
def resize_screenshot(
input_path: str,
output_path: str | None = None,
size: str = "half",
quality: int = 85,
) -> tuple[str, int, int]:
"""Resize screenshot for token optimization.
Requires PIL (Pillow). Falls back gracefully without it.
Args:
input_path: Path to original screenshot
output_path: Output path (uses input_path if None)
size: 'full', 'half', 'quarter', 'thumb'
quality: JPEG quality (1-100, default: 85)
Returns:
Tuple of (output_path, width, height) of resized image
Raises:
FileNotFoundError: If input file doesn't exist
ValueError: If PIL not installed and size != 'full'
Example:
output, w, h = resize_screenshot(
'screenshot.png',
'screenshot_half.png',
'half'
)
print(f"Resized to {w}x{h}")
"""
input_file = Path(input_path)
if not input_file.exists():
raise FileNotFoundError(f"Screenshot not found: {input_path}")
# If full size, just copy
if size == "full":
if output_path:
import shutil
shutil.copy(input_path, output_path)
output_file = Path(output_path)
else:
output_file = input_file
# Get original dimensions
if HAS_PIL:
img = Image.open(str(output_file))
return (str(output_file), img.width, img.height)
return (str(output_file), 0, 0) # Dimensions unknown without PIL
# Need PIL to resize
if not HAS_PIL:
raise ValueError(
f"Size preset '{size}' requires PIL (Pillow). " "Install with: pip3 install pillow"
)
# Open original image
img = Image.open(str(input_file))
orig_w, orig_h = img.size
# Calculate new size
scale_x, scale_y = get_size_preset(size)
new_w = int(orig_w * scale_x)
new_h = int(orig_h * scale_y)
# Resize with high-quality resampling
resized = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
# Determine output path
if output_path is None:
# Insert size marker before extension
stem = input_file.stem
suffix = input_file.suffix
output_path = str(input_file.parent / f"{stem}_{size}{suffix}")
# Save resized image
resized.save(output_path, quality=quality, optimize=True)
return (output_path, new_w, new_h)
def capture_screenshot(
udid: str,
output_path: str | None = None,
size: str = "half",
inline: bool = False,
app_name: str | None = None,
screen_name: str | None = None,
state: str | None = None,
) -> dict[str, Any]:
"""Capture screenshot with flexible output modes.
Supports both file-based (persistent artifacts) and inline base64 modes
(for vision-based automation).
Args:
udid: Device UDID
output_path: File path for file mode (generates semantic name if None)
size: 'full', 'half', 'quarter', 'thumb' (default: 'half')
inline: If True, returns base64 data instead of saving to file
app_name: App name for semantic naming
screen_name: Screen name for semantic naming
state: State description for semantic naming
Returns:
Dict with mode-specific fields:
File mode:
{
'mode': 'file',
'file_path': str,
'size_bytes': int,
'width': int,
'height': int,
'size_preset': str
}
Inline mode:
{
'mode': 'inline',
'base64_data': str,
'mime_type': 'image/png',
'width': int,
'height': int,
'size_preset': str
}
Example:
# File mode
result = capture_screenshot('ABC123', app_name='MyApp')
print(f"Saved to: {result['file_path']}")
# Inline mode
result = capture_screenshot('ABC123', inline=True, size='half')
print(f"Screenshot: {result['width']}x{result['height']}")
print(f"Base64: {result['base64_data'][:50]}...")
"""
try:
# Capture raw screenshot to temp file
temp_path = "/tmp/ios_simulator_screenshot.png"
cmd = ["xcrun", "simctl", "io", udid, "screenshot", temp_path]
subprocess.run(cmd, capture_output=True, text=True, check=True)
if inline:
# Inline mode: resize and convert to base64
# Resize if needed
if size != "full" and HAS_PIL:
resized_path, width, height = resize_screenshot(temp_path, size=size)
else:
resized_path = temp_path
# Get dimensions via PIL if available
if HAS_PIL:
img = Image.open(resized_path)
width, height = img.size
else:
width, height = 390, 844 # Fallback to common device size
# Read and encode as base64
with open(resized_path, "rb") as f:
base64_data = base64.b64encode(f.read()).decode("utf-8")
# Clean up temp files
Path(temp_path).unlink(missing_ok=True)
if resized_path != temp_path:
Path(resized_path).unlink(missing_ok=True)
return {
"mode": "inline",
"base64_data": base64_data,
"mime_type": "image/png",
"width": width,
"height": height,
"size_preset": size,
}
# File mode: save to output path with semantic naming
if output_path is None:
output_path = generate_screenshot_name(app_name, screen_name, state)
# Resize if needed
if size != "full" and HAS_PIL:
final_path, width, height = resize_screenshot(temp_path, output_path, size)
else:
# Just move temp to output
import shutil
shutil.move(temp_path, output_path)
final_path = output_path
# Get dimensions via PIL if available
if HAS_PIL:
img = Image.open(final_path)
width, height = img.size
else:
width, height = 390, 844 # Fallback
# Get file size
size_bytes = Path(final_path).stat().st_size
return {
"mode": "file",
"file_path": final_path,
"size_bytes": size_bytes,
"width": width,
"height": height,
"size_preset": size,
}
except subprocess.CalledProcessError as e:
raise RuntimeError(f"Failed to capture screenshot: {e.stderr}") from e
except Exception as e:
raise RuntimeError(f"Screenshot capture error: {e!s}") from e
def format_screenshot_result(result: dict[str, Any]) -> str:
"""Format screenshot result for human-readable output.
Args:
result: Result dictionary from capture_screenshot()
Returns:
Formatted string for printing
Example:
result = capture_screenshot('ABC123', inline=True)
print(format_screenshot_result(result))
"""
if result["mode"] == "file":
return (
f"Screenshot: {result['file_path']}\n"
f"Dimensions: {result['width']}x{result['height']}\n"
f"Size: {result['size_bytes']} bytes"
)
return (
f"Screenshot (inline): {result['width']}x{result['height']}\n"
f"Base64 length: {len(result['base64_data'])} chars"
)