update skills

2026-05-09 00:41:27 -07:00 · 2026-03-17 16:53:22 -07:00
parent 0b0783ef8e
commit f9a530667e
389 changed files with 54512 additions and 1 deletions
@@ -0,0 +1,201 @@
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf of
+   any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+   To apply the Apache License to your work, attach the following
+   boilerplate notice, with the fields enclosed by brackets "[]"
+   replaced with your own identifying information. (Don\'t include
+   the brackets!)  The text should be enclosed in the appropriate
+   comment syntax for the file format. We also recommend that a
+   file or class name and description of purpose be included on the
+   same "printed page" as the copyright notice for easier
+   identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
@@ -0,0 +1,144 @@
+---
+name: "speech"
+description: "Use when the user asks for text-to-speech narration or voiceover, accessibility reads, audio prompts, or batch speech generation via the OpenAI Audio API; run the bundled CLI (`scripts/text_to_speech.py`) with built-in voices and require `OPENAI_API_KEY` for live calls. Custom voice creation is out of scope."
+---
+
+
+# Speech Generation Skill
+
+Generate spoken audio for the current project (narration, product demo voiceover, IVR prompts, accessibility reads). Defaults to `gpt-4o-mini-tts-2025-12-15` and built-in voices, and prefers the bundled CLI for deterministic, reproducible runs.
+
+## When to use
+- Generate a single spoken clip from text
+- Generate a batch of prompts (many lines, many files)
+
+## Decision tree (single vs batch)
+- If the user provides multiple lines/prompts or wants many outputs -> **batch**
+- Else -> **single**
+
+## Workflow
+1. Decide intent: single vs batch (see decision tree above).
+2. Collect inputs up front: exact text (verbatim), desired voice, delivery style, format, and any constraints.
+3. If batch: write a temporary JSONL under tmp/ (one job per line), run once, then delete the JSONL.
+4. Augment instructions into a short labeled spec without rewriting the input text.
+5. Run the bundled CLI (`scripts/text_to_speech.py`) with sensible defaults (see references/cli.md).
+6. For important clips, validate: intelligibility, pacing, pronunciation, and adherence to constraints.
+7. Iterate with a single targeted change (voice, speed, or instructions), then re-check.
+8. Save/return final outputs and note the final text + instructions + flags used.
+
+## Temp and output conventions
+- Use `tmp/speech/` for intermediate files (for example JSONL batches); delete when done.
+- Write final artifacts under `output/speech/` when working in this repo.
+- Use `--out` or `--out-dir` to control output paths; keep filenames stable and descriptive.
+
+## Dependencies (install if missing)
+Prefer `uv` for dependency management.
+
+Python packages:
+```
+uv pip install openai
+```
+If `uv` is unavailable:
+```
+python3 -m pip install openai
+```
+
+## Environment
+- `OPENAI_API_KEY` must be set for live API calls.
+
+If the key is missing, give the user these steps:
+1. Create an API key in the OpenAI platform UI: https://platform.openai.com/api-keys
+2. Set `OPENAI_API_KEY` as an environment variable in their system.
+3. Offer to guide them through setting the environment variable for their OS/shell if needed.
+- Never ask the user to paste the full key in chat. Ask them to set it locally and confirm when ready.
+
+If installation isn't possible in this environment, tell the user which dependency is missing and how to install it locally.
+
+## Defaults & rules
+- Use `gpt-4o-mini-tts-2025-12-15` unless the user requests another model.
+- Default voice: `cedar`. If the user wants a brighter tone, prefer `marin`.
+- Built-in voices only. Custom voices are out of scope for this skill.
+- `instructions` are supported for GPT-4o mini TTS models, but not for `tts-1` or `tts-1-hd`.
+- Input length must be <= 4096 characters per request. Split longer text into chunks.
+- Enforce 50 requests/minute. The CLI caps `--rpm` at 50.
+- Require `OPENAI_API_KEY` before any live API call.
+- Provide a clear disclosure to end users that the voice is AI-generated.
+- Use the OpenAI Python SDK (`openai` package) for all API calls; do not use raw HTTP.
+- Prefer the bundled CLI (`scripts/text_to_speech.py`) over writing new one-off scripts.
+- Never modify `scripts/text_to_speech.py`. If something is missing, ask the user before doing anything else.
+
+## Instruction augmentation
+Reformat user direction into a short, labeled spec. Only make implicit details explicit; do not invent new requirements.
+
+Quick clarification (augmentation vs invention):
+- If the user says "narration for a demo", you may add implied delivery constraints (clear, steady pacing, friendly tone).
+- Do not introduce a new persona, accent, or emotional style the user did not request.
+
+Template (include only relevant lines):
+```
+Voice Affect: <overall character and texture of the voice>
+Tone: <attitude, formality, warmth>
+Pacing: <slow, steady, brisk>
+Emotion: <key emotions to convey>
+Pronunciation: <words to enunciate or emphasize>
+Pauses: <where to add intentional pauses>
+Emphasis: <key words or phrases to stress>
+Delivery: <cadence or rhythm notes>
+```
+
+Augmentation rules:
+- Keep it short; add only details the user already implied or provided elsewhere.
+- Do not rewrite the input text.
+- If any critical detail is missing and blocks success, ask a question; otherwise proceed.
+
+## Examples
+
+### Single example (narration)
+```
+Input text: "Welcome to the demo. Today we'll show how it works."
+Instructions:
+Voice Affect: Warm and composed.
+Tone: Friendly and confident.
+Pacing: Steady and moderate.
+Emphasis: Stress "demo" and "show".
+```
+
+### Batch example (IVR prompts)
+```
+{"input":"Thank you for calling. Please hold.","voice":"cedar","response_format":"mp3","out":"hold.mp3"}
+{"input":"For sales, press 1. For support, press 2.","voice":"marin","instructions":"Tone: Clear and neutral. Pacing: Slow.","response_format":"wav"}
+```
+
+## Instructioning best practices (short list)
+- Structure directions as: affect -> tone -> pacing -> emotion -> pronunciation/pauses -> emphasis.
+- Keep 4 to 8 short lines; avoid conflicting guidance.
+- For names/acronyms, add pronunciation hints (e.g., "enunciate A-I") or supply a phonetic spelling in the text.
+- For edits/iterations, repeat invariants (e.g., "keep pacing steady") to reduce drift.
+- Iterate with single-change follow-ups.
+
+More principles: `references/prompting.md`. Copy/paste specs: `references/sample-prompts.md`.
+
+## Guidance by use case
+Use these modules when the request is for a specific delivery style. They provide targeted defaults and templates.
+- Narration / explainer: `references/narration.md`
+- Product demo / voiceover: `references/voiceover.md`
+- IVR / phone prompts: `references/ivr.md`
+- Accessibility reads: `references/accessibility.md`
+
+## CLI + environment notes
+- CLI commands + examples: `references/cli.md`
+- API parameter quick reference: `references/audio-api.md`
+- Instruction patterns + examples: `references/voice-directions.md`
+- If network approvals / sandbox settings are getting in the way: `references/codex-network.md`
+
+## Reference map
+- **`references/cli.md`**: how to run speech generation/batches via `scripts/text_to_speech.py` (commands, flags, recipes).
+- **`references/audio-api.md`**: API parameters, limits, voice list.
+- **`references/voice-directions.md`**: instruction patterns and examples.
+- **`references/prompting.md`**: instruction best practices (structure, constraints, iteration patterns).
+- **`references/sample-prompts.md`**: copy/paste instruction recipes (examples only; no extra theory).
+- **`references/narration.md`**: templates + defaults for narration and explainers.
+- **`references/voiceover.md`**: templates + defaults for product demo voiceovers.
+- **`references/ivr.md`**: templates + defaults for IVR/phone prompts.
+- **`references/accessibility.md`**: templates + defaults for accessibility reads.
+- **`references/codex-network.md`**: environment/sandbox/network-approval troubleshooting.
@@ -0,0 +1,6 @@
+interface:
+  display_name: "Speech Generation Skill"
+  short_description: "Generate narrated audio from text"
+  icon_small: "./assets/speech-small.svg"
+  icon_large: "./assets/speech.png"
+  default_prompt: "Generate spoken audio for this text with the right voice style, pacing, and output format."
@@ -0,0 +1,3 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="14" height="14" fill="currentColor" viewBox="0 0 14 14">
+  <path d="M7.78 4.001c.245 0 .444.199.444.444v6.666a.444.444 0 0 1-.887 0V4.445c0-.245.199-.444.444-.444ZM5.836 7.89c.245 0 .443.199.443.443v1.112a.444.444 0 0 1-.886 0V8.333c0-.244.198-.443.443-.443Zm3.889-2.222c.244 0 .443.199.443.443v3.334a.444.444 0 0 1-.887 0V6.11c0-.244.199-.443.444-.443ZM11.67 6.78c.244 0 .443.198.443.443v1.11a.444.444 0 0 1-.887 0v-1.11c0-.245.198-.444.443-.444ZM6.114 1.779c.245 0 .443.198.443.443v.988a.444.444 0 0 1-.886 0v-.545H4.335v3.558h.297l.09.01a.444.444 0 0 1 0 .868l-.09.009h-1.48a.444.444 0 0 1-.001-.887h.297V2.665H2.113v.545a.444.444 0 0 1-.887 0v-.988c0-.245.199-.443.443-.443h4.445Z"/>
+</svg>
@@ -0,0 +1,32 @@
+# Accessibility read defaults
+
+## Suggested defaults
+- Voice: `cedar`
+- Format: `mp3` or `wav`
+- Speed: `0.95` to `1.0`
+
+## Guidance
+- Keep delivery steady and neutral.
+- Enunciate acronyms and numbers.
+- Avoid dramatic or stylized delivery.
+
+## Instruction template
+```
+Voice Affect: Neutral and clear.
+Tone: Informational and steady.
+Pacing: Slow and consistent.
+Pronunciation: Enunciate acronyms and numbers.
+Emphasis: Stress key warnings or labels.
+```
+
+## Example (short)
+Input text:
+"Warning: High voltage. Keep hands clear."
+
+Instructions:
+```
+Voice Affect: Neutral and clear.
+Tone: Informational and steady.
+Pacing: Slow and consistent.
+Emphasis: Stress "Warning" and "High voltage".
+```
@@ -0,0 +1,31 @@
+# Audio Speech API quick reference
+
+## Endpoint
+- Create speech: `POST /v1/audio/speech`
+
+## Default model
+- `gpt-4o-mini-tts-2025-12-15`
+
+## Other speech models (if requested)
+- `gpt-4o-mini-tts`
+- `tts-1`
+- `tts-1-hd`
+
+## Core parameters
+- `model`: speech model
+- `input`: text to synthesize (max 4096 characters)
+- `voice`: built-in voice name
+- `instructions`: optional style directions (not supported for `tts-1` or `tts-1-hd`)
+- `response_format`: `mp3`, `opus`, `aac`, `flac`, `wav`, or `pcm`
+- `speed`: 0.25 to 4.0
+
+## Built-in voices
+- `alloy`, `ash`, `ballad`, `cedar`, `coral`, `echo`, `fable`, `marin`, `nova`, `onyx`, `sage`, `shimmer`, `verse`
+
+## Output notes
+- Default format is `mp3`.
+- `pcm` is raw 24 kHz 16-bit little-endian samples (no header).
+- `wav` includes a header (better for quick playback).
+
+## Compliance note
+- Provide a clear disclosure that the voice is AI-generated.
@@ -0,0 +1,99 @@
+# CLI reference (`scripts/text_to_speech.py`)
+
+This file contains the "command catalog" for the bundled speech generation CLI. Keep `SKILL.md` as overview-first; put verbose CLI details here.
+
+## What this CLI does
+- `speak`: generate a single audio file
+- `speak-batch`: run many jobs from a JSONL file (one job per line)
+- `list-voices`: list supported voices
+
+Real API calls require network access + `OPENAI_API_KEY`. `--dry-run` does not.
+
+## Quick start (works from any repo)
+Set a stable path to the skill CLI (default `CODEX_HOME` is `~/.codex`):
+
+```
+export CODEX_HOME="${CODEX_HOME:-$HOME/.codex}"
+export TTS_GEN="$CODEX_HOME/skills/speech/scripts/text_to_speech.py"
+```
+
+Dry-run (no API call; no network required; does not require the `openai` package):
+
+```
+python "$TTS_GEN" speak --input "Test" --dry-run
+```
+
+Generate (requires `OPENAI_API_KEY` + network):
+
+```
+uv run --with openai python "$TTS_GEN" speak \
+  --input "Today is a wonderful day to build something people love!" \
+  --voice cedar \
+  --instructions "Voice Affect: Warm and composed. Tone: upbeat and encouraging." \
+  --response-format mp3 \
+  --out speech.mp3
+```
+
+No `uv` installed? Use your active Python env:
+
+```
+python "$TTS_GEN" speak --input "Hello" --voice cedar --out speech.mp3
+```
+
+## Guardrails (important)
+- Use `python "$TTS_GEN" ...` (or equivalent full path) for all TTS work.
+- Do **not** create one-off runners (e.g., `gen_audio.py`) unless the user explicitly asks.
+- **Never modify** `scripts/text_to_speech.py`. If something is missing, ask the user before doing anything else.
+
+## Defaults (unless overridden by flags)
+- Model: `gpt-4o-mini-tts-2025-12-15`
+- Voice: `cedar`
+- Response format: `mp3`
+- Speed: `1.0`
+- Batch rpm cap: `50`
+
+## Input limits
+- Input text must be <= 4096 characters per request.
+- For longer text, split into smaller chunks (manual or via batch JSONL).
+
+## Instructions compatibility
+- `instructions` are supported for GPT-4o mini TTS models.
+- `tts-1` and `tts-1-hd` ignore instructions (the CLI will warn and drop them).
+
+## Common recipes
+
+List voices:
+```
+python "$TTS_GEN" list-voices
+```
+
+Generate with explicit pacing:
+```
+python "$TTS_GEN" speak \
+  --input "Welcome to the demo. We'll show how it works." \
+  --instructions "Tone: friendly and confident. Pacing: steady and moderate." \
+  --out demo.mp3
+```
+
+Batch generation (JSONL):
+```
+mkdir -p tmp/speech
+cat > tmp/speech/jobs.jsonl << 'JSONL'
+{"input":"Thank you for calling. Please hold.","voice":"cedar","response_format":"mp3","out":"hold.mp3"}
+{"input":"For sales, press 1. For support, press 2.","voice":"marin","instructions":"Tone: clear and neutral. Pacing: slow.","response_format":"wav"}
+JSONL
+
+python "$TTS_GEN" speak-batch --input tmp/speech/jobs.jsonl --out-dir out --rpm 50
+
+# Cleanup (recommended)
+rm -f tmp/speech/jobs.jsonl
+```
+
+Notes:
+- Use `--rpm` to control rate limiting (default `50`, max `50`).
+- Per-job overrides are supported in JSONL (`model`, `voice`, `response_format`, `speed`, `instructions`, `out`).
+- Treat the JSONL file as temporary: write it under `tmp/` and delete it after the run (do not commit it).
+
+## See also
+- API parameter quick reference: `references/audio-api.md`
+- Instruction patterns and examples: `references/voice-directions.md`
@@ -0,0 +1,28 @@
+# Codex network approvals / sandbox notes
+
+This guidance is intentionally isolated from `SKILL.md` because it can vary by environment and may become stale. Prefer the defaults in your environment when in doubt.
+
+## Why am I asked to approve every speech generation call?
+Speech generation uses the OpenAI Audio API, so the CLI needs outbound network access. In many Codex setups, network access is disabled by default (especially under stricter sandbox modes), and/or the approval policy may require confirmation before networked commands run.
+
+## How do I reduce repeated approval prompts (network)?
+If you trust the repo and want fewer prompts, enable network access for the relevant sandbox mode and relax the approval policy.
+
+Example `~/.codex/config.toml` pattern:
+
+```
+approval_policy = "never"
+sandbox_mode = "workspace-write"
+
+[sandbox_workspace_write]
+network_access = true
+```
+
+Or for a single session:
+
+```
+codex --sandbox workspace-write --ask-for-approval never
+```
+
+## Safety note
+Use caution: enabling network and disabling approvals reduces friction but increases risk if you run untrusted code or work in an untrusted repository.
@@ -0,0 +1,32 @@
+# IVR / phone prompt defaults
+
+## Suggested defaults
+- Voice: `cedar` (clear) or `marin` (brighter)
+- Format: `wav`
+- Speed: `0.9` to `1.0`
+
+## Guidance
+- Prioritize clarity and slower pacing.
+- Enunciate numbers and menu options.
+- Keep sentences short and consistent.
+
+## Instruction template
+```
+Voice Affect: Clear and neutral.
+Tone: Professional and concise.
+Pacing: Slow and even.
+Pronunciation: Enunciate numbers and menu options.
+Emphasis: Stress the option numbers.
+```
+
+## Example (short)
+Input text:
+"For sales, press 1. For support, press 2."
+
+Instructions:
+```
+Voice Affect: Clear and neutral.
+Tone: Professional and concise.
+Pacing: Slow and even.
+Emphasis: Stress "press 1" and "press 2".
+```
@@ -0,0 +1,31 @@
+# Narration / explainer defaults
+
+## Suggested defaults
+- Voice: `cedar`
+- Format: `mp3`
+- Speed: `1.0`
+
+## Guidance
+- Keep pacing steady and clear.
+- Emphasize section headings and key transitions.
+- If the script is long, chunk it into logical paragraphs.
+
+## Instruction template
+```
+Voice Affect: Warm and composed.
+Tone: Friendly and confident.
+Pacing: Steady and moderate.
+Emphasis: Stress section titles and key terms.
+Pauses: Brief pause after each section.
+```
+
+## Example (short)
+Input text:
+"Welcome to the demo. Today we'll show how it works."
+
+Instructions:
+```
+Voice Affect: Warm and composed.
+Tone: Friendly and confident.
+Pacing: Steady and moderate.
+```
@@ -0,0 +1,38 @@
+# Instructioning best practices (TTS)
+
+## Contents
+- Structure
+- Specificity
+- Avoiding conflicts
+- Pronunciation and names
+- Pauses and pacing
+- Iterate deliberately
+- Where to find copy/paste recipes
+
+## Structure
+- Use a consistent order: affect -> tone -> pacing -> emotion -> pronunciation/pauses -> emphasis -> delivery.
+- For complex requests, use short labeled lines instead of a long paragraph.
+
+## Specificity
+- Name the delivery you want ("calm and steady" vs "friendly").
+- If you need a specific cadence, call it out explicitly ("slow and measured", "brisk and energetic").
+
+## Avoiding conflicts
+- Do not mix opposing instructions ("fast and slow", "formal and casual").
+- Keep instructions short: 4 to 8 lines are usually enough.
+
+## Pronunciation and names
+- For acronyms, write the pronunciation hint in text ("A-I" instead of "AI").
+- For names or brands, add a simple phonetic guide in the input text if clarity matters.
+- If a word must be emphasized, add an Emphasis line and repeat the word exactly.
+
+## Pauses and pacing
+- Use punctuation or short line breaks in the input text to create natural pauses.
+- Use the Pauses line for intentional pauses ("pause after the greeting").
+
+## Iterate deliberately
+- Start with a clean base instruction set, then make one change at a time.
+- Repeat critical constraints on each iteration ("keep pacing steady").
+
+## Where to find copy/paste recipes
+For copy/paste instruction templates, see `references/sample-prompts.md`. This file focuses on principles, structure, and iteration patterns.
@@ -0,0 +1,44 @@
+# Sample instruction templates (copy/paste)
+
+These are short instruction blocks. Use only the lines you need and keep them consistent with the input text.
+
+## Friendly product demo
+```
+Voice Affect: Warm and composed.
+Tone: Friendly and confident.
+Pacing: Steady and moderate.
+Emphasis: Stress key product benefits.
+```
+
+## Calm support update
+```
+Voice Affect: Calm and reassuring.
+Tone: Sincere and empathetic.
+Pacing: Slow and steady.
+Emotion: Warmth and care.
+Pauses: Brief pause after apologies.
+```
+
+## IVR menu
+```
+Voice Affect: Clear and neutral.
+Tone: Professional and concise.
+Pacing: Slow and even.
+Emphasis: Stress menu options and numbers.
+```
+
+## Accessibility readout
+```
+Voice Affect: Neutral and clear.
+Tone: Informational and steady.
+Pacing: Slow and consistent.
+Pronunciation: Enunciate acronyms and numbers.
+```
+
+## Energetic intro
+```
+Voice Affect: Bright and upbeat.
+Tone: Enthusiastic and welcoming.
+Pacing: Brisk but clear.
+Emphasis: Stress the opening greeting.
+```
@@ -0,0 +1,80 @@
+# Voice directions
+
+## Template
+Use only the lines you need. Keep directions concise and aligned to the input text.
+
+```
+Voice Affect: <overall character and texture>
+Tone: <attitude, formality, warmth>
+Pacing: <slow, steady, brisk>
+Emotion: <key emotions to convey>
+Pronunciation: <words to enunciate or emphasize>
+Pauses: <where to insert brief pauses>
+Emphasis: <key phrases to stress>
+Delivery: <cadence or rhythm notes>
+```
+
+## Best practices
+- Keep 4 to 8 short lines. Avoid conflicting instructions.
+- Prefer concrete guidance over adjectives alone.
+- Do not rewrite the input text in the instructions; only guide delivery.
+- If you need a language or accent, write the input text in that language.
+- Repeat critical constraints (for example: "slow and steady") when iterating.
+
+## Examples (short)
+
+### Calm support
+```
+Voice Affect: Calm and composed, reassuring.
+Tone: Sincere and empathetic.
+Pacing: Steady and moderate.
+Emotion: Warmth and genuine care.
+Pronunciation: Clear, with emphasis on key reassurances.
+Pauses: Brief pauses after apologies and before requests.
+```
+
+### Dramatic narrator
+```
+Voice Affect: Low and suspenseful.
+Tone: Serious and mysterious.
+Pacing: Slow and deliberate.
+Emotion: Restrained intensity.
+Emphasis: Highlight sensory details and cliffhanger lines.
+Pauses: Add pauses after suspenseful moments.
+```
+
+### Fitness instructor
+```
+Voice Affect: High energy and upbeat.
+Tone: Motivational and encouraging.
+Pacing: Fast and dynamic.
+Emotion: Enthusiasm and momentum.
+Emphasis: Stress action verbs and countdowns.
+```
+
+### Serene guide
+```
+Voice Affect: Soft and soothing.
+Tone: Calm and reassuring.
+Pacing: Slow and unhurried.
+Emotion: Peaceful warmth.
+Pauses: Gentle pauses after breathing cues.
+```
+
+### Robot agent
+```
+Voice Affect: Monotone and mechanical.
+Tone: Neutral and formal.
+Pacing: Even and controlled.
+Emotion: None; strictly informational.
+Pronunciation: Precise and consistent.
+```
+
+### Old-time announcer
+```
+Voice Affect: Refined and theatrical.
+Tone: Formal and welcoming.
+Pacing: Steady with a classic cadence.
+Emotion: Warm enthusiasm.
+Pronunciation: Crisp enunciation with vintage flair.
+```
@@ -0,0 +1,31 @@
+# Product demo / voiceover defaults
+
+## Suggested defaults
+- Voice: `cedar` (neutral) or `marin` (brighter)
+- Format: `wav` for video sync, `mp3` for quick review
+- Speed: `1.0`
+
+## Guidance
+- Keep tone confident and helpful.
+- Emphasize product benefits and call-to-action phrases.
+- Avoid overly dramatic delivery unless requested.
+
+## Instruction template
+```
+Voice Affect: Confident and composed.
+Tone: Helpful and upbeat.
+Pacing: Steady, slightly brisk.
+Emphasis: Stress product benefits and the call to action.
+```
+
+## Example (short)
+Input text:
+"Meet the new dashboard. Find insights faster and act with confidence."
+
+Instructions:
+```
+Voice Affect: Confident and composed.
+Tone: Helpful and upbeat.
+Pacing: Steady, slightly brisk.
+Emphasis: Stress "insights" and "confidence".
+```
@@ -0,0 +1,528 @@
+#!/usr/bin/env python3
+"""Generate speech audio with the OpenAI Audio API (TTS).
+
+Defaults to gpt-4o-mini-tts-2025-12-15 and a built-in voice (cedar).
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+from pathlib import Path
+import re
+import sys
+import time
+from typing import Any, Dict, List, Optional
+
+DEFAULT_MODEL = "gpt-4o-mini-tts-2025-12-15"
+DEFAULT_VOICE = "cedar"
+DEFAULT_RESPONSE_FORMAT = "mp3"
+DEFAULT_SPEED = 1.0
+MAX_INPUT_CHARS = 4096
+MAX_RPM = 50
+DEFAULT_RPM = 50
+DEFAULT_ATTEMPTS = 3
+
+ALLOWED_VOICES = {
+    "alloy",
+    "ash",
+    "ballad",
+    "cedar",
+    "coral",
+    "echo",
+    "fable",
+    "marin",
+    "nova",
+    "onyx",
+    "sage",
+    "shimmer",
+    "verse",
+}
+
+ALLOWED_FORMATS = {"mp3", "opus", "aac", "flac", "wav", "pcm"}
+
+
+def _die(message: str, code: int = 1) -> None:
+    print(f"Error: {message}", file=sys.stderr)
+    raise SystemExit(code)
+
+
+def _warn(message: str) -> None:
+    print(f"Warning: {message}", file=sys.stderr)
+
+
+def _ensure_api_key(dry_run: bool) -> None:
+    if os.getenv("OPENAI_API_KEY"):
+        print("OPENAI_API_KEY is set.", file=sys.stderr)
+        return
+    if dry_run:
+        _warn("OPENAI_API_KEY is not set; dry-run only.")
+        return
+    _die("OPENAI_API_KEY is not set. Export it before running.")
+
+
+def _read_text(text: Optional[str], text_file: Optional[str], label: str) -> str:
+    if text and text_file:
+        _die(f"Use --{label} or --{label}-file, not both.")
+    if text_file:
+        path = Path(text_file)
+        if not path.exists():
+            _die(f"{label} file not found: {path}")
+        return path.read_text(encoding="utf-8").strip()
+    if text:
+        return str(text).strip()
+    _die(f"Missing {label}. Use --{label} or --{label}-file.")
+    return ""  # unreachable
+
+
+def _validate_input(text: str) -> None:
+    if not text:
+        _die("Input text is empty.")
+    if len(text) > MAX_INPUT_CHARS:
+        _die(
+            f"Input text exceeds {MAX_INPUT_CHARS} characters. Split into smaller chunks."
+        )
+
+
+def _normalize_voice(voice: Optional[str]) -> str:
+    if not voice:
+        return DEFAULT_VOICE
+    value = str(voice).strip().lower()
+    if value not in ALLOWED_VOICES:
+        _die(
+            "voice must be one of: " + ", ".join(sorted(ALLOWED_VOICES))
+        )
+    return value
+
+
+def _normalize_format(fmt: Optional[str]) -> str:
+    if not fmt:
+        return DEFAULT_RESPONSE_FORMAT
+    value = str(fmt).strip().lower()
+    if value not in ALLOWED_FORMATS:
+        _die("response-format must be one of: " + ", ".join(sorted(ALLOWED_FORMATS)))
+    return value
+
+
+def _normalize_speed(speed: Optional[float]) -> Optional[float]:
+    if speed is None:
+        return None
+    try:
+        value = float(speed)
+    except ValueError:
+        _die("speed must be a number")
+    if value < 0.25 or value > 4.0:
+        _die("speed must be between 0.25 and 4.0")
+    return value
+
+
+def _normalize_output_path(out: Optional[str], response_format: str) -> Path:
+    if out:
+        path = Path(out)
+        if path.exists() and path.is_dir():
+            return path / f"speech.{response_format}"
+        if path.suffix == "":
+            return path.with_suffix("." + response_format)
+        if path.suffix.lstrip(".").lower() != response_format:
+            _warn(
+                f"Output extension {path.suffix} does not match response-format {response_format}."
+            )
+        return path
+    return Path(f"speech.{response_format}")
+
+
+def _create_client():
+    try:
+        from openai import OpenAI
+    except ImportError:
+        _die("openai SDK not installed. Install with `uv pip install openai`.")
+    return OpenAI()
+
+
+def _extract_retry_after_seconds(exc: Exception) -> Optional[float]:
+    for attr in ("retry_after", "retry_after_seconds"):
+        val = getattr(exc, attr, None)
+        if isinstance(val, (int, float)) and val >= 0:
+            return float(val)
+    msg = str(exc)
+    m = re.search(r"retry[- ]after[:= ]+([0-9]+(?:\\.[0-9]+)?)", msg, re.IGNORECASE)
+    if m:
+        try:
+            return float(m.group(1))
+        except Exception:
+            return None
+    return None
+
+
+def _is_rate_limit_error(exc: Exception) -> bool:
+    name = exc.__class__.__name__.lower()
+    if "ratelimit" in name or "rate_limit" in name:
+        return True
+    msg = str(exc).lower()
+    return "429" in msg or "rate limit" in msg or "too many requests" in msg
+
+
+def _is_transient_error(exc: Exception) -> bool:
+    if _is_rate_limit_error(exc):
+        return True
+    name = exc.__class__.__name__.lower()
+    if "timeout" in name or "timedout" in name or "tempor" in name:
+        return True
+    msg = str(exc).lower()
+    return "timeout" in msg or "timed out" in msg or "connection reset" in msg
+
+
+def _maybe_drop_instructions(model: str, instructions: Optional[str]) -> Optional[str]:
+    if instructions and model in {"tts-1", "tts-1-hd"}:
+        _warn("instructions are not supported for tts-1 / tts-1-hd; ignoring.")
+        return None
+    return instructions
+
+
+def _print_payload(payload: Dict[str, Any]) -> None:
+    print(json.dumps(payload, indent=2, sort_keys=True))
+
+
+def _write_audio(
+    client: Any,
+    payload: Dict[str, Any],
+    out_path: Path,
+    *,
+    dry_run: bool,
+    force: bool,
+    attempts: int,
+) -> None:
+    if dry_run:
+        _print_payload(payload)
+        print(f"Would write {out_path}")
+        return
+
+    _ensure_api_key(dry_run)
+
+    if out_path.exists() and not force:
+        _die(f"Output already exists: {out_path} (use --force to overwrite)")
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    last_exc: Optional[Exception] = None
+    for attempt in range(1, attempts + 1):
+        try:
+            with client.audio.speech.with_streaming_response.create(**payload) as response:
+                response.stream_to_file(out_path)
+            print(f"Wrote {out_path}")
+            return
+        except Exception as exc:
+            last_exc = exc
+            if not _is_transient_error(exc) or attempt >= attempts:
+                raise
+            sleep_s = _extract_retry_after_seconds(exc)
+            if sleep_s is None:
+                sleep_s = min(60.0, 2.0 ** attempt)
+            print(
+                f"Attempt {attempt}/{attempts} failed ({exc.__class__.__name__}); retrying in {sleep_s:.1f}s",
+                file=sys.stderr,
+            )
+            time.sleep(sleep_s)
+
+    if last_exc:
+        raise last_exc
+
+
+def _slugify(value: str) -> str:
+    value = value.strip().lower()
+    value = re.sub(r"[^a-z0-9]+", "-", value)
+    value = re.sub(r"-+", "-", value).strip("-")
+    return value[:60] if value else "job"
+
+
+def _read_jobs_jsonl(path: str) -> List[Dict[str, Any]]:
+    p = Path(path)
+    if not p.exists():
+        _die(f"Input file not found: {p}")
+    jobs: List[Dict[str, Any]] = []
+    for line_no, raw in enumerate(p.read_text(encoding="utf-8").splitlines(), start=1):
+        line = raw.strip()
+        if not line or line.startswith("#"):
+            continue
+        if line.startswith("{"):
+            try:
+                item = json.loads(line)
+            except json.JSONDecodeError as exc:
+                _die(f"Invalid JSON on line {line_no}: {exc}")
+            if not isinstance(item, dict):
+                _die(f"Invalid job on line {line_no}: expected object")
+            jobs.append(item)
+        else:
+            jobs.append({"input": line})
+    if not jobs:
+        _die("No jobs found in input file.")
+    return jobs
+
+
+def _job_input(job: Dict[str, Any]) -> str:
+    for key in ("input", "text", "prompt"):
+        if key in job and str(job[key]).strip():
+            return str(job[key]).strip()
+    _die("Job missing input text (use 'input').")
+    return ""  # unreachable
+
+
+def _merge_non_null(base: Dict[str, Any], extra: Dict[str, Any]) -> Dict[str, Any]:
+    merged = dict(base)
+    for k, v in extra.items():
+        if v is not None:
+            merged[k] = v
+    return merged
+
+
+def _enforce_rpm(rpm: int) -> int:
+    if rpm <= 0:
+        _die("rpm must be > 0")
+    if rpm > MAX_RPM:
+        _warn(f"rpm capped at {MAX_RPM} (requested {rpm}).")
+        return MAX_RPM
+    return rpm
+
+
+def _sleep_for_rate_limit(last_ts: Optional[float], rpm: int) -> float:
+    min_interval = 60.0 / float(rpm)
+    now = time.monotonic()
+    if last_ts is None:
+        return now
+    elapsed = now - last_ts
+    if elapsed < min_interval:
+        time.sleep(min_interval - elapsed)
+    return time.monotonic()
+
+
+def _list_voices() -> None:
+    for name in sorted(ALLOWED_VOICES):
+        print(name)
+
+
+def _run_speak(args: argparse.Namespace) -> int:
+    if args.list_voices:
+        _list_voices()
+        return 0
+
+    input_text = _read_text(args.input, args.input_file, "input")
+    _validate_input(input_text)
+
+    instructions = None
+    if args.instructions or args.instructions_file:
+        instructions = _read_text(args.instructions, args.instructions_file, "instructions")
+
+    model = str(args.model).strip()
+    voice = _normalize_voice(args.voice)
+    response_format = _normalize_format(args.response_format)
+    speed = _normalize_speed(args.speed)
+
+    instructions = _maybe_drop_instructions(model, instructions)
+
+    payload: Dict[str, Any] = {
+        "model": model,
+        "voice": voice,
+        "input": input_text,
+        "response_format": response_format,
+    }
+    if instructions:
+        payload["instructions"] = instructions
+    if speed is not None:
+        payload["speed"] = speed
+
+    out_path = _normalize_output_path(args.out, response_format)
+
+    if args.dry_run:
+        _ensure_api_key(True)
+        _print_payload(payload)
+        print(f"Would write {out_path}")
+        return 0
+
+    client = _create_client()
+    _write_audio(
+        client,
+        payload,
+        out_path,
+        dry_run=args.dry_run,
+        force=args.force,
+        attempts=args.attempts,
+    )
+    return 0
+
+
+def _run_speak_batch(args: argparse.Namespace) -> int:
+    jobs = _read_jobs_jsonl(args.input)
+    out_dir = Path(args.out_dir)
+
+    base_instructions = None
+    if args.instructions or args.instructions_file:
+        base_instructions = _read_text(args.instructions, args.instructions_file, "instructions")
+
+    base_payload = {
+        "model": str(args.model).strip(),
+        "voice": _normalize_voice(args.voice),
+        "response_format": _normalize_format(args.response_format),
+        "speed": _normalize_speed(args.speed),
+        "instructions": base_instructions,
+    }
+
+    rpm = _enforce_rpm(args.rpm)
+    last_ts: Optional[float] = None
+
+    if args.dry_run:
+        _ensure_api_key(True)
+
+    client = None if args.dry_run else _create_client()
+
+    for idx, job in enumerate(jobs, start=1):
+        input_text = _job_input(job)
+        _validate_input(input_text)
+
+        job_payload = dict(base_payload)
+        job_payload["input"] = input_text
+
+        overrides: Dict[str, Any] = {}
+        if "model" in job:
+            overrides["model"] = str(job["model"]).strip()
+        if "voice" in job:
+            overrides["voice"] = _normalize_voice(job["voice"])
+        if "response_format" in job or "format" in job:
+            overrides["response_format"] = _normalize_format(job.get("response_format") or job.get("format"))
+        if "speed" in job and job["speed"] is not None:
+            overrides["speed"] = _normalize_speed(job["speed"])
+        if "instructions" in job and str(job["instructions"]).strip():
+            overrides["instructions"] = str(job["instructions"]).strip()
+
+        job_payload = _merge_non_null(job_payload, overrides)
+        job_payload["instructions"] = _maybe_drop_instructions(
+            job_payload["model"], job_payload.get("instructions")
+        )
+        if job_payload.get("instructions") is None:
+            job_payload.pop("instructions", None)
+
+        response_format = job_payload["response_format"]
+
+        explicit_out = job.get("out")
+        if explicit_out:
+            out_path = _normalize_output_path(str(explicit_out), response_format)
+            if out_path.is_absolute():
+                out_path = out_dir / out_path.name
+            else:
+                out_path = out_dir / out_path
+        else:
+            slug = _slugify(input_text[:80])
+            out_path = out_dir / f"{idx:03d}-{slug}.{response_format}"
+
+        if args.dry_run:
+            _print_payload(job_payload)
+            print(f"Would write {out_path}")
+            continue
+
+        last_ts = _sleep_for_rate_limit(last_ts, rpm)
+
+        if client is None:
+            client = _create_client()
+        _write_audio(
+            client,
+            job_payload,
+            out_path,
+            dry_run=False,
+            force=args.force,
+            attempts=args.attempts,
+        )
+
+    return 0
+
+
+def _add_common_args(parser: argparse.ArgumentParser) -> None:
+    parser.add_argument(
+        "--model",
+        default=DEFAULT_MODEL,
+        help=f"Model to use (default: {DEFAULT_MODEL})",
+    )
+    parser.add_argument(
+        "--voice",
+        default=DEFAULT_VOICE,
+        help=f"Voice to use (default: {DEFAULT_VOICE})",
+    )
+    parser.add_argument(
+        "--response-format",
+        default=DEFAULT_RESPONSE_FORMAT,
+        help=f"Output format (default: {DEFAULT_RESPONSE_FORMAT})",
+    )
+    parser.add_argument(
+        "--speed",
+        type=float,
+        default=DEFAULT_SPEED,
+        help=f"Speech speed (0.25-4.0, default: {DEFAULT_SPEED})",
+    )
+    parser.add_argument(
+        "--instructions",
+        help="Style directions for the voice",
+    )
+    parser.add_argument(
+        "--instructions-file",
+        help="Path to instructions text file",
+    )
+    parser.add_argument(
+        "--attempts",
+        type=int,
+        default=DEFAULT_ATTEMPTS,
+        help=f"Retries on transient errors (default: {DEFAULT_ATTEMPTS})",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print payload; do not call the API",
+    )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Overwrite output files if they exist",
+    )
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Generate speech audio using the OpenAI Audio API."
+    )
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    list_voices = subparsers.add_parser("list-voices", help="List supported voices")
+    list_voices.set_defaults(func=lambda _args: (_list_voices() or 0))
+
+    speak = subparsers.add_parser("speak", help="Generate a single audio file")
+    speak.add_argument("--input", help="Input text")
+    speak.add_argument("--input-file", help="Path to input text file")
+    speak.add_argument("--out", help="Output file path")
+    speak.add_argument(
+        "--list-voices",
+        action="store_true",
+        help="Print voices and exit",
+    )
+    _add_common_args(speak)
+    speak.set_defaults(func=_run_speak)
+
+    batch = subparsers.add_parser("speak-batch", help="Generate from JSONL jobs")
+    batch.add_argument("--input", required=True, help="Path to JSONL file")
+    batch.add_argument(
+        "--out-dir",
+        default="out",
+        help="Output directory (default: out)",
+    )
+    batch.add_argument(
+        "--rpm",
+        type=int,
+        default=DEFAULT_RPM,
+        help=f"Requests per minute cap (default: {DEFAULT_RPM}, max: {MAX_RPM})",
+    )
+    _add_common_args(batch)
+    batch.set_defaults(func=_run_speak_batch)
+
+    args = parser.parse_args()
+    return int(args.func(args))
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())