mirror of
https://github.com/ksyasuda/dotfiles.git
synced 2026-03-20 06:11:27 -07:00
update skills
This commit is contained in:
201
.agents/skills/speech/LICENSE.txt
Normal file
201
.agents/skills/speech/LICENSE.txt
Normal file
@@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf of
|
||||
any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don\'t include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
144
.agents/skills/speech/SKILL.md
Normal file
144
.agents/skills/speech/SKILL.md
Normal file
@@ -0,0 +1,144 @@
|
||||
---
|
||||
name: "speech"
|
||||
description: "Use when the user asks for text-to-speech narration or voiceover, accessibility reads, audio prompts, or batch speech generation via the OpenAI Audio API; run the bundled CLI (`scripts/text_to_speech.py`) with built-in voices and require `OPENAI_API_KEY` for live calls. Custom voice creation is out of scope."
|
||||
---
|
||||
|
||||
|
||||
# Speech Generation Skill
|
||||
|
||||
Generate spoken audio for the current project (narration, product demo voiceover, IVR prompts, accessibility reads). Defaults to `gpt-4o-mini-tts-2025-12-15` and built-in voices, and prefers the bundled CLI for deterministic, reproducible runs.
|
||||
|
||||
## When to use
|
||||
- Generate a single spoken clip from text
|
||||
- Generate a batch of prompts (many lines, many files)
|
||||
|
||||
## Decision tree (single vs batch)
|
||||
- If the user provides multiple lines/prompts or wants many outputs -> **batch**
|
||||
- Else -> **single**
|
||||
|
||||
## Workflow
|
||||
1. Decide intent: single vs batch (see decision tree above).
|
||||
2. Collect inputs up front: exact text (verbatim), desired voice, delivery style, format, and any constraints.
|
||||
3. If batch: write a temporary JSONL under tmp/ (one job per line), run once, then delete the JSONL.
|
||||
4. Augment instructions into a short labeled spec without rewriting the input text.
|
||||
5. Run the bundled CLI (`scripts/text_to_speech.py`) with sensible defaults (see references/cli.md).
|
||||
6. For important clips, validate: intelligibility, pacing, pronunciation, and adherence to constraints.
|
||||
7. Iterate with a single targeted change (voice, speed, or instructions), then re-check.
|
||||
8. Save/return final outputs and note the final text + instructions + flags used.
|
||||
|
||||
## Temp and output conventions
|
||||
- Use `tmp/speech/` for intermediate files (for example JSONL batches); delete when done.
|
||||
- Write final artifacts under `output/speech/` when working in this repo.
|
||||
- Use `--out` or `--out-dir` to control output paths; keep filenames stable and descriptive.
|
||||
|
||||
## Dependencies (install if missing)
|
||||
Prefer `uv` for dependency management.
|
||||
|
||||
Python packages:
|
||||
```
|
||||
uv pip install openai
|
||||
```
|
||||
If `uv` is unavailable:
|
||||
```
|
||||
python3 -m pip install openai
|
||||
```
|
||||
|
||||
## Environment
|
||||
- `OPENAI_API_KEY` must be set for live API calls.
|
||||
|
||||
If the key is missing, give the user these steps:
|
||||
1. Create an API key in the OpenAI platform UI: https://platform.openai.com/api-keys
|
||||
2. Set `OPENAI_API_KEY` as an environment variable in their system.
|
||||
3. Offer to guide them through setting the environment variable for their OS/shell if needed.
|
||||
- Never ask the user to paste the full key in chat. Ask them to set it locally and confirm when ready.
|
||||
|
||||
If installation isn't possible in this environment, tell the user which dependency is missing and how to install it locally.
|
||||
|
||||
## Defaults & rules
|
||||
- Use `gpt-4o-mini-tts-2025-12-15` unless the user requests another model.
|
||||
- Default voice: `cedar`. If the user wants a brighter tone, prefer `marin`.
|
||||
- Built-in voices only. Custom voices are out of scope for this skill.
|
||||
- `instructions` are supported for GPT-4o mini TTS models, but not for `tts-1` or `tts-1-hd`.
|
||||
- Input length must be <= 4096 characters per request. Split longer text into chunks.
|
||||
- Enforce 50 requests/minute. The CLI caps `--rpm` at 50.
|
||||
- Require `OPENAI_API_KEY` before any live API call.
|
||||
- Provide a clear disclosure to end users that the voice is AI-generated.
|
||||
- Use the OpenAI Python SDK (`openai` package) for all API calls; do not use raw HTTP.
|
||||
- Prefer the bundled CLI (`scripts/text_to_speech.py`) over writing new one-off scripts.
|
||||
- Never modify `scripts/text_to_speech.py`. If something is missing, ask the user before doing anything else.
|
||||
|
||||
## Instruction augmentation
|
||||
Reformat user direction into a short, labeled spec. Only make implicit details explicit; do not invent new requirements.
|
||||
|
||||
Quick clarification (augmentation vs invention):
|
||||
- If the user says "narration for a demo", you may add implied delivery constraints (clear, steady pacing, friendly tone).
|
||||
- Do not introduce a new persona, accent, or emotional style the user did not request.
|
||||
|
||||
Template (include only relevant lines):
|
||||
```
|
||||
Voice Affect: <overall character and texture of the voice>
|
||||
Tone: <attitude, formality, warmth>
|
||||
Pacing: <slow, steady, brisk>
|
||||
Emotion: <key emotions to convey>
|
||||
Pronunciation: <words to enunciate or emphasize>
|
||||
Pauses: <where to add intentional pauses>
|
||||
Emphasis: <key words or phrases to stress>
|
||||
Delivery: <cadence or rhythm notes>
|
||||
```
|
||||
|
||||
Augmentation rules:
|
||||
- Keep it short; add only details the user already implied or provided elsewhere.
|
||||
- Do not rewrite the input text.
|
||||
- If any critical detail is missing and blocks success, ask a question; otherwise proceed.
|
||||
|
||||
## Examples
|
||||
|
||||
### Single example (narration)
|
||||
```
|
||||
Input text: "Welcome to the demo. Today we'll show how it works."
|
||||
Instructions:
|
||||
Voice Affect: Warm and composed.
|
||||
Tone: Friendly and confident.
|
||||
Pacing: Steady and moderate.
|
||||
Emphasis: Stress "demo" and "show".
|
||||
```
|
||||
|
||||
### Batch example (IVR prompts)
|
||||
```
|
||||
{"input":"Thank you for calling. Please hold.","voice":"cedar","response_format":"mp3","out":"hold.mp3"}
|
||||
{"input":"For sales, press 1. For support, press 2.","voice":"marin","instructions":"Tone: Clear and neutral. Pacing: Slow.","response_format":"wav"}
|
||||
```
|
||||
|
||||
## Instructioning best practices (short list)
|
||||
- Structure directions as: affect -> tone -> pacing -> emotion -> pronunciation/pauses -> emphasis.
|
||||
- Keep 4 to 8 short lines; avoid conflicting guidance.
|
||||
- For names/acronyms, add pronunciation hints (e.g., "enunciate A-I") or supply a phonetic spelling in the text.
|
||||
- For edits/iterations, repeat invariants (e.g., "keep pacing steady") to reduce drift.
|
||||
- Iterate with single-change follow-ups.
|
||||
|
||||
More principles: `references/prompting.md`. Copy/paste specs: `references/sample-prompts.md`.
|
||||
|
||||
## Guidance by use case
|
||||
Use these modules when the request is for a specific delivery style. They provide targeted defaults and templates.
|
||||
- Narration / explainer: `references/narration.md`
|
||||
- Product demo / voiceover: `references/voiceover.md`
|
||||
- IVR / phone prompts: `references/ivr.md`
|
||||
- Accessibility reads: `references/accessibility.md`
|
||||
|
||||
## CLI + environment notes
|
||||
- CLI commands + examples: `references/cli.md`
|
||||
- API parameter quick reference: `references/audio-api.md`
|
||||
- Instruction patterns + examples: `references/voice-directions.md`
|
||||
- If network approvals / sandbox settings are getting in the way: `references/codex-network.md`
|
||||
|
||||
## Reference map
|
||||
- **`references/cli.md`**: how to run speech generation/batches via `scripts/text_to_speech.py` (commands, flags, recipes).
|
||||
- **`references/audio-api.md`**: API parameters, limits, voice list.
|
||||
- **`references/voice-directions.md`**: instruction patterns and examples.
|
||||
- **`references/prompting.md`**: instruction best practices (structure, constraints, iteration patterns).
|
||||
- **`references/sample-prompts.md`**: copy/paste instruction recipes (examples only; no extra theory).
|
||||
- **`references/narration.md`**: templates + defaults for narration and explainers.
|
||||
- **`references/voiceover.md`**: templates + defaults for product demo voiceovers.
|
||||
- **`references/ivr.md`**: templates + defaults for IVR/phone prompts.
|
||||
- **`references/accessibility.md`**: templates + defaults for accessibility reads.
|
||||
- **`references/codex-network.md`**: environment/sandbox/network-approval troubleshooting.
|
||||
6
.agents/skills/speech/agents/openai.yaml
Normal file
6
.agents/skills/speech/agents/openai.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
interface:
|
||||
display_name: "Speech Generation Skill"
|
||||
short_description: "Generate narrated audio from text"
|
||||
icon_small: "./assets/speech-small.svg"
|
||||
icon_large: "./assets/speech.png"
|
||||
default_prompt: "Generate spoken audio for this text with the right voice style, pacing, and output format."
|
||||
3
.agents/skills/speech/assets/speech-small.svg
Normal file
3
.agents/skills/speech/assets/speech-small.svg
Normal file
@@ -0,0 +1,3 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="14" height="14" fill="currentColor" viewBox="0 0 14 14">
|
||||
<path d="M7.78 4.001c.245 0 .444.199.444.444v6.666a.444.444 0 0 1-.887 0V4.445c0-.245.199-.444.444-.444ZM5.836 7.89c.245 0 .443.199.443.443v1.112a.444.444 0 0 1-.886 0V8.333c0-.244.198-.443.443-.443Zm3.889-2.222c.244 0 .443.199.443.443v3.334a.444.444 0 0 1-.887 0V6.11c0-.244.199-.443.444-.443ZM11.67 6.78c.244 0 .443.198.443.443v1.11a.444.444 0 0 1-.887 0v-1.11c0-.245.198-.444.443-.444ZM6.114 1.779c.245 0 .443.198.443.443v.988a.444.444 0 0 1-.886 0v-.545H4.335v3.558h.297l.09.01a.444.444 0 0 1 0 .868l-.09.009h-1.48a.444.444 0 0 1-.001-.887h.297V2.665H2.113v.545a.444.444 0 0 1-.887 0v-.988c0-.245.199-.443.443-.443h4.445Z"/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 742 B |
BIN
.agents/skills/speech/assets/speech.png
Normal file
BIN
.agents/skills/speech/assets/speech.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.2 KiB |
32
.agents/skills/speech/references/accessibility.md
Normal file
32
.agents/skills/speech/references/accessibility.md
Normal file
@@ -0,0 +1,32 @@
|
||||
# Accessibility read defaults
|
||||
|
||||
## Suggested defaults
|
||||
- Voice: `cedar`
|
||||
- Format: `mp3` or `wav`
|
||||
- Speed: `0.95` to `1.0`
|
||||
|
||||
## Guidance
|
||||
- Keep delivery steady and neutral.
|
||||
- Enunciate acronyms and numbers.
|
||||
- Avoid dramatic or stylized delivery.
|
||||
|
||||
## Instruction template
|
||||
```
|
||||
Voice Affect: Neutral and clear.
|
||||
Tone: Informational and steady.
|
||||
Pacing: Slow and consistent.
|
||||
Pronunciation: Enunciate acronyms and numbers.
|
||||
Emphasis: Stress key warnings or labels.
|
||||
```
|
||||
|
||||
## Example (short)
|
||||
Input text:
|
||||
"Warning: High voltage. Keep hands clear."
|
||||
|
||||
Instructions:
|
||||
```
|
||||
Voice Affect: Neutral and clear.
|
||||
Tone: Informational and steady.
|
||||
Pacing: Slow and consistent.
|
||||
Emphasis: Stress "Warning" and "High voltage".
|
||||
```
|
||||
31
.agents/skills/speech/references/audio-api.md
Normal file
31
.agents/skills/speech/references/audio-api.md
Normal file
@@ -0,0 +1,31 @@
|
||||
# Audio Speech API quick reference
|
||||
|
||||
## Endpoint
|
||||
- Create speech: `POST /v1/audio/speech`
|
||||
|
||||
## Default model
|
||||
- `gpt-4o-mini-tts-2025-12-15`
|
||||
|
||||
## Other speech models (if requested)
|
||||
- `gpt-4o-mini-tts`
|
||||
- `tts-1`
|
||||
- `tts-1-hd`
|
||||
|
||||
## Core parameters
|
||||
- `model`: speech model
|
||||
- `input`: text to synthesize (max 4096 characters)
|
||||
- `voice`: built-in voice name
|
||||
- `instructions`: optional style directions (not supported for `tts-1` or `tts-1-hd`)
|
||||
- `response_format`: `mp3`, `opus`, `aac`, `flac`, `wav`, or `pcm`
|
||||
- `speed`: 0.25 to 4.0
|
||||
|
||||
## Built-in voices
|
||||
- `alloy`, `ash`, `ballad`, `cedar`, `coral`, `echo`, `fable`, `marin`, `nova`, `onyx`, `sage`, `shimmer`, `verse`
|
||||
|
||||
## Output notes
|
||||
- Default format is `mp3`.
|
||||
- `pcm` is raw 24 kHz 16-bit little-endian samples (no header).
|
||||
- `wav` includes a header (better for quick playback).
|
||||
|
||||
## Compliance note
|
||||
- Provide a clear disclosure that the voice is AI-generated.
|
||||
99
.agents/skills/speech/references/cli.md
Normal file
99
.agents/skills/speech/references/cli.md
Normal file
@@ -0,0 +1,99 @@
|
||||
# CLI reference (`scripts/text_to_speech.py`)
|
||||
|
||||
This file contains the "command catalog" for the bundled speech generation CLI. Keep `SKILL.md` as overview-first; put verbose CLI details here.
|
||||
|
||||
## What this CLI does
|
||||
- `speak`: generate a single audio file
|
||||
- `speak-batch`: run many jobs from a JSONL file (one job per line)
|
||||
- `list-voices`: list supported voices
|
||||
|
||||
Real API calls require network access + `OPENAI_API_KEY`. `--dry-run` does not.
|
||||
|
||||
## Quick start (works from any repo)
|
||||
Set a stable path to the skill CLI (default `CODEX_HOME` is `~/.codex`):
|
||||
|
||||
```
|
||||
export CODEX_HOME="${CODEX_HOME:-$HOME/.codex}"
|
||||
export TTS_GEN="$CODEX_HOME/skills/speech/scripts/text_to_speech.py"
|
||||
```
|
||||
|
||||
Dry-run (no API call; no network required; does not require the `openai` package):
|
||||
|
||||
```
|
||||
python "$TTS_GEN" speak --input "Test" --dry-run
|
||||
```
|
||||
|
||||
Generate (requires `OPENAI_API_KEY` + network):
|
||||
|
||||
```
|
||||
uv run --with openai python "$TTS_GEN" speak \
|
||||
--input "Today is a wonderful day to build something people love!" \
|
||||
--voice cedar \
|
||||
--instructions "Voice Affect: Warm and composed. Tone: upbeat and encouraging." \
|
||||
--response-format mp3 \
|
||||
--out speech.mp3
|
||||
```
|
||||
|
||||
No `uv` installed? Use your active Python env:
|
||||
|
||||
```
|
||||
python "$TTS_GEN" speak --input "Hello" --voice cedar --out speech.mp3
|
||||
```
|
||||
|
||||
## Guardrails (important)
|
||||
- Use `python "$TTS_GEN" ...` (or equivalent full path) for all TTS work.
|
||||
- Do **not** create one-off runners (e.g., `gen_audio.py`) unless the user explicitly asks.
|
||||
- **Never modify** `scripts/text_to_speech.py`. If something is missing, ask the user before doing anything else.
|
||||
|
||||
## Defaults (unless overridden by flags)
|
||||
- Model: `gpt-4o-mini-tts-2025-12-15`
|
||||
- Voice: `cedar`
|
||||
- Response format: `mp3`
|
||||
- Speed: `1.0`
|
||||
- Batch rpm cap: `50`
|
||||
|
||||
## Input limits
|
||||
- Input text must be <= 4096 characters per request.
|
||||
- For longer text, split into smaller chunks (manual or via batch JSONL).
|
||||
|
||||
## Instructions compatibility
|
||||
- `instructions` are supported for GPT-4o mini TTS models.
|
||||
- `tts-1` and `tts-1-hd` ignore instructions (the CLI will warn and drop them).
|
||||
|
||||
## Common recipes
|
||||
|
||||
List voices:
|
||||
```
|
||||
python "$TTS_GEN" list-voices
|
||||
```
|
||||
|
||||
Generate with explicit pacing:
|
||||
```
|
||||
python "$TTS_GEN" speak \
|
||||
--input "Welcome to the demo. We'll show how it works." \
|
||||
--instructions "Tone: friendly and confident. Pacing: steady and moderate." \
|
||||
--out demo.mp3
|
||||
```
|
||||
|
||||
Batch generation (JSONL):
|
||||
```
|
||||
mkdir -p tmp/speech
|
||||
cat > tmp/speech/jobs.jsonl << 'JSONL'
|
||||
{"input":"Thank you for calling. Please hold.","voice":"cedar","response_format":"mp3","out":"hold.mp3"}
|
||||
{"input":"For sales, press 1. For support, press 2.","voice":"marin","instructions":"Tone: clear and neutral. Pacing: slow.","response_format":"wav"}
|
||||
JSONL
|
||||
|
||||
python "$TTS_GEN" speak-batch --input tmp/speech/jobs.jsonl --out-dir out --rpm 50
|
||||
|
||||
# Cleanup (recommended)
|
||||
rm -f tmp/speech/jobs.jsonl
|
||||
```
|
||||
|
||||
Notes:
|
||||
- Use `--rpm` to control rate limiting (default `50`, max `50`).
|
||||
- Per-job overrides are supported in JSONL (`model`, `voice`, `response_format`, `speed`, `instructions`, `out`).
|
||||
- Treat the JSONL file as temporary: write it under `tmp/` and delete it after the run (do not commit it).
|
||||
|
||||
## See also
|
||||
- API parameter quick reference: `references/audio-api.md`
|
||||
- Instruction patterns and examples: `references/voice-directions.md`
|
||||
28
.agents/skills/speech/references/codex-network.md
Normal file
28
.agents/skills/speech/references/codex-network.md
Normal file
@@ -0,0 +1,28 @@
|
||||
# Codex network approvals / sandbox notes
|
||||
|
||||
This guidance is intentionally isolated from `SKILL.md` because it can vary by environment and may become stale. Prefer the defaults in your environment when in doubt.
|
||||
|
||||
## Why am I asked to approve every speech generation call?
|
||||
Speech generation uses the OpenAI Audio API, so the CLI needs outbound network access. In many Codex setups, network access is disabled by default (especially under stricter sandbox modes), and/or the approval policy may require confirmation before networked commands run.
|
||||
|
||||
## How do I reduce repeated approval prompts (network)?
|
||||
If you trust the repo and want fewer prompts, enable network access for the relevant sandbox mode and relax the approval policy.
|
||||
|
||||
Example `~/.codex/config.toml` pattern:
|
||||
|
||||
```
|
||||
approval_policy = "never"
|
||||
sandbox_mode = "workspace-write"
|
||||
|
||||
[sandbox_workspace_write]
|
||||
network_access = true
|
||||
```
|
||||
|
||||
Or for a single session:
|
||||
|
||||
```
|
||||
codex --sandbox workspace-write --ask-for-approval never
|
||||
```
|
||||
|
||||
## Safety note
|
||||
Use caution: enabling network and disabling approvals reduces friction but increases risk if you run untrusted code or work in an untrusted repository.
|
||||
32
.agents/skills/speech/references/ivr.md
Normal file
32
.agents/skills/speech/references/ivr.md
Normal file
@@ -0,0 +1,32 @@
|
||||
# IVR / phone prompt defaults
|
||||
|
||||
## Suggested defaults
|
||||
- Voice: `cedar` (clear) or `marin` (brighter)
|
||||
- Format: `wav`
|
||||
- Speed: `0.9` to `1.0`
|
||||
|
||||
## Guidance
|
||||
- Prioritize clarity and slower pacing.
|
||||
- Enunciate numbers and menu options.
|
||||
- Keep sentences short and consistent.
|
||||
|
||||
## Instruction template
|
||||
```
|
||||
Voice Affect: Clear and neutral.
|
||||
Tone: Professional and concise.
|
||||
Pacing: Slow and even.
|
||||
Pronunciation: Enunciate numbers and menu options.
|
||||
Emphasis: Stress the option numbers.
|
||||
```
|
||||
|
||||
## Example (short)
|
||||
Input text:
|
||||
"For sales, press 1. For support, press 2."
|
||||
|
||||
Instructions:
|
||||
```
|
||||
Voice Affect: Clear and neutral.
|
||||
Tone: Professional and concise.
|
||||
Pacing: Slow and even.
|
||||
Emphasis: Stress "press 1" and "press 2".
|
||||
```
|
||||
31
.agents/skills/speech/references/narration.md
Normal file
31
.agents/skills/speech/references/narration.md
Normal file
@@ -0,0 +1,31 @@
|
||||
# Narration / explainer defaults
|
||||
|
||||
## Suggested defaults
|
||||
- Voice: `cedar`
|
||||
- Format: `mp3`
|
||||
- Speed: `1.0`
|
||||
|
||||
## Guidance
|
||||
- Keep pacing steady and clear.
|
||||
- Emphasize section headings and key transitions.
|
||||
- If the script is long, chunk it into logical paragraphs.
|
||||
|
||||
## Instruction template
|
||||
```
|
||||
Voice Affect: Warm and composed.
|
||||
Tone: Friendly and confident.
|
||||
Pacing: Steady and moderate.
|
||||
Emphasis: Stress section titles and key terms.
|
||||
Pauses: Brief pause after each section.
|
||||
```
|
||||
|
||||
## Example (short)
|
||||
Input text:
|
||||
"Welcome to the demo. Today we'll show how it works."
|
||||
|
||||
Instructions:
|
||||
```
|
||||
Voice Affect: Warm and composed.
|
||||
Tone: Friendly and confident.
|
||||
Pacing: Steady and moderate.
|
||||
```
|
||||
38
.agents/skills/speech/references/prompting.md
Normal file
38
.agents/skills/speech/references/prompting.md
Normal file
@@ -0,0 +1,38 @@
|
||||
# Instructioning best practices (TTS)
|
||||
|
||||
## Contents
|
||||
- Structure
|
||||
- Specificity
|
||||
- Avoiding conflicts
|
||||
- Pronunciation and names
|
||||
- Pauses and pacing
|
||||
- Iterate deliberately
|
||||
- Where to find copy/paste recipes
|
||||
|
||||
## Structure
|
||||
- Use a consistent order: affect -> tone -> pacing -> emotion -> pronunciation/pauses -> emphasis -> delivery.
|
||||
- For complex requests, use short labeled lines instead of a long paragraph.
|
||||
|
||||
## Specificity
|
||||
- Name the delivery you want ("calm and steady" vs "friendly").
|
||||
- If you need a specific cadence, call it out explicitly ("slow and measured", "brisk and energetic").
|
||||
|
||||
## Avoiding conflicts
|
||||
- Do not mix opposing instructions ("fast and slow", "formal and casual").
|
||||
- Keep instructions short: 4 to 8 lines are usually enough.
|
||||
|
||||
## Pronunciation and names
|
||||
- For acronyms, write the pronunciation hint in text ("A-I" instead of "AI").
|
||||
- For names or brands, add a simple phonetic guide in the input text if clarity matters.
|
||||
- If a word must be emphasized, add an Emphasis line and repeat the word exactly.
|
||||
|
||||
## Pauses and pacing
|
||||
- Use punctuation or short line breaks in the input text to create natural pauses.
|
||||
- Use the Pauses line for intentional pauses ("pause after the greeting").
|
||||
|
||||
## Iterate deliberately
|
||||
- Start with a clean base instruction set, then make one change at a time.
|
||||
- Repeat critical constraints on each iteration ("keep pacing steady").
|
||||
|
||||
## Where to find copy/paste recipes
|
||||
For copy/paste instruction templates, see `references/sample-prompts.md`. This file focuses on principles, structure, and iteration patterns.
|
||||
44
.agents/skills/speech/references/sample-prompts.md
Normal file
44
.agents/skills/speech/references/sample-prompts.md
Normal file
@@ -0,0 +1,44 @@
|
||||
# Sample instruction templates (copy/paste)
|
||||
|
||||
These are short instruction blocks. Use only the lines you need and keep them consistent with the input text.
|
||||
|
||||
## Friendly product demo
|
||||
```
|
||||
Voice Affect: Warm and composed.
|
||||
Tone: Friendly and confident.
|
||||
Pacing: Steady and moderate.
|
||||
Emphasis: Stress key product benefits.
|
||||
```
|
||||
|
||||
## Calm support update
|
||||
```
|
||||
Voice Affect: Calm and reassuring.
|
||||
Tone: Sincere and empathetic.
|
||||
Pacing: Slow and steady.
|
||||
Emotion: Warmth and care.
|
||||
Pauses: Brief pause after apologies.
|
||||
```
|
||||
|
||||
## IVR menu
|
||||
```
|
||||
Voice Affect: Clear and neutral.
|
||||
Tone: Professional and concise.
|
||||
Pacing: Slow and even.
|
||||
Emphasis: Stress menu options and numbers.
|
||||
```
|
||||
|
||||
## Accessibility readout
|
||||
```
|
||||
Voice Affect: Neutral and clear.
|
||||
Tone: Informational and steady.
|
||||
Pacing: Slow and consistent.
|
||||
Pronunciation: Enunciate acronyms and numbers.
|
||||
```
|
||||
|
||||
## Energetic intro
|
||||
```
|
||||
Voice Affect: Bright and upbeat.
|
||||
Tone: Enthusiastic and welcoming.
|
||||
Pacing: Brisk but clear.
|
||||
Emphasis: Stress the opening greeting.
|
||||
```
|
||||
80
.agents/skills/speech/references/voice-directions.md
Normal file
80
.agents/skills/speech/references/voice-directions.md
Normal file
@@ -0,0 +1,80 @@
|
||||
# Voice directions
|
||||
|
||||
## Template
|
||||
Use only the lines you need. Keep directions concise and aligned to the input text.
|
||||
|
||||
```
|
||||
Voice Affect: <overall character and texture>
|
||||
Tone: <attitude, formality, warmth>
|
||||
Pacing: <slow, steady, brisk>
|
||||
Emotion: <key emotions to convey>
|
||||
Pronunciation: <words to enunciate or emphasize>
|
||||
Pauses: <where to insert brief pauses>
|
||||
Emphasis: <key phrases to stress>
|
||||
Delivery: <cadence or rhythm notes>
|
||||
```
|
||||
|
||||
## Best practices
|
||||
- Keep 4 to 8 short lines. Avoid conflicting instructions.
|
||||
- Prefer concrete guidance over adjectives alone.
|
||||
- Do not rewrite the input text in the instructions; only guide delivery.
|
||||
- If you need a language or accent, write the input text in that language.
|
||||
- Repeat critical constraints (for example: "slow and steady") when iterating.
|
||||
|
||||
## Examples (short)
|
||||
|
||||
### Calm support
|
||||
```
|
||||
Voice Affect: Calm and composed, reassuring.
|
||||
Tone: Sincere and empathetic.
|
||||
Pacing: Steady and moderate.
|
||||
Emotion: Warmth and genuine care.
|
||||
Pronunciation: Clear, with emphasis on key reassurances.
|
||||
Pauses: Brief pauses after apologies and before requests.
|
||||
```
|
||||
|
||||
### Dramatic narrator
|
||||
```
|
||||
Voice Affect: Low and suspenseful.
|
||||
Tone: Serious and mysterious.
|
||||
Pacing: Slow and deliberate.
|
||||
Emotion: Restrained intensity.
|
||||
Emphasis: Highlight sensory details and cliffhanger lines.
|
||||
Pauses: Add pauses after suspenseful moments.
|
||||
```
|
||||
|
||||
### Fitness instructor
|
||||
```
|
||||
Voice Affect: High energy and upbeat.
|
||||
Tone: Motivational and encouraging.
|
||||
Pacing: Fast and dynamic.
|
||||
Emotion: Enthusiasm and momentum.
|
||||
Emphasis: Stress action verbs and countdowns.
|
||||
```
|
||||
|
||||
### Serene guide
|
||||
```
|
||||
Voice Affect: Soft and soothing.
|
||||
Tone: Calm and reassuring.
|
||||
Pacing: Slow and unhurried.
|
||||
Emotion: Peaceful warmth.
|
||||
Pauses: Gentle pauses after breathing cues.
|
||||
```
|
||||
|
||||
### Robot agent
|
||||
```
|
||||
Voice Affect: Monotone and mechanical.
|
||||
Tone: Neutral and formal.
|
||||
Pacing: Even and controlled.
|
||||
Emotion: None; strictly informational.
|
||||
Pronunciation: Precise and consistent.
|
||||
```
|
||||
|
||||
### Old-time announcer
|
||||
```
|
||||
Voice Affect: Refined and theatrical.
|
||||
Tone: Formal and welcoming.
|
||||
Pacing: Steady with a classic cadence.
|
||||
Emotion: Warm enthusiasm.
|
||||
Pronunciation: Crisp enunciation with vintage flair.
|
||||
```
|
||||
31
.agents/skills/speech/references/voiceover.md
Normal file
31
.agents/skills/speech/references/voiceover.md
Normal file
@@ -0,0 +1,31 @@
|
||||
# Product demo / voiceover defaults
|
||||
|
||||
## Suggested defaults
|
||||
- Voice: `cedar` (neutral) or `marin` (brighter)
|
||||
- Format: `wav` for video sync, `mp3` for quick review
|
||||
- Speed: `1.0`
|
||||
|
||||
## Guidance
|
||||
- Keep tone confident and helpful.
|
||||
- Emphasize product benefits and call-to-action phrases.
|
||||
- Avoid overly dramatic delivery unless requested.
|
||||
|
||||
## Instruction template
|
||||
```
|
||||
Voice Affect: Confident and composed.
|
||||
Tone: Helpful and upbeat.
|
||||
Pacing: Steady, slightly brisk.
|
||||
Emphasis: Stress product benefits and the call to action.
|
||||
```
|
||||
|
||||
## Example (short)
|
||||
Input text:
|
||||
"Meet the new dashboard. Find insights faster and act with confidence."
|
||||
|
||||
Instructions:
|
||||
```
|
||||
Voice Affect: Confident and composed.
|
||||
Tone: Helpful and upbeat.
|
||||
Pacing: Steady, slightly brisk.
|
||||
Emphasis: Stress "insights" and "confidence".
|
||||
```
|
||||
528
.agents/skills/speech/scripts/text_to_speech.py
Normal file
528
.agents/skills/speech/scripts/text_to_speech.py
Normal file
@@ -0,0 +1,528 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate speech audio with the OpenAI Audio API (TTS).
|
||||
|
||||
Defaults to gpt-4o-mini-tts-2025-12-15 and a built-in voice (cedar).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
DEFAULT_MODEL = "gpt-4o-mini-tts-2025-12-15"
|
||||
DEFAULT_VOICE = "cedar"
|
||||
DEFAULT_RESPONSE_FORMAT = "mp3"
|
||||
DEFAULT_SPEED = 1.0
|
||||
MAX_INPUT_CHARS = 4096
|
||||
MAX_RPM = 50
|
||||
DEFAULT_RPM = 50
|
||||
DEFAULT_ATTEMPTS = 3
|
||||
|
||||
ALLOWED_VOICES = {
|
||||
"alloy",
|
||||
"ash",
|
||||
"ballad",
|
||||
"cedar",
|
||||
"coral",
|
||||
"echo",
|
||||
"fable",
|
||||
"marin",
|
||||
"nova",
|
||||
"onyx",
|
||||
"sage",
|
||||
"shimmer",
|
||||
"verse",
|
||||
}
|
||||
|
||||
ALLOWED_FORMATS = {"mp3", "opus", "aac", "flac", "wav", "pcm"}
|
||||
|
||||
|
||||
def _die(message: str, code: int = 1) -> None:
|
||||
print(f"Error: {message}", file=sys.stderr)
|
||||
raise SystemExit(code)
|
||||
|
||||
|
||||
def _warn(message: str) -> None:
|
||||
print(f"Warning: {message}", file=sys.stderr)
|
||||
|
||||
|
||||
def _ensure_api_key(dry_run: bool) -> None:
|
||||
if os.getenv("OPENAI_API_KEY"):
|
||||
print("OPENAI_API_KEY is set.", file=sys.stderr)
|
||||
return
|
||||
if dry_run:
|
||||
_warn("OPENAI_API_KEY is not set; dry-run only.")
|
||||
return
|
||||
_die("OPENAI_API_KEY is not set. Export it before running.")
|
||||
|
||||
|
||||
def _read_text(text: Optional[str], text_file: Optional[str], label: str) -> str:
|
||||
if text and text_file:
|
||||
_die(f"Use --{label} or --{label}-file, not both.")
|
||||
if text_file:
|
||||
path = Path(text_file)
|
||||
if not path.exists():
|
||||
_die(f"{label} file not found: {path}")
|
||||
return path.read_text(encoding="utf-8").strip()
|
||||
if text:
|
||||
return str(text).strip()
|
||||
_die(f"Missing {label}. Use --{label} or --{label}-file.")
|
||||
return "" # unreachable
|
||||
|
||||
|
||||
def _validate_input(text: str) -> None:
|
||||
if not text:
|
||||
_die("Input text is empty.")
|
||||
if len(text) > MAX_INPUT_CHARS:
|
||||
_die(
|
||||
f"Input text exceeds {MAX_INPUT_CHARS} characters. Split into smaller chunks."
|
||||
)
|
||||
|
||||
|
||||
def _normalize_voice(voice: Optional[str]) -> str:
|
||||
if not voice:
|
||||
return DEFAULT_VOICE
|
||||
value = str(voice).strip().lower()
|
||||
if value not in ALLOWED_VOICES:
|
||||
_die(
|
||||
"voice must be one of: " + ", ".join(sorted(ALLOWED_VOICES))
|
||||
)
|
||||
return value
|
||||
|
||||
|
||||
def _normalize_format(fmt: Optional[str]) -> str:
|
||||
if not fmt:
|
||||
return DEFAULT_RESPONSE_FORMAT
|
||||
value = str(fmt).strip().lower()
|
||||
if value not in ALLOWED_FORMATS:
|
||||
_die("response-format must be one of: " + ", ".join(sorted(ALLOWED_FORMATS)))
|
||||
return value
|
||||
|
||||
|
||||
def _normalize_speed(speed: Optional[float]) -> Optional[float]:
|
||||
if speed is None:
|
||||
return None
|
||||
try:
|
||||
value = float(speed)
|
||||
except ValueError:
|
||||
_die("speed must be a number")
|
||||
if value < 0.25 or value > 4.0:
|
||||
_die("speed must be between 0.25 and 4.0")
|
||||
return value
|
||||
|
||||
|
||||
def _normalize_output_path(out: Optional[str], response_format: str) -> Path:
|
||||
if out:
|
||||
path = Path(out)
|
||||
if path.exists() and path.is_dir():
|
||||
return path / f"speech.{response_format}"
|
||||
if path.suffix == "":
|
||||
return path.with_suffix("." + response_format)
|
||||
if path.suffix.lstrip(".").lower() != response_format:
|
||||
_warn(
|
||||
f"Output extension {path.suffix} does not match response-format {response_format}."
|
||||
)
|
||||
return path
|
||||
return Path(f"speech.{response_format}")
|
||||
|
||||
|
||||
def _create_client():
|
||||
try:
|
||||
from openai import OpenAI
|
||||
except ImportError:
|
||||
_die("openai SDK not installed. Install with `uv pip install openai`.")
|
||||
return OpenAI()
|
||||
|
||||
|
||||
def _extract_retry_after_seconds(exc: Exception) -> Optional[float]:
|
||||
for attr in ("retry_after", "retry_after_seconds"):
|
||||
val = getattr(exc, attr, None)
|
||||
if isinstance(val, (int, float)) and val >= 0:
|
||||
return float(val)
|
||||
msg = str(exc)
|
||||
m = re.search(r"retry[- ]after[:= ]+([0-9]+(?:\\.[0-9]+)?)", msg, re.IGNORECASE)
|
||||
if m:
|
||||
try:
|
||||
return float(m.group(1))
|
||||
except Exception:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def _is_rate_limit_error(exc: Exception) -> bool:
|
||||
name = exc.__class__.__name__.lower()
|
||||
if "ratelimit" in name or "rate_limit" in name:
|
||||
return True
|
||||
msg = str(exc).lower()
|
||||
return "429" in msg or "rate limit" in msg or "too many requests" in msg
|
||||
|
||||
|
||||
def _is_transient_error(exc: Exception) -> bool:
|
||||
if _is_rate_limit_error(exc):
|
||||
return True
|
||||
name = exc.__class__.__name__.lower()
|
||||
if "timeout" in name or "timedout" in name or "tempor" in name:
|
||||
return True
|
||||
msg = str(exc).lower()
|
||||
return "timeout" in msg or "timed out" in msg or "connection reset" in msg
|
||||
|
||||
|
||||
def _maybe_drop_instructions(model: str, instructions: Optional[str]) -> Optional[str]:
|
||||
if instructions and model in {"tts-1", "tts-1-hd"}:
|
||||
_warn("instructions are not supported for tts-1 / tts-1-hd; ignoring.")
|
||||
return None
|
||||
return instructions
|
||||
|
||||
|
||||
def _print_payload(payload: Dict[str, Any]) -> None:
|
||||
print(json.dumps(payload, indent=2, sort_keys=True))
|
||||
|
||||
|
||||
def _write_audio(
|
||||
client: Any,
|
||||
payload: Dict[str, Any],
|
||||
out_path: Path,
|
||||
*,
|
||||
dry_run: bool,
|
||||
force: bool,
|
||||
attempts: int,
|
||||
) -> None:
|
||||
if dry_run:
|
||||
_print_payload(payload)
|
||||
print(f"Would write {out_path}")
|
||||
return
|
||||
|
||||
_ensure_api_key(dry_run)
|
||||
|
||||
if out_path.exists() and not force:
|
||||
_die(f"Output already exists: {out_path} (use --force to overwrite)")
|
||||
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
last_exc: Optional[Exception] = None
|
||||
for attempt in range(1, attempts + 1):
|
||||
try:
|
||||
with client.audio.speech.with_streaming_response.create(**payload) as response:
|
||||
response.stream_to_file(out_path)
|
||||
print(f"Wrote {out_path}")
|
||||
return
|
||||
except Exception as exc:
|
||||
last_exc = exc
|
||||
if not _is_transient_error(exc) or attempt >= attempts:
|
||||
raise
|
||||
sleep_s = _extract_retry_after_seconds(exc)
|
||||
if sleep_s is None:
|
||||
sleep_s = min(60.0, 2.0 ** attempt)
|
||||
print(
|
||||
f"Attempt {attempt}/{attempts} failed ({exc.__class__.__name__}); retrying in {sleep_s:.1f}s",
|
||||
file=sys.stderr,
|
||||
)
|
||||
time.sleep(sleep_s)
|
||||
|
||||
if last_exc:
|
||||
raise last_exc
|
||||
|
||||
|
||||
def _slugify(value: str) -> str:
|
||||
value = value.strip().lower()
|
||||
value = re.sub(r"[^a-z0-9]+", "-", value)
|
||||
value = re.sub(r"-+", "-", value).strip("-")
|
||||
return value[:60] if value else "job"
|
||||
|
||||
|
||||
def _read_jobs_jsonl(path: str) -> List[Dict[str, Any]]:
|
||||
p = Path(path)
|
||||
if not p.exists():
|
||||
_die(f"Input file not found: {p}")
|
||||
jobs: List[Dict[str, Any]] = []
|
||||
for line_no, raw in enumerate(p.read_text(encoding="utf-8").splitlines(), start=1):
|
||||
line = raw.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
if line.startswith("{"):
|
||||
try:
|
||||
item = json.loads(line)
|
||||
except json.JSONDecodeError as exc:
|
||||
_die(f"Invalid JSON on line {line_no}: {exc}")
|
||||
if not isinstance(item, dict):
|
||||
_die(f"Invalid job on line {line_no}: expected object")
|
||||
jobs.append(item)
|
||||
else:
|
||||
jobs.append({"input": line})
|
||||
if not jobs:
|
||||
_die("No jobs found in input file.")
|
||||
return jobs
|
||||
|
||||
|
||||
def _job_input(job: Dict[str, Any]) -> str:
|
||||
for key in ("input", "text", "prompt"):
|
||||
if key in job and str(job[key]).strip():
|
||||
return str(job[key]).strip()
|
||||
_die("Job missing input text (use 'input').")
|
||||
return "" # unreachable
|
||||
|
||||
|
||||
def _merge_non_null(base: Dict[str, Any], extra: Dict[str, Any]) -> Dict[str, Any]:
|
||||
merged = dict(base)
|
||||
for k, v in extra.items():
|
||||
if v is not None:
|
||||
merged[k] = v
|
||||
return merged
|
||||
|
||||
|
||||
def _enforce_rpm(rpm: int) -> int:
|
||||
if rpm <= 0:
|
||||
_die("rpm must be > 0")
|
||||
if rpm > MAX_RPM:
|
||||
_warn(f"rpm capped at {MAX_RPM} (requested {rpm}).")
|
||||
return MAX_RPM
|
||||
return rpm
|
||||
|
||||
|
||||
def _sleep_for_rate_limit(last_ts: Optional[float], rpm: int) -> float:
|
||||
min_interval = 60.0 / float(rpm)
|
||||
now = time.monotonic()
|
||||
if last_ts is None:
|
||||
return now
|
||||
elapsed = now - last_ts
|
||||
if elapsed < min_interval:
|
||||
time.sleep(min_interval - elapsed)
|
||||
return time.monotonic()
|
||||
|
||||
|
||||
def _list_voices() -> None:
|
||||
for name in sorted(ALLOWED_VOICES):
|
||||
print(name)
|
||||
|
||||
|
||||
def _run_speak(args: argparse.Namespace) -> int:
|
||||
if args.list_voices:
|
||||
_list_voices()
|
||||
return 0
|
||||
|
||||
input_text = _read_text(args.input, args.input_file, "input")
|
||||
_validate_input(input_text)
|
||||
|
||||
instructions = None
|
||||
if args.instructions or args.instructions_file:
|
||||
instructions = _read_text(args.instructions, args.instructions_file, "instructions")
|
||||
|
||||
model = str(args.model).strip()
|
||||
voice = _normalize_voice(args.voice)
|
||||
response_format = _normalize_format(args.response_format)
|
||||
speed = _normalize_speed(args.speed)
|
||||
|
||||
instructions = _maybe_drop_instructions(model, instructions)
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"model": model,
|
||||
"voice": voice,
|
||||
"input": input_text,
|
||||
"response_format": response_format,
|
||||
}
|
||||
if instructions:
|
||||
payload["instructions"] = instructions
|
||||
if speed is not None:
|
||||
payload["speed"] = speed
|
||||
|
||||
out_path = _normalize_output_path(args.out, response_format)
|
||||
|
||||
if args.dry_run:
|
||||
_ensure_api_key(True)
|
||||
_print_payload(payload)
|
||||
print(f"Would write {out_path}")
|
||||
return 0
|
||||
|
||||
client = _create_client()
|
||||
_write_audio(
|
||||
client,
|
||||
payload,
|
||||
out_path,
|
||||
dry_run=args.dry_run,
|
||||
force=args.force,
|
||||
attempts=args.attempts,
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
def _run_speak_batch(args: argparse.Namespace) -> int:
|
||||
jobs = _read_jobs_jsonl(args.input)
|
||||
out_dir = Path(args.out_dir)
|
||||
|
||||
base_instructions = None
|
||||
if args.instructions or args.instructions_file:
|
||||
base_instructions = _read_text(args.instructions, args.instructions_file, "instructions")
|
||||
|
||||
base_payload = {
|
||||
"model": str(args.model).strip(),
|
||||
"voice": _normalize_voice(args.voice),
|
||||
"response_format": _normalize_format(args.response_format),
|
||||
"speed": _normalize_speed(args.speed),
|
||||
"instructions": base_instructions,
|
||||
}
|
||||
|
||||
rpm = _enforce_rpm(args.rpm)
|
||||
last_ts: Optional[float] = None
|
||||
|
||||
if args.dry_run:
|
||||
_ensure_api_key(True)
|
||||
|
||||
client = None if args.dry_run else _create_client()
|
||||
|
||||
for idx, job in enumerate(jobs, start=1):
|
||||
input_text = _job_input(job)
|
||||
_validate_input(input_text)
|
||||
|
||||
job_payload = dict(base_payload)
|
||||
job_payload["input"] = input_text
|
||||
|
||||
overrides: Dict[str, Any] = {}
|
||||
if "model" in job:
|
||||
overrides["model"] = str(job["model"]).strip()
|
||||
if "voice" in job:
|
||||
overrides["voice"] = _normalize_voice(job["voice"])
|
||||
if "response_format" in job or "format" in job:
|
||||
overrides["response_format"] = _normalize_format(job.get("response_format") or job.get("format"))
|
||||
if "speed" in job and job["speed"] is not None:
|
||||
overrides["speed"] = _normalize_speed(job["speed"])
|
||||
if "instructions" in job and str(job["instructions"]).strip():
|
||||
overrides["instructions"] = str(job["instructions"]).strip()
|
||||
|
||||
job_payload = _merge_non_null(job_payload, overrides)
|
||||
job_payload["instructions"] = _maybe_drop_instructions(
|
||||
job_payload["model"], job_payload.get("instructions")
|
||||
)
|
||||
if job_payload.get("instructions") is None:
|
||||
job_payload.pop("instructions", None)
|
||||
|
||||
response_format = job_payload["response_format"]
|
||||
|
||||
explicit_out = job.get("out")
|
||||
if explicit_out:
|
||||
out_path = _normalize_output_path(str(explicit_out), response_format)
|
||||
if out_path.is_absolute():
|
||||
out_path = out_dir / out_path.name
|
||||
else:
|
||||
out_path = out_dir / out_path
|
||||
else:
|
||||
slug = _slugify(input_text[:80])
|
||||
out_path = out_dir / f"{idx:03d}-{slug}.{response_format}"
|
||||
|
||||
if args.dry_run:
|
||||
_print_payload(job_payload)
|
||||
print(f"Would write {out_path}")
|
||||
continue
|
||||
|
||||
last_ts = _sleep_for_rate_limit(last_ts, rpm)
|
||||
|
||||
if client is None:
|
||||
client = _create_client()
|
||||
_write_audio(
|
||||
client,
|
||||
job_payload,
|
||||
out_path,
|
||||
dry_run=False,
|
||||
force=args.force,
|
||||
attempts=args.attempts,
|
||||
)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def _add_common_args(parser: argparse.ArgumentParser) -> None:
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default=DEFAULT_MODEL,
|
||||
help=f"Model to use (default: {DEFAULT_MODEL})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--voice",
|
||||
default=DEFAULT_VOICE,
|
||||
help=f"Voice to use (default: {DEFAULT_VOICE})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--response-format",
|
||||
default=DEFAULT_RESPONSE_FORMAT,
|
||||
help=f"Output format (default: {DEFAULT_RESPONSE_FORMAT})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--speed",
|
||||
type=float,
|
||||
default=DEFAULT_SPEED,
|
||||
help=f"Speech speed (0.25-4.0, default: {DEFAULT_SPEED})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--instructions",
|
||||
help="Style directions for the voice",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--instructions-file",
|
||||
help="Path to instructions text file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--attempts",
|
||||
type=int,
|
||||
default=DEFAULT_ATTEMPTS,
|
||||
help=f"Retries on transient errors (default: {DEFAULT_ATTEMPTS})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Print payload; do not call the API",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force",
|
||||
action="store_true",
|
||||
help="Overwrite output files if they exist",
|
||||
)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate speech audio using the OpenAI Audio API."
|
||||
)
|
||||
subparsers = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
list_voices = subparsers.add_parser("list-voices", help="List supported voices")
|
||||
list_voices.set_defaults(func=lambda _args: (_list_voices() or 0))
|
||||
|
||||
speak = subparsers.add_parser("speak", help="Generate a single audio file")
|
||||
speak.add_argument("--input", help="Input text")
|
||||
speak.add_argument("--input-file", help="Path to input text file")
|
||||
speak.add_argument("--out", help="Output file path")
|
||||
speak.add_argument(
|
||||
"--list-voices",
|
||||
action="store_true",
|
||||
help="Print voices and exit",
|
||||
)
|
||||
_add_common_args(speak)
|
||||
speak.set_defaults(func=_run_speak)
|
||||
|
||||
batch = subparsers.add_parser("speak-batch", help="Generate from JSONL jobs")
|
||||
batch.add_argument("--input", required=True, help="Path to JSONL file")
|
||||
batch.add_argument(
|
||||
"--out-dir",
|
||||
default="out",
|
||||
help="Output directory (default: out)",
|
||||
)
|
||||
batch.add_argument(
|
||||
"--rpm",
|
||||
type=int,
|
||||
default=DEFAULT_RPM,
|
||||
help=f"Requests per minute cap (default: {DEFAULT_RPM}, max: {MAX_RPM})",
|
||||
)
|
||||
_add_common_args(batch)
|
||||
batch.set_defaults(func=_run_speak_batch)
|
||||
|
||||
args = parser.parse_args()
|
||||
return int(args.func(args))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user