This commit is contained in:
2026-02-10 22:53:41 -08:00
parent d23a385861
commit c333265231
3 changed files with 207 additions and 5 deletions

View File

@@ -6,6 +6,8 @@ font-feature = +liga
font-feature = +dlig font-feature = +dlig
theme = Catppuccin Macchiato theme = Catppuccin Macchiato
cursor-style = block cursor-style = block
background-opacity = 1.0
window-colorspace = srgb
window-padding-x = 10 window-padding-x = 10
window-padding-y = 10 window-padding-y = 10
window-decoration = false window-decoration = false
@@ -13,9 +15,10 @@ window-height = 46
window-width = 180 window-width = 180
confirm-close-surface = false confirm-close-surface = false
copy-on-select = clipboard copy-on-select = clipboard
osc-color-report-format = 16-bit
app-notifications = no-clipboard-copy app-notifications = no-clipboard-copy
shell-integration = zsh shell-integration = zsh
shell-integration-features = title,sudo shell-integration-features = title,sudo,ssh-env,ssh-terminfo
desktop-notifications = true desktop-notifications = true
term=xterm-ghostty term=xterm-ghostty
link-url = true link-url = true

View File

@@ -100,3 +100,5 @@ zle -N self-insert url-quote-magic
zle -N bracketed-paste bracketed-paste-magic zle -N bracketed-paste bracketed-paste-magic
alias claude-mem='bun "/home/sudacode/.claude/plugins/marketplaces/thedotmack/plugin/scripts/worker-service.cjs"' alias claude-mem='bun "/home/sudacode/.claude/plugins/marketplaces/thedotmack/plugin/scripts/worker-service.cjs"'
fpath=(/home/sudacode/.zsh/completions $fpath)
autoload -Uz compinit && compinit

View File

@@ -1,5 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""Record microphone audio and transcribe it with whisper.cpp or faster-whisper.""" """Record microphone audio and transcribe it with whisper.cpp, faster-whisper, or WhisperX."""
from __future__ import annotations from __future__ import annotations
@@ -145,6 +145,13 @@ def transcribe(
device: str, device: str,
compute_type: str, compute_type: str,
beam_size: int, beam_size: int,
task: str,
language: str | None,
whisperx_mode: str,
whisperx_vad_method: str,
whisperx_hf_token: str | None,
whisperx_min_speakers: int | None,
whisperx_max_speakers: int | None,
) -> str: ) -> str:
if backend == "whispercpp": if backend == "whispercpp":
return transcribe_whispercpp( return transcribe_whispercpp(
@@ -153,6 +160,8 @@ def transcribe(
notifier=notifier, notifier=notifier,
device=device, device=device,
beam_size=beam_size, beam_size=beam_size,
task=task,
language=language,
) )
if backend == "ctranslate2": if backend == "ctranslate2":
return transcribe_ctranslate2( return transcribe_ctranslate2(
@@ -162,6 +171,24 @@ def transcribe(
device=device, device=device,
compute_type=compute_type, compute_type=compute_type,
beam_size=beam_size, beam_size=beam_size,
task=task,
language=language,
)
if backend == "whisperx":
return transcribe_whisperx(
model_name_or_path=model_name_or_path,
wav_path=wav_path,
notifier=notifier,
device=device,
compute_type=compute_type,
beam_size=beam_size,
task=task,
language=language,
whisperx_mode=whisperx_mode,
whisperx_vad_method=whisperx_vad_method,
whisperx_hf_token=whisperx_hf_token,
whisperx_min_speakers=whisperx_min_speakers,
whisperx_max_speakers=whisperx_max_speakers,
) )
raise RuntimeError(f"Unsupported backend: {backend}") raise RuntimeError(f"Unsupported backend: {backend}")
@@ -193,6 +220,8 @@ def transcribe_whispercpp(
notifier: Notifier, notifier: Notifier,
device: str, device: str,
beam_size: int, beam_size: int,
task: str,
language: str | None,
) -> str: ) -> str:
whisper_cli = shutil.which("whisper-cli") whisper_cli = shutil.which("whisper-cli")
if not whisper_cli: if not whisper_cli:
@@ -220,6 +249,10 @@ def transcribe_whispercpp(
] ]
if device == "cpu": if device == "cpu":
cmd.append("-ng") cmd.append("-ng")
if language:
cmd.extend(["-l", language])
if task == "translate":
cmd.append("-tr")
result = subprocess.run(cmd, capture_output=True, text=True) result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0: if result.returncode != 0:
@@ -241,6 +274,8 @@ def transcribe_ctranslate2(
device: str, device: str,
compute_type: str, compute_type: str,
beam_size: int, beam_size: int,
task: str,
language: str | None,
) -> str: ) -> str:
whisper_cli = shutil.which("whisper-ctranslate2") whisper_cli = shutil.which("whisper-ctranslate2")
if not whisper_cli: if not whisper_cli:
@@ -273,9 +308,13 @@ def transcribe_ctranslate2(
compute_type, compute_type,
"--beam_size", "--beam_size",
str(beam_size), str(beam_size),
"--task",
task,
"--verbose", "--verbose",
"False", "False",
] ]
if language:
cmd.extend(["--language", language])
model_dir_candidate = Path(model_name_or_path).expanduser() model_dir_candidate = Path(model_name_or_path).expanduser()
if model_dir_candidate.exists() and model_dir_candidate.is_dir(): if model_dir_candidate.exists() and model_dir_candidate.is_dir():
cmd.extend(["--model_directory", str(model_dir_candidate)]) cmd.extend(["--model_directory", str(model_dir_candidate)])
@@ -297,6 +336,101 @@ def transcribe_ctranslate2(
return output_txt.read_text(encoding="utf-8").strip() return output_txt.read_text(encoding="utf-8").strip()
def transcribe_whisperx(
model_name_or_path: str,
wav_path: Path,
notifier: Notifier,
device: str,
compute_type: str,
beam_size: int,
task: str,
language: str | None,
whisperx_mode: str,
whisperx_vad_method: str,
whisperx_hf_token: str | None,
whisperx_min_speakers: int | None,
whisperx_max_speakers: int | None,
) -> str:
whisperx_cli = shutil.which("whisperx")
if not whisperx_cli:
raise RuntimeError("whisperx not found in PATH. Install with: pip install whisperx")
output_txt = wav_path.parent / f"{wav_path.stem}.txt"
if output_txt.exists():
output_txt.unlink()
if task == "translate" and not language:
raise RuntimeError("Translation requires --language so WhisperX can translate from the source language.")
def _run_whisperx(out_dir: Path, job_task: str, full_mode: bool) -> tuple[int, str]:
cmd = [
whisperx_cli,
str(wav_path),
"--output_dir",
str(out_dir),
"--output_format",
"txt",
"--device",
device,
"--compute_type",
compute_type,
"--beam_size",
str(beam_size),
"--task",
job_task,
"--vad_method",
whisperx_vad_method,
"--print_progress",
"False",
"--verbose",
"False",
]
if language:
cmd.extend(["--language", language])
model_dir_candidate = Path(model_name_or_path).expanduser()
if model_dir_candidate.exists() and model_dir_candidate.is_dir():
cmd.extend(["--model_dir", str(model_dir_candidate)])
else:
cmd.extend(["--model", model_name_or_path])
if whisperx_mode == "basic":
cmd.append("--no_align")
if full_mode:
cmd.append("--diarize")
if whisperx_hf_token:
cmd.extend(["--hf_token", whisperx_hf_token])
if whisperx_min_speakers is not None:
cmd.extend(["--min_speakers", str(whisperx_min_speakers)])
if whisperx_max_speakers is not None:
cmd.extend(["--max_speakers", str(whisperx_max_speakers)])
result = subprocess.run(cmd, capture_output=True, text=True)
details = (result.stderr or result.stdout or "").strip()
return result.returncode, details
notifier.send("Transcribing", "Running WhisperX...", timeout_ms=1500)
if whisperx_mode == "full":
native_dir = wav_path.parent / "whisperx-native"
native_dir.mkdir(parents=True, exist_ok=True)
notifier.send("Transcribing", "WhisperX full mode: extracting native subtitles", timeout_ms=1500)
rc, details = _run_whisperx(native_dir, "transcribe", full_mode=False)
if rc != 0:
raise RuntimeError(details or "WhisperX native transcription stage failed.")
rc, details = _run_whisperx(wav_path.parent, task, full_mode=(whisperx_mode == "full"))
if rc != 0:
raise RuntimeError(details or "whisperx failed.")
if not output_txt.exists():
raise RuntimeError(
"whisperx completed but no transcript file was produced. "
f"Expected: {output_txt}. {details}"
)
return output_txt.read_text(encoding="utf-8").strip()
def _type_with_tool(text: str) -> None: def _type_with_tool(text: str) -> None:
if shutil.which("wtype"): if shutil.which("wtype"):
subprocess.run(["wtype", text], check=True) subprocess.run(["wtype", text], check=True)
@@ -402,6 +536,13 @@ def _run_transcription_job(args: argparse.Namespace, duration: float | None) ->
device=args.device, device=args.device,
compute_type=args.compute_type, compute_type=args.compute_type,
beam_size=args.beam_size, beam_size=args.beam_size,
task=args.task,
language=args.language,
whisperx_mode=args.whisperx_mode,
whisperx_vad_method=args.whisperx_vad_method,
whisperx_hf_token=args.whisperx_hf_token,
whisperx_min_speakers=args.whisperx_min_speakers,
whisperx_max_speakers=args.whisperx_max_speakers,
) )
return text.strip() return text.strip()
@@ -493,7 +634,21 @@ def start_background(args: argparse.Namespace) -> int:
args.compute_type, args.compute_type,
"--beam-size", "--beam-size",
str(args.beam_size), str(args.beam_size),
"--task",
args.task,
"--whisperx-mode",
args.whisperx_mode,
"--whisperx-vad-method",
args.whisperx_vad_method,
] ]
if args.language:
cmd.extend(["--language", args.language])
if args.whisperx_hf_token:
cmd.extend(["--whisperx-hf-token", args.whisperx_hf_token])
if args.whisperx_min_speakers is not None:
cmd.extend(["--whisperx-min-speakers", str(args.whisperx_min_speakers)])
if args.whisperx_max_speakers is not None:
cmd.extend(["--whisperx-max-speakers", str(args.whisperx_max_speakers)])
log_path = state_dir / "worker.log" log_path = state_dir / "worker.log"
with log_path.open("a", encoding="utf-8") as log_fh: with log_path.open("a", encoding="utf-8") as log_fh:
@@ -572,7 +727,7 @@ def stop_background(args: argparse.Namespace) -> int:
def parse_args() -> argparse.Namespace: def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Record from microphone and transcribe with whisper.cpp or faster-whisper" description="Record from microphone and transcribe with whisper.cpp, faster-whisper, or WhisperX"
) )
parser.add_argument( parser.add_argument(
"--mode", "--mode",
@@ -590,14 +745,25 @@ def parse_args() -> argparse.Namespace:
) )
parser.add_argument( parser.add_argument(
"--backend", "--backend",
choices=("whispercpp", "ctranslate2"), choices=("whispercpp", "ctranslate2", "whisperx"),
default="whispercpp", default="whispercpp",
help="Transcription backend (default: whispercpp)", help="Transcription backend (default: whispercpp)",
) )
parser.add_argument( parser.add_argument(
"--model", "--model",
default=DEFAULT_MODEL, default=DEFAULT_MODEL,
help="Model name or path. For whispercpp: ggml .bin path/name. For ctranslate2: model name or model directory.", help="Model name or path. For whispercpp: ggml .bin path/name. For ctranslate2/whisperx: model name or model directory.",
)
parser.add_argument(
"--task",
choices=("transcribe", "translate"),
default="transcribe",
help="Task to run (default: transcribe).",
)
parser.add_argument(
"--language",
default=None,
help="Source language code/name (for example: en, es, Japanese). Strongly recommended, and required for --task translate.",
) )
parser.add_argument( parser.add_argument(
"--duration", "--duration",
@@ -642,6 +808,35 @@ def parse_args() -> argparse.Namespace:
default=5, default=5,
help="Beam size for decoding (default: 5)", help="Beam size for decoding (default: 5)",
) )
parser.add_argument(
"--whisperx-mode",
choices=("basic", "align", "full"),
default="align",
help="WhisperX pipeline mode: basic (no align), align (aligned transcript), full (native transcript + translate/transcribe + align + VAD + diarization).",
)
parser.add_argument(
"--whisperx-vad-method",
choices=("silero", "pyannote"),
default="silero",
help="WhisperX VAD method (default: silero).",
)
parser.add_argument(
"--whisperx-hf-token",
default=None,
help="Optional HuggingFace token for WhisperX diarization models.",
)
parser.add_argument(
"--whisperx-min-speakers",
type=int,
default=None,
help="Optional minimum speaker count for WhisperX diarization.",
)
parser.add_argument(
"--whisperx-max-speakers",
type=int,
default=None,
help="Optional maximum speaker count for WhisperX diarization.",
)
parser.add_argument( parser.add_argument(
"--state-dir", "--state-dir",
default=str(DEFAULT_STATE_DIR), default=str(DEFAULT_STATE_DIR),
@@ -673,6 +868,8 @@ def parse_args() -> argparse.Namespace:
parser.error("Use only one of --start, --stop, or --toggle.") parser.error("Use only one of --start, --stop, or --toggle.")
if legacy_modes: if legacy_modes:
args.mode = legacy_modes[0] args.mode = legacy_modes[0]
if args.task == "translate" and not args.language:
parser.error("--task translate requires --language so the source language is explicit.")
return args return args