From c333265231b6012485ee3f047f0328b93c20d29f Mon Sep 17 00:00:00 2001 From: sudacode Date: Tue, 10 Feb 2026 22:53:41 -0800 Subject: [PATCH] Update --- .config/ghostty/config##Default | 5 +- .zsh/.zshrc##default | 2 + projects/scripts/whisper_record_transcribe.py | 205 +++++++++++++++++- 3 files changed, 207 insertions(+), 5 deletions(-) diff --git a/.config/ghostty/config##Default b/.config/ghostty/config##Default index b3dda1d..f03a07a 100644 --- a/.config/ghostty/config##Default +++ b/.config/ghostty/config##Default @@ -6,6 +6,8 @@ font-feature = +liga font-feature = +dlig theme = Catppuccin Macchiato cursor-style = block +background-opacity = 1.0 +window-colorspace = srgb window-padding-x = 10 window-padding-y = 10 window-decoration = false @@ -13,9 +15,10 @@ window-height = 46 window-width = 180 confirm-close-surface = false copy-on-select = clipboard +osc-color-report-format = 16-bit app-notifications = no-clipboard-copy shell-integration = zsh -shell-integration-features = title,sudo +shell-integration-features = title,sudo,ssh-env,ssh-terminfo desktop-notifications = true term=xterm-ghostty link-url = true diff --git a/.zsh/.zshrc##default b/.zsh/.zshrc##default index a383c7e..01d7dee 100644 --- a/.zsh/.zshrc##default +++ b/.zsh/.zshrc##default @@ -100,3 +100,5 @@ zle -N self-insert url-quote-magic zle -N bracketed-paste bracketed-paste-magic alias claude-mem='bun "/home/sudacode/.claude/plugins/marketplaces/thedotmack/plugin/scripts/worker-service.cjs"' +fpath=(/home/sudacode/.zsh/completions $fpath) +autoload -Uz compinit && compinit diff --git a/projects/scripts/whisper_record_transcribe.py b/projects/scripts/whisper_record_transcribe.py index 77b366e..79e4e7f 100755 --- a/projects/scripts/whisper_record_transcribe.py +++ b/projects/scripts/whisper_record_transcribe.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Record microphone audio and transcribe it with whisper.cpp or faster-whisper.""" +"""Record microphone audio and transcribe it with whisper.cpp, faster-whisper, or WhisperX.""" from __future__ import annotations @@ -145,6 +145,13 @@ def transcribe( device: str, compute_type: str, beam_size: int, + task: str, + language: str | None, + whisperx_mode: str, + whisperx_vad_method: str, + whisperx_hf_token: str | None, + whisperx_min_speakers: int | None, + whisperx_max_speakers: int | None, ) -> str: if backend == "whispercpp": return transcribe_whispercpp( @@ -153,6 +160,8 @@ def transcribe( notifier=notifier, device=device, beam_size=beam_size, + task=task, + language=language, ) if backend == "ctranslate2": return transcribe_ctranslate2( @@ -162,6 +171,24 @@ def transcribe( device=device, compute_type=compute_type, beam_size=beam_size, + task=task, + language=language, + ) + if backend == "whisperx": + return transcribe_whisperx( + model_name_or_path=model_name_or_path, + wav_path=wav_path, + notifier=notifier, + device=device, + compute_type=compute_type, + beam_size=beam_size, + task=task, + language=language, + whisperx_mode=whisperx_mode, + whisperx_vad_method=whisperx_vad_method, + whisperx_hf_token=whisperx_hf_token, + whisperx_min_speakers=whisperx_min_speakers, + whisperx_max_speakers=whisperx_max_speakers, ) raise RuntimeError(f"Unsupported backend: {backend}") @@ -193,6 +220,8 @@ def transcribe_whispercpp( notifier: Notifier, device: str, beam_size: int, + task: str, + language: str | None, ) -> str: whisper_cli = shutil.which("whisper-cli") if not whisper_cli: @@ -220,6 +249,10 @@ def transcribe_whispercpp( ] if device == "cpu": cmd.append("-ng") + if language: + cmd.extend(["-l", language]) + if task == "translate": + cmd.append("-tr") result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: @@ -241,6 +274,8 @@ def transcribe_ctranslate2( device: str, compute_type: str, beam_size: int, + task: str, + language: str | None, ) -> str: whisper_cli = shutil.which("whisper-ctranslate2") if not whisper_cli: @@ -273,9 +308,13 @@ def transcribe_ctranslate2( compute_type, "--beam_size", str(beam_size), + "--task", + task, "--verbose", "False", ] + if language: + cmd.extend(["--language", language]) model_dir_candidate = Path(model_name_or_path).expanduser() if model_dir_candidate.exists() and model_dir_candidate.is_dir(): cmd.extend(["--model_directory", str(model_dir_candidate)]) @@ -297,6 +336,101 @@ def transcribe_ctranslate2( return output_txt.read_text(encoding="utf-8").strip() +def transcribe_whisperx( + model_name_or_path: str, + wav_path: Path, + notifier: Notifier, + device: str, + compute_type: str, + beam_size: int, + task: str, + language: str | None, + whisperx_mode: str, + whisperx_vad_method: str, + whisperx_hf_token: str | None, + whisperx_min_speakers: int | None, + whisperx_max_speakers: int | None, +) -> str: + whisperx_cli = shutil.which("whisperx") + if not whisperx_cli: + raise RuntimeError("whisperx not found in PATH. Install with: pip install whisperx") + + output_txt = wav_path.parent / f"{wav_path.stem}.txt" + if output_txt.exists(): + output_txt.unlink() + + if task == "translate" and not language: + raise RuntimeError("Translation requires --language so WhisperX can translate from the source language.") + + def _run_whisperx(out_dir: Path, job_task: str, full_mode: bool) -> tuple[int, str]: + cmd = [ + whisperx_cli, + str(wav_path), + "--output_dir", + str(out_dir), + "--output_format", + "txt", + "--device", + device, + "--compute_type", + compute_type, + "--beam_size", + str(beam_size), + "--task", + job_task, + "--vad_method", + whisperx_vad_method, + "--print_progress", + "False", + "--verbose", + "False", + ] + if language: + cmd.extend(["--language", language]) + + model_dir_candidate = Path(model_name_or_path).expanduser() + if model_dir_candidate.exists() and model_dir_candidate.is_dir(): + cmd.extend(["--model_dir", str(model_dir_candidate)]) + else: + cmd.extend(["--model", model_name_or_path]) + + if whisperx_mode == "basic": + cmd.append("--no_align") + + if full_mode: + cmd.append("--diarize") + if whisperx_hf_token: + cmd.extend(["--hf_token", whisperx_hf_token]) + if whisperx_min_speakers is not None: + cmd.extend(["--min_speakers", str(whisperx_min_speakers)]) + if whisperx_max_speakers is not None: + cmd.extend(["--max_speakers", str(whisperx_max_speakers)]) + + result = subprocess.run(cmd, capture_output=True, text=True) + details = (result.stderr or result.stdout or "").strip() + return result.returncode, details + + notifier.send("Transcribing", "Running WhisperX...", timeout_ms=1500) + + if whisperx_mode == "full": + native_dir = wav_path.parent / "whisperx-native" + native_dir.mkdir(parents=True, exist_ok=True) + notifier.send("Transcribing", "WhisperX full mode: extracting native subtitles", timeout_ms=1500) + rc, details = _run_whisperx(native_dir, "transcribe", full_mode=False) + if rc != 0: + raise RuntimeError(details or "WhisperX native transcription stage failed.") + + rc, details = _run_whisperx(wav_path.parent, task, full_mode=(whisperx_mode == "full")) + if rc != 0: + raise RuntimeError(details or "whisperx failed.") + if not output_txt.exists(): + raise RuntimeError( + "whisperx completed but no transcript file was produced. " + f"Expected: {output_txt}. {details}" + ) + return output_txt.read_text(encoding="utf-8").strip() + + def _type_with_tool(text: str) -> None: if shutil.which("wtype"): subprocess.run(["wtype", text], check=True) @@ -402,6 +536,13 @@ def _run_transcription_job(args: argparse.Namespace, duration: float | None) -> device=args.device, compute_type=args.compute_type, beam_size=args.beam_size, + task=args.task, + language=args.language, + whisperx_mode=args.whisperx_mode, + whisperx_vad_method=args.whisperx_vad_method, + whisperx_hf_token=args.whisperx_hf_token, + whisperx_min_speakers=args.whisperx_min_speakers, + whisperx_max_speakers=args.whisperx_max_speakers, ) return text.strip() @@ -493,7 +634,21 @@ def start_background(args: argparse.Namespace) -> int: args.compute_type, "--beam-size", str(args.beam_size), + "--task", + args.task, + "--whisperx-mode", + args.whisperx_mode, + "--whisperx-vad-method", + args.whisperx_vad_method, ] + if args.language: + cmd.extend(["--language", args.language]) + if args.whisperx_hf_token: + cmd.extend(["--whisperx-hf-token", args.whisperx_hf_token]) + if args.whisperx_min_speakers is not None: + cmd.extend(["--whisperx-min-speakers", str(args.whisperx_min_speakers)]) + if args.whisperx_max_speakers is not None: + cmd.extend(["--whisperx-max-speakers", str(args.whisperx_max_speakers)]) log_path = state_dir / "worker.log" with log_path.open("a", encoding="utf-8") as log_fh: @@ -572,7 +727,7 @@ def stop_background(args: argparse.Namespace) -> int: def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description="Record from microphone and transcribe with whisper.cpp or faster-whisper" + description="Record from microphone and transcribe with whisper.cpp, faster-whisper, or WhisperX" ) parser.add_argument( "--mode", @@ -590,14 +745,25 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument( "--backend", - choices=("whispercpp", "ctranslate2"), + choices=("whispercpp", "ctranslate2", "whisperx"), default="whispercpp", help="Transcription backend (default: whispercpp)", ) parser.add_argument( "--model", default=DEFAULT_MODEL, - help="Model name or path. For whispercpp: ggml .bin path/name. For ctranslate2: model name or model directory.", + help="Model name or path. For whispercpp: ggml .bin path/name. For ctranslate2/whisperx: model name or model directory.", + ) + parser.add_argument( + "--task", + choices=("transcribe", "translate"), + default="transcribe", + help="Task to run (default: transcribe).", + ) + parser.add_argument( + "--language", + default=None, + help="Source language code/name (for example: en, es, Japanese). Strongly recommended, and required for --task translate.", ) parser.add_argument( "--duration", @@ -642,6 +808,35 @@ def parse_args() -> argparse.Namespace: default=5, help="Beam size for decoding (default: 5)", ) + parser.add_argument( + "--whisperx-mode", + choices=("basic", "align", "full"), + default="align", + help="WhisperX pipeline mode: basic (no align), align (aligned transcript), full (native transcript + translate/transcribe + align + VAD + diarization).", + ) + parser.add_argument( + "--whisperx-vad-method", + choices=("silero", "pyannote"), + default="silero", + help="WhisperX VAD method (default: silero).", + ) + parser.add_argument( + "--whisperx-hf-token", + default=None, + help="Optional HuggingFace token for WhisperX diarization models.", + ) + parser.add_argument( + "--whisperx-min-speakers", + type=int, + default=None, + help="Optional minimum speaker count for WhisperX diarization.", + ) + parser.add_argument( + "--whisperx-max-speakers", + type=int, + default=None, + help="Optional maximum speaker count for WhisperX diarization.", + ) parser.add_argument( "--state-dir", default=str(DEFAULT_STATE_DIR), @@ -673,6 +868,8 @@ def parse_args() -> argparse.Namespace: parser.error("Use only one of --start, --stop, or --toggle.") if legacy_modes: args.mode = legacy_modes[0] + if args.task == "translate" and not args.language: + parser.error("--task translate requires --language so the source language is explicit.") return args