Update

2026-05-09 00:41:27 -07:00 · 2026-02-10 22:53:41 -08:00
parent d23a385861
commit c333265231
3 changed files with 207 additions and 5 deletions
@@ -6,6 +6,8 @@ font-feature = +liga
 font-feature = +dlig
 theme = Catppuccin Macchiato
 cursor-style = block
 background-opacity = 1.0
 window-colorspace = srgb
 window-padding-x = 10
 window-padding-y = 10
 window-decoration = false
@@ -13,9 +15,10 @@ window-height = 46
 window-width = 180
 confirm-close-surface = false
 copy-on-select = clipboard
 osc-color-report-format = 16-bit
 app-notifications = no-clipboard-copy
 shell-integration = zsh
-shell-integration-features = title,sudo
+shell-integration-features = title,sudo,ssh-env,ssh-terminfo
 desktop-notifications = true
 term=xterm-ghostty
 link-url = true
@@ -100,3 +100,5 @@ zle   -N self-insert       url-quote-magic
 zle   -N bracketed-paste   bracketed-paste-magic
 alias claude-mem='bun "/home/sudacode/.claude/plugins/marketplaces/thedotmack/plugin/scripts/worker-service.cjs"'
 fpath=(/home/sudacode/.zsh/completions $fpath)
 autoload -Uz compinit && compinit
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""Record microphone audio and transcribe it with whisper.cpp or faster-whisper."""
+"""Record microphone audio and transcribe it with whisper.cpp, faster-whisper, or WhisperX."""
 from __future__ import annotations
@@ -145,6 +145,13 @@ def transcribe(
    device: str,
    compute_type: str,
    beam_size: int,
    task: str,
    language: str | None,
    whisperx_mode: str,
    whisperx_vad_method: str,
    whisperx_hf_token: str | None,
    whisperx_min_speakers: int | None,
    whisperx_max_speakers: int | None,
 ) -> str:
    if backend == "whispercpp":
        return transcribe_whispercpp(
@@ -153,6 +160,8 @@ def transcribe(
            notifier=notifier,
            device=device,
            beam_size=beam_size,
            task=task,
            language=language,
        )
    if backend == "ctranslate2":
        return transcribe_ctranslate2(
@@ -162,6 +171,24 @@ def transcribe(
            device=device,
            compute_type=compute_type,
            beam_size=beam_size,
            task=task,
            language=language,
        )
    if backend == "whisperx":
        return transcribe_whisperx(
            model_name_or_path=model_name_or_path,
            wav_path=wav_path,
            notifier=notifier,
            device=device,
            compute_type=compute_type,
            beam_size=beam_size,
            task=task,
            language=language,
            whisperx_mode=whisperx_mode,
            whisperx_vad_method=whisperx_vad_method,
            whisperx_hf_token=whisperx_hf_token,
            whisperx_min_speakers=whisperx_min_speakers,
            whisperx_max_speakers=whisperx_max_speakers,
        )
    raise RuntimeError(f"Unsupported backend: {backend}")
@@ -193,6 +220,8 @@ def transcribe_whispercpp(
    notifier: Notifier,
    device: str,
    beam_size: int,
    task: str,
    language: str | None,
 ) -> str:
    whisper_cli = shutil.which("whisper-cli")
    if not whisper_cli:
@@ -220,6 +249,10 @@ def transcribe_whispercpp(
    ]
    if device == "cpu":
        cmd.append("-ng")
    if language:
        cmd.extend(["-l", language])
    if task == "translate":
        cmd.append("-tr")
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
@@ -241,6 +274,8 @@ def transcribe_ctranslate2(
    device: str,
    compute_type: str,
    beam_size: int,
    task: str,
    language: str | None,
 ) -> str:
    whisper_cli = shutil.which("whisper-ctranslate2")
    if not whisper_cli:
@@ -273,9 +308,13 @@ def transcribe_ctranslate2(
        compute_type,
        "--beam_size",
        str(beam_size),
        "--task",
        task,
        "--verbose",
        "False",
    ]
    if language:
        cmd.extend(["--language", language])
    model_dir_candidate = Path(model_name_or_path).expanduser()
    if model_dir_candidate.exists() and model_dir_candidate.is_dir():
        cmd.extend(["--model_directory", str(model_dir_candidate)])
@@ -297,6 +336,101 @@ def transcribe_ctranslate2(
    return output_txt.read_text(encoding="utf-8").strip()
 def transcribe_whisperx(
    model_name_or_path: str,
    wav_path: Path,
    notifier: Notifier,
    device: str,
    compute_type: str,
    beam_size: int,
    task: str,
    language: str | None,
    whisperx_mode: str,
    whisperx_vad_method: str,
    whisperx_hf_token: str | None,
    whisperx_min_speakers: int | None,
    whisperx_max_speakers: int | None,
 ) -> str:
    whisperx_cli = shutil.which("whisperx")
    if not whisperx_cli:
        raise RuntimeError("whisperx not found in PATH. Install with: pip install whisperx")
    output_txt = wav_path.parent / f"{wav_path.stem}.txt"
    if output_txt.exists():
        output_txt.unlink()
    if task == "translate" and not language:
        raise RuntimeError("Translation requires --language so WhisperX can translate from the source language.")
    def _run_whisperx(out_dir: Path, job_task: str, full_mode: bool) -> tuple[int, str]:
        cmd = [
            whisperx_cli,
            str(wav_path),
            "--output_dir",
            str(out_dir),
            "--output_format",
            "txt",
            "--device",
            device,
            "--compute_type",
            compute_type,
            "--beam_size",
            str(beam_size),
            "--task",
            job_task,
            "--vad_method",
            whisperx_vad_method,
            "--print_progress",
            "False",
            "--verbose",
            "False",
        ]
        if language:
            cmd.extend(["--language", language])
        model_dir_candidate = Path(model_name_or_path).expanduser()
        if model_dir_candidate.exists() and model_dir_candidate.is_dir():
            cmd.extend(["--model_dir", str(model_dir_candidate)])
        else:
            cmd.extend(["--model", model_name_or_path])
        if whisperx_mode == "basic":
            cmd.append("--no_align")
        if full_mode:
            cmd.append("--diarize")
            if whisperx_hf_token:
                cmd.extend(["--hf_token", whisperx_hf_token])
            if whisperx_min_speakers is not None:
                cmd.extend(["--min_speakers", str(whisperx_min_speakers)])
            if whisperx_max_speakers is not None:
                cmd.extend(["--max_speakers", str(whisperx_max_speakers)])
        result = subprocess.run(cmd, capture_output=True, text=True)
        details = (result.stderr or result.stdout or "").strip()
        return result.returncode, details
    notifier.send("Transcribing", "Running WhisperX...", timeout_ms=1500)
    if whisperx_mode == "full":
        native_dir = wav_path.parent / "whisperx-native"
        native_dir.mkdir(parents=True, exist_ok=True)
        notifier.send("Transcribing", "WhisperX full mode: extracting native subtitles", timeout_ms=1500)
        rc, details = _run_whisperx(native_dir, "transcribe", full_mode=False)
        if rc != 0:
            raise RuntimeError(details or "WhisperX native transcription stage failed.")
    rc, details = _run_whisperx(wav_path.parent, task, full_mode=(whisperx_mode == "full"))
    if rc != 0:
        raise RuntimeError(details or "whisperx failed.")
    if not output_txt.exists():
        raise RuntimeError(
            "whisperx completed but no transcript file was produced. "
            f"Expected: {output_txt}. {details}"
        )
    return output_txt.read_text(encoding="utf-8").strip()
 def _type_with_tool(text: str) -> None:
    if shutil.which("wtype"):
        subprocess.run(["wtype", text], check=True)
@@ -402,6 +536,13 @@ def _run_transcription_job(args: argparse.Namespace, duration: float | None) ->
            device=args.device,
            compute_type=args.compute_type,
            beam_size=args.beam_size,
            task=args.task,
            language=args.language,
            whisperx_mode=args.whisperx_mode,
            whisperx_vad_method=args.whisperx_vad_method,
            whisperx_hf_token=args.whisperx_hf_token,
            whisperx_min_speakers=args.whisperx_min_speakers,
            whisperx_max_speakers=args.whisperx_max_speakers,
        )
    return text.strip()
@@ -493,7 +634,21 @@ def start_background(args: argparse.Namespace) -> int:
        args.compute_type,
        "--beam-size",
        str(args.beam_size),
        "--task",
        args.task,
        "--whisperx-mode",
        args.whisperx_mode,
        "--whisperx-vad-method",
        args.whisperx_vad_method,
    ]
    if args.language:
        cmd.extend(["--language", args.language])
    if args.whisperx_hf_token:
        cmd.extend(["--whisperx-hf-token", args.whisperx_hf_token])
    if args.whisperx_min_speakers is not None:
        cmd.extend(["--whisperx-min-speakers", str(args.whisperx_min_speakers)])
    if args.whisperx_max_speakers is not None:
        cmd.extend(["--whisperx-max-speakers", str(args.whisperx_max_speakers)])
    log_path = state_dir / "worker.log"
    with log_path.open("a", encoding="utf-8") as log_fh:
@@ -572,7 +727,7 @@ def stop_background(args: argparse.Namespace) -> int:
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
-        description="Record from microphone and transcribe with whisper.cpp or faster-whisper"
+        description="Record from microphone and transcribe with whisper.cpp, faster-whisper, or WhisperX"
    )
    parser.add_argument(
        "--mode",
@@ -590,14 +745,25 @@ def parse_args() -> argparse.Namespace:
    )
    parser.add_argument(
        "--backend",
-        choices=("whispercpp", "ctranslate2"),
+        choices=("whispercpp", "ctranslate2", "whisperx"),
        default="whispercpp",
        help="Transcription backend (default: whispercpp)",
    )
    parser.add_argument(
        "--model",
        default=DEFAULT_MODEL,
-        help="Model name or path. For whispercpp: ggml .bin path/name. For ctranslate2: model name or model directory.",
+        help="Model name or path. For whispercpp: ggml .bin path/name. For ctranslate2/whisperx: model name or model directory.",
    )
    parser.add_argument(
        "--task",
        choices=("transcribe", "translate"),
        default="transcribe",
        help="Task to run (default: transcribe).",
    )
    parser.add_argument(
        "--language",
        default=None,
        help="Source language code/name (for example: en, es, Japanese). Strongly recommended, and required for --task translate.",
    )
    parser.add_argument(
        "--duration",
@@ -642,6 +808,35 @@ def parse_args() -> argparse.Namespace:
        default=5,
        help="Beam size for decoding (default: 5)",
    )
    parser.add_argument(
        "--whisperx-mode",
        choices=("basic", "align", "full"),
        default="align",
        help="WhisperX pipeline mode: basic (no align), align (aligned transcript), full (native transcript + translate/transcribe + align + VAD + diarization).",
    )
    parser.add_argument(
        "--whisperx-vad-method",
        choices=("silero", "pyannote"),
        default="silero",
        help="WhisperX VAD method (default: silero).",
    )
    parser.add_argument(
        "--whisperx-hf-token",
        default=None,
        help="Optional HuggingFace token for WhisperX diarization models.",
    )
    parser.add_argument(
        "--whisperx-min-speakers",
        type=int,
        default=None,
        help="Optional minimum speaker count for WhisperX diarization.",
    )
    parser.add_argument(
        "--whisperx-max-speakers",
        type=int,
        default=None,
        help="Optional maximum speaker count for WhisperX diarization.",
    )
    parser.add_argument(
        "--state-dir",
        default=str(DEFAULT_STATE_DIR),
@@ -673,6 +868,8 @@ def parse_args() -> argparse.Namespace:
        parser.error("Use only one of --start, --stop, or --toggle.")
    if legacy_modes:
        args.mode = legacy_modes[0]
    if args.task == "translate" and not args.language:
        parser.error("--task translate requires --language so the source language is explicit.")
    return args