mirror of
https://github.com/ksyasuda/dotfiles.git
synced 2026-02-27 12:22:43 -08:00
Update
This commit is contained in:
@@ -6,6 +6,8 @@ font-feature = +liga
|
||||
font-feature = +dlig
|
||||
theme = Catppuccin Macchiato
|
||||
cursor-style = block
|
||||
background-opacity = 1.0
|
||||
window-colorspace = srgb
|
||||
window-padding-x = 10
|
||||
window-padding-y = 10
|
||||
window-decoration = false
|
||||
@@ -13,9 +15,10 @@ window-height = 46
|
||||
window-width = 180
|
||||
confirm-close-surface = false
|
||||
copy-on-select = clipboard
|
||||
osc-color-report-format = 16-bit
|
||||
app-notifications = no-clipboard-copy
|
||||
shell-integration = zsh
|
||||
shell-integration-features = title,sudo
|
||||
shell-integration-features = title,sudo,ssh-env,ssh-terminfo
|
||||
desktop-notifications = true
|
||||
term=xterm-ghostty
|
||||
link-url = true
|
||||
|
||||
@@ -100,3 +100,5 @@ zle -N self-insert url-quote-magic
|
||||
zle -N bracketed-paste bracketed-paste-magic
|
||||
|
||||
alias claude-mem='bun "/home/sudacode/.claude/plugins/marketplaces/thedotmack/plugin/scripts/worker-service.cjs"'
|
||||
fpath=(/home/sudacode/.zsh/completions $fpath)
|
||||
autoload -Uz compinit && compinit
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Record microphone audio and transcribe it with whisper.cpp or faster-whisper."""
|
||||
"""Record microphone audio and transcribe it with whisper.cpp, faster-whisper, or WhisperX."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -145,6 +145,13 @@ def transcribe(
|
||||
device: str,
|
||||
compute_type: str,
|
||||
beam_size: int,
|
||||
task: str,
|
||||
language: str | None,
|
||||
whisperx_mode: str,
|
||||
whisperx_vad_method: str,
|
||||
whisperx_hf_token: str | None,
|
||||
whisperx_min_speakers: int | None,
|
||||
whisperx_max_speakers: int | None,
|
||||
) -> str:
|
||||
if backend == "whispercpp":
|
||||
return transcribe_whispercpp(
|
||||
@@ -153,6 +160,8 @@ def transcribe(
|
||||
notifier=notifier,
|
||||
device=device,
|
||||
beam_size=beam_size,
|
||||
task=task,
|
||||
language=language,
|
||||
)
|
||||
if backend == "ctranslate2":
|
||||
return transcribe_ctranslate2(
|
||||
@@ -162,6 +171,24 @@ def transcribe(
|
||||
device=device,
|
||||
compute_type=compute_type,
|
||||
beam_size=beam_size,
|
||||
task=task,
|
||||
language=language,
|
||||
)
|
||||
if backend == "whisperx":
|
||||
return transcribe_whisperx(
|
||||
model_name_or_path=model_name_or_path,
|
||||
wav_path=wav_path,
|
||||
notifier=notifier,
|
||||
device=device,
|
||||
compute_type=compute_type,
|
||||
beam_size=beam_size,
|
||||
task=task,
|
||||
language=language,
|
||||
whisperx_mode=whisperx_mode,
|
||||
whisperx_vad_method=whisperx_vad_method,
|
||||
whisperx_hf_token=whisperx_hf_token,
|
||||
whisperx_min_speakers=whisperx_min_speakers,
|
||||
whisperx_max_speakers=whisperx_max_speakers,
|
||||
)
|
||||
raise RuntimeError(f"Unsupported backend: {backend}")
|
||||
|
||||
@@ -193,6 +220,8 @@ def transcribe_whispercpp(
|
||||
notifier: Notifier,
|
||||
device: str,
|
||||
beam_size: int,
|
||||
task: str,
|
||||
language: str | None,
|
||||
) -> str:
|
||||
whisper_cli = shutil.which("whisper-cli")
|
||||
if not whisper_cli:
|
||||
@@ -220,6 +249,10 @@ def transcribe_whispercpp(
|
||||
]
|
||||
if device == "cpu":
|
||||
cmd.append("-ng")
|
||||
if language:
|
||||
cmd.extend(["-l", language])
|
||||
if task == "translate":
|
||||
cmd.append("-tr")
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
@@ -241,6 +274,8 @@ def transcribe_ctranslate2(
|
||||
device: str,
|
||||
compute_type: str,
|
||||
beam_size: int,
|
||||
task: str,
|
||||
language: str | None,
|
||||
) -> str:
|
||||
whisper_cli = shutil.which("whisper-ctranslate2")
|
||||
if not whisper_cli:
|
||||
@@ -273,9 +308,13 @@ def transcribe_ctranslate2(
|
||||
compute_type,
|
||||
"--beam_size",
|
||||
str(beam_size),
|
||||
"--task",
|
||||
task,
|
||||
"--verbose",
|
||||
"False",
|
||||
]
|
||||
if language:
|
||||
cmd.extend(["--language", language])
|
||||
model_dir_candidate = Path(model_name_or_path).expanduser()
|
||||
if model_dir_candidate.exists() and model_dir_candidate.is_dir():
|
||||
cmd.extend(["--model_directory", str(model_dir_candidate)])
|
||||
@@ -297,6 +336,101 @@ def transcribe_ctranslate2(
|
||||
return output_txt.read_text(encoding="utf-8").strip()
|
||||
|
||||
|
||||
def transcribe_whisperx(
|
||||
model_name_or_path: str,
|
||||
wav_path: Path,
|
||||
notifier: Notifier,
|
||||
device: str,
|
||||
compute_type: str,
|
||||
beam_size: int,
|
||||
task: str,
|
||||
language: str | None,
|
||||
whisperx_mode: str,
|
||||
whisperx_vad_method: str,
|
||||
whisperx_hf_token: str | None,
|
||||
whisperx_min_speakers: int | None,
|
||||
whisperx_max_speakers: int | None,
|
||||
) -> str:
|
||||
whisperx_cli = shutil.which("whisperx")
|
||||
if not whisperx_cli:
|
||||
raise RuntimeError("whisperx not found in PATH. Install with: pip install whisperx")
|
||||
|
||||
output_txt = wav_path.parent / f"{wav_path.stem}.txt"
|
||||
if output_txt.exists():
|
||||
output_txt.unlink()
|
||||
|
||||
if task == "translate" and not language:
|
||||
raise RuntimeError("Translation requires --language so WhisperX can translate from the source language.")
|
||||
|
||||
def _run_whisperx(out_dir: Path, job_task: str, full_mode: bool) -> tuple[int, str]:
|
||||
cmd = [
|
||||
whisperx_cli,
|
||||
str(wav_path),
|
||||
"--output_dir",
|
||||
str(out_dir),
|
||||
"--output_format",
|
||||
"txt",
|
||||
"--device",
|
||||
device,
|
||||
"--compute_type",
|
||||
compute_type,
|
||||
"--beam_size",
|
||||
str(beam_size),
|
||||
"--task",
|
||||
job_task,
|
||||
"--vad_method",
|
||||
whisperx_vad_method,
|
||||
"--print_progress",
|
||||
"False",
|
||||
"--verbose",
|
||||
"False",
|
||||
]
|
||||
if language:
|
||||
cmd.extend(["--language", language])
|
||||
|
||||
model_dir_candidate = Path(model_name_or_path).expanduser()
|
||||
if model_dir_candidate.exists() and model_dir_candidate.is_dir():
|
||||
cmd.extend(["--model_dir", str(model_dir_candidate)])
|
||||
else:
|
||||
cmd.extend(["--model", model_name_or_path])
|
||||
|
||||
if whisperx_mode == "basic":
|
||||
cmd.append("--no_align")
|
||||
|
||||
if full_mode:
|
||||
cmd.append("--diarize")
|
||||
if whisperx_hf_token:
|
||||
cmd.extend(["--hf_token", whisperx_hf_token])
|
||||
if whisperx_min_speakers is not None:
|
||||
cmd.extend(["--min_speakers", str(whisperx_min_speakers)])
|
||||
if whisperx_max_speakers is not None:
|
||||
cmd.extend(["--max_speakers", str(whisperx_max_speakers)])
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
details = (result.stderr or result.stdout or "").strip()
|
||||
return result.returncode, details
|
||||
|
||||
notifier.send("Transcribing", "Running WhisperX...", timeout_ms=1500)
|
||||
|
||||
if whisperx_mode == "full":
|
||||
native_dir = wav_path.parent / "whisperx-native"
|
||||
native_dir.mkdir(parents=True, exist_ok=True)
|
||||
notifier.send("Transcribing", "WhisperX full mode: extracting native subtitles", timeout_ms=1500)
|
||||
rc, details = _run_whisperx(native_dir, "transcribe", full_mode=False)
|
||||
if rc != 0:
|
||||
raise RuntimeError(details or "WhisperX native transcription stage failed.")
|
||||
|
||||
rc, details = _run_whisperx(wav_path.parent, task, full_mode=(whisperx_mode == "full"))
|
||||
if rc != 0:
|
||||
raise RuntimeError(details or "whisperx failed.")
|
||||
if not output_txt.exists():
|
||||
raise RuntimeError(
|
||||
"whisperx completed but no transcript file was produced. "
|
||||
f"Expected: {output_txt}. {details}"
|
||||
)
|
||||
return output_txt.read_text(encoding="utf-8").strip()
|
||||
|
||||
|
||||
def _type_with_tool(text: str) -> None:
|
||||
if shutil.which("wtype"):
|
||||
subprocess.run(["wtype", text], check=True)
|
||||
@@ -402,6 +536,13 @@ def _run_transcription_job(args: argparse.Namespace, duration: float | None) ->
|
||||
device=args.device,
|
||||
compute_type=args.compute_type,
|
||||
beam_size=args.beam_size,
|
||||
task=args.task,
|
||||
language=args.language,
|
||||
whisperx_mode=args.whisperx_mode,
|
||||
whisperx_vad_method=args.whisperx_vad_method,
|
||||
whisperx_hf_token=args.whisperx_hf_token,
|
||||
whisperx_min_speakers=args.whisperx_min_speakers,
|
||||
whisperx_max_speakers=args.whisperx_max_speakers,
|
||||
)
|
||||
|
||||
return text.strip()
|
||||
@@ -493,7 +634,21 @@ def start_background(args: argparse.Namespace) -> int:
|
||||
args.compute_type,
|
||||
"--beam-size",
|
||||
str(args.beam_size),
|
||||
"--task",
|
||||
args.task,
|
||||
"--whisperx-mode",
|
||||
args.whisperx_mode,
|
||||
"--whisperx-vad-method",
|
||||
args.whisperx_vad_method,
|
||||
]
|
||||
if args.language:
|
||||
cmd.extend(["--language", args.language])
|
||||
if args.whisperx_hf_token:
|
||||
cmd.extend(["--whisperx-hf-token", args.whisperx_hf_token])
|
||||
if args.whisperx_min_speakers is not None:
|
||||
cmd.extend(["--whisperx-min-speakers", str(args.whisperx_min_speakers)])
|
||||
if args.whisperx_max_speakers is not None:
|
||||
cmd.extend(["--whisperx-max-speakers", str(args.whisperx_max_speakers)])
|
||||
|
||||
log_path = state_dir / "worker.log"
|
||||
with log_path.open("a", encoding="utf-8") as log_fh:
|
||||
@@ -572,7 +727,7 @@ def stop_background(args: argparse.Namespace) -> int:
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Record from microphone and transcribe with whisper.cpp or faster-whisper"
|
||||
description="Record from microphone and transcribe with whisper.cpp, faster-whisper, or WhisperX"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mode",
|
||||
@@ -590,14 +745,25 @@ def parse_args() -> argparse.Namespace:
|
||||
)
|
||||
parser.add_argument(
|
||||
"--backend",
|
||||
choices=("whispercpp", "ctranslate2"),
|
||||
choices=("whispercpp", "ctranslate2", "whisperx"),
|
||||
default="whispercpp",
|
||||
help="Transcription backend (default: whispercpp)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default=DEFAULT_MODEL,
|
||||
help="Model name or path. For whispercpp: ggml .bin path/name. For ctranslate2: model name or model directory.",
|
||||
help="Model name or path. For whispercpp: ggml .bin path/name. For ctranslate2/whisperx: model name or model directory.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--task",
|
||||
choices=("transcribe", "translate"),
|
||||
default="transcribe",
|
||||
help="Task to run (default: transcribe).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--language",
|
||||
default=None,
|
||||
help="Source language code/name (for example: en, es, Japanese). Strongly recommended, and required for --task translate.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--duration",
|
||||
@@ -642,6 +808,35 @@ def parse_args() -> argparse.Namespace:
|
||||
default=5,
|
||||
help="Beam size for decoding (default: 5)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--whisperx-mode",
|
||||
choices=("basic", "align", "full"),
|
||||
default="align",
|
||||
help="WhisperX pipeline mode: basic (no align), align (aligned transcript), full (native transcript + translate/transcribe + align + VAD + diarization).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--whisperx-vad-method",
|
||||
choices=("silero", "pyannote"),
|
||||
default="silero",
|
||||
help="WhisperX VAD method (default: silero).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--whisperx-hf-token",
|
||||
default=None,
|
||||
help="Optional HuggingFace token for WhisperX diarization models.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--whisperx-min-speakers",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Optional minimum speaker count for WhisperX diarization.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--whisperx-max-speakers",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Optional maximum speaker count for WhisperX diarization.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--state-dir",
|
||||
default=str(DEFAULT_STATE_DIR),
|
||||
@@ -673,6 +868,8 @@ def parse_args() -> argparse.Namespace:
|
||||
parser.error("Use only one of --start, --stop, or --toggle.")
|
||||
if legacy_modes:
|
||||
args.mode = legacy_modes[0]
|
||||
if args.task == "translate" and not args.language:
|
||||
parser.error("--task translate requires --language so the source language is explicit.")
|
||||
return args
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user