mirror of
https://github.com/ksyasuda/dotfiles.git
synced 2026-02-27 12:22:43 -08:00
Update
This commit is contained in:
@@ -6,6 +6,8 @@ font-feature = +liga
|
|||||||
font-feature = +dlig
|
font-feature = +dlig
|
||||||
theme = Catppuccin Macchiato
|
theme = Catppuccin Macchiato
|
||||||
cursor-style = block
|
cursor-style = block
|
||||||
|
background-opacity = 1.0
|
||||||
|
window-colorspace = srgb
|
||||||
window-padding-x = 10
|
window-padding-x = 10
|
||||||
window-padding-y = 10
|
window-padding-y = 10
|
||||||
window-decoration = false
|
window-decoration = false
|
||||||
@@ -13,9 +15,10 @@ window-height = 46
|
|||||||
window-width = 180
|
window-width = 180
|
||||||
confirm-close-surface = false
|
confirm-close-surface = false
|
||||||
copy-on-select = clipboard
|
copy-on-select = clipboard
|
||||||
|
osc-color-report-format = 16-bit
|
||||||
app-notifications = no-clipboard-copy
|
app-notifications = no-clipboard-copy
|
||||||
shell-integration = zsh
|
shell-integration = zsh
|
||||||
shell-integration-features = title,sudo
|
shell-integration-features = title,sudo,ssh-env,ssh-terminfo
|
||||||
desktop-notifications = true
|
desktop-notifications = true
|
||||||
term=xterm-ghostty
|
term=xterm-ghostty
|
||||||
link-url = true
|
link-url = true
|
||||||
|
|||||||
@@ -100,3 +100,5 @@ zle -N self-insert url-quote-magic
|
|||||||
zle -N bracketed-paste bracketed-paste-magic
|
zle -N bracketed-paste bracketed-paste-magic
|
||||||
|
|
||||||
alias claude-mem='bun "/home/sudacode/.claude/plugins/marketplaces/thedotmack/plugin/scripts/worker-service.cjs"'
|
alias claude-mem='bun "/home/sudacode/.claude/plugins/marketplaces/thedotmack/plugin/scripts/worker-service.cjs"'
|
||||||
|
fpath=(/home/sudacode/.zsh/completions $fpath)
|
||||||
|
autoload -Uz compinit && compinit
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""Record microphone audio and transcribe it with whisper.cpp or faster-whisper."""
|
"""Record microphone audio and transcribe it with whisper.cpp, faster-whisper, or WhisperX."""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
@@ -145,6 +145,13 @@ def transcribe(
|
|||||||
device: str,
|
device: str,
|
||||||
compute_type: str,
|
compute_type: str,
|
||||||
beam_size: int,
|
beam_size: int,
|
||||||
|
task: str,
|
||||||
|
language: str | None,
|
||||||
|
whisperx_mode: str,
|
||||||
|
whisperx_vad_method: str,
|
||||||
|
whisperx_hf_token: str | None,
|
||||||
|
whisperx_min_speakers: int | None,
|
||||||
|
whisperx_max_speakers: int | None,
|
||||||
) -> str:
|
) -> str:
|
||||||
if backend == "whispercpp":
|
if backend == "whispercpp":
|
||||||
return transcribe_whispercpp(
|
return transcribe_whispercpp(
|
||||||
@@ -153,6 +160,8 @@ def transcribe(
|
|||||||
notifier=notifier,
|
notifier=notifier,
|
||||||
device=device,
|
device=device,
|
||||||
beam_size=beam_size,
|
beam_size=beam_size,
|
||||||
|
task=task,
|
||||||
|
language=language,
|
||||||
)
|
)
|
||||||
if backend == "ctranslate2":
|
if backend == "ctranslate2":
|
||||||
return transcribe_ctranslate2(
|
return transcribe_ctranslate2(
|
||||||
@@ -162,6 +171,24 @@ def transcribe(
|
|||||||
device=device,
|
device=device,
|
||||||
compute_type=compute_type,
|
compute_type=compute_type,
|
||||||
beam_size=beam_size,
|
beam_size=beam_size,
|
||||||
|
task=task,
|
||||||
|
language=language,
|
||||||
|
)
|
||||||
|
if backend == "whisperx":
|
||||||
|
return transcribe_whisperx(
|
||||||
|
model_name_or_path=model_name_or_path,
|
||||||
|
wav_path=wav_path,
|
||||||
|
notifier=notifier,
|
||||||
|
device=device,
|
||||||
|
compute_type=compute_type,
|
||||||
|
beam_size=beam_size,
|
||||||
|
task=task,
|
||||||
|
language=language,
|
||||||
|
whisperx_mode=whisperx_mode,
|
||||||
|
whisperx_vad_method=whisperx_vad_method,
|
||||||
|
whisperx_hf_token=whisperx_hf_token,
|
||||||
|
whisperx_min_speakers=whisperx_min_speakers,
|
||||||
|
whisperx_max_speakers=whisperx_max_speakers,
|
||||||
)
|
)
|
||||||
raise RuntimeError(f"Unsupported backend: {backend}")
|
raise RuntimeError(f"Unsupported backend: {backend}")
|
||||||
|
|
||||||
@@ -193,6 +220,8 @@ def transcribe_whispercpp(
|
|||||||
notifier: Notifier,
|
notifier: Notifier,
|
||||||
device: str,
|
device: str,
|
||||||
beam_size: int,
|
beam_size: int,
|
||||||
|
task: str,
|
||||||
|
language: str | None,
|
||||||
) -> str:
|
) -> str:
|
||||||
whisper_cli = shutil.which("whisper-cli")
|
whisper_cli = shutil.which("whisper-cli")
|
||||||
if not whisper_cli:
|
if not whisper_cli:
|
||||||
@@ -220,6 +249,10 @@ def transcribe_whispercpp(
|
|||||||
]
|
]
|
||||||
if device == "cpu":
|
if device == "cpu":
|
||||||
cmd.append("-ng")
|
cmd.append("-ng")
|
||||||
|
if language:
|
||||||
|
cmd.extend(["-l", language])
|
||||||
|
if task == "translate":
|
||||||
|
cmd.append("-tr")
|
||||||
|
|
||||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
@@ -241,6 +274,8 @@ def transcribe_ctranslate2(
|
|||||||
device: str,
|
device: str,
|
||||||
compute_type: str,
|
compute_type: str,
|
||||||
beam_size: int,
|
beam_size: int,
|
||||||
|
task: str,
|
||||||
|
language: str | None,
|
||||||
) -> str:
|
) -> str:
|
||||||
whisper_cli = shutil.which("whisper-ctranslate2")
|
whisper_cli = shutil.which("whisper-ctranslate2")
|
||||||
if not whisper_cli:
|
if not whisper_cli:
|
||||||
@@ -273,9 +308,13 @@ def transcribe_ctranslate2(
|
|||||||
compute_type,
|
compute_type,
|
||||||
"--beam_size",
|
"--beam_size",
|
||||||
str(beam_size),
|
str(beam_size),
|
||||||
|
"--task",
|
||||||
|
task,
|
||||||
"--verbose",
|
"--verbose",
|
||||||
"False",
|
"False",
|
||||||
]
|
]
|
||||||
|
if language:
|
||||||
|
cmd.extend(["--language", language])
|
||||||
model_dir_candidate = Path(model_name_or_path).expanduser()
|
model_dir_candidate = Path(model_name_or_path).expanduser()
|
||||||
if model_dir_candidate.exists() and model_dir_candidate.is_dir():
|
if model_dir_candidate.exists() and model_dir_candidate.is_dir():
|
||||||
cmd.extend(["--model_directory", str(model_dir_candidate)])
|
cmd.extend(["--model_directory", str(model_dir_candidate)])
|
||||||
@@ -297,6 +336,101 @@ def transcribe_ctranslate2(
|
|||||||
return output_txt.read_text(encoding="utf-8").strip()
|
return output_txt.read_text(encoding="utf-8").strip()
|
||||||
|
|
||||||
|
|
||||||
|
def transcribe_whisperx(
|
||||||
|
model_name_or_path: str,
|
||||||
|
wav_path: Path,
|
||||||
|
notifier: Notifier,
|
||||||
|
device: str,
|
||||||
|
compute_type: str,
|
||||||
|
beam_size: int,
|
||||||
|
task: str,
|
||||||
|
language: str | None,
|
||||||
|
whisperx_mode: str,
|
||||||
|
whisperx_vad_method: str,
|
||||||
|
whisperx_hf_token: str | None,
|
||||||
|
whisperx_min_speakers: int | None,
|
||||||
|
whisperx_max_speakers: int | None,
|
||||||
|
) -> str:
|
||||||
|
whisperx_cli = shutil.which("whisperx")
|
||||||
|
if not whisperx_cli:
|
||||||
|
raise RuntimeError("whisperx not found in PATH. Install with: pip install whisperx")
|
||||||
|
|
||||||
|
output_txt = wav_path.parent / f"{wav_path.stem}.txt"
|
||||||
|
if output_txt.exists():
|
||||||
|
output_txt.unlink()
|
||||||
|
|
||||||
|
if task == "translate" and not language:
|
||||||
|
raise RuntimeError("Translation requires --language so WhisperX can translate from the source language.")
|
||||||
|
|
||||||
|
def _run_whisperx(out_dir: Path, job_task: str, full_mode: bool) -> tuple[int, str]:
|
||||||
|
cmd = [
|
||||||
|
whisperx_cli,
|
||||||
|
str(wav_path),
|
||||||
|
"--output_dir",
|
||||||
|
str(out_dir),
|
||||||
|
"--output_format",
|
||||||
|
"txt",
|
||||||
|
"--device",
|
||||||
|
device,
|
||||||
|
"--compute_type",
|
||||||
|
compute_type,
|
||||||
|
"--beam_size",
|
||||||
|
str(beam_size),
|
||||||
|
"--task",
|
||||||
|
job_task,
|
||||||
|
"--vad_method",
|
||||||
|
whisperx_vad_method,
|
||||||
|
"--print_progress",
|
||||||
|
"False",
|
||||||
|
"--verbose",
|
||||||
|
"False",
|
||||||
|
]
|
||||||
|
if language:
|
||||||
|
cmd.extend(["--language", language])
|
||||||
|
|
||||||
|
model_dir_candidate = Path(model_name_or_path).expanduser()
|
||||||
|
if model_dir_candidate.exists() and model_dir_candidate.is_dir():
|
||||||
|
cmd.extend(["--model_dir", str(model_dir_candidate)])
|
||||||
|
else:
|
||||||
|
cmd.extend(["--model", model_name_or_path])
|
||||||
|
|
||||||
|
if whisperx_mode == "basic":
|
||||||
|
cmd.append("--no_align")
|
||||||
|
|
||||||
|
if full_mode:
|
||||||
|
cmd.append("--diarize")
|
||||||
|
if whisperx_hf_token:
|
||||||
|
cmd.extend(["--hf_token", whisperx_hf_token])
|
||||||
|
if whisperx_min_speakers is not None:
|
||||||
|
cmd.extend(["--min_speakers", str(whisperx_min_speakers)])
|
||||||
|
if whisperx_max_speakers is not None:
|
||||||
|
cmd.extend(["--max_speakers", str(whisperx_max_speakers)])
|
||||||
|
|
||||||
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||||
|
details = (result.stderr or result.stdout or "").strip()
|
||||||
|
return result.returncode, details
|
||||||
|
|
||||||
|
notifier.send("Transcribing", "Running WhisperX...", timeout_ms=1500)
|
||||||
|
|
||||||
|
if whisperx_mode == "full":
|
||||||
|
native_dir = wav_path.parent / "whisperx-native"
|
||||||
|
native_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
notifier.send("Transcribing", "WhisperX full mode: extracting native subtitles", timeout_ms=1500)
|
||||||
|
rc, details = _run_whisperx(native_dir, "transcribe", full_mode=False)
|
||||||
|
if rc != 0:
|
||||||
|
raise RuntimeError(details or "WhisperX native transcription stage failed.")
|
||||||
|
|
||||||
|
rc, details = _run_whisperx(wav_path.parent, task, full_mode=(whisperx_mode == "full"))
|
||||||
|
if rc != 0:
|
||||||
|
raise RuntimeError(details or "whisperx failed.")
|
||||||
|
if not output_txt.exists():
|
||||||
|
raise RuntimeError(
|
||||||
|
"whisperx completed but no transcript file was produced. "
|
||||||
|
f"Expected: {output_txt}. {details}"
|
||||||
|
)
|
||||||
|
return output_txt.read_text(encoding="utf-8").strip()
|
||||||
|
|
||||||
|
|
||||||
def _type_with_tool(text: str) -> None:
|
def _type_with_tool(text: str) -> None:
|
||||||
if shutil.which("wtype"):
|
if shutil.which("wtype"):
|
||||||
subprocess.run(["wtype", text], check=True)
|
subprocess.run(["wtype", text], check=True)
|
||||||
@@ -402,6 +536,13 @@ def _run_transcription_job(args: argparse.Namespace, duration: float | None) ->
|
|||||||
device=args.device,
|
device=args.device,
|
||||||
compute_type=args.compute_type,
|
compute_type=args.compute_type,
|
||||||
beam_size=args.beam_size,
|
beam_size=args.beam_size,
|
||||||
|
task=args.task,
|
||||||
|
language=args.language,
|
||||||
|
whisperx_mode=args.whisperx_mode,
|
||||||
|
whisperx_vad_method=args.whisperx_vad_method,
|
||||||
|
whisperx_hf_token=args.whisperx_hf_token,
|
||||||
|
whisperx_min_speakers=args.whisperx_min_speakers,
|
||||||
|
whisperx_max_speakers=args.whisperx_max_speakers,
|
||||||
)
|
)
|
||||||
|
|
||||||
return text.strip()
|
return text.strip()
|
||||||
@@ -493,7 +634,21 @@ def start_background(args: argparse.Namespace) -> int:
|
|||||||
args.compute_type,
|
args.compute_type,
|
||||||
"--beam-size",
|
"--beam-size",
|
||||||
str(args.beam_size),
|
str(args.beam_size),
|
||||||
|
"--task",
|
||||||
|
args.task,
|
||||||
|
"--whisperx-mode",
|
||||||
|
args.whisperx_mode,
|
||||||
|
"--whisperx-vad-method",
|
||||||
|
args.whisperx_vad_method,
|
||||||
]
|
]
|
||||||
|
if args.language:
|
||||||
|
cmd.extend(["--language", args.language])
|
||||||
|
if args.whisperx_hf_token:
|
||||||
|
cmd.extend(["--whisperx-hf-token", args.whisperx_hf_token])
|
||||||
|
if args.whisperx_min_speakers is not None:
|
||||||
|
cmd.extend(["--whisperx-min-speakers", str(args.whisperx_min_speakers)])
|
||||||
|
if args.whisperx_max_speakers is not None:
|
||||||
|
cmd.extend(["--whisperx-max-speakers", str(args.whisperx_max_speakers)])
|
||||||
|
|
||||||
log_path = state_dir / "worker.log"
|
log_path = state_dir / "worker.log"
|
||||||
with log_path.open("a", encoding="utf-8") as log_fh:
|
with log_path.open("a", encoding="utf-8") as log_fh:
|
||||||
@@ -572,7 +727,7 @@ def stop_background(args: argparse.Namespace) -> int:
|
|||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
def parse_args() -> argparse.Namespace:
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Record from microphone and transcribe with whisper.cpp or faster-whisper"
|
description="Record from microphone and transcribe with whisper.cpp, faster-whisper, or WhisperX"
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--mode",
|
"--mode",
|
||||||
@@ -590,14 +745,25 @@ def parse_args() -> argparse.Namespace:
|
|||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--backend",
|
"--backend",
|
||||||
choices=("whispercpp", "ctranslate2"),
|
choices=("whispercpp", "ctranslate2", "whisperx"),
|
||||||
default="whispercpp",
|
default="whispercpp",
|
||||||
help="Transcription backend (default: whispercpp)",
|
help="Transcription backend (default: whispercpp)",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--model",
|
"--model",
|
||||||
default=DEFAULT_MODEL,
|
default=DEFAULT_MODEL,
|
||||||
help="Model name or path. For whispercpp: ggml .bin path/name. For ctranslate2: model name or model directory.",
|
help="Model name or path. For whispercpp: ggml .bin path/name. For ctranslate2/whisperx: model name or model directory.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--task",
|
||||||
|
choices=("transcribe", "translate"),
|
||||||
|
default="transcribe",
|
||||||
|
help="Task to run (default: transcribe).",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--language",
|
||||||
|
default=None,
|
||||||
|
help="Source language code/name (for example: en, es, Japanese). Strongly recommended, and required for --task translate.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--duration",
|
"--duration",
|
||||||
@@ -642,6 +808,35 @@ def parse_args() -> argparse.Namespace:
|
|||||||
default=5,
|
default=5,
|
||||||
help="Beam size for decoding (default: 5)",
|
help="Beam size for decoding (default: 5)",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--whisperx-mode",
|
||||||
|
choices=("basic", "align", "full"),
|
||||||
|
default="align",
|
||||||
|
help="WhisperX pipeline mode: basic (no align), align (aligned transcript), full (native transcript + translate/transcribe + align + VAD + diarization).",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--whisperx-vad-method",
|
||||||
|
choices=("silero", "pyannote"),
|
||||||
|
default="silero",
|
||||||
|
help="WhisperX VAD method (default: silero).",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--whisperx-hf-token",
|
||||||
|
default=None,
|
||||||
|
help="Optional HuggingFace token for WhisperX diarization models.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--whisperx-min-speakers",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Optional minimum speaker count for WhisperX diarization.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--whisperx-max-speakers",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Optional maximum speaker count for WhisperX diarization.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--state-dir",
|
"--state-dir",
|
||||||
default=str(DEFAULT_STATE_DIR),
|
default=str(DEFAULT_STATE_DIR),
|
||||||
@@ -673,6 +868,8 @@ def parse_args() -> argparse.Namespace:
|
|||||||
parser.error("Use only one of --start, --stop, or --toggle.")
|
parser.error("Use only one of --start, --stop, or --toggle.")
|
||||||
if legacy_modes:
|
if legacy_modes:
|
||||||
args.mode = legacy_modes[0]
|
args.mode = legacy_modes[0]
|
||||||
|
if args.task == "translate" and not args.language:
|
||||||
|
parser.error("--task translate requires --language so the source language is explicit.")
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user