refactor(youtube): extract subtitle generation pipeline

2026-06-19 03:13:32 -07:00 · 2026-03-08 16:10:56 -07:00
parent 9e46176519
commit 4c0575afe0
11 changed files with 876 additions and 467 deletions
@@ -0,0 +1,84 @@
+import fs from 'node:fs';
+import path from 'node:path';
+
+import type { Args } from '../types.js';
+import { YOUTUBE_AUDIO_EXTENSIONS } from '../types.js';
+import { runExternalCommand } from '../util.js';
+
+export function findAudioFile(tempDir: string, preferredExt: string): string | null {
+  const entries = fs.readdirSync(tempDir);
+  const audioFiles: Array<{ path: string; ext: string; mtimeMs: number }> = [];
+  for (const name of entries) {
+    const fullPath = path.join(tempDir, name);
+    let stat: fs.Stats;
+    try {
+      stat = fs.statSync(fullPath);
+    } catch {
+      continue;
+    }
+    if (!stat.isFile()) continue;
+    const ext = path.extname(name).toLowerCase();
+    if (!YOUTUBE_AUDIO_EXTENSIONS.has(ext)) continue;
+    audioFiles.push({ path: fullPath, ext, mtimeMs: stat.mtimeMs });
+  }
+  if (audioFiles.length === 0) return null;
+  const preferred = audioFiles.find((entry) => entry.ext === `.${preferredExt.toLowerCase()}`);
+  if (preferred) return preferred.path;
+  audioFiles.sort((a, b) => b.mtimeMs - a.mtimeMs);
+  return audioFiles[0]?.path ?? null;
+}
+
+export async function convertAudioForWhisper(inputPath: string, tempDir: string): Promise<string> {
+  const wavPath = path.join(tempDir, 'whisper-input.wav');
+  await runExternalCommand('ffmpeg', [
+    '-y',
+    '-loglevel',
+    'error',
+    '-i',
+    inputPath,
+    '-ar',
+    '16000',
+    '-ac',
+    '1',
+    '-c:a',
+    'pcm_s16le',
+    wavPath,
+  ]);
+  if (!fs.existsSync(wavPath)) {
+    throw new Error(`Failed to prepare whisper audio input: ${wavPath}`);
+  }
+  return wavPath;
+}
+
+export async function downloadYoutubeAudio(
+  target: string,
+  args: Args,
+  tempDir: string,
+  childTracker?: Set<ReturnType<typeof import('node:child_process').spawn>>,
+): Promise<string> {
+  await runExternalCommand(
+    'yt-dlp',
+    [
+      '-f',
+      'bestaudio/best',
+      '--extract-audio',
+      '--audio-format',
+      args.youtubeSubgenAudioFormat,
+      '--no-warnings',
+      '-o',
+      path.join(tempDir, '%(id)s.%(ext)s'),
+      target,
+    ],
+    {
+      logLevel: args.logLevel,
+      commandLabel: 'yt-dlp:audio',
+      streamOutput: true,
+    },
+    childTracker,
+  );
+  const audioPath = findAudioFile(tempDir, args.youtubeSubgenAudioFormat);
+  if (!audioPath) {
+    throw new Error('Audio extraction succeeded, but no audio file was found.');
+  }
+  return audioPath;
+}
@@ -0,0 +1,99 @@
+import fs from 'node:fs';
+import path from 'node:path';
+
+import type { SubtitleCandidate } from '../types.js';
+import { YOUTUBE_SUB_EXTENSIONS } from '../types.js';
+import { escapeRegExp, runExternalCommand } from '../util.js';
+
+function filenameHasLanguageTag(filenameLower: string, langCode: string): boolean {
+  const escaped = escapeRegExp(langCode);
+  const pattern = new RegExp(`(^|[._-])${escaped}([._-]|$)`);
+  return pattern.test(filenameLower);
+}
+
+function classifyLanguage(
+  filename: string,
+  primaryLangCodes: string[],
+  secondaryLangCodes: string[],
+): 'primary' | 'secondary' | null {
+  const lower = filename.toLowerCase();
+  const primary = primaryLangCodes.some((code) => filenameHasLanguageTag(lower, code));
+  const secondary = secondaryLangCodes.some((code) => filenameHasLanguageTag(lower, code));
+  if (primary && !secondary) return 'primary';
+  if (secondary && !primary) return 'secondary';
+  return null;
+}
+
+export function toYtdlpLangPattern(langCodes: string[]): string {
+  return langCodes.map((lang) => `${lang}.*`).join(',');
+}
+
+export function scanSubtitleCandidates(
+  tempDir: string,
+  knownSet: Set<string>,
+  source: SubtitleCandidate['source'],
+  primaryLangCodes: string[],
+  secondaryLangCodes: string[],
+): SubtitleCandidate[] {
+  const entries = fs.readdirSync(tempDir);
+  const out: SubtitleCandidate[] = [];
+  for (const name of entries) {
+    const fullPath = path.join(tempDir, name);
+    if (knownSet.has(fullPath)) continue;
+    let stat: fs.Stats;
+    try {
+      stat = fs.statSync(fullPath);
+    } catch {
+      continue;
+    }
+    if (!stat.isFile()) continue;
+    const ext = path.extname(fullPath).toLowerCase();
+    if (!YOUTUBE_SUB_EXTENSIONS.has(ext)) continue;
+    const lang = classifyLanguage(name, primaryLangCodes, secondaryLangCodes);
+    if (!lang) continue;
+    out.push({ path: fullPath, lang, ext, size: stat.size, source });
+  }
+  return out;
+}
+
+export function pickBestCandidate(candidates: SubtitleCandidate[]): SubtitleCandidate | null {
+  if (candidates.length === 0) return null;
+  const scored = [...candidates].sort((a, b) => {
+    const srtA = a.ext === '.srt' ? 1 : 0;
+    const srtB = b.ext === '.srt' ? 1 : 0;
+    if (srtA !== srtB) return srtB - srtA;
+    return b.size - a.size;
+  });
+  return scored[0] ?? null;
+}
+
+export async function downloadManualSubtitles(
+  target: string,
+  tempDir: string,
+  langPattern: string,
+  logLevel: import('../types.js').LogLevel,
+  childTracker?: Set<ReturnType<typeof import('node:child_process').spawn>>,
+): Promise<void> {
+  await runExternalCommand(
+    'yt-dlp',
+    [
+      '--skip-download',
+      '--no-warnings',
+      '--write-subs',
+      '--sub-format',
+      'srt/vtt/best',
+      '--sub-langs',
+      langPattern,
+      '-o',
+      path.join(tempDir, '%(id)s.%(ext)s'),
+      target,
+    ],
+    {
+      allowFailure: true,
+      logLevel,
+      commandLabel: 'yt-dlp:manual-subs',
+      streamOutput: true,
+    },
+    childTracker,
+  );
+}
@@ -0,0 +1,58 @@
+import test from 'node:test';
+import assert from 'node:assert/strict';
+
+import { planYoutubeSubtitleGeneration } from './orchestrator';
+
+test('planYoutubeSubtitleGeneration prefers manual subtitles and never schedules auto-subs', () => {
+  assert.deepEqual(
+    planYoutubeSubtitleGeneration({
+      hasPrimaryManualSubtitle: true,
+      hasSecondaryManualSubtitle: false,
+      secondaryCanTranslate: true,
+    }),
+    {
+      fetchManualSubtitles: true,
+      fetchAutoSubtitles: false,
+      publishPrimaryManualSubtitle: false,
+      publishSecondaryManualSubtitle: false,
+      generatePrimarySubtitle: false,
+      generateSecondarySubtitle: true,
+    },
+  );
+});
+
+test('planYoutubeSubtitleGeneration generates only missing tracks', () => {
+  assert.deepEqual(
+    planYoutubeSubtitleGeneration({
+      hasPrimaryManualSubtitle: false,
+      hasSecondaryManualSubtitle: true,
+      secondaryCanTranslate: true,
+    }),
+    {
+      fetchManualSubtitles: true,
+      fetchAutoSubtitles: false,
+      publishPrimaryManualSubtitle: false,
+      publishSecondaryManualSubtitle: false,
+      generatePrimarySubtitle: true,
+      generateSecondarySubtitle: false,
+    },
+  );
+});
+
+test('planYoutubeSubtitleGeneration reuses manual tracks already present on the YouTube video', () => {
+  assert.deepEqual(
+    planYoutubeSubtitleGeneration({
+      hasPrimaryManualSubtitle: true,
+      hasSecondaryManualSubtitle: true,
+      secondaryCanTranslate: true,
+    }),
+    {
+      fetchManualSubtitles: true,
+      fetchAutoSubtitles: false,
+      publishPrimaryManualSubtitle: false,
+      publishSecondaryManualSubtitle: false,
+      generatePrimarySubtitle: false,
+      generateSecondarySubtitle: false,
+    },
+  );
+});
@@ -0,0 +1,367 @@
+import fs from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+
+import type { Args, SubtitleCandidate, YoutubeSubgenOutputs } from '../types.js';
+import { log } from '../log.js';
+import {
+  commandExists,
+  normalizeBasename,
+  resolvePathMaybe,
+  runExternalCommand,
+  uniqueNormalizedLangCodes,
+} from '../util.js';
+import { state } from '../mpv.js';
+import { downloadYoutubeAudio, convertAudioForWhisper } from './audio-extraction.js';
+import {
+  downloadManualSubtitles,
+  pickBestCandidate,
+  scanSubtitleCandidates,
+  toYtdlpLangPattern,
+} from './manual-subs.js';
+import { runLoggedYoutubePhase } from './progress.js';
+import { fixSubtitleWithAi } from './subtitle-fix-ai.js';
+import { runWhisper } from './whisper.js';
+
+export interface YoutubeSubtitleGenerationPlan {
+  fetchManualSubtitles: true;
+  fetchAutoSubtitles: false;
+  publishPrimaryManualSubtitle: false;
+  publishSecondaryManualSubtitle: false;
+  generatePrimarySubtitle: boolean;
+  generateSecondarySubtitle: boolean;
+}
+
+export function planYoutubeSubtitleGeneration(input: {
+  hasPrimaryManualSubtitle: boolean;
+  hasSecondaryManualSubtitle: boolean;
+  secondaryCanTranslate: boolean;
+}): YoutubeSubtitleGenerationPlan {
+  return {
+    fetchManualSubtitles: true,
+    fetchAutoSubtitles: false,
+    publishPrimaryManualSubtitle: false,
+    publishSecondaryManualSubtitle: false,
+    generatePrimarySubtitle: !input.hasPrimaryManualSubtitle,
+    generateSecondarySubtitle: !input.hasSecondaryManualSubtitle && input.secondaryCanTranslate,
+  };
+}
+
+function preferredLangLabel(langCodes: string[], fallback: string): string {
+  return uniqueNormalizedLangCodes(langCodes)[0] || fallback;
+}
+
+function sourceTag(source: SubtitleCandidate['source']): string {
+  return source;
+}
+
+export function resolveWhisperBinary(args: Args): string | null {
+  const explicit = args.whisperBin.trim();
+  if (explicit) return resolvePathMaybe(explicit);
+  if (commandExists('whisper-cli')) return 'whisper-cli';
+  return null;
+}
+
+async function maybeFixSubtitleWithAi(
+  selectedPath: string,
+  args: Args,
+  expectedLanguage?: string,
+): Promise<string> {
+  if (!args.youtubeFixWithAi || args.aiConfig.enabled !== true) {
+    return selectedPath;
+  }
+  const fixedContent = await runLoggedYoutubePhase(
+    {
+      startMessage: `Starting AI subtitle fix: ${path.basename(selectedPath)}`,
+      finishMessage: `Finished AI subtitle fix: ${path.basename(selectedPath)}`,
+      failureMessage: `AI subtitle fix failed: ${path.basename(selectedPath)}`,
+      log: (level, message) => log(level, args.logLevel, message),
+    },
+    async () => {
+      const originalContent = fs.readFileSync(selectedPath, 'utf8');
+      return fixSubtitleWithAi(
+        originalContent,
+        args.aiConfig,
+        (message) => {
+          log('warn', args.logLevel, message);
+        },
+        expectedLanguage,
+      );
+    },
+  );
+  if (!fixedContent) {
+    return selectedPath;
+  }
+
+  const fixedPath = selectedPath.replace(/\.srt$/i, '.fixed.srt');
+  fs.writeFileSync(fixedPath, fixedContent, 'utf8');
+  return fixedPath;
+}
+
+export async function generateYoutubeSubtitles(
+  target: string,
+  args: Args,
+  onReady?: (lang: 'primary' | 'secondary', pathToLoad: string) => Promise<void>,
+): Promise<YoutubeSubgenOutputs> {
+  const outDir = path.resolve(resolvePathMaybe(args.youtubeSubgenOutDir));
+  fs.mkdirSync(outDir, { recursive: true });
+
+  const primaryLangCodes = uniqueNormalizedLangCodes(args.youtubePrimarySubLangs);
+  const secondaryLangCodes = uniqueNormalizedLangCodes(args.youtubeSecondarySubLangs);
+  const primaryLabel = preferredLangLabel(primaryLangCodes, 'primary');
+  const secondaryLabel = preferredLangLabel(secondaryLangCodes, 'secondary');
+  const secondaryCanUseWhisperTranslate =
+    secondaryLangCodes.includes('en') || secondaryLangCodes.includes('eng');
+  const manualLangs = toYtdlpLangPattern([...primaryLangCodes, ...secondaryLangCodes]);
+
+  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-yt-subgen-'));
+  const knownFiles = new Set<string>();
+  let keepTemp = args.youtubeSubgenKeepTemp;
+
+  const publishTrack = async (
+    lang: 'primary' | 'secondary',
+    source: SubtitleCandidate['source'],
+    selectedPath: string,
+    basename: string,
+  ): Promise<string> => {
+    const langLabel = lang === 'primary' ? primaryLabel : secondaryLabel;
+    const taggedPath = path.join(outDir, `${basename}.${langLabel}.${sourceTag(source)}.srt`);
+    const aliasPath = path.join(outDir, `${basename}.${langLabel}.srt`);
+    fs.copyFileSync(selectedPath, taggedPath);
+    fs.copyFileSync(taggedPath, aliasPath);
+    log('info', args.logLevel, `Generated subtitle (${langLabel}, ${source}) -> ${aliasPath}`);
+    if (onReady) await onReady(lang, aliasPath);
+    return aliasPath;
+  };
+
+  try {
+    const meta = await runLoggedYoutubePhase(
+      {
+        startMessage: 'Starting YouTube metadata probe',
+        finishMessage: 'Finished YouTube metadata probe',
+        failureMessage: 'YouTube metadata probe failed',
+        log: (level, message) => log(level, args.logLevel, message),
+      },
+      () =>
+        runExternalCommand(
+          'yt-dlp',
+          ['--dump-single-json', '--no-warnings', target],
+          {
+            captureStdout: true,
+            logLevel: args.logLevel,
+            commandLabel: 'yt-dlp:meta',
+          },
+          state.youtubeSubgenChildren,
+        ),
+    );
+    const metadata = JSON.parse(meta.stdout) as { id?: string };
+    const videoId = metadata.id || `${Date.now()}`;
+    const basename = normalizeBasename(videoId, videoId);
+
+    await runLoggedYoutubePhase(
+      {
+        startMessage: `Starting manual subtitle probe (${manualLangs || 'requested langs'})`,
+        finishMessage: 'Finished manual subtitle probe',
+        failureMessage: 'Manual subtitle probe failed',
+        log: (level, message) => log(level, args.logLevel, message),
+      },
+      () =>
+        downloadManualSubtitles(
+          target,
+          tempDir,
+          manualLangs,
+          args.logLevel,
+          state.youtubeSubgenChildren,
+        ),
+    );
+
+    const manualSubs = scanSubtitleCandidates(
+      tempDir,
+      knownFiles,
+      'manual',
+      primaryLangCodes,
+      secondaryLangCodes,
+    );
+    for (const sub of manualSubs) knownFiles.add(sub.path);
+    const selectedPrimary = pickBestCandidate(
+      manualSubs.filter((entry) => entry.lang === 'primary'),
+    );
+    const selectedSecondary = pickBestCandidate(
+      manualSubs.filter((entry) => entry.lang === 'secondary'),
+    );
+
+    const plan = planYoutubeSubtitleGeneration({
+      hasPrimaryManualSubtitle: Boolean(selectedPrimary),
+      hasSecondaryManualSubtitle: Boolean(selectedSecondary),
+      secondaryCanTranslate: secondaryCanUseWhisperTranslate,
+    });
+
+    let primaryAlias = '';
+    let secondaryAlias = '';
+
+    if (selectedPrimary) {
+      log(
+        'info',
+        args.logLevel,
+        `Using native YouTube subtitle track for primary (${primaryLabel}); skipping external subtitle copy.`,
+      );
+    }
+    if (selectedSecondary) {
+      log(
+        'info',
+        args.logLevel,
+        `Using native YouTube subtitle track for secondary (${secondaryLabel}); skipping external subtitle copy.`,
+      );
+    }
+
+    if (plan.generatePrimarySubtitle || plan.generateSecondarySubtitle) {
+      const whisperBin = resolveWhisperBinary(args);
+      const modelPath = args.whisperModel.trim()
+        ? path.resolve(resolvePathMaybe(args.whisperModel.trim()))
+        : '';
+      const hasWhisperFallback = !!whisperBin && !!modelPath && fs.existsSync(modelPath);
+
+      if (!hasWhisperFallback) {
+        log(
+          'warn',
+          args.logLevel,
+          'Whisper fallback is not configured; continuing with available subtitle tracks.',
+        );
+      } else {
+        const audioPath = await runLoggedYoutubePhase(
+          {
+            startMessage: 'Starting fallback audio extraction for subtitle generation',
+            finishMessage: 'Finished fallback audio extraction',
+            failureMessage: 'Fallback audio extraction failed',
+            log: (level, message) => log(level, args.logLevel, message),
+          },
+          () =>
+            downloadYoutubeAudio(target, args, tempDir, state.youtubeSubgenChildren),
+        );
+        const whisperAudioPath = await runLoggedYoutubePhase(
+          {
+            startMessage: 'Starting ffmpeg audio prep for whisper',
+            finishMessage: 'Finished ffmpeg audio prep for whisper',
+            failureMessage: 'ffmpeg audio prep for whisper failed',
+            log: (level, message) => log(level, args.logLevel, message),
+          },
+          () => convertAudioForWhisper(audioPath, tempDir),
+        );
+
+        if (plan.generatePrimarySubtitle) {
+          try {
+            const primaryPrefix = path.join(tempDir, `${basename}.${primaryLabel}`);
+            const primarySrt = await runLoggedYoutubePhase(
+              {
+                startMessage: `Starting whisper primary subtitle generation (${primaryLabel})`,
+                finishMessage: `Finished whisper primary subtitle generation (${primaryLabel})`,
+                failureMessage: `Whisper primary subtitle generation failed (${primaryLabel})`,
+                log: (level, message) => log(level, args.logLevel, message),
+              },
+              () =>
+                runWhisper(whisperBin!, args, {
+                  modelPath,
+                  audioPath: whisperAudioPath,
+                  language: args.youtubeWhisperSourceLanguage,
+                  translate: false,
+                  outputPrefix: primaryPrefix,
+                }),
+            );
+            const fixedPrimary = await maybeFixSubtitleWithAi(
+              primarySrt,
+              args,
+              args.youtubeWhisperSourceLanguage,
+            );
+            primaryAlias = await publishTrack(
+              'primary',
+              fixedPrimary === primarySrt ? 'whisper' : 'whisper-fixed',
+              fixedPrimary,
+              basename,
+            );
+          } catch (error) {
+            log(
+              'warn',
+              args.logLevel,
+              `Failed to generate primary subtitle via whisper fallback: ${(error as Error).message}`,
+            );
+          }
+        }
+
+        if (plan.generateSecondarySubtitle) {
+          try {
+            const secondaryPrefix = path.join(tempDir, `${basename}.${secondaryLabel}`);
+            const secondarySrt = await runLoggedYoutubePhase(
+              {
+                startMessage: `Starting whisper secondary subtitle generation (${secondaryLabel})`,
+                finishMessage: `Finished whisper secondary subtitle generation (${secondaryLabel})`,
+                failureMessage: `Whisper secondary subtitle generation failed (${secondaryLabel})`,
+                log: (level, message) => log(level, args.logLevel, message),
+              },
+              () =>
+                runWhisper(whisperBin!, args, {
+                  modelPath,
+                  audioPath: whisperAudioPath,
+                  language: args.youtubeWhisperSourceLanguage,
+                  translate: true,
+                  outputPrefix: secondaryPrefix,
+                }),
+            );
+            const fixedSecondary = await maybeFixSubtitleWithAi(secondarySrt, args);
+            secondaryAlias = await publishTrack(
+              'secondary',
+              fixedSecondary === secondarySrt ? 'whisper-translate' : 'whisper-translate-fixed',
+              fixedSecondary,
+              basename,
+            );
+          } catch (error) {
+            log(
+              'warn',
+              args.logLevel,
+              `Failed to generate secondary subtitle via whisper fallback: ${(error as Error).message}`,
+            );
+          }
+        }
+      }
+    }
+
+    if (!secondaryCanUseWhisperTranslate && !selectedSecondary) {
+      log(
+        'warn',
+        args.logLevel,
+        `Secondary subtitle language (${secondaryLabel}) has no whisper translate fallback; relying on manual subtitles only.`,
+      );
+    }
+
+    if (!primaryAlias && !secondaryAlias && !selectedPrimary && !selectedSecondary) {
+      throw new Error('Failed to generate any subtitle tracks.');
+    }
+    if ((!primaryAlias && !selectedPrimary) || (!secondaryAlias && !selectedSecondary)) {
+      log(
+        'warn',
+        args.logLevel,
+        `Generated partial subtitle result: primary=${primaryAlias || selectedPrimary ? 'ok' : 'missing'}, secondary=${secondaryAlias || selectedSecondary ? 'ok' : 'missing'}`,
+      );
+    }
+
+    return {
+      basename,
+      primaryPath: primaryAlias || undefined,
+      secondaryPath: secondaryAlias || undefined,
+      primaryNative: Boolean(selectedPrimary),
+      secondaryNative: Boolean(selectedSecondary),
+    };
+  } catch (error) {
+    keepTemp = true;
+    throw error;
+  } finally {
+    if (keepTemp) {
+      log('warn', args.logLevel, `Keeping subtitle temp dir: ${tempDir}`);
+    } else {
+      try {
+        fs.rmSync(tempDir, { recursive: true, force: true });
+      } catch {
+        // ignore cleanup failures
+      }
+    }
+  }
+}
@@ -0,0 +1,55 @@
+import test from 'node:test';
+import assert from 'node:assert/strict';
+
+import { runLoggedYoutubePhase } from './progress';
+
+test('runLoggedYoutubePhase logs start and finish with elapsed time', async () => {
+  const entries: Array<{ level: 'info' | 'warn'; message: string }> = [];
+  let nowMs = 1_000;
+
+  const result = await runLoggedYoutubePhase(
+    {
+      startMessage: 'Starting subtitle probe',
+      finishMessage: 'Finished subtitle probe',
+      log: (level, message) => entries.push({ level, message }),
+      now: () => nowMs,
+    },
+    async () => {
+      nowMs = 2_500;
+      return 'ok';
+    },
+  );
+
+  assert.equal(result, 'ok');
+  assert.deepEqual(entries, [
+    { level: 'info', message: 'Starting subtitle probe' },
+    { level: 'info', message: 'Finished subtitle probe (1.5s)' },
+  ]);
+});
+
+test('runLoggedYoutubePhase logs failure with elapsed time and rethrows', async () => {
+  const entries: Array<{ level: 'info' | 'warn'; message: string }> = [];
+  let nowMs = 5_000;
+
+  await assert.rejects(
+    runLoggedYoutubePhase(
+      {
+        startMessage: 'Starting whisper primary',
+        finishMessage: 'Finished whisper primary',
+        failureMessage: 'Failed whisper primary',
+        log: (level, message) => entries.push({ level, message }),
+        now: () => nowMs,
+      },
+      async () => {
+        nowMs = 8_200;
+        throw new Error('boom');
+      },
+    ),
+    /boom/,
+  );
+
+  assert.deepEqual(entries, [
+    { level: 'info', message: 'Starting whisper primary' },
+    { level: 'warn', message: 'Failed whisper primary after 3.2s: boom' },
+  ]);
+});
@@ -0,0 +1,33 @@
+type PhaseLogLevel = 'info' | 'warn';
+
+export interface RunLoggedYoutubePhaseOptions {
+  startMessage: string;
+  finishMessage: string;
+  failureMessage?: string;
+  log: (level: PhaseLogLevel, message: string) => void;
+  now?: () => number;
+}
+
+function formatElapsedMs(elapsedMs: number): string {
+  const seconds = Math.max(0, elapsedMs) / 1000;
+  return `${seconds.toFixed(1)}s`;
+}
+
+export async function runLoggedYoutubePhase<T>(
+  options: RunLoggedYoutubePhaseOptions,
+  run: () => Promise<T>,
+): Promise<T> {
+  const now = options.now ?? Date.now;
+  const startedAt = now();
+  options.log('info', options.startMessage);
+  try {
+    const result = await run();
+    options.log('info', `${options.finishMessage} (${formatElapsedMs(now() - startedAt)})`);
+    return result;
+  } catch (error) {
+    const prefix = options.failureMessage ?? options.finishMessage;
+    const message = error instanceof Error ? error.message : String(error);
+    options.log('warn', `${prefix} after ${formatElapsedMs(now() - startedAt)}: ${message}`);
+    throw error;
+  }
+}
@@ -0,0 +1,32 @@
+import test from 'node:test';
+import assert from 'node:assert/strict';
+
+import { parseSrt, stringifySrt } from './srt';
+
+test('parseSrt reads cue numbering timing and text', () => {
+  const cues = parseSrt(`1
+00:00:01,000 --> 00:00:02,000
+こんにちは
+
+2
+00:00:02,500 --> 00:00:03,000
+世界
+`);
+
+  assert.equal(cues.length, 2);
+  assert.equal(cues[0]?.start, '00:00:01,000');
+  assert.equal(cues[0]?.end, '00:00:02,000');
+  assert.equal(cues[0]?.text, 'こんにちは');
+  assert.equal(cues[1]?.text, '世界');
+});
+
+test('stringifySrt preserves parseable cue structure', () => {
+  const roundTrip = stringifySrt(
+    parseSrt(`1
+00:00:01,000 --> 00:00:02,000
+こんにちは
+`),
+  );
+
+  assert.match(roundTrip, /1\n00:00:01,000 --> 00:00:02,000\nこんにちは/);
+});
@@ -0,0 +1,40 @@
+export interface SrtCue {
+  index: number;
+  start: string;
+  end: string;
+  text: string;
+}
+
+const TIMING_LINE_PATTERN =
+  /^(?<start>\d{2}:\d{2}:\d{2},\d{3}) --> (?<end>\d{2}:\d{2}:\d{2},\d{3})$/;
+
+export function parseSrt(content: string): SrtCue[] {
+  const normalized = content.replace(/\r\n/g, '\n').trim();
+  if (!normalized) return [];
+
+  return normalized
+    .split(/\n{2,}/)
+    .map((block) => {
+      const lines = block.split('\n');
+      const index = Number.parseInt(lines[0] || '', 10);
+      const timingLine = lines[1] || '';
+      const timingMatch = TIMING_LINE_PATTERN.exec(timingLine);
+      if (!Number.isInteger(index) || !timingMatch?.groups) {
+        throw new Error(`Invalid SRT cue block: ${block}`);
+      }
+      return {
+        index,
+        start: timingMatch.groups.start!,
+        end: timingMatch.groups.end!,
+        text: lines.slice(2).join('\n').trim(),
+      } satisfies SrtCue;
+    })
+    .filter((cue) => cue.text.length > 0);
+}
+
+export function stringifySrt(cues: SrtCue[]): string {
+  return cues
+    .map((cue, idx) => `${idx + 1}\n${cue.start} --> ${cue.end}\n${cue.text.trim()}\n`)
+    .join('\n')
+    .trimEnd();
+}
@@ -0,0 +1,47 @@
+import test from 'node:test';
+import assert from 'node:assert/strict';
+
+import { buildWhisperArgs } from './whisper';
+
+test('buildWhisperArgs includes threads and optional VAD flags', () => {
+  assert.deepEqual(
+    buildWhisperArgs({
+      modelPath: '/models/ggml-large-v2.bin',
+      audioPath: '/tmp/input.wav',
+      outputPrefix: '/tmp/output',
+      language: 'ja',
+      translate: false,
+      threads: 8,
+      vadModelPath: '/models/vad.bin',
+    }),
+    [
+      '-m',
+      '/models/ggml-large-v2.bin',
+      '-f',
+      '/tmp/input.wav',
+      '--output-srt',
+      '--output-file',
+      '/tmp/output',
+      '--language',
+      'ja',
+      '--threads',
+      '8',
+      '-vm',
+      '/models/vad.bin',
+      '--vad',
+    ],
+  );
+});
+
+test('buildWhisperArgs includes translate flag when requested', () => {
+  assert.ok(
+    buildWhisperArgs({
+      modelPath: '/models/base.bin',
+      audioPath: '/tmp/input.wav',
+      outputPrefix: '/tmp/output',
+      language: 'ja',
+      translate: true,
+      threads: 4,
+    }).includes('--translate'),
+  );
+});
@@ -0,0 +1,60 @@
+import fs from 'node:fs';
+
+import type { Args } from '../types.js';
+import { runExternalCommand } from '../util.js';
+
+export interface BuildWhisperArgsOptions {
+  modelPath: string;
+  audioPath: string;
+  outputPrefix: string;
+  language: string;
+  translate: boolean;
+  threads: number;
+  vadModelPath?: string;
+}
+
+export function buildWhisperArgs(options: BuildWhisperArgsOptions): string[] {
+  const args = [
+    '-m',
+    options.modelPath,
+    '-f',
+    options.audioPath,
+    '--output-srt',
+    '--output-file',
+    options.outputPrefix,
+    '--language',
+    options.language,
+    '--threads',
+    String(options.threads),
+  ];
+  if (options.translate) args.push('--translate');
+  if (options.vadModelPath) {
+    args.push('-vm', options.vadModelPath, '--vad');
+  }
+  return args;
+}
+
+export async function runWhisper(
+  whisperBin: string,
+  args: Args,
+  options: Omit<BuildWhisperArgsOptions, 'threads' | 'vadModelPath'>,
+): Promise<string> {
+  const vadModelPath =
+    args.whisperVadModel.trim() && fs.existsSync(args.whisperVadModel.trim())
+      ? args.whisperVadModel.trim()
+      : undefined;
+  const whisperArgs = buildWhisperArgs({
+    ...options,
+    threads: args.whisperThreads,
+    vadModelPath,
+  });
+  await runExternalCommand(whisperBin, whisperArgs, {
+    commandLabel: 'whisper',
+    streamOutput: true,
+  });
+  const outputPath = `${options.outputPrefix}.srt`;
+  if (!fs.existsSync(outputPath)) {
+    throw new Error(`whisper output not found: ${outputPath}`);
+  }
+  return outputPath;
+}