refactor(youtube): extract subtitle generation pipeline

This commit is contained in:
2026-03-08 16:10:56 -07:00
parent 9e46176519
commit 4c0575afe0
11 changed files with 876 additions and 467 deletions

View File

@@ -0,0 +1,84 @@
import fs from 'node:fs';
import path from 'node:path';
import type { Args } from '../types.js';
import { YOUTUBE_AUDIO_EXTENSIONS } from '../types.js';
import { runExternalCommand } from '../util.js';
export function findAudioFile(tempDir: string, preferredExt: string): string | null {
const entries = fs.readdirSync(tempDir);
const audioFiles: Array<{ path: string; ext: string; mtimeMs: number }> = [];
for (const name of entries) {
const fullPath = path.join(tempDir, name);
let stat: fs.Stats;
try {
stat = fs.statSync(fullPath);
} catch {
continue;
}
if (!stat.isFile()) continue;
const ext = path.extname(name).toLowerCase();
if (!YOUTUBE_AUDIO_EXTENSIONS.has(ext)) continue;
audioFiles.push({ path: fullPath, ext, mtimeMs: stat.mtimeMs });
}
if (audioFiles.length === 0) return null;
const preferred = audioFiles.find((entry) => entry.ext === `.${preferredExt.toLowerCase()}`);
if (preferred) return preferred.path;
audioFiles.sort((a, b) => b.mtimeMs - a.mtimeMs);
return audioFiles[0]?.path ?? null;
}
export async function convertAudioForWhisper(inputPath: string, tempDir: string): Promise<string> {
const wavPath = path.join(tempDir, 'whisper-input.wav');
await runExternalCommand('ffmpeg', [
'-y',
'-loglevel',
'error',
'-i',
inputPath,
'-ar',
'16000',
'-ac',
'1',
'-c:a',
'pcm_s16le',
wavPath,
]);
if (!fs.existsSync(wavPath)) {
throw new Error(`Failed to prepare whisper audio input: ${wavPath}`);
}
return wavPath;
}
export async function downloadYoutubeAudio(
target: string,
args: Args,
tempDir: string,
childTracker?: Set<ReturnType<typeof import('node:child_process').spawn>>,
): Promise<string> {
await runExternalCommand(
'yt-dlp',
[
'-f',
'bestaudio/best',
'--extract-audio',
'--audio-format',
args.youtubeSubgenAudioFormat,
'--no-warnings',
'-o',
path.join(tempDir, '%(id)s.%(ext)s'),
target,
],
{
logLevel: args.logLevel,
commandLabel: 'yt-dlp:audio',
streamOutput: true,
},
childTracker,
);
const audioPath = findAudioFile(tempDir, args.youtubeSubgenAudioFormat);
if (!audioPath) {
throw new Error('Audio extraction succeeded, but no audio file was found.');
}
return audioPath;
}

View File

@@ -0,0 +1,99 @@
import fs from 'node:fs';
import path from 'node:path';
import type { SubtitleCandidate } from '../types.js';
import { YOUTUBE_SUB_EXTENSIONS } from '../types.js';
import { escapeRegExp, runExternalCommand } from '../util.js';
function filenameHasLanguageTag(filenameLower: string, langCode: string): boolean {
const escaped = escapeRegExp(langCode);
const pattern = new RegExp(`(^|[._-])${escaped}([._-]|$)`);
return pattern.test(filenameLower);
}
function classifyLanguage(
filename: string,
primaryLangCodes: string[],
secondaryLangCodes: string[],
): 'primary' | 'secondary' | null {
const lower = filename.toLowerCase();
const primary = primaryLangCodes.some((code) => filenameHasLanguageTag(lower, code));
const secondary = secondaryLangCodes.some((code) => filenameHasLanguageTag(lower, code));
if (primary && !secondary) return 'primary';
if (secondary && !primary) return 'secondary';
return null;
}
export function toYtdlpLangPattern(langCodes: string[]): string {
return langCodes.map((lang) => `${lang}.*`).join(',');
}
export function scanSubtitleCandidates(
tempDir: string,
knownSet: Set<string>,
source: SubtitleCandidate['source'],
primaryLangCodes: string[],
secondaryLangCodes: string[],
): SubtitleCandidate[] {
const entries = fs.readdirSync(tempDir);
const out: SubtitleCandidate[] = [];
for (const name of entries) {
const fullPath = path.join(tempDir, name);
if (knownSet.has(fullPath)) continue;
let stat: fs.Stats;
try {
stat = fs.statSync(fullPath);
} catch {
continue;
}
if (!stat.isFile()) continue;
const ext = path.extname(fullPath).toLowerCase();
if (!YOUTUBE_SUB_EXTENSIONS.has(ext)) continue;
const lang = classifyLanguage(name, primaryLangCodes, secondaryLangCodes);
if (!lang) continue;
out.push({ path: fullPath, lang, ext, size: stat.size, source });
}
return out;
}
export function pickBestCandidate(candidates: SubtitleCandidate[]): SubtitleCandidate | null {
if (candidates.length === 0) return null;
const scored = [...candidates].sort((a, b) => {
const srtA = a.ext === '.srt' ? 1 : 0;
const srtB = b.ext === '.srt' ? 1 : 0;
if (srtA !== srtB) return srtB - srtA;
return b.size - a.size;
});
return scored[0] ?? null;
}
export async function downloadManualSubtitles(
target: string,
tempDir: string,
langPattern: string,
logLevel: import('../types.js').LogLevel,
childTracker?: Set<ReturnType<typeof import('node:child_process').spawn>>,
): Promise<void> {
await runExternalCommand(
'yt-dlp',
[
'--skip-download',
'--no-warnings',
'--write-subs',
'--sub-format',
'srt/vtt/best',
'--sub-langs',
langPattern,
'-o',
path.join(tempDir, '%(id)s.%(ext)s'),
target,
],
{
allowFailure: true,
logLevel,
commandLabel: 'yt-dlp:manual-subs',
streamOutput: true,
},
childTracker,
);
}

View File

@@ -0,0 +1,58 @@
import test from 'node:test';
import assert from 'node:assert/strict';
import { planYoutubeSubtitleGeneration } from './orchestrator';
test('planYoutubeSubtitleGeneration prefers manual subtitles and never schedules auto-subs', () => {
assert.deepEqual(
planYoutubeSubtitleGeneration({
hasPrimaryManualSubtitle: true,
hasSecondaryManualSubtitle: false,
secondaryCanTranslate: true,
}),
{
fetchManualSubtitles: true,
fetchAutoSubtitles: false,
publishPrimaryManualSubtitle: false,
publishSecondaryManualSubtitle: false,
generatePrimarySubtitle: false,
generateSecondarySubtitle: true,
},
);
});
test('planYoutubeSubtitleGeneration generates only missing tracks', () => {
assert.deepEqual(
planYoutubeSubtitleGeneration({
hasPrimaryManualSubtitle: false,
hasSecondaryManualSubtitle: true,
secondaryCanTranslate: true,
}),
{
fetchManualSubtitles: true,
fetchAutoSubtitles: false,
publishPrimaryManualSubtitle: false,
publishSecondaryManualSubtitle: false,
generatePrimarySubtitle: true,
generateSecondarySubtitle: false,
},
);
});
test('planYoutubeSubtitleGeneration reuses manual tracks already present on the YouTube video', () => {
assert.deepEqual(
planYoutubeSubtitleGeneration({
hasPrimaryManualSubtitle: true,
hasSecondaryManualSubtitle: true,
secondaryCanTranslate: true,
}),
{
fetchManualSubtitles: true,
fetchAutoSubtitles: false,
publishPrimaryManualSubtitle: false,
publishSecondaryManualSubtitle: false,
generatePrimarySubtitle: false,
generateSecondarySubtitle: false,
},
);
});

View File

@@ -0,0 +1,367 @@
import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
import type { Args, SubtitleCandidate, YoutubeSubgenOutputs } from '../types.js';
import { log } from '../log.js';
import {
commandExists,
normalizeBasename,
resolvePathMaybe,
runExternalCommand,
uniqueNormalizedLangCodes,
} from '../util.js';
import { state } from '../mpv.js';
import { downloadYoutubeAudio, convertAudioForWhisper } from './audio-extraction.js';
import {
downloadManualSubtitles,
pickBestCandidate,
scanSubtitleCandidates,
toYtdlpLangPattern,
} from './manual-subs.js';
import { runLoggedYoutubePhase } from './progress.js';
import { fixSubtitleWithAi } from './subtitle-fix-ai.js';
import { runWhisper } from './whisper.js';
export interface YoutubeSubtitleGenerationPlan {
fetchManualSubtitles: true;
fetchAutoSubtitles: false;
publishPrimaryManualSubtitle: false;
publishSecondaryManualSubtitle: false;
generatePrimarySubtitle: boolean;
generateSecondarySubtitle: boolean;
}
export function planYoutubeSubtitleGeneration(input: {
hasPrimaryManualSubtitle: boolean;
hasSecondaryManualSubtitle: boolean;
secondaryCanTranslate: boolean;
}): YoutubeSubtitleGenerationPlan {
return {
fetchManualSubtitles: true,
fetchAutoSubtitles: false,
publishPrimaryManualSubtitle: false,
publishSecondaryManualSubtitle: false,
generatePrimarySubtitle: !input.hasPrimaryManualSubtitle,
generateSecondarySubtitle: !input.hasSecondaryManualSubtitle && input.secondaryCanTranslate,
};
}
function preferredLangLabel(langCodes: string[], fallback: string): string {
return uniqueNormalizedLangCodes(langCodes)[0] || fallback;
}
function sourceTag(source: SubtitleCandidate['source']): string {
return source;
}
export function resolveWhisperBinary(args: Args): string | null {
const explicit = args.whisperBin.trim();
if (explicit) return resolvePathMaybe(explicit);
if (commandExists('whisper-cli')) return 'whisper-cli';
return null;
}
async function maybeFixSubtitleWithAi(
selectedPath: string,
args: Args,
expectedLanguage?: string,
): Promise<string> {
if (!args.youtubeFixWithAi || args.aiConfig.enabled !== true) {
return selectedPath;
}
const fixedContent = await runLoggedYoutubePhase(
{
startMessage: `Starting AI subtitle fix: ${path.basename(selectedPath)}`,
finishMessage: `Finished AI subtitle fix: ${path.basename(selectedPath)}`,
failureMessage: `AI subtitle fix failed: ${path.basename(selectedPath)}`,
log: (level, message) => log(level, args.logLevel, message),
},
async () => {
const originalContent = fs.readFileSync(selectedPath, 'utf8');
return fixSubtitleWithAi(
originalContent,
args.aiConfig,
(message) => {
log('warn', args.logLevel, message);
},
expectedLanguage,
);
},
);
if (!fixedContent) {
return selectedPath;
}
const fixedPath = selectedPath.replace(/\.srt$/i, '.fixed.srt');
fs.writeFileSync(fixedPath, fixedContent, 'utf8');
return fixedPath;
}
export async function generateYoutubeSubtitles(
target: string,
args: Args,
onReady?: (lang: 'primary' | 'secondary', pathToLoad: string) => Promise<void>,
): Promise<YoutubeSubgenOutputs> {
const outDir = path.resolve(resolvePathMaybe(args.youtubeSubgenOutDir));
fs.mkdirSync(outDir, { recursive: true });
const primaryLangCodes = uniqueNormalizedLangCodes(args.youtubePrimarySubLangs);
const secondaryLangCodes = uniqueNormalizedLangCodes(args.youtubeSecondarySubLangs);
const primaryLabel = preferredLangLabel(primaryLangCodes, 'primary');
const secondaryLabel = preferredLangLabel(secondaryLangCodes, 'secondary');
const secondaryCanUseWhisperTranslate =
secondaryLangCodes.includes('en') || secondaryLangCodes.includes('eng');
const manualLangs = toYtdlpLangPattern([...primaryLangCodes, ...secondaryLangCodes]);
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-yt-subgen-'));
const knownFiles = new Set<string>();
let keepTemp = args.youtubeSubgenKeepTemp;
const publishTrack = async (
lang: 'primary' | 'secondary',
source: SubtitleCandidate['source'],
selectedPath: string,
basename: string,
): Promise<string> => {
const langLabel = lang === 'primary' ? primaryLabel : secondaryLabel;
const taggedPath = path.join(outDir, `${basename}.${langLabel}.${sourceTag(source)}.srt`);
const aliasPath = path.join(outDir, `${basename}.${langLabel}.srt`);
fs.copyFileSync(selectedPath, taggedPath);
fs.copyFileSync(taggedPath, aliasPath);
log('info', args.logLevel, `Generated subtitle (${langLabel}, ${source}) -> ${aliasPath}`);
if (onReady) await onReady(lang, aliasPath);
return aliasPath;
};
try {
const meta = await runLoggedYoutubePhase(
{
startMessage: 'Starting YouTube metadata probe',
finishMessage: 'Finished YouTube metadata probe',
failureMessage: 'YouTube metadata probe failed',
log: (level, message) => log(level, args.logLevel, message),
},
() =>
runExternalCommand(
'yt-dlp',
['--dump-single-json', '--no-warnings', target],
{
captureStdout: true,
logLevel: args.logLevel,
commandLabel: 'yt-dlp:meta',
},
state.youtubeSubgenChildren,
),
);
const metadata = JSON.parse(meta.stdout) as { id?: string };
const videoId = metadata.id || `${Date.now()}`;
const basename = normalizeBasename(videoId, videoId);
await runLoggedYoutubePhase(
{
startMessage: `Starting manual subtitle probe (${manualLangs || 'requested langs'})`,
finishMessage: 'Finished manual subtitle probe',
failureMessage: 'Manual subtitle probe failed',
log: (level, message) => log(level, args.logLevel, message),
},
() =>
downloadManualSubtitles(
target,
tempDir,
manualLangs,
args.logLevel,
state.youtubeSubgenChildren,
),
);
const manualSubs = scanSubtitleCandidates(
tempDir,
knownFiles,
'manual',
primaryLangCodes,
secondaryLangCodes,
);
for (const sub of manualSubs) knownFiles.add(sub.path);
const selectedPrimary = pickBestCandidate(
manualSubs.filter((entry) => entry.lang === 'primary'),
);
const selectedSecondary = pickBestCandidate(
manualSubs.filter((entry) => entry.lang === 'secondary'),
);
const plan = planYoutubeSubtitleGeneration({
hasPrimaryManualSubtitle: Boolean(selectedPrimary),
hasSecondaryManualSubtitle: Boolean(selectedSecondary),
secondaryCanTranslate: secondaryCanUseWhisperTranslate,
});
let primaryAlias = '';
let secondaryAlias = '';
if (selectedPrimary) {
log(
'info',
args.logLevel,
`Using native YouTube subtitle track for primary (${primaryLabel}); skipping external subtitle copy.`,
);
}
if (selectedSecondary) {
log(
'info',
args.logLevel,
`Using native YouTube subtitle track for secondary (${secondaryLabel}); skipping external subtitle copy.`,
);
}
if (plan.generatePrimarySubtitle || plan.generateSecondarySubtitle) {
const whisperBin = resolveWhisperBinary(args);
const modelPath = args.whisperModel.trim()
? path.resolve(resolvePathMaybe(args.whisperModel.trim()))
: '';
const hasWhisperFallback = !!whisperBin && !!modelPath && fs.existsSync(modelPath);
if (!hasWhisperFallback) {
log(
'warn',
args.logLevel,
'Whisper fallback is not configured; continuing with available subtitle tracks.',
);
} else {
const audioPath = await runLoggedYoutubePhase(
{
startMessage: 'Starting fallback audio extraction for subtitle generation',
finishMessage: 'Finished fallback audio extraction',
failureMessage: 'Fallback audio extraction failed',
log: (level, message) => log(level, args.logLevel, message),
},
() =>
downloadYoutubeAudio(target, args, tempDir, state.youtubeSubgenChildren),
);
const whisperAudioPath = await runLoggedYoutubePhase(
{
startMessage: 'Starting ffmpeg audio prep for whisper',
finishMessage: 'Finished ffmpeg audio prep for whisper',
failureMessage: 'ffmpeg audio prep for whisper failed',
log: (level, message) => log(level, args.logLevel, message),
},
() => convertAudioForWhisper(audioPath, tempDir),
);
if (plan.generatePrimarySubtitle) {
try {
const primaryPrefix = path.join(tempDir, `${basename}.${primaryLabel}`);
const primarySrt = await runLoggedYoutubePhase(
{
startMessage: `Starting whisper primary subtitle generation (${primaryLabel})`,
finishMessage: `Finished whisper primary subtitle generation (${primaryLabel})`,
failureMessage: `Whisper primary subtitle generation failed (${primaryLabel})`,
log: (level, message) => log(level, args.logLevel, message),
},
() =>
runWhisper(whisperBin!, args, {
modelPath,
audioPath: whisperAudioPath,
language: args.youtubeWhisperSourceLanguage,
translate: false,
outputPrefix: primaryPrefix,
}),
);
const fixedPrimary = await maybeFixSubtitleWithAi(
primarySrt,
args,
args.youtubeWhisperSourceLanguage,
);
primaryAlias = await publishTrack(
'primary',
fixedPrimary === primarySrt ? 'whisper' : 'whisper-fixed',
fixedPrimary,
basename,
);
} catch (error) {
log(
'warn',
args.logLevel,
`Failed to generate primary subtitle via whisper fallback: ${(error as Error).message}`,
);
}
}
if (plan.generateSecondarySubtitle) {
try {
const secondaryPrefix = path.join(tempDir, `${basename}.${secondaryLabel}`);
const secondarySrt = await runLoggedYoutubePhase(
{
startMessage: `Starting whisper secondary subtitle generation (${secondaryLabel})`,
finishMessage: `Finished whisper secondary subtitle generation (${secondaryLabel})`,
failureMessage: `Whisper secondary subtitle generation failed (${secondaryLabel})`,
log: (level, message) => log(level, args.logLevel, message),
},
() =>
runWhisper(whisperBin!, args, {
modelPath,
audioPath: whisperAudioPath,
language: args.youtubeWhisperSourceLanguage,
translate: true,
outputPrefix: secondaryPrefix,
}),
);
const fixedSecondary = await maybeFixSubtitleWithAi(secondarySrt, args);
secondaryAlias = await publishTrack(
'secondary',
fixedSecondary === secondarySrt ? 'whisper-translate' : 'whisper-translate-fixed',
fixedSecondary,
basename,
);
} catch (error) {
log(
'warn',
args.logLevel,
`Failed to generate secondary subtitle via whisper fallback: ${(error as Error).message}`,
);
}
}
}
}
if (!secondaryCanUseWhisperTranslate && !selectedSecondary) {
log(
'warn',
args.logLevel,
`Secondary subtitle language (${secondaryLabel}) has no whisper translate fallback; relying on manual subtitles only.`,
);
}
if (!primaryAlias && !secondaryAlias && !selectedPrimary && !selectedSecondary) {
throw new Error('Failed to generate any subtitle tracks.');
}
if ((!primaryAlias && !selectedPrimary) || (!secondaryAlias && !selectedSecondary)) {
log(
'warn',
args.logLevel,
`Generated partial subtitle result: primary=${primaryAlias || selectedPrimary ? 'ok' : 'missing'}, secondary=${secondaryAlias || selectedSecondary ? 'ok' : 'missing'}`,
);
}
return {
basename,
primaryPath: primaryAlias || undefined,
secondaryPath: secondaryAlias || undefined,
primaryNative: Boolean(selectedPrimary),
secondaryNative: Boolean(selectedSecondary),
};
} catch (error) {
keepTemp = true;
throw error;
} finally {
if (keepTemp) {
log('warn', args.logLevel, `Keeping subtitle temp dir: ${tempDir}`);
} else {
try {
fs.rmSync(tempDir, { recursive: true, force: true });
} catch {
// ignore cleanup failures
}
}
}
}

View File

@@ -0,0 +1,55 @@
import test from 'node:test';
import assert from 'node:assert/strict';
import { runLoggedYoutubePhase } from './progress';
test('runLoggedYoutubePhase logs start and finish with elapsed time', async () => {
const entries: Array<{ level: 'info' | 'warn'; message: string }> = [];
let nowMs = 1_000;
const result = await runLoggedYoutubePhase(
{
startMessage: 'Starting subtitle probe',
finishMessage: 'Finished subtitle probe',
log: (level, message) => entries.push({ level, message }),
now: () => nowMs,
},
async () => {
nowMs = 2_500;
return 'ok';
},
);
assert.equal(result, 'ok');
assert.deepEqual(entries, [
{ level: 'info', message: 'Starting subtitle probe' },
{ level: 'info', message: 'Finished subtitle probe (1.5s)' },
]);
});
test('runLoggedYoutubePhase logs failure with elapsed time and rethrows', async () => {
const entries: Array<{ level: 'info' | 'warn'; message: string }> = [];
let nowMs = 5_000;
await assert.rejects(
runLoggedYoutubePhase(
{
startMessage: 'Starting whisper primary',
finishMessage: 'Finished whisper primary',
failureMessage: 'Failed whisper primary',
log: (level, message) => entries.push({ level, message }),
now: () => nowMs,
},
async () => {
nowMs = 8_200;
throw new Error('boom');
},
),
/boom/,
);
assert.deepEqual(entries, [
{ level: 'info', message: 'Starting whisper primary' },
{ level: 'warn', message: 'Failed whisper primary after 3.2s: boom' },
]);
});

View File

@@ -0,0 +1,33 @@
type PhaseLogLevel = 'info' | 'warn';
export interface RunLoggedYoutubePhaseOptions {
startMessage: string;
finishMessage: string;
failureMessage?: string;
log: (level: PhaseLogLevel, message: string) => void;
now?: () => number;
}
function formatElapsedMs(elapsedMs: number): string {
const seconds = Math.max(0, elapsedMs) / 1000;
return `${seconds.toFixed(1)}s`;
}
export async function runLoggedYoutubePhase<T>(
options: RunLoggedYoutubePhaseOptions,
run: () => Promise<T>,
): Promise<T> {
const now = options.now ?? Date.now;
const startedAt = now();
options.log('info', options.startMessage);
try {
const result = await run();
options.log('info', `${options.finishMessage} (${formatElapsedMs(now() - startedAt)})`);
return result;
} catch (error) {
const prefix = options.failureMessage ?? options.finishMessage;
const message = error instanceof Error ? error.message : String(error);
options.log('warn', `${prefix} after ${formatElapsedMs(now() - startedAt)}: ${message}`);
throw error;
}
}

View File

@@ -0,0 +1,32 @@
import test from 'node:test';
import assert from 'node:assert/strict';
import { parseSrt, stringifySrt } from './srt';
test('parseSrt reads cue numbering timing and text', () => {
const cues = parseSrt(`1
00:00:01,000 --> 00:00:02,000
こんにちは
2
00:00:02,500 --> 00:00:03,000
世界
`);
assert.equal(cues.length, 2);
assert.equal(cues[0]?.start, '00:00:01,000');
assert.equal(cues[0]?.end, '00:00:02,000');
assert.equal(cues[0]?.text, 'こんにちは');
assert.equal(cues[1]?.text, '世界');
});
test('stringifySrt preserves parseable cue structure', () => {
const roundTrip = stringifySrt(
parseSrt(`1
00:00:01,000 --> 00:00:02,000
こんにちは
`),
);
assert.match(roundTrip, /1\n00:00:01,000 --> 00:00:02,000\nこんにちは/);
});

40
launcher/youtube/srt.ts Normal file
View File

@@ -0,0 +1,40 @@
export interface SrtCue {
index: number;
start: string;
end: string;
text: string;
}
const TIMING_LINE_PATTERN =
/^(?<start>\d{2}:\d{2}:\d{2},\d{3}) --> (?<end>\d{2}:\d{2}:\d{2},\d{3})$/;
export function parseSrt(content: string): SrtCue[] {
const normalized = content.replace(/\r\n/g, '\n').trim();
if (!normalized) return [];
return normalized
.split(/\n{2,}/)
.map((block) => {
const lines = block.split('\n');
const index = Number.parseInt(lines[0] || '', 10);
const timingLine = lines[1] || '';
const timingMatch = TIMING_LINE_PATTERN.exec(timingLine);
if (!Number.isInteger(index) || !timingMatch?.groups) {
throw new Error(`Invalid SRT cue block: ${block}`);
}
return {
index,
start: timingMatch.groups.start!,
end: timingMatch.groups.end!,
text: lines.slice(2).join('\n').trim(),
} satisfies SrtCue;
})
.filter((cue) => cue.text.length > 0);
}
export function stringifySrt(cues: SrtCue[]): string {
return cues
.map((cue, idx) => `${idx + 1}\n${cue.start} --> ${cue.end}\n${cue.text.trim()}\n`)
.join('\n')
.trimEnd();
}

View File

@@ -0,0 +1,47 @@
import test from 'node:test';
import assert from 'node:assert/strict';
import { buildWhisperArgs } from './whisper';
test('buildWhisperArgs includes threads and optional VAD flags', () => {
assert.deepEqual(
buildWhisperArgs({
modelPath: '/models/ggml-large-v2.bin',
audioPath: '/tmp/input.wav',
outputPrefix: '/tmp/output',
language: 'ja',
translate: false,
threads: 8,
vadModelPath: '/models/vad.bin',
}),
[
'-m',
'/models/ggml-large-v2.bin',
'-f',
'/tmp/input.wav',
'--output-srt',
'--output-file',
'/tmp/output',
'--language',
'ja',
'--threads',
'8',
'-vm',
'/models/vad.bin',
'--vad',
],
);
});
test('buildWhisperArgs includes translate flag when requested', () => {
assert.ok(
buildWhisperArgs({
modelPath: '/models/base.bin',
audioPath: '/tmp/input.wav',
outputPrefix: '/tmp/output',
language: 'ja',
translate: true,
threads: 4,
}).includes('--translate'),
);
});

View File

@@ -0,0 +1,60 @@
import fs from 'node:fs';
import type { Args } from '../types.js';
import { runExternalCommand } from '../util.js';
export interface BuildWhisperArgsOptions {
modelPath: string;
audioPath: string;
outputPrefix: string;
language: string;
translate: boolean;
threads: number;
vadModelPath?: string;
}
export function buildWhisperArgs(options: BuildWhisperArgsOptions): string[] {
const args = [
'-m',
options.modelPath,
'-f',
options.audioPath,
'--output-srt',
'--output-file',
options.outputPrefix,
'--language',
options.language,
'--threads',
String(options.threads),
];
if (options.translate) args.push('--translate');
if (options.vadModelPath) {
args.push('-vm', options.vadModelPath, '--vad');
}
return args;
}
export async function runWhisper(
whisperBin: string,
args: Args,
options: Omit<BuildWhisperArgsOptions, 'threads' | 'vadModelPath'>,
): Promise<string> {
const vadModelPath =
args.whisperVadModel.trim() && fs.existsSync(args.whisperVadModel.trim())
? args.whisperVadModel.trim()
: undefined;
const whisperArgs = buildWhisperArgs({
...options,
threads: args.whisperThreads,
vadModelPath,
});
await runExternalCommand(whisperBin, whisperArgs, {
commandLabel: 'whisper',
streamOutput: true,
});
const outputPath = `${options.outputPrefix}.srt`;
if (!fs.existsSync(outputPath)) {
throw new Error(`whisper output not found: ${outputPath}`);
}
return outputPath;
}