feat: add app-owned YouTube subtitle flow with absPlayer-style parsing (#31)

* fix: harden preload argv parsing for popup windows

* fix: align youtube playback with shared overlay startup

* fix: unwrap mpv youtube streams for anki media mining

* docs: update docs for youtube subtitle and mining flow

* refactor: unify cli and runtime wiring for startup and youtube flow

* feat: update subtitle sidebar overlay behavior

* chore: add shared log-file source for diagnostics

* fix(ci): add changelog fragment for immersion changes

* fix: address CodeRabbit review feedback

* fix: persist canonical title from youtube metadata

* style: format stats library tab

* fix: address latest review feedback

* style: format stats library files

* test: stub launcher youtube deps in CI

* test: isolate launcher youtube flow deps

* test: stub launcher youtube deps in failing case

* test: force x11 backend in launcher ci harness

* test: address latest review feedback

* fix(launcher): preserve user YouTube ytdl raw options

* docs(backlog): update task tracking notes

* fix(immersion): special-case youtube media paths in runtime and tracking

* feat(stats): improve YouTube media metadata and picker key handling

* fix(ci): format stats media library hook

* fix: address latest CodeRabbit review items

* docs: update youtube release notes and docs

* feat: auto-load youtube subtitles before manual picker

* fix: restore app-owned youtube subtitle flow

* docs: update youtube playback docs and config copy

* refactor: remove legacy youtube launcher mode plumbing

* fix: refine youtube subtitle startup binding

* docs: clarify youtube subtitle startup behavior

* fix: address PR #31 latest review follow-ups

* fix: address PR #31 follow-up review comments

* test: harden youtube picker test harness

* udpate backlog

* fix: add timeout to youtube metadata probe

* docs: refresh youtube and stats docs

* update backlog

* update backlog

* chore: release v0.9.0
This commit is contained in:
2026-03-24 00:01:24 -07:00
committed by GitHub
parent c17f0a4080
commit 5feed360ca
219 changed files with 12778 additions and 1052 deletions

View File

@@ -0,0 +1,18 @@
import type { YoutubeTrackOption } from './track-probe';
import { downloadYoutubeSubtitleTrack, downloadYoutubeSubtitleTracks } from './track-download';
export async function acquireYoutubeSubtitleTrack(input: {
targetUrl: string;
outputDir: string;
track: YoutubeTrackOption;
}): Promise<{ path: string }> {
return await downloadYoutubeSubtitleTrack(input);
}
export async function acquireYoutubeSubtitleTracks(input: {
targetUrl: string;
outputDir: string;
tracks: YoutubeTrackOption[];
}): Promise<Map<string, string>> {
return await downloadYoutubeSubtitleTracks(input);
}

View File

@@ -0,0 +1 @@
export type YoutubeTrackKind = 'manual' | 'auto';

View File

@@ -0,0 +1,41 @@
import type { YoutubeTrackKind } from './kinds';
export type { YoutubeTrackKind };
export function normalizeYoutubeLangCode(value: string): string {
return value.trim().toLowerCase().replace(/_/g, '-').replace(/[^a-z0-9-]+/g, '');
}
export function isJapaneseYoutubeLang(value: string): boolean {
const normalized = normalizeYoutubeLangCode(value);
return (
normalized === 'ja' ||
normalized === 'jp' ||
normalized === 'jpn' ||
normalized === 'japanese' ||
normalized.startsWith('ja-') ||
normalized.startsWith('jp-')
);
}
export function isEnglishYoutubeLang(value: string): boolean {
const normalized = normalizeYoutubeLangCode(value);
return (
normalized === 'en' ||
normalized === 'eng' ||
normalized === 'english' ||
normalized === 'enus' ||
normalized === 'en-us' ||
normalized.startsWith('en-')
);
}
export function formatYoutubeTrackLabel(input: {
language: string;
kind: YoutubeTrackKind;
title?: string;
}): string {
const language = input.language.trim() || 'unknown';
const base = input.title?.trim() || language;
return `${base} (${input.kind})`;
}

View File

@@ -0,0 +1,89 @@
import assert from 'node:assert/strict';
import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
import test from 'node:test';
import { probeYoutubeVideoMetadata } from './metadata-probe';
async function withTempDir<T>(fn: (dir: string) => Promise<T>): Promise<T> {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-youtube-metadata-probe-'));
try {
return await fn(dir);
} finally {
fs.rmSync(dir, { recursive: true, force: true });
}
}
function makeFakeYtDlpScript(dir: string, payload: string): void {
const scriptPath = path.join(dir, 'yt-dlp');
const script = `#!/usr/bin/env node
process.stdout.write(${JSON.stringify(payload)});
`;
fs.writeFileSync(scriptPath, script, 'utf8');
if (process.platform !== 'win32') {
fs.chmodSync(scriptPath, 0o755);
}
fs.writeFileSync(scriptPath + '.cmd', `@echo off\r\nnode "${scriptPath}"\r\n`, 'utf8');
}
function makeHangingFakeYtDlpScript(dir: string): void {
const scriptPath = path.join(dir, 'yt-dlp');
const script = `#!/usr/bin/env node
setInterval(() => {}, 1000);
`;
fs.writeFileSync(scriptPath, script, 'utf8');
if (process.platform !== 'win32') {
fs.chmodSync(scriptPath, 0o755);
}
fs.writeFileSync(scriptPath + '.cmd', `@echo off\r\nnode "${scriptPath}"\r\n`, 'utf8');
}
async function withFakeYtDlp<T>(payload: string, fn: () => Promise<T>): Promise<T> {
return await withTempDir(async (root) => {
const binDir = path.join(root, 'bin');
fs.mkdirSync(binDir, { recursive: true });
makeFakeYtDlpScript(binDir, payload);
const originalPath = process.env.PATH ?? '';
process.env.PATH = `${binDir}${path.delimiter}${originalPath}`;
try {
return await fn();
} finally {
process.env.PATH = originalPath;
}
});
}
async function withHangingFakeYtDlp<T>(fn: () => Promise<T>): Promise<T> {
return await withTempDir(async (root) => {
const binDir = path.join(root, 'bin');
fs.mkdirSync(binDir, { recursive: true });
makeHangingFakeYtDlpScript(binDir);
const originalPath = process.env.PATH ?? '';
process.env.PATH = `${binDir}${path.delimiter}${originalPath}`;
try {
return await fn();
} finally {
process.env.PATH = originalPath;
}
});
}
test('probeYoutubeVideoMetadata returns null on malformed yt-dlp JSON', async () => {
await withFakeYtDlp('not-json', async () => {
const result = await probeYoutubeVideoMetadata('https://www.youtube.com/watch?v=abc123');
assert.equal(result, null);
});
});
test(
'probeYoutubeVideoMetadata times out when yt-dlp hangs',
{ timeout: 20_000 },
async () => {
await withHangingFakeYtDlp(async () => {
await assert.rejects(
probeYoutubeVideoMetadata('https://www.youtube.com/watch?v=abc123'),
/timed out after 15000ms/,
);
});
},
);

View File

@@ -0,0 +1,122 @@
import { spawn } from 'node:child_process';
import type { YoutubeVideoMetadata } from '../immersion-tracker/types';
const YOUTUBE_METADATA_PROBE_TIMEOUT_MS = 15_000;
type YtDlpThumbnail = {
url?: string;
width?: number;
height?: number;
};
type YtDlpYoutubeMetadata = {
id?: string;
title?: string;
webpage_url?: string;
thumbnail?: string;
thumbnails?: YtDlpThumbnail[];
channel_id?: string;
channel?: string;
channel_url?: string;
uploader_id?: string;
uploader_url?: string;
description?: string;
};
function runCapture(
command: string,
args: string[],
timeoutMs = YOUTUBE_METADATA_PROBE_TIMEOUT_MS,
): Promise<{ stdout: string; stderr: string }> {
return new Promise((resolve, reject) => {
const proc = spawn(command, args, { stdio: ['ignore', 'pipe', 'pipe'] });
let stdout = '';
let stderr = '';
const timer = setTimeout(() => {
proc.kill();
reject(new Error(`yt-dlp timed out after ${timeoutMs}ms`));
}, timeoutMs);
proc.stdout.setEncoding('utf8');
proc.stderr.setEncoding('utf8');
proc.stdout.on('data', (chunk) => {
stdout += String(chunk);
});
proc.stderr.on('data', (chunk) => {
stderr += String(chunk);
});
proc.once('error', (error) => {
clearTimeout(timer);
reject(error);
});
proc.once('close', (code) => {
clearTimeout(timer);
if (code === 0) {
resolve({ stdout, stderr });
return;
}
reject(new Error(stderr.trim() || `yt-dlp exited with status ${code ?? 'unknown'}`));
});
});
}
function pickChannelThumbnail(thumbnails: YtDlpThumbnail[] | undefined): string | null {
if (!Array.isArray(thumbnails)) return null;
for (const thumbnail of thumbnails) {
const candidate = thumbnail.url?.trim();
if (!candidate) continue;
if (candidate.includes('/vi/')) continue;
if (
typeof thumbnail.width === 'number' &&
typeof thumbnail.height === 'number' &&
thumbnail.width > 0 &&
thumbnail.height > 0
) {
const ratio = thumbnail.width / thumbnail.height;
if (ratio >= 0.8 && ratio <= 1.25) {
return candidate;
}
continue;
}
if (candidate.includes('yt3.googleusercontent.com')) {
return candidate;
}
}
return null;
}
export async function probeYoutubeVideoMetadata(
targetUrl: string,
): Promise<YoutubeVideoMetadata | null> {
const { stdout } = await runCapture('yt-dlp', [
'--dump-single-json',
'--no-warnings',
'--skip-download',
targetUrl,
]);
let info: YtDlpYoutubeMetadata;
try {
info = JSON.parse(stdout) as YtDlpYoutubeMetadata;
} catch {
return null;
}
const youtubeVideoId = info.id?.trim();
const videoUrl = info.webpage_url?.trim() || targetUrl.trim();
if (!youtubeVideoId || !videoUrl) {
return null;
}
return {
youtubeVideoId,
videoUrl,
videoTitle: info.title?.trim() || null,
videoThumbnailUrl: info.thumbnail?.trim() || null,
channelId: info.channel_id?.trim() || null,
channelName: info.channel?.trim() || null,
channelUrl: info.channel_url?.trim() || null,
channelThumbnailUrl: pickChannelThumbnail(info.thumbnails),
uploaderId: info.uploader_id?.trim() || null,
uploaderUrl: info.uploader_url?.trim() || null,
description: info.description?.trim() || null,
metadataJson: JSON.stringify(info),
};
}

View File

@@ -0,0 +1,29 @@
import test from 'node:test';
import assert from 'node:assert/strict';
import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
import { retimeYoutubeSubtitle } from './retime';
test('retimeYoutubeSubtitle uses the downloaded subtitle path as-is', async () => {
const root = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-youtube-retime-'));
try {
const primaryPath = path.join(root, 'primary.vtt');
const referencePath = path.join(root, 'reference.vtt');
fs.writeFileSync(primaryPath, 'WEBVTT\n', 'utf8');
fs.writeFileSync(referencePath, 'WEBVTT\n', 'utf8');
const result = await retimeYoutubeSubtitle({
primaryPath,
secondaryPath: referencePath,
});
assert.equal(result.ok, true);
assert.equal(result.strategy, 'none');
assert.equal(result.path, primaryPath);
assert.equal(result.message, 'Using downloaded subtitle as-is (no automatic retime enabled)');
assert.equal(fs.readFileSync(result.path, 'utf8'), 'WEBVTT\n');
} finally {
fs.rmSync(root, { recursive: true, force: true });
}
});

View File

@@ -0,0 +1,11 @@
export async function retimeYoutubeSubtitle(input: {
primaryPath: string;
secondaryPath: string | null;
}): Promise<{ ok: boolean; path: string; strategy: 'none' | 'alass' | 'ffsubsync'; message: string }> {
return {
ok: true,
path: input.primaryPath,
strategy: 'none',
message: `Using downloaded subtitle as-is${input.secondaryPath ? ' (no automatic retime enabled)' : ''}`,
};
}

View File

@@ -0,0 +1,75 @@
import assert from 'node:assert/strict';
import test from 'node:test';
import { convertYoutubeTimedTextToVtt, normalizeYoutubeAutoVtt } from './timedtext';
test('convertYoutubeTimedTextToVtt leaves malformed numeric entities literal', () => {
const result = convertYoutubeTimedTextToVtt(
'<timedtext><body><p t="0" d="1000">&#99999999; &#x110000; &#x41;</p></body></timedtext>',
);
assert.equal(
result,
['WEBVTT', '', '00:00:00.000 --> 00:00:01.000', '&#99999999; &#x110000; A', ''].join('\n'),
);
});
test('convertYoutubeTimedTextToVtt does not swallow text after zero-length overlap rows', () => {
const result = convertYoutubeTimedTextToVtt(
[
'<timedtext><body>',
'<p t="0" d="2000">今日は</p>',
'<p t="1000" d="0">今日はいい天気ですね</p>',
'<p t="1000" d="2000">今日はいい天気ですね</p>',
'</body></timedtext>',
].join(''),
);
assert.equal(
result,
[
'WEBVTT',
'',
'00:00:00.000 --> 00:00:00.999',
'今日は',
'',
'00:00:01.000 --> 00:00:03.000',
'いい天気ですね',
'',
].join('\n'),
);
});
test('normalizeYoutubeAutoVtt strips cumulative rolling-caption prefixes', () => {
const result = normalizeYoutubeAutoVtt(
[
'WEBVTT',
'',
'00:00:01.000 --> 00:00:02.000',
'今日は',
'',
'00:00:02.000 --> 00:00:03.000',
'今日はいい天気ですね',
'',
'00:00:03.000 --> 00:00:04.000',
'今日はいい天気ですね本当に',
'',
].join('\n'),
);
assert.equal(
result,
[
'WEBVTT',
'',
'00:00:01.000 --> 00:00:02.000',
'今日は',
'',
'00:00:02.000 --> 00:00:03.000',
'いい天気ですね',
'',
'00:00:03.000 --> 00:00:04.000',
'本当に',
'',
].join('\n'),
);
});

View File

@@ -0,0 +1,166 @@
interface YoutubeTimedTextRow {
startMs: number;
durationMs: number;
text: string;
}
const YOUTUBE_TIMEDTEXT_EXTENSIONS = new Set(['srv1', 'srv2', 'srv3', 'ytsrv3']);
function decodeNumericEntity(match: string, codePoint: number): string {
if (
!Number.isInteger(codePoint) ||
codePoint < 0 ||
codePoint > 0x10ffff ||
(codePoint >= 0xd800 && codePoint <= 0xdfff)
) {
return match;
}
return String.fromCodePoint(codePoint);
}
function decodeHtmlEntities(value: string): string {
return value
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&#(\d+);/g, (match, codePoint) =>
decodeNumericEntity(match, Number(codePoint)),
)
.replace(/&#x([0-9a-f]+);/gi, (match, codePoint) =>
decodeNumericEntity(match, Number.parseInt(codePoint, 16)),
);
}
function parseAttributeMap(raw: string): Map<string, string> {
const attrs = new Map<string, string>();
for (const match of raw.matchAll(/([a-zA-Z0-9:_-]+)="([^"]*)"/g)) {
attrs.set(match[1]!, match[2]!);
}
return attrs;
}
function extractYoutubeTimedTextRows(xml: string): YoutubeTimedTextRow[] {
const rows: YoutubeTimedTextRow[] = [];
for (const match of xml.matchAll(/<p\b([^>]*)>([\s\S]*?)<\/p>/g)) {
const attrs = parseAttributeMap(match[1] ?? '');
const startMs = Number(attrs.get('t'));
const durationMs = Number(attrs.get('d'));
if (!Number.isFinite(startMs) || !Number.isFinite(durationMs)) {
continue;
}
const inner = (match[2] ?? '')
.replace(/<br\s*\/?>/gi, '\n')
.replace(/<[^>]+>/g, '');
const text = decodeHtmlEntities(inner).trim();
if (!text) {
continue;
}
rows.push({ startMs, durationMs, text });
}
return rows;
}
function formatVttTimestamp(ms: number): string {
const totalMs = Math.max(0, Math.floor(ms));
const hours = Math.floor(totalMs / 3_600_000);
const minutes = Math.floor((totalMs % 3_600_000) / 60_000);
const seconds = Math.floor((totalMs % 60_000) / 1_000);
const millis = totalMs % 1_000;
return `${String(hours).padStart(2, '0')}:${String(minutes).padStart(2, '0')}:${String(seconds).padStart(2, '0')}.${String(millis).padStart(3, '0')}`;
}
export function isYoutubeTimedTextExtension(value: string | undefined): boolean {
if (!value) {
return false;
}
return YOUTUBE_TIMEDTEXT_EXTENSIONS.has(value.trim().toLowerCase());
}
export function convertYoutubeTimedTextToVtt(xml: string): string {
const rows = extractYoutubeTimedTextRows(xml);
if (rows.length === 0) {
return 'WEBVTT\n';
}
const blocks: string[] = [];
let previousText = '';
for (let index = 0; index < rows.length; index += 1) {
const row = rows[index]!;
const nextRow = rows[index + 1];
const unclampedEnd = row.startMs + row.durationMs;
const clampedEnd =
nextRow && unclampedEnd > nextRow.startMs
? Math.max(row.startMs, nextRow.startMs - 1)
: unclampedEnd;
if (clampedEnd <= row.startMs) {
continue;
}
const text =
previousText && row.text.startsWith(previousText)
? row.text.slice(previousText.length).trimStart()
: row.text;
previousText = row.text;
if (!text) {
continue;
}
blocks.push(`${formatVttTimestamp(row.startMs)} --> ${formatVttTimestamp(clampedEnd)}\n${text}`);
}
return `WEBVTT\n\n${blocks.join('\n\n')}\n`;
}
function normalizeRollingCaptionText(text: string, previousText: string): string {
if (!previousText || !text.startsWith(previousText)) {
return text;
}
return text.slice(previousText.length).trimStart();
}
export function normalizeYoutubeAutoVtt(content: string): string {
const normalizedContent = content.replace(/\r\n?/g, '\n');
const blocks = normalizedContent.split(/\n{2,}/);
if (blocks.length === 0) {
return content;
}
let previousText = '';
let changed = false;
const normalizedBlocks = blocks.map((block) => {
if (!block.includes('-->')) {
return block;
}
const lines = block.split('\n');
const timingLineIndex = lines.findIndex((line) => line.includes('-->'));
if (timingLineIndex < 0 || timingLineIndex === lines.length - 1) {
return block;
}
const textLines = lines.slice(timingLineIndex + 1);
const originalText = textLines.join('\n').trim();
if (!originalText) {
return block;
}
const normalizedText = normalizeRollingCaptionText(originalText, previousText);
previousText = originalText;
if (!normalizedText || normalizedText === originalText) {
return block;
}
changed = true;
return [...lines.slice(0, timingLineIndex + 1), normalizedText].join('\n');
});
if (!changed) {
return content;
}
return `${normalizedBlocks.join('\n\n')}\n`;
}

View File

@@ -0,0 +1,570 @@
import test from 'node:test';
import assert from 'node:assert/strict';
import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
import { downloadYoutubeSubtitleTrack, downloadYoutubeSubtitleTracks } from './track-download';
async function withTempDir<T>(fn: (dir: string) => Promise<T>): Promise<T> {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-youtube-track-download-'));
try {
return await fn(dir);
} finally {
fs.rmSync(dir, { recursive: true, force: true });
}
}
function makeFakeYtDlpScript(dir: string): string {
const scriptPath = path.join(dir, 'yt-dlp');
const script = `#!/usr/bin/env node
const fs = require('node:fs');
const path = require('node:path');
const args = process.argv.slice(2);
let outputTemplate = '';
const wantsAutoSubs = args.includes('--write-auto-subs');
const wantsManualSubs = args.includes('--write-subs');
const subLangIndex = args.indexOf('--sub-langs');
const subLang = subLangIndex >= 0 ? args[subLangIndex + 1] || '' : '';
const subLangs = subLang ? subLang.split(',').filter(Boolean) : [];
for (let i = 0; i < args.length; i += 1) {
if (args[i] === '-o' && typeof args[i + 1] === 'string') {
outputTemplate = args[i + 1];
i += 1;
}
}
if (process.env.YTDLP_EXPECT_AUTO_SUBS === '1' && !wantsAutoSubs) {
process.exit(2);
}
if (process.env.YTDLP_EXPECT_MANUAL_SUBS === '1' && !wantsManualSubs) {
process.exit(3);
}
if (process.env.YTDLP_EXPECT_SUB_LANG && subLang !== process.env.YTDLP_EXPECT_SUB_LANG) {
process.exit(4);
}
const prefix = outputTemplate.replace(/\.%\([^)]+\)s$/, '');
if (!prefix) {
process.exit(1);
}
fs.mkdirSync(path.dirname(prefix), { recursive: true });
if (process.env.YTDLP_FAKE_MODE === 'multi') {
for (const lang of subLangs) {
fs.writeFileSync(\`\${prefix}.\${lang}.vtt\`, 'WEBVTT\\n');
}
} else if (process.env.YTDLP_FAKE_MODE === 'rolling-auto') {
fs.writeFileSync(
\`\${prefix}.vtt\`,
[
'WEBVTT',
'',
'00:00:01.000 --> 00:00:02.000',
'今日は',
'',
'00:00:02.000 --> 00:00:03.000',
'今日はいい天気ですね',
'',
'00:00:03.000 --> 00:00:04.000',
'今日はいい天気ですね本当に',
'',
].join('\\n'),
);
} else if (process.env.YTDLP_FAKE_MODE === 'multi-primary-only-fail') {
const primaryLang = subLangs[0];
if (primaryLang) {
fs.writeFileSync(\`\${prefix}.\${primaryLang}.vtt\`, 'WEBVTT\\n');
}
process.stderr.write("ERROR: Unable to download video subtitles for 'en': HTTP Error 429: Too Many Requests\\n");
process.exit(1);
} else if (process.env.YTDLP_FAKE_MODE === 'both') {
fs.writeFileSync(\`\${prefix}.vtt\`, 'WEBVTT\\n');
fs.writeFileSync(\`\${prefix}.orig.webp\`, 'webp');
} else if (process.env.YTDLP_FAKE_MODE === 'webp-only') {
fs.writeFileSync(\`\${prefix}.orig.webp\`, 'webp');
} else {
fs.writeFileSync(\`\${prefix}.vtt\`, 'WEBVTT\\n');
}
process.exit(0);
`;
fs.writeFileSync(scriptPath, script, 'utf8');
fs.chmodSync(scriptPath, 0o755);
return scriptPath;
}
async function withFakeYtDlp<T>(
mode: 'both' | 'webp-only' | 'multi' | 'multi-primary-only-fail' | 'rolling-auto',
fn: (dir: string, binDir: string) => Promise<T>,
): Promise<T> {
return await withTempDir(async (root) => {
const binDir = path.join(root, 'bin');
fs.mkdirSync(binDir, { recursive: true });
makeFakeYtDlpScript(binDir);
const originalPath = process.env.PATH ?? '';
process.env.PATH = `${binDir}${path.delimiter}${originalPath}`;
process.env.YTDLP_FAKE_MODE = mode;
try {
return await fn(root, binDir);
} finally {
process.env.PATH = originalPath;
delete process.env.YTDLP_FAKE_MODE;
}
});
}
async function withFakeYtDlpExpectations<T>(
expectations: Partial<Record<'YTDLP_EXPECT_AUTO_SUBS' | 'YTDLP_EXPECT_MANUAL_SUBS' | 'YTDLP_EXPECT_SUB_LANG', string>>,
fn: () => Promise<T>,
): Promise<T> {
const previous = {
YTDLP_EXPECT_AUTO_SUBS: process.env.YTDLP_EXPECT_AUTO_SUBS,
YTDLP_EXPECT_MANUAL_SUBS: process.env.YTDLP_EXPECT_MANUAL_SUBS,
YTDLP_EXPECT_SUB_LANG: process.env.YTDLP_EXPECT_SUB_LANG,
};
Object.assign(process.env, expectations);
try {
return await fn();
} finally {
for (const [key, value] of Object.entries(previous)) {
if (value === undefined) {
delete process.env[key];
} else {
process.env[key] = value;
}
}
}
}
async function withStubFetch<T>(
handler: (url: string) => Promise<Response> | Response,
fn: () => Promise<T>,
): Promise<T> {
const originalFetch = globalThis.fetch;
globalThis.fetch = (async (input: string | URL | Request) => {
const url =
typeof input === 'string'
? input
: input instanceof URL
? input.toString()
: input.url;
return await handler(url);
}) as typeof fetch;
try {
return await fn();
} finally {
globalThis.fetch = originalFetch;
}
}
test('downloadYoutubeSubtitleTrack prefers subtitle files over later webp artifacts', async () => {
if (process.platform === 'win32') {
return;
}
await withFakeYtDlp('both', async (root) => {
const result = await downloadYoutubeSubtitleTrack({
targetUrl: 'https://www.youtube.com/watch?v=abc123',
outputDir: path.join(root, 'out'),
track: {
id: 'auto:ja-orig',
language: 'ja',
sourceLanguage: 'ja-orig',
kind: 'auto',
label: 'Japanese (auto)',
},
});
assert.equal(path.extname(result.path), '.vtt');
assert.match(path.basename(result.path), /^auto-ja-orig\./);
});
});
test('downloadYoutubeSubtitleTrack ignores stale subtitle files from prior runs', async () => {
if (process.platform === 'win32') {
return;
}
await withFakeYtDlp('webp-only', async (root) => {
const outputDir = path.join(root, 'out');
fs.mkdirSync(outputDir, { recursive: true });
fs.writeFileSync(path.join(outputDir, 'auto-ja.vtt'), 'stale subtitle');
await assert.rejects(
async () =>
await downloadYoutubeSubtitleTrack({
targetUrl: 'https://www.youtube.com/watch?v=abc123',
outputDir,
track: {
id: 'auto:ja',
language: 'ja',
sourceLanguage: 'ja',
kind: 'auto',
label: 'Japanese (auto)',
},
}),
/No subtitle file was downloaded/,
);
});
});
test('downloadYoutubeSubtitleTrack uses auto subtitle flags and raw source language for auto tracks', async () => {
if (process.platform === 'win32') {
return;
}
await withFakeYtDlp('both', async (root) => {
await withFakeYtDlpExpectations(
{
YTDLP_EXPECT_AUTO_SUBS: '1',
YTDLP_EXPECT_SUB_LANG: 'ja-orig',
},
async () => {
const result = await downloadYoutubeSubtitleTrack({
targetUrl: 'https://www.youtube.com/watch?v=abc123',
outputDir: path.join(root, 'out'),
track: {
id: 'auto:ja-orig',
language: 'ja',
sourceLanguage: 'ja-orig',
kind: 'auto',
label: 'Japanese (auto)',
},
});
assert.equal(path.extname(result.path), '.vtt');
},
);
});
});
test('downloadYoutubeSubtitleTrack keeps manual subtitle flag for manual tracks', async () => {
if (process.platform === 'win32') {
return;
}
await withFakeYtDlp('both', async (root) => {
await withFakeYtDlpExpectations(
{
YTDLP_EXPECT_MANUAL_SUBS: '1',
YTDLP_EXPECT_SUB_LANG: 'ja',
},
async () => {
const result = await downloadYoutubeSubtitleTrack({
targetUrl: 'https://www.youtube.com/watch?v=abc123',
outputDir: path.join(root, 'out'),
track: {
id: 'manual:ja',
language: 'ja',
sourceLanguage: 'ja',
kind: 'manual',
label: 'Japanese (manual)',
},
});
assert.equal(path.extname(result.path), '.vtt');
},
);
});
});
test('downloadYoutubeSubtitleTrack normalizes rolling auto-caption vtt output from yt-dlp', async () => {
if (process.platform === 'win32') {
return;
}
await withFakeYtDlp('rolling-auto', async (root) => {
const result = await downloadYoutubeSubtitleTrack({
targetUrl: 'https://www.youtube.com/watch?v=abc123',
outputDir: path.join(root, 'out'),
track: {
id: 'auto:ja-orig',
language: 'ja',
sourceLanguage: 'ja-orig',
kind: 'auto',
label: 'Japanese (auto)',
},
});
assert.equal(
fs.readFileSync(result.path, 'utf8'),
[
'WEBVTT',
'',
'00:00:01.000 --> 00:00:02.000',
'今日は',
'',
'00:00:02.000 --> 00:00:03.000',
'いい天気ですね',
'',
'00:00:03.000 --> 00:00:04.000',
'本当に',
'',
].join('\n'),
);
});
});
test('downloadYoutubeSubtitleTrack prefers direct download URL when available', async () => {
await withTempDir(async (root) => {
await withStubFetch(
async (url) => {
assert.equal(url, 'https://example.com/subs/ja.vtt');
return new Response('WEBVTT\n', { status: 200 });
},
async () => {
const result = await downloadYoutubeSubtitleTrack({
targetUrl: 'https://www.youtube.com/watch?v=abc123',
outputDir: path.join(root, 'out'),
track: {
id: 'auto:ja-orig',
language: 'ja',
sourceLanguage: 'ja-orig',
kind: 'auto',
label: 'Japanese (auto)',
downloadUrl: 'https://example.com/subs/ja.vtt',
fileExtension: 'vtt',
},
});
assert.equal(path.basename(result.path), 'auto-ja-orig.ja-orig.vtt');
assert.equal(fs.readFileSync(result.path, 'utf8'), 'WEBVTT\n');
},
);
});
});
test('downloadYoutubeSubtitleTrack sanitizes metadata source language in filenames', async () => {
await withTempDir(async (root) => {
await withStubFetch(
async () => new Response('WEBVTT\n', { status: 200 }),
async () => {
const result = await downloadYoutubeSubtitleTrack({
targetUrl: 'https://www.youtube.com/watch?v=abc123',
outputDir: path.join(root, 'out'),
track: {
id: 'auto:../../ja-orig',
language: 'ja',
sourceLanguage: '../ja-orig/../../evil',
kind: 'auto',
label: 'Japanese (auto)',
downloadUrl: 'https://example.com/subs/ja.vtt',
fileExtension: 'vtt',
},
});
assert.equal(path.dirname(result.path), path.join(root, 'out'));
assert.equal(path.basename(result.path), 'auto-ja-orig.ja-orig-evil.vtt');
},
);
});
});
test('downloadYoutubeSubtitleTrack converts srv3 auto subtitles into regular vtt', async () => {
await withTempDir(async (root) => {
await withStubFetch(
async (url) => {
assert.equal(url, 'https://example.com/subs/ja.srv3');
return new Response(
[
'<timedtext><body>',
'<p t="1000" d="2500">今日は</p>',
'<p t="2000" d="2500">今日はいい天気ですね</p>',
'<p t="3500" d="2500">今日はいい天気ですね本当に</p>',
'</body></timedtext>',
].join(''),
{ status: 200 },
);
},
async () => {
const result = await downloadYoutubeSubtitleTrack({
targetUrl: 'https://www.youtube.com/watch?v=abc123',
outputDir: path.join(root, 'out'),
track: {
id: 'auto:ja-orig',
language: 'ja',
sourceLanguage: 'ja-orig',
kind: 'auto',
label: 'Japanese (auto)',
downloadUrl: 'https://example.com/subs/ja.srv3',
fileExtension: 'srv3',
},
});
assert.equal(path.basename(result.path), 'auto-ja-orig.ja-orig.vtt');
assert.equal(
fs.readFileSync(result.path, 'utf8'),
[
'WEBVTT',
'',
'00:00:01.000 --> 00:00:01.999',
'今日は',
'',
'00:00:02.000 --> 00:00:03.499',
'いい天気ですね',
'',
'00:00:03.500 --> 00:00:06.000',
'本当に',
'',
].join('\n'),
);
},
);
});
});
test('downloadYoutubeSubtitleTracks downloads primary and secondary in one invocation', async () => {
if (process.platform === 'win32') {
return;
}
await withFakeYtDlp('multi', async (root) => {
const outputDir = path.join(root, 'out');
const result = await downloadYoutubeSubtitleTracks({
targetUrl: 'https://www.youtube.com/watch?v=abc123',
outputDir,
tracks: [
{
id: 'auto:ja-orig',
language: 'ja',
sourceLanguage: 'ja-orig',
kind: 'auto',
label: 'Japanese (auto)',
},
{
id: 'auto:en',
language: 'en',
sourceLanguage: 'en',
kind: 'auto',
label: 'English (auto)',
},
],
});
assert.match(path.basename(result.get('auto:ja-orig') ?? ''), /\.ja-orig\.vtt$/);
assert.match(path.basename(result.get('auto:en') ?? ''), /\.en\.vtt$/);
});
});
test('downloadYoutubeSubtitleTracks preserves successfully downloaded primary file on partial failure', async () => {
if (process.platform === 'win32') {
return;
}
await withFakeYtDlp('multi-primary-only-fail', async (root) => {
const outputDir = path.join(root, 'out');
const result = await downloadYoutubeSubtitleTracks({
targetUrl: 'https://www.youtube.com/watch?v=abc123',
outputDir,
tracks: [
{
id: 'auto:ja-orig',
language: 'ja',
sourceLanguage: 'ja-orig',
kind: 'auto',
label: 'Japanese (auto)',
},
{
id: 'auto:en',
language: 'en',
sourceLanguage: 'en',
kind: 'auto',
label: 'English (auto)',
},
],
});
assert.match(path.basename(result.get('auto:ja-orig') ?? ''), /\.ja-orig\.vtt$/);
assert.equal(result.has('auto:en'), false);
});
});
test('downloadYoutubeSubtitleTracks prefers direct download URLs when available', async () => {
await withTempDir(async (root) => {
const seen: string[] = [];
await withStubFetch(
async (url) => {
seen.push(url);
return new Response(`WEBVTT\n${url}\n`, { status: 200 });
},
async () => {
const result = await downloadYoutubeSubtitleTracks({
targetUrl: 'https://www.youtube.com/watch?v=abc123',
outputDir: path.join(root, 'out'),
tracks: [
{
id: 'auto:ja-orig',
language: 'ja',
sourceLanguage: 'ja-orig',
kind: 'auto',
label: 'Japanese (auto)',
downloadUrl: 'https://example.com/subs/ja.vtt',
fileExtension: 'vtt',
},
{
id: 'auto:en',
language: 'en',
sourceLanguage: 'en',
kind: 'auto',
label: 'English (auto)',
downloadUrl: 'https://example.com/subs/en.vtt',
fileExtension: 'vtt',
},
],
});
assert.deepEqual(seen, [
'https://example.com/subs/ja.vtt',
'https://example.com/subs/en.vtt',
]);
assert.match(path.basename(result.get('auto:ja-orig') ?? ''), /\.ja-orig\.vtt$/);
assert.match(path.basename(result.get('auto:en') ?? ''), /\.en\.vtt$/);
},
);
});
});
test('downloadYoutubeSubtitleTracks keeps duplicate source-language direct downloads distinct', async () => {
await withTempDir(async (root) => {
const seen: string[] = [];
await withStubFetch(
async (url) => {
seen.push(url);
return new Response(`WEBVTT\n${url}\n`, { status: 200 });
},
async () => {
const result = await downloadYoutubeSubtitleTracks({
targetUrl: 'https://www.youtube.com/watch?v=abc123',
outputDir: path.join(root, 'out'),
tracks: [
{
id: 'auto:ja-orig',
language: 'ja',
sourceLanguage: 'ja-orig',
kind: 'auto',
label: 'Japanese (auto)',
downloadUrl: 'https://example.com/subs/ja-auto.vtt',
fileExtension: 'vtt',
},
{
id: 'manual:ja-orig',
language: 'ja',
sourceLanguage: 'ja-orig',
kind: 'manual',
label: 'Japanese (manual)',
downloadUrl: 'https://example.com/subs/ja-manual.vtt',
fileExtension: 'vtt',
},
],
});
assert.deepEqual(seen, [
'https://example.com/subs/ja-auto.vtt',
'https://example.com/subs/ja-manual.vtt',
]);
assert.notEqual(result.get('auto:ja-orig'), result.get('manual:ja-orig'));
},
);
});
});

View File

@@ -0,0 +1,315 @@
import fs from 'node:fs';
import path from 'node:path';
import { spawn } from 'node:child_process';
import type { YoutubeTrackOption } from './track-probe';
import {
convertYoutubeTimedTextToVtt,
isYoutubeTimedTextExtension,
normalizeYoutubeAutoVtt,
} from './timedtext';
const YOUTUBE_SUBTITLE_EXTENSIONS = new Set(['.srt', '.vtt', '.ass']);
const YOUTUBE_BATCH_PREFIX = 'youtube-batch';
const YOUTUBE_DOWNLOAD_TIMEOUT_MS = 15_000;
function sanitizeFilenameSegment(value: string): string {
const sanitized = value.trim().replace(/[^a-z0-9_-]+/gi, '-').replace(/-+/g, '-');
return sanitized.replace(/^-+|-+$/g, '') || 'unknown';
}
function createFetchTimeoutSignal(timeoutMs: number): AbortSignal | undefined {
if (typeof AbortSignal !== 'undefined' && typeof AbortSignal.timeout === 'function') {
return AbortSignal.timeout(timeoutMs);
}
return undefined;
}
function runCapture(
command: string,
args: string[],
timeoutMs = YOUTUBE_DOWNLOAD_TIMEOUT_MS,
): Promise<{ stdout: string; stderr: string }> {
return new Promise((resolve, reject) => {
const proc = spawn(command, args, { stdio: ['ignore', 'pipe', 'pipe'] });
let stdout = '';
let stderr = '';
const timer = setTimeout(() => {
proc.kill();
reject(new Error(`yt-dlp timed out after ${timeoutMs}ms`));
}, timeoutMs);
proc.stdout.setEncoding('utf8');
proc.stderr.setEncoding('utf8');
proc.stdout.on('data', (chunk) => {
stdout += String(chunk);
});
proc.stderr.on('data', (chunk) => {
stderr += String(chunk);
});
proc.once('error', (error) => {
clearTimeout(timer);
reject(error);
});
proc.once('close', (code) => {
clearTimeout(timer);
if (code === 0) {
resolve({ stdout, stderr });
return;
}
reject(new Error(stderr.trim() || `yt-dlp exited with status ${code ?? 'unknown'}`));
});
});
}
function runCaptureDetailed(
command: string,
args: string[],
timeoutMs = YOUTUBE_DOWNLOAD_TIMEOUT_MS,
): Promise<{ stdout: string; stderr: string; code: number }> {
return new Promise((resolve, reject) => {
const proc = spawn(command, args, { stdio: ['ignore', 'pipe', 'pipe'] });
let stdout = '';
let stderr = '';
const timer = setTimeout(() => {
proc.kill();
reject(new Error(`yt-dlp timed out after ${timeoutMs}ms`));
}, timeoutMs);
proc.stdout.setEncoding('utf8');
proc.stderr.setEncoding('utf8');
proc.stdout.on('data', (chunk) => {
stdout += String(chunk);
});
proc.stderr.on('data', (chunk) => {
stderr += String(chunk);
});
proc.once('error', (error) => {
clearTimeout(timer);
reject(error);
});
proc.once('close', (code) => {
clearTimeout(timer);
resolve({ stdout, stderr, code: code ?? 1 });
});
});
}
function pickLatestSubtitleFile(dir: string, prefix: string): string | null {
const entries = fs.readdirSync(dir).map((name) => path.join(dir, name));
const candidates = entries.filter((candidate) => {
const basename = path.basename(candidate);
const ext = path.extname(basename).toLowerCase();
return basename.startsWith(prefix) && YOUTUBE_SUBTITLE_EXTENSIONS.has(ext);
});
candidates.sort((a, b) => fs.statSync(b).mtimeMs - fs.statSync(a).mtimeMs);
return candidates[0] ?? null;
}
function pickLatestSubtitleFileForLanguage(
dir: string,
prefix: string,
sourceLanguage: string,
): string | null {
const entries = fs.readdirSync(dir).map((name) => path.join(dir, name));
const candidates = entries.filter((candidate) => {
const basename = path.basename(candidate);
const ext = path.extname(basename).toLowerCase();
return (
basename.startsWith(`${prefix}.`) &&
basename.includes(`.${sourceLanguage}.`) &&
YOUTUBE_SUBTITLE_EXTENSIONS.has(ext)
);
});
candidates.sort((a, b) => fs.statSync(b).mtimeMs - fs.statSync(a).mtimeMs);
return candidates[0] ?? null;
}
function buildDownloadArgs(input: {
targetUrl: string;
outputTemplate: string;
sourceLanguages: string[];
includeAutoSubs: boolean;
includeManualSubs: boolean;
}): string[] {
const args = ['--skip-download', '--no-warnings'];
if (input.includeAutoSubs) {
args.push('--write-auto-subs');
}
if (input.includeManualSubs) {
args.push('--write-subs');
}
args.push(
'--sub-format',
'srt/vtt/best',
'--sub-langs',
input.sourceLanguages.join(','),
'-o',
input.outputTemplate,
input.targetUrl,
);
return args;
}
async function downloadSubtitleFromUrl(input: {
outputDir: string;
prefix: string;
track: YoutubeTrackOption;
}): Promise<{ path: string }> {
if (!input.track.downloadUrl) {
throw new Error(`No direct subtitle URL available for ${input.track.sourceLanguage}`);
}
const ext = (input.track.fileExtension?.trim().toLowerCase() || 'vtt').replace(/[^a-z0-9]+/g, '');
const safeExt = isYoutubeTimedTextExtension(ext)
? 'vtt'
: YOUTUBE_SUBTITLE_EXTENSIONS.has(`.${ext}`)
? ext
: 'vtt';
const safeSourceLanguage = sanitizeFilenameSegment(input.track.sourceLanguage);
const targetPath = path.join(
input.outputDir,
`${input.prefix}.${safeSourceLanguage}.${safeExt}`,
);
const response = await fetch(input.track.downloadUrl, {
signal: createFetchTimeoutSignal(YOUTUBE_DOWNLOAD_TIMEOUT_MS),
});
if (!response.ok) {
throw new Error(`HTTP ${response.status} while downloading ${input.track.sourceLanguage}`);
}
const body = await response.text();
const normalizedBody = isYoutubeTimedTextExtension(ext)
? convertYoutubeTimedTextToVtt(body)
: input.track.kind === 'auto' && safeExt === 'vtt'
? normalizeYoutubeAutoVtt(body)
: body;
fs.writeFileSync(targetPath, normalizedBody, 'utf8');
return { path: targetPath };
}
function canDownloadSubtitleFromUrl(track: YoutubeTrackOption): boolean {
if (!track.downloadUrl) {
return false;
}
const ext = (track.fileExtension?.trim().toLowerCase() || 'vtt').replace(/[^a-z0-9]+/g, '');
return isYoutubeTimedTextExtension(ext) || YOUTUBE_SUBTITLE_EXTENSIONS.has(`.${ext}`);
}
function normalizeDownloadedAutoSubtitle(pathname: string, track: YoutubeTrackOption): void {
if (track.kind !== 'auto' || path.extname(pathname).toLowerCase() !== '.vtt') {
return;
}
const content = fs.readFileSync(pathname, 'utf8');
const normalized = normalizeYoutubeAutoVtt(content);
if (normalized !== content) {
fs.writeFileSync(pathname, normalized, 'utf8');
}
}
export async function downloadYoutubeSubtitleTrack(input: {
targetUrl: string;
outputDir: string;
track: YoutubeTrackOption;
}): Promise<{ path: string }> {
fs.mkdirSync(input.outputDir, { recursive: true });
const prefix = input.track.id.replace(/[^a-z0-9_-]+/gi, '-');
for (const name of fs.readdirSync(input.outputDir)) {
if (name.startsWith(prefix)) {
try {
fs.rmSync(path.join(input.outputDir, name), { force: true });
} catch {
// ignore stale files
}
}
}
if (canDownloadSubtitleFromUrl(input.track)) {
return await downloadSubtitleFromUrl({
outputDir: input.outputDir,
prefix,
track: input.track,
});
}
const outputTemplate = path.join(input.outputDir, `${prefix}.%(ext)s`);
const args = [
...buildDownloadArgs({
targetUrl: input.targetUrl,
outputTemplate,
sourceLanguages: [input.track.sourceLanguage],
includeAutoSubs: input.track.kind === 'auto',
includeManualSubs: input.track.kind === 'manual',
}),
];
await runCapture('yt-dlp', args);
const subtitlePath = pickLatestSubtitleFile(input.outputDir, prefix);
if (!subtitlePath) {
throw new Error(`No subtitle file was downloaded for ${input.track.sourceLanguage}`);
}
normalizeDownloadedAutoSubtitle(subtitlePath, input.track);
return { path: subtitlePath };
}
export async function downloadYoutubeSubtitleTracks(input: {
targetUrl: string;
outputDir: string;
tracks: YoutubeTrackOption[];
}): Promise<Map<string, string>> {
fs.mkdirSync(input.outputDir, { recursive: true });
const hasDuplicateSourceLanguages =
new Set(input.tracks.map((track) => track.sourceLanguage)).size !== input.tracks.length;
for (const name of fs.readdirSync(input.outputDir)) {
if (name.startsWith(`${YOUTUBE_BATCH_PREFIX}.`)) {
try {
fs.rmSync(path.join(input.outputDir, name), { force: true });
} catch {
// ignore stale files
}
}
}
if (hasDuplicateSourceLanguages || input.tracks.every(canDownloadSubtitleFromUrl)) {
const results = new Map<string, string>();
for (const track of input.tracks) {
const download = await downloadSubtitleFromUrl({
outputDir: input.outputDir,
prefix: track.id.replace(/[^a-z0-9_-]+/gi, '-'),
track,
});
results.set(track.id, download.path);
}
return results;
}
const outputTemplate = path.join(input.outputDir, `${YOUTUBE_BATCH_PREFIX}.%(ext)s`);
const includeAutoSubs = input.tracks.some((track) => track.kind === 'auto');
const includeManualSubs = input.tracks.some((track) => track.kind === 'manual');
const result = await runCaptureDetailed(
'yt-dlp',
buildDownloadArgs({
targetUrl: input.targetUrl,
outputTemplate,
sourceLanguages: input.tracks.map((track) => track.sourceLanguage),
includeAutoSubs,
includeManualSubs,
}),
);
const results = new Map<string, string>();
for (const track of input.tracks) {
const subtitlePath = pickLatestSubtitleFileForLanguage(
input.outputDir,
YOUTUBE_BATCH_PREFIX,
track.sourceLanguage,
);
if (subtitlePath) {
normalizeDownloadedAutoSubtitle(subtitlePath, track);
results.set(track.id, subtitlePath);
}
}
if (results.size > 0) {
return results;
}
if (result.code !== 0) {
throw new Error(result.stderr.trim() || `yt-dlp exited with status ${result.code}`);
}
throw new Error(
`No subtitle file was downloaded for ${input.tracks.map((track) => track.sourceLanguage).join(',')}`,
);
}

View File

@@ -0,0 +1,99 @@
import test from 'node:test';
import assert from 'node:assert/strict';
import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
import { probeYoutubeTracks } from './track-probe';
async function withTempDir<T>(fn: (dir: string) => Promise<T>): Promise<T> {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-youtube-track-probe-'));
try {
return await fn(dir);
} finally {
fs.rmSync(dir, { recursive: true, force: true });
}
}
function makeFakeYtDlpScript(dir: string, payload: unknown, rawScript = false): void {
const scriptPath = path.join(dir, 'yt-dlp');
const stdoutBody = typeof payload === 'string' ? payload : JSON.stringify(payload);
const script = rawScript
? stdoutBody
: `#!/usr/bin/env node
process.stdout.write(${JSON.stringify(stdoutBody)});
`;
fs.writeFileSync(scriptPath, script, 'utf8');
if (process.platform !== 'win32') {
fs.chmodSync(scriptPath, 0o755);
}
fs.writeFileSync(scriptPath + '.cmd', `@echo off\r\nnode "${scriptPath}"\r\n`, 'utf8');
}
async function withFakeYtDlp<T>(
payload: unknown,
fn: () => Promise<T>,
options: { rawScript?: boolean } = {},
): Promise<T> {
return await withTempDir(async (root) => {
const binDir = path.join(root, 'bin');
fs.mkdirSync(binDir, { recursive: true });
makeFakeYtDlpScript(binDir, payload, options.rawScript === true);
const originalPath = process.env.PATH ?? '';
process.env.PATH = `${binDir}${path.delimiter}${originalPath}`;
try {
return await fn();
} finally {
process.env.PATH = originalPath;
}
});
}
test('probeYoutubeTracks prefers srv3 over vtt for automatic captions', async () => {
await withFakeYtDlp(
{
id: 'abc123',
title: 'Example',
automatic_captions: {
'ja-orig': [
{ ext: 'vtt', url: 'https://example.com/ja.vtt', name: 'Japanese auto' },
{ ext: 'srv3', url: 'https://example.com/ja.srv3', name: 'Japanese auto' },
],
},
},
async () => {
const result = await probeYoutubeTracks('https://www.youtube.com/watch?v=abc123');
assert.equal(result.videoId, 'abc123');
assert.equal(result.tracks[0]?.downloadUrl, 'https://example.com/ja.srv3');
assert.equal(result.tracks[0]?.fileExtension, 'srv3');
},
);
});
test('probeYoutubeTracks keeps preferring srt for manual captions', async () => {
await withFakeYtDlp(
{
id: 'abc123',
title: 'Example',
subtitles: {
ja: [
{ ext: 'srv3', url: 'https://example.com/ja.srv3', name: 'Japanese manual' },
{ ext: 'srt', url: 'https://example.com/ja.srt', name: 'Japanese manual' },
],
},
},
async () => {
const result = await probeYoutubeTracks('https://www.youtube.com/watch?v=abc123');
assert.equal(result.tracks[0]?.downloadUrl, 'https://example.com/ja.srt');
assert.equal(result.tracks[0]?.fileExtension, 'srt');
},
);
});
test('probeYoutubeTracks reports malformed yt-dlp JSON with context', async () => {
await withFakeYtDlp('not-json', async () => {
await assert.rejects(
async () => await probeYoutubeTracks('https://www.youtube.com/watch?v=abc123'),
/Failed to parse yt-dlp output as JSON/,
);
});
});

View File

@@ -0,0 +1,136 @@
import { spawn } from 'node:child_process';
import type { YoutubeTrackOption } from '../../../types';
import { formatYoutubeTrackLabel, normalizeYoutubeLangCode, type YoutubeTrackKind } from './labels';
const YOUTUBE_TRACK_PROBE_TIMEOUT_MS = 15_000;
export type YoutubeTrackProbeResult = {
videoId: string;
title: string;
tracks: YoutubeTrackOption[];
};
type YtDlpSubtitleEntry = Array<{ ext?: string; name?: string; url?: string }>;
type YtDlpInfo = {
id?: string;
title?: string;
subtitles?: Record<string, YtDlpSubtitleEntry>;
automatic_captions?: Record<string, YtDlpSubtitleEntry>;
};
function runCapture(
command: string,
args: string[],
timeoutMs = YOUTUBE_TRACK_PROBE_TIMEOUT_MS,
): Promise<{ stdout: string; stderr: string }> {
return new Promise((resolve, reject) => {
const proc = spawn(command, args, { stdio: ['ignore', 'pipe', 'pipe'] });
let stdout = '';
let stderr = '';
const timer = setTimeout(() => {
proc.kill();
reject(new Error(`yt-dlp timed out after ${timeoutMs}ms`));
}, timeoutMs);
proc.stdout.setEncoding('utf8');
proc.stderr.setEncoding('utf8');
proc.stdout.on('data', (chunk) => {
stdout += String(chunk);
});
proc.stderr.on('data', (chunk) => {
stderr += String(chunk);
});
proc.once('error', (error) => {
clearTimeout(timer);
reject(error);
});
proc.once('close', (code) => {
clearTimeout(timer);
if (code === 0) {
resolve({ stdout, stderr });
return;
}
reject(new Error(stderr.trim() || `yt-dlp exited with status ${code ?? 'unknown'}`));
});
});
}
function choosePreferredFormat(
formats: YtDlpSubtitleEntry,
kind: YoutubeTrackKind,
): { ext: string; url: string; title?: string } | null {
const preferredOrder =
kind === 'auto'
? ['srv3', 'srv2', 'srv1', 'vtt', 'srt', 'ttml', 'json3']
: ['srt', 'vtt', 'srv3', 'srv2', 'srv1', 'ttml', 'json3'];
for (const ext of preferredOrder) {
const match = formats.find(
(format) => typeof format.url === 'string' && format.url && format.ext === ext,
);
if (match?.url) {
return { ext, url: match.url, title: match.name?.trim() || undefined };
}
}
const fallback = formats.find((format) => typeof format.url === 'string' && format.url);
if (!fallback?.url) {
return null;
}
return {
ext: fallback.ext?.trim() || 'vtt',
url: fallback.url,
title: fallback.name?.trim() || undefined,
};
}
function toTracks(entries: Record<string, YtDlpSubtitleEntry> | undefined, kind: YoutubeTrackKind) {
const tracks: YoutubeTrackOption[] = [];
if (!entries) return tracks;
for (const [language, formats] of Object.entries(entries)) {
if (!Array.isArray(formats) || formats.length === 0) continue;
const preferredFormat = choosePreferredFormat(formats, kind);
if (!preferredFormat) continue;
const sourceLanguage = language.trim() || language;
const normalizedLanguage = normalizeYoutubeLangCode(sourceLanguage) || sourceLanguage;
const title = preferredFormat.title;
tracks.push({
id: `${kind}:${sourceLanguage}`,
language: normalizedLanguage,
sourceLanguage,
kind,
title,
label: formatYoutubeTrackLabel({ language: normalizedLanguage, kind, title }),
downloadUrl: preferredFormat.url,
fileExtension: preferredFormat.ext,
});
}
return tracks;
}
export type { YoutubeTrackOption };
export async function probeYoutubeTracks(targetUrl: string): Promise<YoutubeTrackProbeResult> {
const { stdout } = await runCapture('yt-dlp', ['--dump-single-json', '--no-warnings', targetUrl]);
const trimmedStdout = stdout.trim();
if (!trimmedStdout) {
throw new Error('yt-dlp returned empty output while probing subtitle tracks');
}
let info: YtDlpInfo;
try {
info = JSON.parse(trimmedStdout) as YtDlpInfo;
} catch (error) {
const snippet = trimmedStdout.slice(0, 200);
throw new Error(
`Failed to parse yt-dlp output as JSON: ${
error instanceof Error ? error.message : String(error)
}${snippet ? `; stdout=${snippet}` : ''}`,
);
}
const tracks = [...toTracks(info.subtitles, 'manual'), ...toTracks(info.automatic_captions, 'auto')];
return {
videoId: info.id || '',
title: info.title || '',
tracks,
};
}

View File

@@ -0,0 +1,63 @@
import { isEnglishYoutubeLang, isJapaneseYoutubeLang } from './labels';
import type { YoutubeTrackOption } from './track-probe';
function pickTrack(
tracks: YoutubeTrackOption[],
matcher: (value: string) => boolean,
excludeId?: string,
): YoutubeTrackOption | null {
const matching = tracks.filter((track) => matcher(track.language) && track.id !== excludeId);
return matching[0] ?? null;
}
export function chooseDefaultYoutubeTrackIds(
tracks: YoutubeTrackOption[],
): { primaryTrackId: string | null; secondaryTrackId: string | null } {
const primary =
pickTrack(
tracks.filter((track) => track.kind === 'manual'),
isJapaneseYoutubeLang,
) ||
pickTrack(
tracks.filter((track) => track.kind === 'auto'),
isJapaneseYoutubeLang,
) ||
tracks.find((track) => track.kind === 'manual') ||
tracks[0] ||
null;
const secondary =
pickTrack(
tracks.filter((track) => track.kind === 'manual'),
isEnglishYoutubeLang,
primary?.id ?? undefined,
) ||
pickTrack(
tracks.filter((track) => track.kind === 'auto'),
isEnglishYoutubeLang,
primary?.id ?? undefined,
) ||
null;
return {
primaryTrackId: primary?.id ?? null,
secondaryTrackId: secondary?.id ?? null,
};
}
export function normalizeYoutubeTrackSelection(input: {
primaryTrackId: string | null;
secondaryTrackId: string | null;
}): {
primaryTrackId: string | null;
secondaryTrackId: string | null;
} {
if (input.primaryTrackId && input.secondaryTrackId && input.primaryTrackId === input.secondaryTrackId) {
return {
primaryTrackId: input.primaryTrackId,
secondaryTrackId: null,
};
}
return input;
}