fix: address latest review feedback

This commit is contained in:
2026-03-22 20:09:16 -07:00
parent 809b57af44
commit d8a7ae77b0
18 changed files with 428 additions and 44 deletions

View File

@@ -0,0 +1,49 @@
import assert from 'node:assert/strict';
import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
import test from 'node:test';
import { probeYoutubeVideoMetadata } from './metadata-probe';
async function withTempDir<T>(fn: (dir: string) => Promise<T>): Promise<T> {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-youtube-metadata-probe-'));
try {
return await fn(dir);
} finally {
fs.rmSync(dir, { recursive: true, force: true });
}
}
function makeFakeYtDlpScript(dir: string, payload: string): void {
const scriptPath = path.join(dir, 'yt-dlp');
const script = `#!/usr/bin/env node
process.stdout.write(${JSON.stringify(payload)});
`;
fs.writeFileSync(scriptPath, script, 'utf8');
if (process.platform !== 'win32') {
fs.chmodSync(scriptPath, 0o755);
}
fs.writeFileSync(scriptPath + '.cmd', `@echo off\r\nnode "${scriptPath}"\r\n`, 'utf8');
}
async function withFakeYtDlp<T>(payload: string, fn: () => Promise<T>): Promise<T> {
return await withTempDir(async (root) => {
const binDir = path.join(root, 'bin');
fs.mkdirSync(binDir, { recursive: true });
makeFakeYtDlpScript(binDir, payload);
const originalPath = process.env.PATH ?? '';
process.env.PATH = `${binDir}${path.delimiter}${originalPath}`;
try {
return await fn();
} finally {
process.env.PATH = originalPath;
}
});
}
test('probeYoutubeVideoMetadata returns null on malformed yt-dlp JSON', async () => {
await withFakeYtDlp('not-json', async () => {
const result = await probeYoutubeVideoMetadata('https://www.youtube.com/watch?v=abc123');
assert.equal(result, null);
});
});

View File

@@ -79,7 +79,12 @@ export async function probeYoutubeVideoMetadata(
'--skip-download',
targetUrl,
]);
const info = JSON.parse(stdout) as YtDlpYoutubeMetadata;
let info: YtDlpYoutubeMetadata;
try {
info = JSON.parse(stdout) as YtDlpYoutubeMetadata;
} catch {
return null;
}
const youtubeVideoId = info.id?.trim();
const videoUrl = info.webpage_url?.trim() || targetUrl.trim();
if (!youtubeVideoId || !videoUrl) {

View File

@@ -0,0 +1,40 @@
import assert from 'node:assert/strict';
import test from 'node:test';
import { convertYoutubeTimedTextToVtt } from './timedtext';
test('convertYoutubeTimedTextToVtt leaves malformed numeric entities literal', () => {
const result = convertYoutubeTimedTextToVtt(
'<timedtext><body><p t="0" d="1000">&#99999999; &#x110000; &#x41;</p></body></timedtext>',
);
assert.equal(
result,
['WEBVTT', '', '00:00:00.000 --> 00:00:01.000', '&#99999999; &#x110000; A', ''].join('\n'),
);
});
test('convertYoutubeTimedTextToVtt does not swallow text after zero-length overlap rows', () => {
const result = convertYoutubeTimedTextToVtt(
[
'<timedtext><body>',
'<p t="0" d="2000">今日は</p>',
'<p t="1000" d="0">今日はいい天気ですね</p>',
'<p t="1000" d="2000">今日はいい天気ですね</p>',
'</body></timedtext>',
].join(''),
);
assert.equal(
result,
[
'WEBVTT',
'',
'00:00:00.000 --> 00:00:00.999',
'今日は',
'',
'00:00:01.000 --> 00:00:03.000',
'いい天気ですね',
'',
].join('\n'),
);
});

View File

@@ -6,6 +6,18 @@ interface YoutubeTimedTextRow {
const YOUTUBE_TIMEDTEXT_EXTENSIONS = new Set(['srv1', 'srv2', 'srv3', 'ytsrv3']);
function decodeNumericEntity(match: string, codePoint: number): string {
if (
!Number.isInteger(codePoint) ||
codePoint < 0 ||
codePoint > 0x10ffff ||
(codePoint >= 0xd800 && codePoint <= 0xdfff)
) {
return match;
}
return String.fromCodePoint(codePoint);
}
function decodeHtmlEntities(value: string): string {
return value
.replace(/&amp;/g, '&')
@@ -13,9 +25,11 @@ function decodeHtmlEntities(value: string): string {
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&#(\d+);/g, (_match, codePoint) => String.fromCodePoint(Number(codePoint)))
.replace(/&#x([0-9a-f]+);/gi, (_match, codePoint) =>
String.fromCodePoint(Number.parseInt(codePoint, 16)),
.replace(/&#(\d+);/g, (match, codePoint) =>
decodeNumericEntity(match, Number(codePoint)),
)
.replace(/&#x([0-9a-f]+);/gi, (match, codePoint) =>
decodeNumericEntity(match, Number.parseInt(codePoint, 16)),
);
}
@@ -85,7 +99,6 @@ export function convertYoutubeTimedTextToVtt(xml: string): string {
? Math.max(row.startMs, nextRow.startMs - 1)
: unclampedEnd;
if (clampedEnd <= row.startMs) {
previousText = row.text;
continue;
}

View File

@@ -303,6 +303,33 @@ test('downloadYoutubeSubtitleTrack prefers direct download URL when available',
});
});
test('downloadYoutubeSubtitleTrack sanitizes metadata source language in filenames', async () => {
await withTempDir(async (root) => {
await withStubFetch(
async () => new Response('WEBVTT\n', { status: 200 }),
async () => {
const result = await downloadYoutubeSubtitleTrack({
targetUrl: 'https://www.youtube.com/watch?v=abc123',
outputDir: path.join(root, 'out'),
track: {
id: 'auto:../../ja-orig',
language: 'ja',
sourceLanguage: '../ja-orig/../../evil',
kind: 'auto',
label: 'Japanese (auto)',
downloadUrl: 'https://example.com/subs/ja.vtt',
fileExtension: 'vtt',
},
mode: 'download',
});
assert.equal(path.dirname(result.path), path.join(root, 'out'));
assert.equal(path.basename(result.path), 'auto-ja-orig.ja-orig-evil.vtt');
},
);
});
});
test('downloadYoutubeSubtitleTrack converts srv3 auto subtitles into regular vtt', async () => {
await withTempDir(async (root) => {
await withStubFetch(

View File

@@ -9,6 +9,11 @@ const YOUTUBE_SUBTITLE_EXTENSIONS = new Set(['.srt', '.vtt', '.ass']);
const YOUTUBE_BATCH_PREFIX = 'youtube-batch';
const YOUTUBE_DOWNLOAD_TIMEOUT_MS = 15_000;
function sanitizeFilenameSegment(value: string): string {
const sanitized = value.trim().replace(/[^a-z0-9_-]+/gi, '-').replace(/-+/g, '-');
return sanitized.replace(/^-+|-+$/g, '') || 'unknown';
}
function createFetchTimeoutSignal(timeoutMs: number): AbortSignal | undefined {
if (typeof AbortSignal !== 'undefined' && typeof AbortSignal.timeout === 'function') {
return AbortSignal.timeout(timeoutMs);
@@ -154,9 +159,10 @@ async function downloadSubtitleFromUrl(input: {
: YOUTUBE_SUBTITLE_EXTENSIONS.has(`.${ext}`)
? ext
: 'vtt';
const safeSourceLanguage = sanitizeFilenameSegment(input.track.sourceLanguage);
const targetPath = path.join(
input.outputDir,
`${input.prefix}.${input.track.sourceLanguage}.${safeExt}`,
`${input.prefix}.${safeSourceLanguage}.${safeExt}`,
);
const response = await fetch(input.track.downloadUrl, {
signal: createFetchTimeoutSignal(YOUTUBE_DOWNLOAD_TIMEOUT_MS),

View File

@@ -14,10 +14,12 @@ async function withTempDir<T>(fn: (dir: string) => Promise<T>): Promise<T> {
}
}
function makeFakeYtDlpScript(dir: string, payload: unknown): void {
function makeFakeYtDlpScript(dir: string, payload: unknown, rawScript = false): void {
const scriptPath = path.join(dir, 'yt-dlp');
const stdoutBody = typeof payload === 'string' ? payload : JSON.stringify(payload);
const script = `#!/usr/bin/env node
const script = rawScript
? stdoutBody
: `#!/usr/bin/env node
process.stdout.write(${JSON.stringify(stdoutBody)});
`;
fs.writeFileSync(scriptPath, script, 'utf8');
@@ -27,11 +29,15 @@ process.stdout.write(${JSON.stringify(stdoutBody)});
fs.writeFileSync(scriptPath + '.cmd', `@echo off\r\nnode "${scriptPath}"\r\n`, 'utf8');
}
async function withFakeYtDlp<T>(payload: unknown, fn: () => Promise<T>): Promise<T> {
async function withFakeYtDlp<T>(
payload: unknown,
fn: () => Promise<T>,
options: { rawScript?: boolean } = {},
): Promise<T> {
return await withTempDir(async (root) => {
const binDir = path.join(root, 'bin');
fs.mkdirSync(binDir, { recursive: true });
makeFakeYtDlpScript(binDir, payload);
makeFakeYtDlpScript(binDir, payload, options.rawScript === true);
const originalPath = process.env.PATH ?? '';
process.env.PATH = `${binDir}${path.delimiter}${originalPath}`;
try {

View File

@@ -2,6 +2,8 @@ import { spawn } from 'node:child_process';
import type { YoutubeTrackOption } from '../../../types';
import { formatYoutubeTrackLabel, normalizeYoutubeLangCode, type YoutubeTrackKind } from './labels';
const YOUTUBE_TRACK_PROBE_TIMEOUT_MS = 15_000;
export type YoutubeTrackProbeResult = {
videoId: string;
title: string;
@@ -17,11 +19,19 @@ type YtDlpInfo = {
automatic_captions?: Record<string, YtDlpSubtitleEntry>;
};
function runCapture(command: string, args: string[]): Promise<{ stdout: string; stderr: string }> {
function runCapture(
command: string,
args: string[],
timeoutMs = YOUTUBE_TRACK_PROBE_TIMEOUT_MS,
): Promise<{ stdout: string; stderr: string }> {
return new Promise((resolve, reject) => {
const proc = spawn(command, args, { stdio: ['ignore', 'pipe', 'pipe'] });
let stdout = '';
let stderr = '';
const timer = setTimeout(() => {
proc.kill();
reject(new Error(`yt-dlp timed out after ${timeoutMs}ms`));
}, timeoutMs);
proc.stdout.setEncoding('utf8');
proc.stderr.setEncoding('utf8');
proc.stdout.on('data', (chunk) => {
@@ -30,8 +40,12 @@ function runCapture(command: string, args: string[]): Promise<{ stdout: string;
proc.stderr.on('data', (chunk) => {
stderr += String(chunk);
});
proc.once('error', reject);
proc.once('error', (error) => {
clearTimeout(timer);
reject(error);
});
proc.once('close', (code) => {
clearTimeout(timer);
if (code === 0) {
resolve({ stdout, stderr });
return;