mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-23 00:11:28 -07:00
fix: address latest review feedback
This commit is contained in:
49
src/core/services/youtube/metadata-probe.test.ts
Normal file
49
src/core/services/youtube/metadata-probe.test.ts
Normal file
@@ -0,0 +1,49 @@
|
||||
import assert from 'node:assert/strict';
|
||||
import fs from 'node:fs';
|
||||
import os from 'node:os';
|
||||
import path from 'node:path';
|
||||
import test from 'node:test';
|
||||
import { probeYoutubeVideoMetadata } from './metadata-probe';
|
||||
|
||||
async function withTempDir<T>(fn: (dir: string) => Promise<T>): Promise<T> {
|
||||
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-youtube-metadata-probe-'));
|
||||
try {
|
||||
return await fn(dir);
|
||||
} finally {
|
||||
fs.rmSync(dir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
function makeFakeYtDlpScript(dir: string, payload: string): void {
|
||||
const scriptPath = path.join(dir, 'yt-dlp');
|
||||
const script = `#!/usr/bin/env node
|
||||
process.stdout.write(${JSON.stringify(payload)});
|
||||
`;
|
||||
fs.writeFileSync(scriptPath, script, 'utf8');
|
||||
if (process.platform !== 'win32') {
|
||||
fs.chmodSync(scriptPath, 0o755);
|
||||
}
|
||||
fs.writeFileSync(scriptPath + '.cmd', `@echo off\r\nnode "${scriptPath}"\r\n`, 'utf8');
|
||||
}
|
||||
|
||||
async function withFakeYtDlp<T>(payload: string, fn: () => Promise<T>): Promise<T> {
|
||||
return await withTempDir(async (root) => {
|
||||
const binDir = path.join(root, 'bin');
|
||||
fs.mkdirSync(binDir, { recursive: true });
|
||||
makeFakeYtDlpScript(binDir, payload);
|
||||
const originalPath = process.env.PATH ?? '';
|
||||
process.env.PATH = `${binDir}${path.delimiter}${originalPath}`;
|
||||
try {
|
||||
return await fn();
|
||||
} finally {
|
||||
process.env.PATH = originalPath;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
test('probeYoutubeVideoMetadata returns null on malformed yt-dlp JSON', async () => {
|
||||
await withFakeYtDlp('not-json', async () => {
|
||||
const result = await probeYoutubeVideoMetadata('https://www.youtube.com/watch?v=abc123');
|
||||
assert.equal(result, null);
|
||||
});
|
||||
});
|
||||
@@ -79,7 +79,12 @@ export async function probeYoutubeVideoMetadata(
|
||||
'--skip-download',
|
||||
targetUrl,
|
||||
]);
|
||||
const info = JSON.parse(stdout) as YtDlpYoutubeMetadata;
|
||||
let info: YtDlpYoutubeMetadata;
|
||||
try {
|
||||
info = JSON.parse(stdout) as YtDlpYoutubeMetadata;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
const youtubeVideoId = info.id?.trim();
|
||||
const videoUrl = info.webpage_url?.trim() || targetUrl.trim();
|
||||
if (!youtubeVideoId || !videoUrl) {
|
||||
|
||||
40
src/core/services/youtube/timedtext.test.ts
Normal file
40
src/core/services/youtube/timedtext.test.ts
Normal file
@@ -0,0 +1,40 @@
|
||||
import assert from 'node:assert/strict';
|
||||
import test from 'node:test';
|
||||
import { convertYoutubeTimedTextToVtt } from './timedtext';
|
||||
|
||||
test('convertYoutubeTimedTextToVtt leaves malformed numeric entities literal', () => {
|
||||
const result = convertYoutubeTimedTextToVtt(
|
||||
'<timedtext><body><p t="0" d="1000">� � A</p></body></timedtext>',
|
||||
);
|
||||
|
||||
assert.equal(
|
||||
result,
|
||||
['WEBVTT', '', '00:00:00.000 --> 00:00:01.000', '� � A', ''].join('\n'),
|
||||
);
|
||||
});
|
||||
|
||||
test('convertYoutubeTimedTextToVtt does not swallow text after zero-length overlap rows', () => {
|
||||
const result = convertYoutubeTimedTextToVtt(
|
||||
[
|
||||
'<timedtext><body>',
|
||||
'<p t="0" d="2000">今日は</p>',
|
||||
'<p t="1000" d="0">今日はいい天気ですね</p>',
|
||||
'<p t="1000" d="2000">今日はいい天気ですね</p>',
|
||||
'</body></timedtext>',
|
||||
].join(''),
|
||||
);
|
||||
|
||||
assert.equal(
|
||||
result,
|
||||
[
|
||||
'WEBVTT',
|
||||
'',
|
||||
'00:00:00.000 --> 00:00:00.999',
|
||||
'今日は',
|
||||
'',
|
||||
'00:00:01.000 --> 00:00:03.000',
|
||||
'いい天気ですね',
|
||||
'',
|
||||
].join('\n'),
|
||||
);
|
||||
});
|
||||
@@ -6,6 +6,18 @@ interface YoutubeTimedTextRow {
|
||||
|
||||
const YOUTUBE_TIMEDTEXT_EXTENSIONS = new Set(['srv1', 'srv2', 'srv3', 'ytsrv3']);
|
||||
|
||||
function decodeNumericEntity(match: string, codePoint: number): string {
|
||||
if (
|
||||
!Number.isInteger(codePoint) ||
|
||||
codePoint < 0 ||
|
||||
codePoint > 0x10ffff ||
|
||||
(codePoint >= 0xd800 && codePoint <= 0xdfff)
|
||||
) {
|
||||
return match;
|
||||
}
|
||||
return String.fromCodePoint(codePoint);
|
||||
}
|
||||
|
||||
function decodeHtmlEntities(value: string): string {
|
||||
return value
|
||||
.replace(/&/g, '&')
|
||||
@@ -13,9 +25,11 @@ function decodeHtmlEntities(value: string): string {
|
||||
.replace(/>/g, '>')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/&#(\d+);/g, (_match, codePoint) => String.fromCodePoint(Number(codePoint)))
|
||||
.replace(/&#x([0-9a-f]+);/gi, (_match, codePoint) =>
|
||||
String.fromCodePoint(Number.parseInt(codePoint, 16)),
|
||||
.replace(/&#(\d+);/g, (match, codePoint) =>
|
||||
decodeNumericEntity(match, Number(codePoint)),
|
||||
)
|
||||
.replace(/&#x([0-9a-f]+);/gi, (match, codePoint) =>
|
||||
decodeNumericEntity(match, Number.parseInt(codePoint, 16)),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -85,7 +99,6 @@ export function convertYoutubeTimedTextToVtt(xml: string): string {
|
||||
? Math.max(row.startMs, nextRow.startMs - 1)
|
||||
: unclampedEnd;
|
||||
if (clampedEnd <= row.startMs) {
|
||||
previousText = row.text;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
@@ -303,6 +303,33 @@ test('downloadYoutubeSubtitleTrack prefers direct download URL when available',
|
||||
});
|
||||
});
|
||||
|
||||
test('downloadYoutubeSubtitleTrack sanitizes metadata source language in filenames', async () => {
|
||||
await withTempDir(async (root) => {
|
||||
await withStubFetch(
|
||||
async () => new Response('WEBVTT\n', { status: 200 }),
|
||||
async () => {
|
||||
const result = await downloadYoutubeSubtitleTrack({
|
||||
targetUrl: 'https://www.youtube.com/watch?v=abc123',
|
||||
outputDir: path.join(root, 'out'),
|
||||
track: {
|
||||
id: 'auto:../../ja-orig',
|
||||
language: 'ja',
|
||||
sourceLanguage: '../ja-orig/../../evil',
|
||||
kind: 'auto',
|
||||
label: 'Japanese (auto)',
|
||||
downloadUrl: 'https://example.com/subs/ja.vtt',
|
||||
fileExtension: 'vtt',
|
||||
},
|
||||
mode: 'download',
|
||||
});
|
||||
|
||||
assert.equal(path.dirname(result.path), path.join(root, 'out'));
|
||||
assert.equal(path.basename(result.path), 'auto-ja-orig.ja-orig-evil.vtt');
|
||||
},
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
test('downloadYoutubeSubtitleTrack converts srv3 auto subtitles into regular vtt', async () => {
|
||||
await withTempDir(async (root) => {
|
||||
await withStubFetch(
|
||||
|
||||
@@ -9,6 +9,11 @@ const YOUTUBE_SUBTITLE_EXTENSIONS = new Set(['.srt', '.vtt', '.ass']);
|
||||
const YOUTUBE_BATCH_PREFIX = 'youtube-batch';
|
||||
const YOUTUBE_DOWNLOAD_TIMEOUT_MS = 15_000;
|
||||
|
||||
function sanitizeFilenameSegment(value: string): string {
|
||||
const sanitized = value.trim().replace(/[^a-z0-9_-]+/gi, '-').replace(/-+/g, '-');
|
||||
return sanitized.replace(/^-+|-+$/g, '') || 'unknown';
|
||||
}
|
||||
|
||||
function createFetchTimeoutSignal(timeoutMs: number): AbortSignal | undefined {
|
||||
if (typeof AbortSignal !== 'undefined' && typeof AbortSignal.timeout === 'function') {
|
||||
return AbortSignal.timeout(timeoutMs);
|
||||
@@ -154,9 +159,10 @@ async function downloadSubtitleFromUrl(input: {
|
||||
: YOUTUBE_SUBTITLE_EXTENSIONS.has(`.${ext}`)
|
||||
? ext
|
||||
: 'vtt';
|
||||
const safeSourceLanguage = sanitizeFilenameSegment(input.track.sourceLanguage);
|
||||
const targetPath = path.join(
|
||||
input.outputDir,
|
||||
`${input.prefix}.${input.track.sourceLanguage}.${safeExt}`,
|
||||
`${input.prefix}.${safeSourceLanguage}.${safeExt}`,
|
||||
);
|
||||
const response = await fetch(input.track.downloadUrl, {
|
||||
signal: createFetchTimeoutSignal(YOUTUBE_DOWNLOAD_TIMEOUT_MS),
|
||||
|
||||
@@ -14,10 +14,12 @@ async function withTempDir<T>(fn: (dir: string) => Promise<T>): Promise<T> {
|
||||
}
|
||||
}
|
||||
|
||||
function makeFakeYtDlpScript(dir: string, payload: unknown): void {
|
||||
function makeFakeYtDlpScript(dir: string, payload: unknown, rawScript = false): void {
|
||||
const scriptPath = path.join(dir, 'yt-dlp');
|
||||
const stdoutBody = typeof payload === 'string' ? payload : JSON.stringify(payload);
|
||||
const script = `#!/usr/bin/env node
|
||||
const script = rawScript
|
||||
? stdoutBody
|
||||
: `#!/usr/bin/env node
|
||||
process.stdout.write(${JSON.stringify(stdoutBody)});
|
||||
`;
|
||||
fs.writeFileSync(scriptPath, script, 'utf8');
|
||||
@@ -27,11 +29,15 @@ process.stdout.write(${JSON.stringify(stdoutBody)});
|
||||
fs.writeFileSync(scriptPath + '.cmd', `@echo off\r\nnode "${scriptPath}"\r\n`, 'utf8');
|
||||
}
|
||||
|
||||
async function withFakeYtDlp<T>(payload: unknown, fn: () => Promise<T>): Promise<T> {
|
||||
async function withFakeYtDlp<T>(
|
||||
payload: unknown,
|
||||
fn: () => Promise<T>,
|
||||
options: { rawScript?: boolean } = {},
|
||||
): Promise<T> {
|
||||
return await withTempDir(async (root) => {
|
||||
const binDir = path.join(root, 'bin');
|
||||
fs.mkdirSync(binDir, { recursive: true });
|
||||
makeFakeYtDlpScript(binDir, payload);
|
||||
makeFakeYtDlpScript(binDir, payload, options.rawScript === true);
|
||||
const originalPath = process.env.PATH ?? '';
|
||||
process.env.PATH = `${binDir}${path.delimiter}${originalPath}`;
|
||||
try {
|
||||
|
||||
@@ -2,6 +2,8 @@ import { spawn } from 'node:child_process';
|
||||
import type { YoutubeTrackOption } from '../../../types';
|
||||
import { formatYoutubeTrackLabel, normalizeYoutubeLangCode, type YoutubeTrackKind } from './labels';
|
||||
|
||||
const YOUTUBE_TRACK_PROBE_TIMEOUT_MS = 15_000;
|
||||
|
||||
export type YoutubeTrackProbeResult = {
|
||||
videoId: string;
|
||||
title: string;
|
||||
@@ -17,11 +19,19 @@ type YtDlpInfo = {
|
||||
automatic_captions?: Record<string, YtDlpSubtitleEntry>;
|
||||
};
|
||||
|
||||
function runCapture(command: string, args: string[]): Promise<{ stdout: string; stderr: string }> {
|
||||
function runCapture(
|
||||
command: string,
|
||||
args: string[],
|
||||
timeoutMs = YOUTUBE_TRACK_PROBE_TIMEOUT_MS,
|
||||
): Promise<{ stdout: string; stderr: string }> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const proc = spawn(command, args, { stdio: ['ignore', 'pipe', 'pipe'] });
|
||||
let stdout = '';
|
||||
let stderr = '';
|
||||
const timer = setTimeout(() => {
|
||||
proc.kill();
|
||||
reject(new Error(`yt-dlp timed out after ${timeoutMs}ms`));
|
||||
}, timeoutMs);
|
||||
proc.stdout.setEncoding('utf8');
|
||||
proc.stderr.setEncoding('utf8');
|
||||
proc.stdout.on('data', (chunk) => {
|
||||
@@ -30,8 +40,12 @@ function runCapture(command: string, args: string[]): Promise<{ stdout: string;
|
||||
proc.stderr.on('data', (chunk) => {
|
||||
stderr += String(chunk);
|
||||
});
|
||||
proc.once('error', reject);
|
||||
proc.once('error', (error) => {
|
||||
clearTimeout(timer);
|
||||
reject(error);
|
||||
});
|
||||
proc.once('close', (code) => {
|
||||
clearTimeout(timer);
|
||||
if (code === 0) {
|
||||
resolve({ stdout, stderr });
|
||||
return;
|
||||
|
||||
Reference in New Issue
Block a user