feat(core): add module scaffolding and provider registries

This commit is contained in:
kyasuda
2026-02-10 13:16:01 -08:00
committed by sudacode
parent 531f8027bd
commit 09e142279a
19 changed files with 822 additions and 0 deletions

46
src/subtitle/pipeline.ts Normal file
View File

@@ -0,0 +1,46 @@
import { TokenMergerProvider } from "../token-mergers";
import { TokenizerProvider } from "../tokenizers";
import { SubtitleData } from "../types";
import {
normalizeDisplayText,
normalizeTokenizerInput,
} from "./stages/normalize";
import { tokenizeStage } from "./stages/tokenize";
import { mergeStage } from "./stages/merge";
export interface SubtitlePipelineDeps {
getTokenizer: () => TokenizerProvider | null;
getTokenMerger: () => TokenMergerProvider | null;
}
export class SubtitlePipeline {
private readonly deps: SubtitlePipelineDeps;
constructor(deps: SubtitlePipelineDeps) {
this.deps = deps;
}
async process(text: string): Promise<SubtitleData> {
if (!text) {
return { text, tokens: null };
}
const displayText = normalizeDisplayText(text);
if (!displayText) {
return { text, tokens: null };
}
const tokenizeText = normalizeTokenizerInput(displayText);
try {
const tokens = await tokenizeStage(this.deps.getTokenizer(), tokenizeText);
const mergedTokens = mergeStage(this.deps.getTokenMerger(), tokens);
if (!mergedTokens || mergedTokens.length === 0) {
return { text: displayText, tokens: null };
}
return { text: displayText, tokens: mergedTokens };
} catch {
return { text: displayText, tokens: null };
}
}
}

View File

@@ -0,0 +1,12 @@
import { TokenMergerProvider } from "../../token-mergers";
import { MergedToken, Token } from "../../types";
export function mergeStage(
mergerProvider: TokenMergerProvider | null,
tokens: Token[] | null,
): MergedToken[] | null {
if (!mergerProvider || !tokens || tokens.length === 0) {
return null;
}
return mergerProvider.merge(tokens);
}

View File

@@ -0,0 +1,14 @@
export function normalizeDisplayText(text: string): string {
return text
.replace(/\r\n/g, "\n")
.replace(/\\N/g, "\n")
.replace(/\\n/g, "\n")
.trim();
}
export function normalizeTokenizerInput(displayText: string): string {
return displayText
.replace(/\n/g, " ")
.replace(/\s+/g, " ")
.trim();
}

View File

@@ -0,0 +1,12 @@
import { TokenizerProvider } from "../../tokenizers";
import { Token } from "../../types";
export async function tokenizeStage(
tokenizerProvider: TokenizerProvider | null,
input: string,
): Promise<Token[] | null> {
if (!tokenizerProvider || !input) {
return null;
}
return tokenizerProvider.tokenize(input);
}