mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-02-28 06:22:45 -08:00
feat(core): add module scaffolding and provider registries
This commit is contained in:
46
src/subtitle/pipeline.ts
Normal file
46
src/subtitle/pipeline.ts
Normal file
@@ -0,0 +1,46 @@
|
||||
import { TokenMergerProvider } from "../token-mergers";
|
||||
import { TokenizerProvider } from "../tokenizers";
|
||||
import { SubtitleData } from "../types";
|
||||
import {
|
||||
normalizeDisplayText,
|
||||
normalizeTokenizerInput,
|
||||
} from "./stages/normalize";
|
||||
import { tokenizeStage } from "./stages/tokenize";
|
||||
import { mergeStage } from "./stages/merge";
|
||||
|
||||
export interface SubtitlePipelineDeps {
|
||||
getTokenizer: () => TokenizerProvider | null;
|
||||
getTokenMerger: () => TokenMergerProvider | null;
|
||||
}
|
||||
|
||||
export class SubtitlePipeline {
|
||||
private readonly deps: SubtitlePipelineDeps;
|
||||
|
||||
constructor(deps: SubtitlePipelineDeps) {
|
||||
this.deps = deps;
|
||||
}
|
||||
|
||||
async process(text: string): Promise<SubtitleData> {
|
||||
if (!text) {
|
||||
return { text, tokens: null };
|
||||
}
|
||||
|
||||
const displayText = normalizeDisplayText(text);
|
||||
if (!displayText) {
|
||||
return { text, tokens: null };
|
||||
}
|
||||
|
||||
const tokenizeText = normalizeTokenizerInput(displayText);
|
||||
|
||||
try {
|
||||
const tokens = await tokenizeStage(this.deps.getTokenizer(), tokenizeText);
|
||||
const mergedTokens = mergeStage(this.deps.getTokenMerger(), tokens);
|
||||
if (!mergedTokens || mergedTokens.length === 0) {
|
||||
return { text: displayText, tokens: null };
|
||||
}
|
||||
return { text: displayText, tokens: mergedTokens };
|
||||
} catch {
|
||||
return { text: displayText, tokens: null };
|
||||
}
|
||||
}
|
||||
}
|
||||
12
src/subtitle/stages/merge.ts
Normal file
12
src/subtitle/stages/merge.ts
Normal file
@@ -0,0 +1,12 @@
|
||||
import { TokenMergerProvider } from "../../token-mergers";
|
||||
import { MergedToken, Token } from "../../types";
|
||||
|
||||
export function mergeStage(
|
||||
mergerProvider: TokenMergerProvider | null,
|
||||
tokens: Token[] | null,
|
||||
): MergedToken[] | null {
|
||||
if (!mergerProvider || !tokens || tokens.length === 0) {
|
||||
return null;
|
||||
}
|
||||
return mergerProvider.merge(tokens);
|
||||
}
|
||||
14
src/subtitle/stages/normalize.ts
Normal file
14
src/subtitle/stages/normalize.ts
Normal file
@@ -0,0 +1,14 @@
|
||||
export function normalizeDisplayText(text: string): string {
|
||||
return text
|
||||
.replace(/\r\n/g, "\n")
|
||||
.replace(/\\N/g, "\n")
|
||||
.replace(/\\n/g, "\n")
|
||||
.trim();
|
||||
}
|
||||
|
||||
export function normalizeTokenizerInput(displayText: string): string {
|
||||
return displayText
|
||||
.replace(/\n/g, " ")
|
||||
.replace(/\s+/g, " ")
|
||||
.trim();
|
||||
}
|
||||
12
src/subtitle/stages/tokenize.ts
Normal file
12
src/subtitle/stages/tokenize.ts
Normal file
@@ -0,0 +1,12 @@
|
||||
import { TokenizerProvider } from "../../tokenizers";
|
||||
import { Token } from "../../types";
|
||||
|
||||
export async function tokenizeStage(
|
||||
tokenizerProvider: TokenizerProvider | null,
|
||||
input: string,
|
||||
): Promise<Token[] | null> {
|
||||
if (!tokenizerProvider || !input) {
|
||||
return null;
|
||||
}
|
||||
return tokenizerProvider.tokenize(input);
|
||||
}
|
||||
Reference in New Issue
Block a user