Files
SubMiner/src/core/services/tokenizer-service.ts

306 lines
8.0 KiB
TypeScript

import { BrowserWindow, Extension, session } from "electron";
import { MergedToken, PartOfSpeech, SubtitleData } from "../../types";
interface YomitanParseHeadword {
term?: unknown;
}
interface YomitanParseSegment {
text?: unknown;
reading?: unknown;
headwords?: unknown;
}
interface YomitanParseResultItem {
source?: unknown;
index?: unknown;
content?: unknown;
}
export interface TokenizerServiceDeps {
getYomitanExt: () => Extension | null;
getYomitanParserWindow: () => BrowserWindow | null;
setYomitanParserWindow: (window: BrowserWindow | null) => void;
getYomitanParserReadyPromise: () => Promise<void> | null;
setYomitanParserReadyPromise: (promise: Promise<void> | null) => void;
getYomitanParserInitPromise: () => Promise<boolean> | null;
setYomitanParserInitPromise: (promise: Promise<boolean> | null) => void;
tokenizeWithMecab: (text: string) => Promise<MergedToken[] | null>;
}
function extractYomitanHeadword(segment: YomitanParseSegment): string {
const headwords = segment.headwords;
if (!Array.isArray(headwords) || headwords.length === 0) {
return "";
}
const firstGroup = headwords[0];
if (!Array.isArray(firstGroup) || firstGroup.length === 0) {
return "";
}
const firstHeadword = firstGroup[0] as YomitanParseHeadword;
return typeof firstHeadword?.term === "string" ? firstHeadword.term : "";
}
function mapYomitanParseResultsToMergedTokens(
parseResults: unknown,
): MergedToken[] | null {
if (!Array.isArray(parseResults) || parseResults.length === 0) {
return null;
}
const scanningItems = parseResults.filter((item) => {
const resultItem = item as YomitanParseResultItem;
return (
resultItem &&
resultItem.source === "scanning-parser" &&
Array.isArray(resultItem.content)
);
}) as YomitanParseResultItem[];
if (scanningItems.length === 0) {
return null;
}
const primaryItem =
scanningItems.find((item) => item.index === 0) || scanningItems[0];
const content = primaryItem.content;
if (!Array.isArray(content)) {
return null;
}
const tokens: MergedToken[] = [];
let charOffset = 0;
for (const line of content) {
if (!Array.isArray(line)) {
continue;
}
let surface = "";
let reading = "";
let headword = "";
for (const rawSegment of line) {
const segment = rawSegment as YomitanParseSegment;
if (!segment || typeof segment !== "object") {
continue;
}
const segmentText = segment.text;
if (typeof segmentText !== "string" || segmentText.length === 0) {
continue;
}
surface += segmentText;
if (typeof segment.reading === "string") {
reading += segment.reading;
}
if (!headword) {
headword = extractYomitanHeadword(segment);
}
}
if (!surface) {
continue;
}
const start = charOffset;
const end = start + surface.length;
charOffset = end;
tokens.push({
surface,
reading,
headword: headword || surface,
startPos: start,
endPos: end,
partOfSpeech: PartOfSpeech.other,
isMerged: true,
});
}
return tokens.length > 0 ? tokens : null;
}
async function ensureYomitanParserWindow(
deps: TokenizerServiceDeps,
): Promise<boolean> {
const yomitanExt = deps.getYomitanExt();
if (!yomitanExt) {
return false;
}
const currentWindow = deps.getYomitanParserWindow();
if (currentWindow && !currentWindow.isDestroyed()) {
return true;
}
const existingInitPromise = deps.getYomitanParserInitPromise();
if (existingInitPromise) {
return existingInitPromise;
}
const initPromise = (async () => {
const parserWindow = new BrowserWindow({
show: false,
width: 800,
height: 600,
webPreferences: {
contextIsolation: true,
nodeIntegration: false,
session: session.defaultSession,
},
});
deps.setYomitanParserWindow(parserWindow);
deps.setYomitanParserReadyPromise(
new Promise((resolve, reject) => {
parserWindow.webContents.once("did-finish-load", () => resolve());
parserWindow.webContents.once(
"did-fail-load",
(_event, _errorCode, errorDescription) => {
reject(new Error(errorDescription));
},
);
}),
);
parserWindow.on("closed", () => {
if (deps.getYomitanParserWindow() === parserWindow) {
deps.setYomitanParserWindow(null);
deps.setYomitanParserReadyPromise(null);
}
});
try {
await parserWindow.loadURL(`chrome-extension://${yomitanExt.id}/search.html`);
const readyPromise = deps.getYomitanParserReadyPromise();
if (readyPromise) {
await readyPromise;
}
return true;
} catch (err) {
console.error(
"Failed to initialize Yomitan parser window:",
(err as Error).message,
);
if (!parserWindow.isDestroyed()) {
parserWindow.destroy();
}
if (deps.getYomitanParserWindow() === parserWindow) {
deps.setYomitanParserWindow(null);
deps.setYomitanParserReadyPromise(null);
}
return false;
} finally {
deps.setYomitanParserInitPromise(null);
}
})();
deps.setYomitanParserInitPromise(initPromise);
return initPromise;
}
async function parseWithYomitanInternalParser(
text: string,
deps: TokenizerServiceDeps,
): Promise<MergedToken[] | null> {
const yomitanExt = deps.getYomitanExt();
if (!text || !yomitanExt) {
return null;
}
const isReady = await ensureYomitanParserWindow(deps);
const parserWindow = deps.getYomitanParserWindow();
if (!isReady || !parserWindow || parserWindow.isDestroyed()) {
return null;
}
const script = `
(async () => {
const invoke = (action, params) =>
new Promise((resolve, reject) => {
chrome.runtime.sendMessage({ action, params }, (response) => {
if (chrome.runtime.lastError) {
reject(new Error(chrome.runtime.lastError.message));
return;
}
if (!response || typeof response !== "object") {
reject(new Error("Invalid response from Yomitan backend"));
return;
}
if (response.error) {
reject(new Error(response.error.message || "Yomitan backend error"));
return;
}
resolve(response.result);
});
});
const optionsFull = await invoke("optionsGetFull", undefined);
const profileIndex = optionsFull.profileCurrent;
const scanLength =
optionsFull.profiles?.[profileIndex]?.options?.scanning?.length ?? 40;
return await invoke("parseText", {
text: ${JSON.stringify(text)},
optionsContext: { index: profileIndex },
scanLength,
useInternalParser: true,
useMecabParser: false
});
})();
`;
try {
const parseResults = await parserWindow.webContents.executeJavaScript(
script,
true,
);
return mapYomitanParseResultsToMergedTokens(parseResults);
} catch (err) {
console.error("Yomitan parser request failed:", (err as Error).message);
return null;
}
}
export async function tokenizeSubtitleService(
text: string,
deps: TokenizerServiceDeps,
): Promise<SubtitleData> {
const displayText = text
.replace(/\r\n/g, "\n")
.replace(/\\N/g, "\n")
.replace(/\\n/g, "\n")
.trim();
if (!displayText) {
return { text, tokens: null };
}
const tokenizeText = displayText
.replace(/\n/g, " ")
.replace(/\s+/g, " ")
.trim();
const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps);
if (yomitanTokens && yomitanTokens.length > 0) {
return { text: displayText, tokens: yomitanTokens };
}
try {
const mecabTokens = await deps.tokenizeWithMecab(tokenizeText);
if (mecabTokens && mecabTokens.length > 0) {
return { text: displayText, tokens: mecabTokens };
}
} catch (err) {
console.error("Tokenization error:", (err as Error).message);
}
return { text: displayText, tokens: null };
}