Files
SubMiner/scripts/test-yomitan-parser.ts

661 lines
18 KiB
TypeScript

import fs from "node:fs";
import os from "node:os";
import path from "node:path";
import process from "node:process";
import { createTokenizerDepsRuntime, tokenizeSubtitle } from "../src/core/services/tokenizer.js";
import { MecabTokenizer } from "../src/mecab-tokenizer.js";
import type { MergedToken } from "../src/types.js";
interface CliOptions {
input: string;
emitPretty: boolean;
emitJson: boolean;
forceMecabOnly: boolean;
yomitanExtensionPath?: string;
yomitanUserDataPath?: string;
mecabCommand?: string;
mecabDictionaryPath?: string;
}
interface YomitanParseHeadword {
term?: unknown;
}
interface YomitanParseSegment {
text?: unknown;
reading?: unknown;
headwords?: unknown;
}
interface YomitanParseResultItem {
source?: unknown;
index?: unknown;
content?: unknown;
}
interface ParsedCandidate {
source: string;
index: number;
tokens: Array<{
surface: string;
reading: string;
headword: string;
startPos: number;
endPos: number;
}>;
}
interface YomitanRuntimeState {
available: boolean;
note: string | null;
extension: Electron.Extension | null;
parserWindow: Electron.BrowserWindow | null;
parserReadyPromise: Promise<void> | null;
parserInitPromise: Promise<boolean> | null;
}
const DEFAULT_YOMITAN_USER_DATA_PATH = path.join(
os.homedir(),
".config",
"SubMiner",
);
function destroyParserWindow(window: Electron.BrowserWindow | null): void {
if (!window || window.isDestroyed()) {
return;
}
window.destroy();
}
async function shutdownYomitanRuntime(yomitan: YomitanRuntimeState): Promise<void> {
destroyParserWindow(yomitan.parserWindow);
const electronModule = await import("electron").catch(() => null);
if (electronModule?.app) {
electronModule.app.quit();
}
}
function parseCliArgs(argv: string[]): CliOptions {
const args = [...argv];
const inputParts: string[] = [];
let emitPretty = true;
let emitJson = false;
let forceMecabOnly = false;
let yomitanExtensionPath: string | undefined;
let yomitanUserDataPath: string | undefined = DEFAULT_YOMITAN_USER_DATA_PATH;
let mecabCommand: string | undefined;
let mecabDictionaryPath: string | undefined;
while (args.length > 0) {
const arg = args.shift();
if (!arg) break;
if (arg === "--help" || arg === "-h") {
printUsage();
process.exit(0);
}
if (arg === "--pretty") {
emitPretty = true;
continue;
}
if (arg === "--json") {
emitJson = true;
continue;
}
if (arg === "--force-mecab") {
forceMecabOnly = true;
continue;
}
if (arg === "--yomitan-extension") {
const next = args.shift();
if (!next) {
throw new Error("Missing value for --yomitan-extension");
}
yomitanExtensionPath = path.resolve(next);
continue;
}
if (arg.startsWith("--yomitan-extension=")) {
yomitanExtensionPath = path.resolve(
arg.slice("--yomitan-extension=".length),
);
continue;
}
if (arg === "--yomitan-user-data") {
const next = args.shift();
if (!next) {
throw new Error("Missing value for --yomitan-user-data");
}
yomitanUserDataPath = path.resolve(next);
continue;
}
if (arg.startsWith("--yomitan-user-data=")) {
yomitanUserDataPath = path.resolve(
arg.slice("--yomitan-user-data=".length),
);
continue;
}
if (arg === "--mecab-command") {
const next = args.shift();
if (!next) {
throw new Error("Missing value for --mecab-command");
}
mecabCommand = next;
continue;
}
if (arg.startsWith("--mecab-command=")) {
mecabCommand = arg.slice("--mecab-command=".length);
continue;
}
if (arg === "--mecab-dictionary") {
const next = args.shift();
if (!next) {
throw new Error("Missing value for --mecab-dictionary");
}
mecabDictionaryPath = next;
continue;
}
if (arg.startsWith("--mecab-dictionary=")) {
mecabDictionaryPath = arg.slice("--mecab-dictionary=".length);
continue;
}
if (arg.startsWith("-")) {
throw new Error(`Unknown flag: ${arg}`);
}
inputParts.push(arg);
}
const input = inputParts.join(" ").trim();
if (input.length > 0) {
return {
input,
emitPretty,
emitJson,
forceMecabOnly,
yomitanExtensionPath,
yomitanUserDataPath,
mecabCommand,
mecabDictionaryPath,
};
}
const stdin = fs.readFileSync(0, "utf8").trim();
if (!stdin) {
throw new Error(
"Please provide input text as arguments or via stdin.",
);
}
return {
input: stdin,
emitPretty,
emitJson,
forceMecabOnly,
yomitanExtensionPath,
yomitanUserDataPath,
mecabCommand,
mecabDictionaryPath,
};
}
function printUsage(): void {
process.stdout.write(`Usage:
bun run test-yomitan-parser:electron -- [--pretty] [--json] [--yomitan-extension <path>] [--yomitan-user-data <path>] [--mecab-command <path>] [--mecab-dictionary <path>] <text>
--pretty Pretty-print JSON output.
--json Emit machine-readable JSON output.
--force-mecab Skip Yomitan parser setup and test MeCab fallback only.
--yomitan-extension <path> Optional path to Yomitan extension directory.
--yomitan-user-data <path> Optional Electron userData directory (default: ~/.config/SubMiner).
--mecab-command <path> Optional MeCab binary path (default: mecab).
--mecab-dictionary <path> Optional MeCab dictionary directory.
-h, --help Show usage.
`);
}
function normalizeDisplayText(text: string): string {
return text
.replace(/\r\n/g, "\n")
.replace(/\\N/g, "\n")
.replace(/\\n/g, "\n")
.trim();
}
function normalizeTokenizerText(text: string): string {
return normalizeDisplayText(text)
.replace(/\n/g, " ")
.replace(/\s+/g, " ")
.trim();
}
function isObject(value: unknown): value is Record<string, unknown> {
return Boolean(value && typeof value === "object");
}
function isHeadwordRows(value: unknown): value is YomitanParseHeadword[][] {
return (
Array.isArray(value) &&
value.every((row) =>
Array.isArray(row) &&
row.every((entry) => isObject(entry) && typeof entry.term === "string")
)
);
}
function extractHeadwordTerms(segment: YomitanParseSegment): string[] {
if (!isHeadwordRows(segment.headwords)) {
return [];
}
const terms: string[] = [];
const seen = new Set<string>();
for (const row of segment.headwords) {
for (const entry of row) {
const term = (entry.term as string).trim();
if (!term || seen.has(term)) {
continue;
}
seen.add(term);
terms.push(term);
}
}
return terms;
}
function mapParseResultsToCandidates(parseResults: unknown): ParsedCandidate[] {
if (!Array.isArray(parseResults)) {
return [];
}
const candidates: ParsedCandidate[] = [];
for (const item of parseResults) {
if (!isObject(item)) {
continue;
}
const parseItem = item as YomitanParseResultItem;
if (!Array.isArray(parseItem.content) || typeof parseItem.source !== "string") {
continue;
}
const candidateTokens: ParsedCandidate["tokens"] = [];
let charOffset = 0;
let validLineCount = 0;
for (const line of parseItem.content) {
if (!Array.isArray(line)) {
continue;
}
const lineSegments = line as YomitanParseSegment[];
if (lineSegments.some((segment) => typeof segment.text !== "string")) {
continue;
}
validLineCount += 1;
for (const segment of lineSegments) {
const surface = (segment.text as string) ?? "";
if (!surface) {
continue;
}
const startPos = charOffset;
const endPos = startPos + surface.length;
charOffset = endPos;
const headwordTerms = extractHeadwordTerms(segment);
candidateTokens.push({
surface,
reading: typeof segment.reading === "string" ? segment.reading : "",
headword: headwordTerms[0] ?? surface,
startPos,
endPos,
});
}
}
if (validLineCount === 0 || candidateTokens.length === 0) {
continue;
}
candidates.push({
source: parseItem.source,
index:
typeof parseItem.index === "number" && Number.isInteger(parseItem.index)
? parseItem.index
: 0,
tokens: candidateTokens,
});
}
return candidates;
}
function candidateTokenSignature(token: {
surface: string;
reading: string;
headword: string;
startPos: number;
endPos: number;
}): string {
return `${token.surface}\u001f${token.reading}\u001f${token.headword}\u001f${token.startPos}\u001f${token.endPos}`;
}
function mergedTokenSignature(token: MergedToken): string {
return `${token.surface}\u001f${token.reading}\u001f${token.headword}\u001f${token.startPos}\u001f${token.endPos}`;
}
function findSelectedCandidateIndexes(
candidates: ParsedCandidate[],
mergedTokens: MergedToken[] | null,
): number[] {
if (!mergedTokens || mergedTokens.length === 0) {
return [];
}
const mergedSignatures = mergedTokens.map(mergedTokenSignature);
const selected: number[] = [];
for (let i = 0; i < candidates.length; i += 1) {
const candidateSignatures = candidates[i].tokens.map(candidateTokenSignature);
if (candidateSignatures.length !== mergedSignatures.length) {
continue;
}
let allMatch = true;
for (let j = 0; j < candidateSignatures.length; j += 1) {
if (candidateSignatures[j] !== mergedSignatures[j]) {
allMatch = false;
break;
}
}
if (allMatch) {
selected.push(i);
}
}
return selected;
}
function resolveYomitanExtensionPath(explicitPath?: string): string | null {
const candidates = [
explicitPath ? path.resolve(explicitPath) : null,
path.resolve(process.cwd(), "vendor", "yomitan"),
];
for (const candidate of candidates) {
if (!candidate) {
continue;
}
if (fs.existsSync(path.join(candidate, "manifest.json"))) {
return candidate;
}
}
return null;
}
async function setupYomitanRuntime(
options: CliOptions,
): Promise<YomitanRuntimeState> {
const state: YomitanRuntimeState = {
available: false,
note: null,
extension: null,
parserWindow: null,
parserReadyPromise: null,
parserInitPromise: null,
};
if (options.forceMecabOnly) {
state.note = "force-mecab enabled";
return state;
}
const electronModule = await import("electron").catch((error) => {
state.note = error instanceof Error ? error.message : "electron import failed";
return null;
});
if (!electronModule?.app || !electronModule?.session) {
state.note = "electron runtime not available in this process";
return state;
}
if (options.yomitanUserDataPath) {
electronModule.app.setPath("userData", options.yomitanUserDataPath);
}
await electronModule.app.whenReady();
const extensionPath = resolveYomitanExtensionPath(options.yomitanExtensionPath);
if (!extensionPath) {
state.note = "no Yomitan extension directory found";
return state;
}
try {
state.extension = await electronModule.session.defaultSession.loadExtension(
extensionPath,
{ allowFileAccess: true },
);
state.available = true;
return state;
} catch (error) {
state.note =
error instanceof Error
? error.message
: "failed to load Yomitan extension";
state.available = false;
return state;
}
}
async function fetchRawParseResults(
parserWindow: Electron.BrowserWindow,
text: string,
): Promise<unknown> {
const script = `
(async () => {
const invoke = (action, params) =>
new Promise((resolve, reject) => {
chrome.runtime.sendMessage({ action, params }, (response) => {
if (chrome.runtime.lastError) {
reject(new Error(chrome.runtime.lastError.message));
return;
}
if (!response || typeof response !== "object") {
reject(new Error("Invalid response from Yomitan backend"));
return;
}
if (response.error) {
reject(new Error(response.error.message || "Yomitan backend error"));
return;
}
resolve(response.result);
});
});
const optionsFull = await invoke("optionsGetFull", undefined);
const profileIndex = optionsFull.profileCurrent;
const scanLength =
optionsFull.profiles?.[profileIndex]?.options?.scanning?.length ?? 40;
return await invoke("parseText", {
text: ${JSON.stringify(text)},
optionsContext: { index: profileIndex },
scanLength,
useInternalParser: true,
useMecabParser: true
});
})();
`;
return parserWindow.webContents.executeJavaScript(script, true);
}
function renderTextOutput(payload: Record<string, unknown>): void {
process.stdout.write(`Input: ${String(payload.input)}\n`);
process.stdout.write(`Tokenizer text: ${String(payload.tokenizerText)}\n`);
process.stdout.write(`Yomitan available: ${String(payload.yomitanAvailable)}\n`);
process.stdout.write(`Yomitan note: ${String(payload.yomitanNote ?? "")}\n`);
process.stdout.write(`Selected candidate indexes: ${JSON.stringify(payload.selectedCandidateIndexes)}\n`);
process.stdout.write("\nFinal selected tokens:\n");
const finalTokens = payload.finalTokens as Array<Record<string, unknown>> | null;
if (!finalTokens || finalTokens.length === 0) {
process.stdout.write(" (none)\n");
} else {
for (let i = 0; i < finalTokens.length; i += 1) {
const token = finalTokens[i];
process.stdout.write(
` [${i}] ${token.surface} -> ${token.headword} (${token.reading}) [${token.startPos}, ${token.endPos})\n`,
);
}
}
process.stdout.write("\nYomitan parse candidates:\n");
const candidates = payload.candidates as Array<Record<string, unknown>>;
if (!candidates || candidates.length === 0) {
process.stdout.write(" (none)\n");
return;
}
for (let i = 0; i < candidates.length; i += 1) {
const candidate = candidates[i];
process.stdout.write(
` [${i}] source=${String(candidate.source)} index=${String(candidate.index)} selectedByTokenizer=${String(candidate.selectedByTokenizer)} tokenCount=${String(candidate.tokenCount)}\n`,
);
const tokens = candidate.tokens as Array<Record<string, unknown>> | undefined;
if (!tokens || tokens.length === 0) {
continue;
}
for (let j = 0; j < tokens.length; j += 1) {
const token = tokens[j];
process.stdout.write(
` - ${token.surface} -> ${token.headword} (${token.reading}) [${token.startPos}, ${token.endPos})\n`,
);
}
}
}
async function main(): Promise<void> {
const args = parseCliArgs(process.argv.slice(2));
const yomitan: YomitanRuntimeState = {
available: false,
note: null,
extension: null,
parserWindow: null,
parserReadyPromise: null,
parserInitPromise: null,
};
try {
const mecabTokenizer = new MecabTokenizer({
mecabCommand: args.mecabCommand,
dictionaryPath: args.mecabDictionaryPath,
});
const isMecabAvailable = await mecabTokenizer.checkAvailability();
if (!isMecabAvailable) {
throw new Error("MeCab is not available on this system.");
}
const runtime = await setupYomitanRuntime(args);
yomitan.available = runtime.available;
yomitan.note = runtime.note;
yomitan.extension = runtime.extension;
yomitan.parserWindow = runtime.parserWindow;
yomitan.parserReadyPromise = runtime.parserReadyPromise;
yomitan.parserInitPromise = runtime.parserInitPromise;
const deps = createTokenizerDepsRuntime({
getYomitanExt: () => yomitan.extension,
getYomitanParserWindow: () => yomitan.parserWindow,
setYomitanParserWindow: (window) => {
yomitan.parserWindow = window;
},
getYomitanParserReadyPromise: () => yomitan.parserReadyPromise,
setYomitanParserReadyPromise: (promise) => {
yomitan.parserReadyPromise = promise;
},
getYomitanParserInitPromise: () => yomitan.parserInitPromise,
setYomitanParserInitPromise: (promise) => {
yomitan.parserInitPromise = promise;
},
isKnownWord: () => false,
getKnownWordMatchMode: () => "headword",
getJlptLevel: () => null,
getMecabTokenizer: () => ({
tokenize: (text: string) => mecabTokenizer.tokenize(text),
}),
});
const subtitleData = await tokenizeSubtitle(args.input, deps);
const tokenizeText = normalizeTokenizerText(args.input);
let rawParseResults: unknown = null;
if (
yomitan.available &&
yomitan.parserWindow &&
!yomitan.parserWindow.isDestroyed() &&
tokenizeText
) {
rawParseResults = await fetchRawParseResults(yomitan.parserWindow, tokenizeText);
}
const parsedCandidates = mapParseResultsToCandidates(rawParseResults);
const selectedCandidateIndexes = findSelectedCandidateIndexes(
parsedCandidates,
subtitleData.tokens,
);
const selectedIndexSet = new Set<number>(selectedCandidateIndexes);
const payload = {
input: args.input,
tokenizerText: subtitleData.text,
yomitanAvailable: yomitan.available,
yomitanNote: yomitan.note,
selectedCandidateIndexes,
finalTokens:
subtitleData.tokens?.map((token) => ({
surface: token.surface,
reading: token.reading,
headword: token.headword,
startPos: token.startPos,
endPos: token.endPos,
pos1: token.pos1,
partOfSpeech: token.partOfSpeech,
isKnown: token.isKnown,
isNPlusOneTarget: token.isNPlusOneTarget,
})) ?? null,
candidates: parsedCandidates.map((candidate, idx) => ({
source: candidate.source,
index: candidate.index,
selectedByTokenizer: selectedIndexSet.has(idx),
tokenCount: candidate.tokens.length,
tokens: candidate.tokens,
})),
};
if (args.emitJson) {
process.stdout.write(
`${JSON.stringify(payload, null, args.emitPretty ? 2 : undefined)}\n`,
);
} else {
renderTextOutput(payload);
}
} finally {
await shutdownYomitanRuntime(yomitan);
}
}
main()
.then(() => {
process.exit(0);
})
.catch((error) => {
console.error(`Error: ${(error as Error).message}`);
process.exit(1);
});