mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-02-28 06:22:45 -08:00
Standardize core service module and export names to reduce naming ambiguity and make imports predictable across runtime, tests, scripts, and docs.
654 lines
18 KiB
TypeScript
654 lines
18 KiB
TypeScript
import fs from "node:fs";
|
|
import path from "node:path";
|
|
import process from "node:process";
|
|
|
|
import { createTokenizerDepsRuntime, tokenizeSubtitle } from "../src/core/services/tokenizer.js";
|
|
import { MecabTokenizer } from "../src/mecab-tokenizer.js";
|
|
import type { MergedToken } from "../src/types.js";
|
|
|
|
interface CliOptions {
|
|
input: string;
|
|
emitPretty: boolean;
|
|
emitJson: boolean;
|
|
forceMecabOnly: boolean;
|
|
yomitanExtensionPath?: string;
|
|
yomitanUserDataPath?: string;
|
|
mecabCommand?: string;
|
|
mecabDictionaryPath?: string;
|
|
}
|
|
|
|
interface YomitanParseHeadword {
|
|
term?: unknown;
|
|
}
|
|
|
|
interface YomitanParseSegment {
|
|
text?: unknown;
|
|
reading?: unknown;
|
|
headwords?: unknown;
|
|
}
|
|
|
|
interface YomitanParseResultItem {
|
|
source?: unknown;
|
|
index?: unknown;
|
|
content?: unknown;
|
|
}
|
|
|
|
interface ParsedCandidate {
|
|
source: string;
|
|
index: number;
|
|
tokens: Array<{
|
|
surface: string;
|
|
reading: string;
|
|
headword: string;
|
|
startPos: number;
|
|
endPos: number;
|
|
}>;
|
|
}
|
|
|
|
interface YomitanRuntimeState {
|
|
available: boolean;
|
|
note: string | null;
|
|
extension: Electron.Extension | null;
|
|
parserWindow: Electron.BrowserWindow | null;
|
|
parserReadyPromise: Promise<void> | null;
|
|
parserInitPromise: Promise<boolean> | null;
|
|
}
|
|
|
|
function destroyParserWindow(window: Electron.BrowserWindow | null): void {
|
|
if (!window || window.isDestroyed()) {
|
|
return;
|
|
}
|
|
window.destroy();
|
|
}
|
|
|
|
async function shutdownYomitanRuntime(yomitan: YomitanRuntimeState): Promise<void> {
|
|
destroyParserWindow(yomitan.parserWindow);
|
|
const electronModule = await import("electron").catch(() => null);
|
|
if (electronModule?.app) {
|
|
electronModule.app.quit();
|
|
}
|
|
}
|
|
|
|
function parseCliArgs(argv: string[]): CliOptions {
|
|
const args = [...argv];
|
|
const inputParts: string[] = [];
|
|
let emitPretty = false;
|
|
let emitJson = false;
|
|
let forceMecabOnly = false;
|
|
let yomitanExtensionPath: string | undefined;
|
|
let yomitanUserDataPath: string | undefined;
|
|
let mecabCommand: string | undefined;
|
|
let mecabDictionaryPath: string | undefined;
|
|
|
|
while (args.length > 0) {
|
|
const arg = args.shift();
|
|
if (!arg) break;
|
|
|
|
if (arg === "--help" || arg === "-h") {
|
|
printUsage();
|
|
process.exit(0);
|
|
}
|
|
|
|
if (arg === "--pretty") {
|
|
emitPretty = true;
|
|
continue;
|
|
}
|
|
|
|
if (arg === "--json") {
|
|
emitJson = true;
|
|
continue;
|
|
}
|
|
|
|
if (arg === "--force-mecab") {
|
|
forceMecabOnly = true;
|
|
continue;
|
|
}
|
|
|
|
if (arg === "--yomitan-extension") {
|
|
const next = args.shift();
|
|
if (!next) {
|
|
throw new Error("Missing value for --yomitan-extension");
|
|
}
|
|
yomitanExtensionPath = path.resolve(next);
|
|
continue;
|
|
}
|
|
|
|
if (arg.startsWith("--yomitan-extension=")) {
|
|
yomitanExtensionPath = path.resolve(
|
|
arg.slice("--yomitan-extension=".length),
|
|
);
|
|
continue;
|
|
}
|
|
|
|
if (arg === "--yomitan-user-data") {
|
|
const next = args.shift();
|
|
if (!next) {
|
|
throw new Error("Missing value for --yomitan-user-data");
|
|
}
|
|
yomitanUserDataPath = path.resolve(next);
|
|
continue;
|
|
}
|
|
|
|
if (arg.startsWith("--yomitan-user-data=")) {
|
|
yomitanUserDataPath = path.resolve(
|
|
arg.slice("--yomitan-user-data=".length),
|
|
);
|
|
continue;
|
|
}
|
|
|
|
if (arg === "--mecab-command") {
|
|
const next = args.shift();
|
|
if (!next) {
|
|
throw new Error("Missing value for --mecab-command");
|
|
}
|
|
mecabCommand = next;
|
|
continue;
|
|
}
|
|
|
|
if (arg.startsWith("--mecab-command=")) {
|
|
mecabCommand = arg.slice("--mecab-command=".length);
|
|
continue;
|
|
}
|
|
|
|
if (arg === "--mecab-dictionary") {
|
|
const next = args.shift();
|
|
if (!next) {
|
|
throw new Error("Missing value for --mecab-dictionary");
|
|
}
|
|
mecabDictionaryPath = next;
|
|
continue;
|
|
}
|
|
|
|
if (arg.startsWith("--mecab-dictionary=")) {
|
|
mecabDictionaryPath = arg.slice("--mecab-dictionary=".length);
|
|
continue;
|
|
}
|
|
|
|
if (arg.startsWith("-")) {
|
|
throw new Error(`Unknown flag: ${arg}`);
|
|
}
|
|
|
|
inputParts.push(arg);
|
|
}
|
|
|
|
const input = inputParts.join(" ").trim();
|
|
if (input.length > 0) {
|
|
return {
|
|
input,
|
|
emitPretty,
|
|
emitJson,
|
|
forceMecabOnly,
|
|
yomitanExtensionPath,
|
|
yomitanUserDataPath,
|
|
mecabCommand,
|
|
mecabDictionaryPath,
|
|
};
|
|
}
|
|
|
|
const stdin = fs.readFileSync(0, "utf8").trim();
|
|
if (!stdin) {
|
|
throw new Error(
|
|
"Please provide input text as arguments or via stdin.",
|
|
);
|
|
}
|
|
|
|
return {
|
|
input: stdin,
|
|
emitPretty,
|
|
emitJson,
|
|
forceMecabOnly,
|
|
yomitanExtensionPath,
|
|
yomitanUserDataPath,
|
|
mecabCommand,
|
|
mecabDictionaryPath,
|
|
};
|
|
}
|
|
|
|
function printUsage(): void {
|
|
process.stdout.write(`Usage:
|
|
pnpm run test-yomitan-parser:electron -- [--pretty] [--json] [--yomitan-extension <path>] [--yomitan-user-data <path>] [--mecab-command <path>] [--mecab-dictionary <path>] <text>
|
|
|
|
--pretty Pretty-print JSON output.
|
|
--json Emit machine-readable JSON output.
|
|
--force-mecab Skip Yomitan parser setup and test MeCab fallback only.
|
|
--yomitan-extension <path> Optional path to Yomitan extension directory.
|
|
--yomitan-user-data <path> Optional Electron userData directory.
|
|
--mecab-command <path> Optional MeCab binary path (default: mecab).
|
|
--mecab-dictionary <path> Optional MeCab dictionary directory.
|
|
-h, --help Show usage.
|
|
`);
|
|
}
|
|
|
|
function normalizeDisplayText(text: string): string {
|
|
return text
|
|
.replace(/\r\n/g, "\n")
|
|
.replace(/\\N/g, "\n")
|
|
.replace(/\\n/g, "\n")
|
|
.trim();
|
|
}
|
|
|
|
function normalizeTokenizerText(text: string): string {
|
|
return normalizeDisplayText(text)
|
|
.replace(/\n/g, " ")
|
|
.replace(/\s+/g, " ")
|
|
.trim();
|
|
}
|
|
|
|
function isObject(value: unknown): value is Record<string, unknown> {
|
|
return Boolean(value && typeof value === "object");
|
|
}
|
|
|
|
function isHeadwordRows(value: unknown): value is YomitanParseHeadword[][] {
|
|
return (
|
|
Array.isArray(value) &&
|
|
value.every((row) =>
|
|
Array.isArray(row) &&
|
|
row.every((entry) => isObject(entry) && typeof entry.term === "string")
|
|
)
|
|
);
|
|
}
|
|
|
|
function extractHeadwordTerms(segment: YomitanParseSegment): string[] {
|
|
if (!isHeadwordRows(segment.headwords)) {
|
|
return [];
|
|
}
|
|
const terms: string[] = [];
|
|
const seen = new Set<string>();
|
|
for (const row of segment.headwords) {
|
|
for (const entry of row) {
|
|
const term = (entry.term as string).trim();
|
|
if (!term || seen.has(term)) {
|
|
continue;
|
|
}
|
|
seen.add(term);
|
|
terms.push(term);
|
|
}
|
|
}
|
|
return terms;
|
|
}
|
|
|
|
function mapParseResultsToCandidates(parseResults: unknown): ParsedCandidate[] {
|
|
if (!Array.isArray(parseResults)) {
|
|
return [];
|
|
}
|
|
|
|
const candidates: ParsedCandidate[] = [];
|
|
for (const item of parseResults) {
|
|
if (!isObject(item)) {
|
|
continue;
|
|
}
|
|
const parseItem = item as YomitanParseResultItem;
|
|
if (!Array.isArray(parseItem.content) || typeof parseItem.source !== "string") {
|
|
continue;
|
|
}
|
|
|
|
const candidateTokens: ParsedCandidate["tokens"] = [];
|
|
let charOffset = 0;
|
|
let validLineCount = 0;
|
|
|
|
for (const line of parseItem.content) {
|
|
if (!Array.isArray(line)) {
|
|
continue;
|
|
}
|
|
const lineSegments = line as YomitanParseSegment[];
|
|
if (lineSegments.some((segment) => typeof segment.text !== "string")) {
|
|
continue;
|
|
}
|
|
validLineCount += 1;
|
|
|
|
for (const segment of lineSegments) {
|
|
const surface = (segment.text as string) ?? "";
|
|
if (!surface) {
|
|
continue;
|
|
}
|
|
const startPos = charOffset;
|
|
const endPos = startPos + surface.length;
|
|
charOffset = endPos;
|
|
const headwordTerms = extractHeadwordTerms(segment);
|
|
candidateTokens.push({
|
|
surface,
|
|
reading: typeof segment.reading === "string" ? segment.reading : "",
|
|
headword: headwordTerms[0] ?? surface,
|
|
startPos,
|
|
endPos,
|
|
});
|
|
}
|
|
}
|
|
|
|
if (validLineCount === 0 || candidateTokens.length === 0) {
|
|
continue;
|
|
}
|
|
|
|
candidates.push({
|
|
source: parseItem.source,
|
|
index:
|
|
typeof parseItem.index === "number" && Number.isInteger(parseItem.index)
|
|
? parseItem.index
|
|
: 0,
|
|
tokens: candidateTokens,
|
|
});
|
|
}
|
|
|
|
return candidates;
|
|
}
|
|
|
|
function candidateTokenSignature(token: {
|
|
surface: string;
|
|
reading: string;
|
|
headword: string;
|
|
startPos: number;
|
|
endPos: number;
|
|
}): string {
|
|
return `${token.surface}\u001f${token.reading}\u001f${token.headword}\u001f${token.startPos}\u001f${token.endPos}`;
|
|
}
|
|
|
|
function mergedTokenSignature(token: MergedToken): string {
|
|
return `${token.surface}\u001f${token.reading}\u001f${token.headword}\u001f${token.startPos}\u001f${token.endPos}`;
|
|
}
|
|
|
|
function findSelectedCandidateIndexes(
|
|
candidates: ParsedCandidate[],
|
|
mergedTokens: MergedToken[] | null,
|
|
): number[] {
|
|
if (!mergedTokens || mergedTokens.length === 0) {
|
|
return [];
|
|
}
|
|
|
|
const mergedSignatures = mergedTokens.map(mergedTokenSignature);
|
|
const selected: number[] = [];
|
|
for (let i = 0; i < candidates.length; i += 1) {
|
|
const candidateSignatures = candidates[i].tokens.map(candidateTokenSignature);
|
|
if (candidateSignatures.length !== mergedSignatures.length) {
|
|
continue;
|
|
}
|
|
let allMatch = true;
|
|
for (let j = 0; j < candidateSignatures.length; j += 1) {
|
|
if (candidateSignatures[j] !== mergedSignatures[j]) {
|
|
allMatch = false;
|
|
break;
|
|
}
|
|
}
|
|
if (allMatch) {
|
|
selected.push(i);
|
|
}
|
|
}
|
|
|
|
return selected;
|
|
}
|
|
|
|
function resolveYomitanExtensionPath(explicitPath?: string): string | null {
|
|
const candidates = [
|
|
explicitPath ? path.resolve(explicitPath) : null,
|
|
path.resolve(process.cwd(), "vendor", "yomitan"),
|
|
];
|
|
|
|
for (const candidate of candidates) {
|
|
if (!candidate) {
|
|
continue;
|
|
}
|
|
if (fs.existsSync(path.join(candidate, "manifest.json"))) {
|
|
return candidate;
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
async function setupYomitanRuntime(
|
|
options: CliOptions,
|
|
): Promise<YomitanRuntimeState> {
|
|
const state: YomitanRuntimeState = {
|
|
available: false,
|
|
note: null,
|
|
extension: null,
|
|
parserWindow: null,
|
|
parserReadyPromise: null,
|
|
parserInitPromise: null,
|
|
};
|
|
|
|
if (options.forceMecabOnly) {
|
|
state.note = "force-mecab enabled";
|
|
return state;
|
|
}
|
|
|
|
const electronModule = await import("electron").catch((error) => {
|
|
state.note = error instanceof Error ? error.message : "electron import failed";
|
|
return null;
|
|
});
|
|
if (!electronModule?.app || !electronModule?.session) {
|
|
state.note = "electron runtime not available in this process";
|
|
return state;
|
|
}
|
|
|
|
if (options.yomitanUserDataPath) {
|
|
electronModule.app.setPath("userData", options.yomitanUserDataPath);
|
|
}
|
|
await electronModule.app.whenReady();
|
|
|
|
const extensionPath = resolveYomitanExtensionPath(options.yomitanExtensionPath);
|
|
if (!extensionPath) {
|
|
state.note = "no Yomitan extension directory found";
|
|
return state;
|
|
}
|
|
|
|
try {
|
|
state.extension = await electronModule.session.defaultSession.loadExtension(
|
|
extensionPath,
|
|
{ allowFileAccess: true },
|
|
);
|
|
state.available = true;
|
|
return state;
|
|
} catch (error) {
|
|
state.note =
|
|
error instanceof Error
|
|
? error.message
|
|
: "failed to load Yomitan extension";
|
|
state.available = false;
|
|
return state;
|
|
}
|
|
}
|
|
|
|
async function fetchRawParseResults(
|
|
parserWindow: Electron.BrowserWindow,
|
|
text: string,
|
|
): Promise<unknown> {
|
|
const script = `
|
|
(async () => {
|
|
const invoke = (action, params) =>
|
|
new Promise((resolve, reject) => {
|
|
chrome.runtime.sendMessage({ action, params }, (response) => {
|
|
if (chrome.runtime.lastError) {
|
|
reject(new Error(chrome.runtime.lastError.message));
|
|
return;
|
|
}
|
|
if (!response || typeof response !== "object") {
|
|
reject(new Error("Invalid response from Yomitan backend"));
|
|
return;
|
|
}
|
|
if (response.error) {
|
|
reject(new Error(response.error.message || "Yomitan backend error"));
|
|
return;
|
|
}
|
|
resolve(response.result);
|
|
});
|
|
});
|
|
|
|
const optionsFull = await invoke("optionsGetFull", undefined);
|
|
const profileIndex = optionsFull.profileCurrent;
|
|
const scanLength =
|
|
optionsFull.profiles?.[profileIndex]?.options?.scanning?.length ?? 40;
|
|
|
|
return await invoke("parseText", {
|
|
text: ${JSON.stringify(text)},
|
|
optionsContext: { index: profileIndex },
|
|
scanLength,
|
|
useInternalParser: true,
|
|
useMecabParser: true
|
|
});
|
|
})();
|
|
`;
|
|
return parserWindow.webContents.executeJavaScript(script, true);
|
|
}
|
|
|
|
function renderTextOutput(payload: Record<string, unknown>): void {
|
|
process.stdout.write(`Input: ${String(payload.input)}\n`);
|
|
process.stdout.write(`Tokenizer text: ${String(payload.tokenizerText)}\n`);
|
|
process.stdout.write(`Yomitan available: ${String(payload.yomitanAvailable)}\n`);
|
|
process.stdout.write(`Yomitan note: ${String(payload.yomitanNote ?? "")}\n`);
|
|
process.stdout.write(`Selected candidate indexes: ${JSON.stringify(payload.selectedCandidateIndexes)}\n`);
|
|
process.stdout.write("\nFinal selected tokens:\n");
|
|
const finalTokens = payload.finalTokens as Array<Record<string, unknown>> | null;
|
|
if (!finalTokens || finalTokens.length === 0) {
|
|
process.stdout.write(" (none)\n");
|
|
} else {
|
|
for (let i = 0; i < finalTokens.length; i += 1) {
|
|
const token = finalTokens[i];
|
|
process.stdout.write(
|
|
` [${i}] ${token.surface} -> ${token.headword} (${token.reading}) [${token.startPos}, ${token.endPos})\n`,
|
|
);
|
|
}
|
|
}
|
|
|
|
process.stdout.write("\nYomitan parse candidates:\n");
|
|
const candidates = payload.candidates as Array<Record<string, unknown>>;
|
|
if (!candidates || candidates.length === 0) {
|
|
process.stdout.write(" (none)\n");
|
|
return;
|
|
}
|
|
|
|
for (let i = 0; i < candidates.length; i += 1) {
|
|
const candidate = candidates[i];
|
|
process.stdout.write(
|
|
` [${i}] source=${String(candidate.source)} index=${String(candidate.index)} selectedByTokenizer=${String(candidate.selectedByTokenizer)} tokenCount=${String(candidate.tokenCount)}\n`,
|
|
);
|
|
const tokens = candidate.tokens as Array<Record<string, unknown>> | undefined;
|
|
if (!tokens || tokens.length === 0) {
|
|
continue;
|
|
}
|
|
for (let j = 0; j < tokens.length; j += 1) {
|
|
const token = tokens[j];
|
|
process.stdout.write(
|
|
` - ${token.surface} -> ${token.headword} (${token.reading}) [${token.startPos}, ${token.endPos})\n`,
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
async function main(): Promise<void> {
|
|
const args = parseCliArgs(process.argv.slice(2));
|
|
const yomitan: YomitanRuntimeState = {
|
|
available: false,
|
|
note: null,
|
|
extension: null,
|
|
parserWindow: null,
|
|
parserReadyPromise: null,
|
|
parserInitPromise: null,
|
|
};
|
|
|
|
try {
|
|
const mecabTokenizer = new MecabTokenizer({
|
|
mecabCommand: args.mecabCommand,
|
|
dictionaryPath: args.mecabDictionaryPath,
|
|
});
|
|
const isMecabAvailable = await mecabTokenizer.checkAvailability();
|
|
if (!isMecabAvailable) {
|
|
throw new Error("MeCab is not available on this system.");
|
|
}
|
|
|
|
const runtime = await setupYomitanRuntime(args);
|
|
yomitan.available = runtime.available;
|
|
yomitan.note = runtime.note;
|
|
yomitan.extension = runtime.extension;
|
|
yomitan.parserWindow = runtime.parserWindow;
|
|
yomitan.parserReadyPromise = runtime.parserReadyPromise;
|
|
yomitan.parserInitPromise = runtime.parserInitPromise;
|
|
|
|
const deps = createTokenizerDepsRuntime({
|
|
getYomitanExt: () => yomitan.extension,
|
|
getYomitanParserWindow: () => yomitan.parserWindow,
|
|
setYomitanParserWindow: (window) => {
|
|
yomitan.parserWindow = window;
|
|
},
|
|
getYomitanParserReadyPromise: () => yomitan.parserReadyPromise,
|
|
setYomitanParserReadyPromise: (promise) => {
|
|
yomitan.parserReadyPromise = promise;
|
|
},
|
|
getYomitanParserInitPromise: () => yomitan.parserInitPromise,
|
|
setYomitanParserInitPromise: (promise) => {
|
|
yomitan.parserInitPromise = promise;
|
|
},
|
|
isKnownWord: () => false,
|
|
getKnownWordMatchMode: () => "headword",
|
|
getJlptLevel: () => null,
|
|
getMecabTokenizer: () => ({
|
|
tokenize: (text: string) => mecabTokenizer.tokenize(text),
|
|
}),
|
|
});
|
|
|
|
const subtitleData = await tokenizeSubtitle(args.input, deps);
|
|
const tokenizeText = normalizeTokenizerText(args.input);
|
|
let rawParseResults: unknown = null;
|
|
if (
|
|
yomitan.available &&
|
|
yomitan.parserWindow &&
|
|
!yomitan.parserWindow.isDestroyed() &&
|
|
tokenizeText
|
|
) {
|
|
rawParseResults = await fetchRawParseResults(yomitan.parserWindow, tokenizeText);
|
|
}
|
|
|
|
const parsedCandidates = mapParseResultsToCandidates(rawParseResults);
|
|
const selectedCandidateIndexes = findSelectedCandidateIndexes(
|
|
parsedCandidates,
|
|
subtitleData.tokens,
|
|
);
|
|
const selectedIndexSet = new Set<number>(selectedCandidateIndexes);
|
|
|
|
const payload = {
|
|
input: args.input,
|
|
tokenizerText: subtitleData.text,
|
|
yomitanAvailable: yomitan.available,
|
|
yomitanNote: yomitan.note,
|
|
selectedCandidateIndexes,
|
|
finalTokens:
|
|
subtitleData.tokens?.map((token) => ({
|
|
surface: token.surface,
|
|
reading: token.reading,
|
|
headword: token.headword,
|
|
startPos: token.startPos,
|
|
endPos: token.endPos,
|
|
pos1: token.pos1,
|
|
partOfSpeech: token.partOfSpeech,
|
|
isKnown: token.isKnown,
|
|
isNPlusOneTarget: token.isNPlusOneTarget,
|
|
})) ?? null,
|
|
candidates: parsedCandidates.map((candidate, idx) => ({
|
|
source: candidate.source,
|
|
index: candidate.index,
|
|
selectedByTokenizer: selectedIndexSet.has(idx),
|
|
tokenCount: candidate.tokens.length,
|
|
tokens: candidate.tokens,
|
|
})),
|
|
};
|
|
|
|
if (args.emitJson) {
|
|
process.stdout.write(
|
|
`${JSON.stringify(payload, null, args.emitPretty ? 2 : undefined)}\n`,
|
|
);
|
|
} else {
|
|
renderTextOutput(payload);
|
|
}
|
|
} finally {
|
|
await shutdownYomitanRuntime(yomitan);
|
|
}
|
|
}
|
|
|
|
main()
|
|
.then(() => {
|
|
process.exit(0);
|
|
})
|
|
.catch((error) => {
|
|
console.error(`Error: ${(error as Error).message}`);
|
|
process.exit(1);
|
|
});
|