mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-02-27 18:22:41 -08:00
feat(tokenizer): refine Yomitan grouping and parser tooling
- map segmented Yomitan lines into single logical tokens and improve candidate selection heuristics - limit frequency lookup to selected token text with POS-based exclusions and add debug logging hook - add standalone Yomitan parser test script, deterministic utility-script shutdown, and docs/backlog updates
This commit is contained in:
@@ -385,7 +385,7 @@ function printUsage(): void {
|
||||
pnpm run get-frequency [--pretty] [--verbose] [--dictionary <path>] [--mecab-command <path>] [--mecab-dictionary <path>] <text>
|
||||
|
||||
--pretty Pretty-print JSON output.
|
||||
--verbose Include merged-frequency diagnostics and lookup terms.
|
||||
--verbose Include merged-frequency diagnostics and lookup term details.
|
||||
--force-mecab Skip Yomitan parser initialization and force MeCab fallback.
|
||||
--yomitan-extension <path> Optional path to a Yomitan extension directory.
|
||||
--yomitan-user-data <path> Optional Electron userData directory for Yomitan state.
|
||||
@@ -413,41 +413,8 @@ type FrequencyCandidate = {
|
||||
};
|
||||
|
||||
function getFrequencyLookupTextCandidates(token: MergedToken): string[] {
|
||||
const tokenWithCandidates = token as MergedToken & {
|
||||
frequencyLookupTerms?: string[];
|
||||
};
|
||||
const lookupTextCandidates: string[] = [];
|
||||
const addLookupText = (text: string | undefined): void => {
|
||||
if (!text) {
|
||||
return;
|
||||
}
|
||||
const trimmed = text.trim();
|
||||
if (!trimmed) {
|
||||
return;
|
||||
}
|
||||
lookupTextCandidates.push(trimmed);
|
||||
};
|
||||
|
||||
if (Array.isArray(tokenWithCandidates.frequencyLookupTerms)) {
|
||||
for (const term of tokenWithCandidates.frequencyLookupTerms) {
|
||||
addLookupText(term);
|
||||
}
|
||||
}
|
||||
|
||||
addLookupText(token.headword);
|
||||
addLookupText(token.reading);
|
||||
addLookupText(token.surface);
|
||||
|
||||
const uniqueLookupTerms: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
for (const term of lookupTextCandidates) {
|
||||
if (seen.has(term)) {
|
||||
continue;
|
||||
}
|
||||
seen.add(term);
|
||||
uniqueLookupTerms.push(term);
|
||||
}
|
||||
return uniqueLookupTerms;
|
||||
const lookupText = token.headword?.trim() || token.reading?.trim() || token.surface.trim();
|
||||
return lookupText ? [lookupText] : [];
|
||||
}
|
||||
|
||||
function getBestFrequencyLookupCandidate(
|
||||
@@ -488,10 +455,6 @@ function simplifyTokenWithVerbose(
|
||||
token: MergedToken,
|
||||
getFrequencyRank: FrequencyDictionaryLookup,
|
||||
): Record<string, unknown> {
|
||||
const tokenWithCandidates = token as MergedToken & {
|
||||
frequencyLookupTerms?: string[];
|
||||
};
|
||||
const frequencyLookupTerms = tokenWithCandidates.frequencyLookupTerms;
|
||||
const candidates = getFrequencyLookupTextCandidates(token).map((term) => ({
|
||||
term,
|
||||
rank: getFrequencyRank(term),
|
||||
@@ -518,10 +481,6 @@ function simplifyTokenWithVerbose(
|
||||
isNPlusOneTarget: token.isNPlusOneTarget,
|
||||
frequencyRank: token.frequencyRank,
|
||||
jlptLevel: token.jlptLevel,
|
||||
frequencyLookupTerms:
|
||||
Array.isArray(frequencyLookupTerms) && frequencyLookupTerms.length > 0
|
||||
? frequencyLookupTerms
|
||||
: undefined,
|
||||
frequencyCandidates: candidates,
|
||||
frequencyBestLookupTerm: bestCandidate?.term ?? null,
|
||||
frequencyBestLookupRank: bestCandidate?.rank ?? null,
|
||||
@@ -537,6 +496,25 @@ interface YomitanRuntimeState {
|
||||
note?: string;
|
||||
}
|
||||
|
||||
function destroyUnknownParserWindow(window: unknown): void {
|
||||
if (!window || typeof window !== "object") {
|
||||
return;
|
||||
}
|
||||
const candidate = window as {
|
||||
isDestroyed?: () => boolean;
|
||||
destroy?: () => void;
|
||||
};
|
||||
if (typeof candidate.isDestroyed !== "function") {
|
||||
return;
|
||||
}
|
||||
if (typeof candidate.destroy !== "function") {
|
||||
return;
|
||||
}
|
||||
if (!candidate.isDestroyed()) {
|
||||
candidate.destroy();
|
||||
}
|
||||
}
|
||||
|
||||
async function createYomitanRuntimeState(
|
||||
userDataPath: string,
|
||||
): Promise<YomitanRuntimeState> {
|
||||
@@ -775,133 +753,141 @@ function renderColoredLine(
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const args = parseCliArgs(process.argv.slice(2));
|
||||
const getFrequencyRank = await getFrequencyLookup(args.dictionaryPath);
|
||||
let electronModule: (typeof import("electron")) | null = null;
|
||||
let yomitanState: YomitanRuntimeState | null = null;
|
||||
|
||||
const mecabTokenizer = new MecabTokenizer({
|
||||
mecabCommand: args.mecabCommand,
|
||||
dictionaryPath: args.mecabDictionaryPath,
|
||||
});
|
||||
const isMecabAvailable = await mecabTokenizer.checkAvailability();
|
||||
if (!isMecabAvailable) {
|
||||
throw new Error(
|
||||
"MeCab is not available on this system. Install/run environment with MeCab to tokenize input.",
|
||||
);
|
||||
}
|
||||
try {
|
||||
const args = parseCliArgs(process.argv.slice(2));
|
||||
const getFrequencyRank = await getFrequencyLookup(args.dictionaryPath);
|
||||
|
||||
const app = await import("electron").catch(() => null);
|
||||
if (app && args.yomitanUserDataPath) {
|
||||
app.app.setPath("userData", args.yomitanUserDataPath);
|
||||
}
|
||||
const yomitanState =
|
||||
!args.forceMecabOnly
|
||||
? await createYomitanRuntimeStateWithSearch(
|
||||
app?.app?.getPath ? app.app.getPath("userData") : process.cwd(),
|
||||
args.yomitanExtensionPath,
|
||||
)
|
||||
: null;
|
||||
const hasYomitan = Boolean(yomitanState?.available && yomitanState?.yomitanExt);
|
||||
const mecabTokenizer = new MecabTokenizer({
|
||||
mecabCommand: args.mecabCommand,
|
||||
dictionaryPath: args.mecabDictionaryPath,
|
||||
});
|
||||
const isMecabAvailable = await mecabTokenizer.checkAvailability();
|
||||
if (!isMecabAvailable) {
|
||||
throw new Error(
|
||||
"MeCab is not available on this system. Install/run environment with MeCab to tokenize input.",
|
||||
);
|
||||
}
|
||||
|
||||
const deps = createTokenizerDepsRuntimeService({
|
||||
getYomitanExt: () =>
|
||||
(hasYomitan ? yomitanState!.yomitanExt : null) as never,
|
||||
getYomitanParserWindow: () =>
|
||||
(hasYomitan ? yomitanState!.parserWindow : null) as never,
|
||||
setYomitanParserWindow: (window) => {
|
||||
if (!hasYomitan) {
|
||||
return;
|
||||
}
|
||||
yomitanState!.parserWindow = window;
|
||||
},
|
||||
getYomitanParserReadyPromise: () =>
|
||||
(hasYomitan ? yomitanState!.parserReadyPromise : null) as never,
|
||||
setYomitanParserReadyPromise: (promise) => {
|
||||
if (!hasYomitan) {
|
||||
return;
|
||||
}
|
||||
yomitanState!.parserReadyPromise = promise;
|
||||
},
|
||||
getYomitanParserInitPromise: () =>
|
||||
(hasYomitan ? yomitanState!.parserInitPromise : null) as never,
|
||||
setYomitanParserInitPromise: (promise) => {
|
||||
if (!hasYomitan) {
|
||||
return;
|
||||
}
|
||||
yomitanState!.parserInitPromise = promise;
|
||||
},
|
||||
isKnownWord: () => false,
|
||||
getKnownWordMatchMode: () => "headword",
|
||||
getJlptLevel: () => null,
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank,
|
||||
getMecabTokenizer: () => ({
|
||||
tokenize: (text: string) => mecabTokenizer.tokenize(text),
|
||||
}),
|
||||
});
|
||||
electronModule = await import("electron").catch(() => null);
|
||||
if (electronModule && args.yomitanUserDataPath) {
|
||||
electronModule.app.setPath("userData", args.yomitanUserDataPath);
|
||||
}
|
||||
yomitanState =
|
||||
!args.forceMecabOnly
|
||||
? await createYomitanRuntimeStateWithSearch(
|
||||
electronModule?.app?.getPath
|
||||
? electronModule.app.getPath("userData")
|
||||
: process.cwd(),
|
||||
args.yomitanExtensionPath,
|
||||
)
|
||||
: null;
|
||||
const hasYomitan = Boolean(yomitanState?.available && yomitanState?.yomitanExt);
|
||||
|
||||
const subtitleData = await tokenizeSubtitleService(args.input, deps);
|
||||
const tokenCount = subtitleData.tokens?.length ?? 0;
|
||||
const mergedCount = subtitleData.tokens?.filter((token) => token.isMerged).length ?? 0;
|
||||
const hasYomitanCandidates = Boolean(
|
||||
subtitleData.tokens?.some((token) => {
|
||||
const frequencyLookupTerms = (
|
||||
token as MergedToken & { frequencyLookupTerms?: string[] }
|
||||
).frequencyLookupTerms;
|
||||
return Array.isArray(frequencyLookupTerms) && frequencyLookupTerms.length > 0;
|
||||
}) ?? false,
|
||||
);
|
||||
const tokens =
|
||||
subtitleData.tokens?.map((token) =>
|
||||
args.emitVerbose
|
||||
? simplifyTokenWithVerbose(token, getFrequencyRank)
|
||||
: simplifyToken(token),
|
||||
) ?? null;
|
||||
const diagnostics = {
|
||||
yomitan: {
|
||||
available: Boolean(yomitanState?.available),
|
||||
loaded: hasYomitan,
|
||||
forceMecabOnly: args.forceMecabOnly,
|
||||
note: yomitanState?.note ?? null,
|
||||
},
|
||||
mecab: {
|
||||
command: args.mecabCommand ?? "mecab",
|
||||
dictionaryPath: args.mecabDictionaryPath ?? null,
|
||||
available: isMecabAvailable,
|
||||
},
|
||||
tokenizer: {
|
||||
sourceHint:
|
||||
tokenCount === 0
|
||||
? "none"
|
||||
: hasYomitan ? "yomitan-merged" : "mecab-merge",
|
||||
mergedTokenCount: mergedCount,
|
||||
totalTokenCount: tokenCount,
|
||||
},
|
||||
};
|
||||
if (tokens === null) {
|
||||
diagnostics.mecab["status"] = "no-tokens";
|
||||
diagnostics.mecab["note"] =
|
||||
"MeCab returned no parseable tokens. This is often caused by a missing/invalid MeCab dictionary path.";
|
||||
} else {
|
||||
diagnostics.mecab["status"] = "ok";
|
||||
}
|
||||
const deps = createTokenizerDepsRuntimeService({
|
||||
getYomitanExt: () =>
|
||||
(hasYomitan ? yomitanState!.yomitanExt : null) as never,
|
||||
getYomitanParserWindow: () =>
|
||||
(hasYomitan ? yomitanState!.parserWindow : null) as never,
|
||||
setYomitanParserWindow: (window) => {
|
||||
if (!hasYomitan) {
|
||||
return;
|
||||
}
|
||||
yomitanState!.parserWindow = window;
|
||||
},
|
||||
getYomitanParserReadyPromise: () =>
|
||||
(hasYomitan ? yomitanState!.parserReadyPromise : null) as never,
|
||||
setYomitanParserReadyPromise: (promise) => {
|
||||
if (!hasYomitan) {
|
||||
return;
|
||||
}
|
||||
yomitanState!.parserReadyPromise = promise;
|
||||
},
|
||||
getYomitanParserInitPromise: () =>
|
||||
(hasYomitan ? yomitanState!.parserInitPromise : null) as never,
|
||||
setYomitanParserInitPromise: (promise) => {
|
||||
if (!hasYomitan) {
|
||||
return;
|
||||
}
|
||||
yomitanState!.parserInitPromise = promise;
|
||||
},
|
||||
isKnownWord: () => false,
|
||||
getKnownWordMatchMode: () => "headword",
|
||||
getJlptLevel: () => null,
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank,
|
||||
getMecabTokenizer: () => ({
|
||||
tokenize: (text: string) => mecabTokenizer.tokenize(text),
|
||||
}),
|
||||
});
|
||||
|
||||
const output = {
|
||||
input: args.input,
|
||||
tokenizerText: subtitleData.text,
|
||||
tokens,
|
||||
diagnostics,
|
||||
};
|
||||
const subtitleData = await tokenizeSubtitleService(args.input, deps);
|
||||
const tokenCount = subtitleData.tokens?.length ?? 0;
|
||||
const mergedCount = subtitleData.tokens?.filter((token) => token.isMerged).length ?? 0;
|
||||
const tokens =
|
||||
subtitleData.tokens?.map((token) =>
|
||||
args.emitVerbose
|
||||
? simplifyTokenWithVerbose(token, getFrequencyRank)
|
||||
: simplifyToken(token),
|
||||
) ?? null;
|
||||
const diagnostics = {
|
||||
yomitan: {
|
||||
available: Boolean(yomitanState?.available),
|
||||
loaded: hasYomitan,
|
||||
forceMecabOnly: args.forceMecabOnly,
|
||||
note: yomitanState?.note ?? null,
|
||||
},
|
||||
mecab: {
|
||||
command: args.mecabCommand ?? "mecab",
|
||||
dictionaryPath: args.mecabDictionaryPath ?? null,
|
||||
available: isMecabAvailable,
|
||||
},
|
||||
tokenizer: {
|
||||
sourceHint:
|
||||
tokenCount === 0
|
||||
? "none"
|
||||
: hasYomitan ? "yomitan-merged" : "mecab-merge",
|
||||
mergedTokenCount: mergedCount,
|
||||
totalTokenCount: tokenCount,
|
||||
},
|
||||
};
|
||||
if (tokens === null) {
|
||||
diagnostics.mecab["status"] = "no-tokens";
|
||||
diagnostics.mecab["note"] =
|
||||
"MeCab returned no parseable tokens. This is often caused by a missing/invalid MeCab dictionary path.";
|
||||
} else {
|
||||
diagnostics.mecab["status"] = "ok";
|
||||
}
|
||||
|
||||
const json = JSON.stringify(output, null, args.emitPretty ? 2 : undefined);
|
||||
process.stdout.write(`${json}\n`);
|
||||
const output = {
|
||||
input: args.input,
|
||||
tokenizerText: subtitleData.text,
|
||||
tokens,
|
||||
diagnostics,
|
||||
};
|
||||
|
||||
if (args.emitColoredLine && subtitleData.tokens) {
|
||||
const coloredLine = renderColoredLine(subtitleData.text, subtitleData.tokens, args);
|
||||
process.stdout.write(`${coloredLine}\n`);
|
||||
const json = JSON.stringify(output, null, args.emitPretty ? 2 : undefined);
|
||||
process.stdout.write(`${json}\n`);
|
||||
|
||||
if (args.emitColoredLine && subtitleData.tokens) {
|
||||
const coloredLine = renderColoredLine(subtitleData.text, subtitleData.tokens, args);
|
||||
process.stdout.write(`${coloredLine}\n`);
|
||||
}
|
||||
} finally {
|
||||
destroyUnknownParserWindow(yomitanState?.parserWindow ?? null);
|
||||
if (electronModule?.app) {
|
||||
electronModule.app.quit();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error(`Error: ${(error as Error).message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
main()
|
||||
.then(() => {
|
||||
process.exit(0);
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error(`Error: ${(error as Error).message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
653
scripts/test-yomitan-parser.ts
Normal file
653
scripts/test-yomitan-parser.ts
Normal file
@@ -0,0 +1,653 @@
|
||||
import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
import process from "node:process";
|
||||
|
||||
import { createTokenizerDepsRuntimeService, tokenizeSubtitleService } from "../src/core/services/tokenizer-service.js";
|
||||
import { MecabTokenizer } from "../src/mecab-tokenizer.js";
|
||||
import type { MergedToken } from "../src/types.js";
|
||||
|
||||
interface CliOptions {
|
||||
input: string;
|
||||
emitPretty: boolean;
|
||||
emitJson: boolean;
|
||||
forceMecabOnly: boolean;
|
||||
yomitanExtensionPath?: string;
|
||||
yomitanUserDataPath?: string;
|
||||
mecabCommand?: string;
|
||||
mecabDictionaryPath?: string;
|
||||
}
|
||||
|
||||
interface YomitanParseHeadword {
|
||||
term?: unknown;
|
||||
}
|
||||
|
||||
interface YomitanParseSegment {
|
||||
text?: unknown;
|
||||
reading?: unknown;
|
||||
headwords?: unknown;
|
||||
}
|
||||
|
||||
interface YomitanParseResultItem {
|
||||
source?: unknown;
|
||||
index?: unknown;
|
||||
content?: unknown;
|
||||
}
|
||||
|
||||
interface ParsedCandidate {
|
||||
source: string;
|
||||
index: number;
|
||||
tokens: Array<{
|
||||
surface: string;
|
||||
reading: string;
|
||||
headword: string;
|
||||
startPos: number;
|
||||
endPos: number;
|
||||
}>;
|
||||
}
|
||||
|
||||
interface YomitanRuntimeState {
|
||||
available: boolean;
|
||||
note: string | null;
|
||||
extension: Electron.Extension | null;
|
||||
parserWindow: Electron.BrowserWindow | null;
|
||||
parserReadyPromise: Promise<void> | null;
|
||||
parserInitPromise: Promise<boolean> | null;
|
||||
}
|
||||
|
||||
function destroyParserWindow(window: Electron.BrowserWindow | null): void {
|
||||
if (!window || window.isDestroyed()) {
|
||||
return;
|
||||
}
|
||||
window.destroy();
|
||||
}
|
||||
|
||||
async function shutdownYomitanRuntime(yomitan: YomitanRuntimeState): Promise<void> {
|
||||
destroyParserWindow(yomitan.parserWindow);
|
||||
const electronModule = await import("electron").catch(() => null);
|
||||
if (electronModule?.app) {
|
||||
electronModule.app.quit();
|
||||
}
|
||||
}
|
||||
|
||||
function parseCliArgs(argv: string[]): CliOptions {
|
||||
const args = [...argv];
|
||||
const inputParts: string[] = [];
|
||||
let emitPretty = false;
|
||||
let emitJson = false;
|
||||
let forceMecabOnly = false;
|
||||
let yomitanExtensionPath: string | undefined;
|
||||
let yomitanUserDataPath: string | undefined;
|
||||
let mecabCommand: string | undefined;
|
||||
let mecabDictionaryPath: string | undefined;
|
||||
|
||||
while (args.length > 0) {
|
||||
const arg = args.shift();
|
||||
if (!arg) break;
|
||||
|
||||
if (arg === "--help" || arg === "-h") {
|
||||
printUsage();
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
if (arg === "--pretty") {
|
||||
emitPretty = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--json") {
|
||||
emitJson = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--force-mecab") {
|
||||
forceMecabOnly = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--yomitan-extension") {
|
||||
const next = args.shift();
|
||||
if (!next) {
|
||||
throw new Error("Missing value for --yomitan-extension");
|
||||
}
|
||||
yomitanExtensionPath = path.resolve(next);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("--yomitan-extension=")) {
|
||||
yomitanExtensionPath = path.resolve(
|
||||
arg.slice("--yomitan-extension=".length),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--yomitan-user-data") {
|
||||
const next = args.shift();
|
||||
if (!next) {
|
||||
throw new Error("Missing value for --yomitan-user-data");
|
||||
}
|
||||
yomitanUserDataPath = path.resolve(next);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("--yomitan-user-data=")) {
|
||||
yomitanUserDataPath = path.resolve(
|
||||
arg.slice("--yomitan-user-data=".length),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--mecab-command") {
|
||||
const next = args.shift();
|
||||
if (!next) {
|
||||
throw new Error("Missing value for --mecab-command");
|
||||
}
|
||||
mecabCommand = next;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("--mecab-command=")) {
|
||||
mecabCommand = arg.slice("--mecab-command=".length);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--mecab-dictionary") {
|
||||
const next = args.shift();
|
||||
if (!next) {
|
||||
throw new Error("Missing value for --mecab-dictionary");
|
||||
}
|
||||
mecabDictionaryPath = next;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("--mecab-dictionary=")) {
|
||||
mecabDictionaryPath = arg.slice("--mecab-dictionary=".length);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("-")) {
|
||||
throw new Error(`Unknown flag: ${arg}`);
|
||||
}
|
||||
|
||||
inputParts.push(arg);
|
||||
}
|
||||
|
||||
const input = inputParts.join(" ").trim();
|
||||
if (input.length > 0) {
|
||||
return {
|
||||
input,
|
||||
emitPretty,
|
||||
emitJson,
|
||||
forceMecabOnly,
|
||||
yomitanExtensionPath,
|
||||
yomitanUserDataPath,
|
||||
mecabCommand,
|
||||
mecabDictionaryPath,
|
||||
};
|
||||
}
|
||||
|
||||
const stdin = fs.readFileSync(0, "utf8").trim();
|
||||
if (!stdin) {
|
||||
throw new Error(
|
||||
"Please provide input text as arguments or via stdin.",
|
||||
);
|
||||
}
|
||||
|
||||
return {
|
||||
input: stdin,
|
||||
emitPretty,
|
||||
emitJson,
|
||||
forceMecabOnly,
|
||||
yomitanExtensionPath,
|
||||
yomitanUserDataPath,
|
||||
mecabCommand,
|
||||
mecabDictionaryPath,
|
||||
};
|
||||
}
|
||||
|
||||
function printUsage(): void {
|
||||
process.stdout.write(`Usage:
|
||||
pnpm run test-yomitan-parser:electron -- [--pretty] [--json] [--yomitan-extension <path>] [--yomitan-user-data <path>] [--mecab-command <path>] [--mecab-dictionary <path>] <text>
|
||||
|
||||
--pretty Pretty-print JSON output.
|
||||
--json Emit machine-readable JSON output.
|
||||
--force-mecab Skip Yomitan parser setup and test MeCab fallback only.
|
||||
--yomitan-extension <path> Optional path to Yomitan extension directory.
|
||||
--yomitan-user-data <path> Optional Electron userData directory.
|
||||
--mecab-command <path> Optional MeCab binary path (default: mecab).
|
||||
--mecab-dictionary <path> Optional MeCab dictionary directory.
|
||||
-h, --help Show usage.
|
||||
`);
|
||||
}
|
||||
|
||||
function normalizeDisplayText(text: string): string {
|
||||
return text
|
||||
.replace(/\r\n/g, "\n")
|
||||
.replace(/\\N/g, "\n")
|
||||
.replace(/\\n/g, "\n")
|
||||
.trim();
|
||||
}
|
||||
|
||||
function normalizeTokenizerText(text: string): string {
|
||||
return normalizeDisplayText(text)
|
||||
.replace(/\n/g, " ")
|
||||
.replace(/\s+/g, " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
function isObject(value: unknown): value is Record<string, unknown> {
|
||||
return Boolean(value && typeof value === "object");
|
||||
}
|
||||
|
||||
function isHeadwordRows(value: unknown): value is YomitanParseHeadword[][] {
|
||||
return (
|
||||
Array.isArray(value) &&
|
||||
value.every((row) =>
|
||||
Array.isArray(row) &&
|
||||
row.every((entry) => isObject(entry) && typeof entry.term === "string")
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
function extractHeadwordTerms(segment: YomitanParseSegment): string[] {
|
||||
if (!isHeadwordRows(segment.headwords)) {
|
||||
return [];
|
||||
}
|
||||
const terms: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
for (const row of segment.headwords) {
|
||||
for (const entry of row) {
|
||||
const term = (entry.term as string).trim();
|
||||
if (!term || seen.has(term)) {
|
||||
continue;
|
||||
}
|
||||
seen.add(term);
|
||||
terms.push(term);
|
||||
}
|
||||
}
|
||||
return terms;
|
||||
}
|
||||
|
||||
function mapParseResultsToCandidates(parseResults: unknown): ParsedCandidate[] {
|
||||
if (!Array.isArray(parseResults)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const candidates: ParsedCandidate[] = [];
|
||||
for (const item of parseResults) {
|
||||
if (!isObject(item)) {
|
||||
continue;
|
||||
}
|
||||
const parseItem = item as YomitanParseResultItem;
|
||||
if (!Array.isArray(parseItem.content) || typeof parseItem.source !== "string") {
|
||||
continue;
|
||||
}
|
||||
|
||||
const candidateTokens: ParsedCandidate["tokens"] = [];
|
||||
let charOffset = 0;
|
||||
let validLineCount = 0;
|
||||
|
||||
for (const line of parseItem.content) {
|
||||
if (!Array.isArray(line)) {
|
||||
continue;
|
||||
}
|
||||
const lineSegments = line as YomitanParseSegment[];
|
||||
if (lineSegments.some((segment) => typeof segment.text !== "string")) {
|
||||
continue;
|
||||
}
|
||||
validLineCount += 1;
|
||||
|
||||
for (const segment of lineSegments) {
|
||||
const surface = (segment.text as string) ?? "";
|
||||
if (!surface) {
|
||||
continue;
|
||||
}
|
||||
const startPos = charOffset;
|
||||
const endPos = startPos + surface.length;
|
||||
charOffset = endPos;
|
||||
const headwordTerms = extractHeadwordTerms(segment);
|
||||
candidateTokens.push({
|
||||
surface,
|
||||
reading: typeof segment.reading === "string" ? segment.reading : "",
|
||||
headword: headwordTerms[0] ?? surface,
|
||||
startPos,
|
||||
endPos,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (validLineCount === 0 || candidateTokens.length === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
candidates.push({
|
||||
source: parseItem.source,
|
||||
index:
|
||||
typeof parseItem.index === "number" && Number.isInteger(parseItem.index)
|
||||
? parseItem.index
|
||||
: 0,
|
||||
tokens: candidateTokens,
|
||||
});
|
||||
}
|
||||
|
||||
return candidates;
|
||||
}
|
||||
|
||||
function candidateTokenSignature(token: {
|
||||
surface: string;
|
||||
reading: string;
|
||||
headword: string;
|
||||
startPos: number;
|
||||
endPos: number;
|
||||
}): string {
|
||||
return `${token.surface}\u001f${token.reading}\u001f${token.headword}\u001f${token.startPos}\u001f${token.endPos}`;
|
||||
}
|
||||
|
||||
function mergedTokenSignature(token: MergedToken): string {
|
||||
return `${token.surface}\u001f${token.reading}\u001f${token.headword}\u001f${token.startPos}\u001f${token.endPos}`;
|
||||
}
|
||||
|
||||
function findSelectedCandidateIndexes(
|
||||
candidates: ParsedCandidate[],
|
||||
mergedTokens: MergedToken[] | null,
|
||||
): number[] {
|
||||
if (!mergedTokens || mergedTokens.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const mergedSignatures = mergedTokens.map(mergedTokenSignature);
|
||||
const selected: number[] = [];
|
||||
for (let i = 0; i < candidates.length; i += 1) {
|
||||
const candidateSignatures = candidates[i].tokens.map(candidateTokenSignature);
|
||||
if (candidateSignatures.length !== mergedSignatures.length) {
|
||||
continue;
|
||||
}
|
||||
let allMatch = true;
|
||||
for (let j = 0; j < candidateSignatures.length; j += 1) {
|
||||
if (candidateSignatures[j] !== mergedSignatures[j]) {
|
||||
allMatch = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (allMatch) {
|
||||
selected.push(i);
|
||||
}
|
||||
}
|
||||
|
||||
return selected;
|
||||
}
|
||||
|
||||
function resolveYomitanExtensionPath(explicitPath?: string): string | null {
|
||||
const candidates = [
|
||||
explicitPath ? path.resolve(explicitPath) : null,
|
||||
path.resolve(process.cwd(), "vendor", "yomitan"),
|
||||
];
|
||||
|
||||
for (const candidate of candidates) {
|
||||
if (!candidate) {
|
||||
continue;
|
||||
}
|
||||
if (fs.existsSync(path.join(candidate, "manifest.json"))) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async function setupYomitanRuntime(
|
||||
options: CliOptions,
|
||||
): Promise<YomitanRuntimeState> {
|
||||
const state: YomitanRuntimeState = {
|
||||
available: false,
|
||||
note: null,
|
||||
extension: null,
|
||||
parserWindow: null,
|
||||
parserReadyPromise: null,
|
||||
parserInitPromise: null,
|
||||
};
|
||||
|
||||
if (options.forceMecabOnly) {
|
||||
state.note = "force-mecab enabled";
|
||||
return state;
|
||||
}
|
||||
|
||||
const electronModule = await import("electron").catch((error) => {
|
||||
state.note = error instanceof Error ? error.message : "electron import failed";
|
||||
return null;
|
||||
});
|
||||
if (!electronModule?.app || !electronModule?.session) {
|
||||
state.note = "electron runtime not available in this process";
|
||||
return state;
|
||||
}
|
||||
|
||||
if (options.yomitanUserDataPath) {
|
||||
electronModule.app.setPath("userData", options.yomitanUserDataPath);
|
||||
}
|
||||
await electronModule.app.whenReady();
|
||||
|
||||
const extensionPath = resolveYomitanExtensionPath(options.yomitanExtensionPath);
|
||||
if (!extensionPath) {
|
||||
state.note = "no Yomitan extension directory found";
|
||||
return state;
|
||||
}
|
||||
|
||||
try {
|
||||
state.extension = await electronModule.session.defaultSession.loadExtension(
|
||||
extensionPath,
|
||||
{ allowFileAccess: true },
|
||||
);
|
||||
state.available = true;
|
||||
return state;
|
||||
} catch (error) {
|
||||
state.note =
|
||||
error instanceof Error
|
||||
? error.message
|
||||
: "failed to load Yomitan extension";
|
||||
state.available = false;
|
||||
return state;
|
||||
}
|
||||
}
|
||||
|
||||
async function fetchRawParseResults(
|
||||
parserWindow: Electron.BrowserWindow,
|
||||
text: string,
|
||||
): Promise<unknown> {
|
||||
const script = `
|
||||
(async () => {
|
||||
const invoke = (action, params) =>
|
||||
new Promise((resolve, reject) => {
|
||||
chrome.runtime.sendMessage({ action, params }, (response) => {
|
||||
if (chrome.runtime.lastError) {
|
||||
reject(new Error(chrome.runtime.lastError.message));
|
||||
return;
|
||||
}
|
||||
if (!response || typeof response !== "object") {
|
||||
reject(new Error("Invalid response from Yomitan backend"));
|
||||
return;
|
||||
}
|
||||
if (response.error) {
|
||||
reject(new Error(response.error.message || "Yomitan backend error"));
|
||||
return;
|
||||
}
|
||||
resolve(response.result);
|
||||
});
|
||||
});
|
||||
|
||||
const optionsFull = await invoke("optionsGetFull", undefined);
|
||||
const profileIndex = optionsFull.profileCurrent;
|
||||
const scanLength =
|
||||
optionsFull.profiles?.[profileIndex]?.options?.scanning?.length ?? 40;
|
||||
|
||||
return await invoke("parseText", {
|
||||
text: ${JSON.stringify(text)},
|
||||
optionsContext: { index: profileIndex },
|
||||
scanLength,
|
||||
useInternalParser: true,
|
||||
useMecabParser: true
|
||||
});
|
||||
})();
|
||||
`;
|
||||
return parserWindow.webContents.executeJavaScript(script, true);
|
||||
}
|
||||
|
||||
function renderTextOutput(payload: Record<string, unknown>): void {
|
||||
process.stdout.write(`Input: ${String(payload.input)}\n`);
|
||||
process.stdout.write(`Tokenizer text: ${String(payload.tokenizerText)}\n`);
|
||||
process.stdout.write(`Yomitan available: ${String(payload.yomitanAvailable)}\n`);
|
||||
process.stdout.write(`Yomitan note: ${String(payload.yomitanNote ?? "")}\n`);
|
||||
process.stdout.write(`Selected candidate indexes: ${JSON.stringify(payload.selectedCandidateIndexes)}\n`);
|
||||
process.stdout.write("\nFinal selected tokens:\n");
|
||||
const finalTokens = payload.finalTokens as Array<Record<string, unknown>> | null;
|
||||
if (!finalTokens || finalTokens.length === 0) {
|
||||
process.stdout.write(" (none)\n");
|
||||
} else {
|
||||
for (let i = 0; i < finalTokens.length; i += 1) {
|
||||
const token = finalTokens[i];
|
||||
process.stdout.write(
|
||||
` [${i}] ${token.surface} -> ${token.headword} (${token.reading}) [${token.startPos}, ${token.endPos})\n`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
process.stdout.write("\nYomitan parse candidates:\n");
|
||||
const candidates = payload.candidates as Array<Record<string, unknown>>;
|
||||
if (!candidates || candidates.length === 0) {
|
||||
process.stdout.write(" (none)\n");
|
||||
return;
|
||||
}
|
||||
|
||||
for (let i = 0; i < candidates.length; i += 1) {
|
||||
const candidate = candidates[i];
|
||||
process.stdout.write(
|
||||
` [${i}] source=${String(candidate.source)} index=${String(candidate.index)} selectedByTokenizer=${String(candidate.selectedByTokenizer)} tokenCount=${String(candidate.tokenCount)}\n`,
|
||||
);
|
||||
const tokens = candidate.tokens as Array<Record<string, unknown>> | undefined;
|
||||
if (!tokens || tokens.length === 0) {
|
||||
continue;
|
||||
}
|
||||
for (let j = 0; j < tokens.length; j += 1) {
|
||||
const token = tokens[j];
|
||||
process.stdout.write(
|
||||
` - ${token.surface} -> ${token.headword} (${token.reading}) [${token.startPos}, ${token.endPos})\n`,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const args = parseCliArgs(process.argv.slice(2));
|
||||
const yomitan: YomitanRuntimeState = {
|
||||
available: false,
|
||||
note: null,
|
||||
extension: null,
|
||||
parserWindow: null,
|
||||
parserReadyPromise: null,
|
||||
parserInitPromise: null,
|
||||
};
|
||||
|
||||
try {
|
||||
const mecabTokenizer = new MecabTokenizer({
|
||||
mecabCommand: args.mecabCommand,
|
||||
dictionaryPath: args.mecabDictionaryPath,
|
||||
});
|
||||
const isMecabAvailable = await mecabTokenizer.checkAvailability();
|
||||
if (!isMecabAvailable) {
|
||||
throw new Error("MeCab is not available on this system.");
|
||||
}
|
||||
|
||||
const runtime = await setupYomitanRuntime(args);
|
||||
yomitan.available = runtime.available;
|
||||
yomitan.note = runtime.note;
|
||||
yomitan.extension = runtime.extension;
|
||||
yomitan.parserWindow = runtime.parserWindow;
|
||||
yomitan.parserReadyPromise = runtime.parserReadyPromise;
|
||||
yomitan.parserInitPromise = runtime.parserInitPromise;
|
||||
|
||||
const deps = createTokenizerDepsRuntimeService({
|
||||
getYomitanExt: () => yomitan.extension,
|
||||
getYomitanParserWindow: () => yomitan.parserWindow,
|
||||
setYomitanParserWindow: (window) => {
|
||||
yomitan.parserWindow = window;
|
||||
},
|
||||
getYomitanParserReadyPromise: () => yomitan.parserReadyPromise,
|
||||
setYomitanParserReadyPromise: (promise) => {
|
||||
yomitan.parserReadyPromise = promise;
|
||||
},
|
||||
getYomitanParserInitPromise: () => yomitan.parserInitPromise,
|
||||
setYomitanParserInitPromise: (promise) => {
|
||||
yomitan.parserInitPromise = promise;
|
||||
},
|
||||
isKnownWord: () => false,
|
||||
getKnownWordMatchMode: () => "headword",
|
||||
getJlptLevel: () => null,
|
||||
getMecabTokenizer: () => ({
|
||||
tokenize: (text: string) => mecabTokenizer.tokenize(text),
|
||||
}),
|
||||
});
|
||||
|
||||
const subtitleData = await tokenizeSubtitleService(args.input, deps);
|
||||
const tokenizeText = normalizeTokenizerText(args.input);
|
||||
let rawParseResults: unknown = null;
|
||||
if (
|
||||
yomitan.available &&
|
||||
yomitan.parserWindow &&
|
||||
!yomitan.parserWindow.isDestroyed() &&
|
||||
tokenizeText
|
||||
) {
|
||||
rawParseResults = await fetchRawParseResults(yomitan.parserWindow, tokenizeText);
|
||||
}
|
||||
|
||||
const parsedCandidates = mapParseResultsToCandidates(rawParseResults);
|
||||
const selectedCandidateIndexes = findSelectedCandidateIndexes(
|
||||
parsedCandidates,
|
||||
subtitleData.tokens,
|
||||
);
|
||||
const selectedIndexSet = new Set<number>(selectedCandidateIndexes);
|
||||
|
||||
const payload = {
|
||||
input: args.input,
|
||||
tokenizerText: subtitleData.text,
|
||||
yomitanAvailable: yomitan.available,
|
||||
yomitanNote: yomitan.note,
|
||||
selectedCandidateIndexes,
|
||||
finalTokens:
|
||||
subtitleData.tokens?.map((token) => ({
|
||||
surface: token.surface,
|
||||
reading: token.reading,
|
||||
headword: token.headword,
|
||||
startPos: token.startPos,
|
||||
endPos: token.endPos,
|
||||
pos1: token.pos1,
|
||||
partOfSpeech: token.partOfSpeech,
|
||||
isKnown: token.isKnown,
|
||||
isNPlusOneTarget: token.isNPlusOneTarget,
|
||||
})) ?? null,
|
||||
candidates: parsedCandidates.map((candidate, idx) => ({
|
||||
source: candidate.source,
|
||||
index: candidate.index,
|
||||
selectedByTokenizer: selectedIndexSet.has(idx),
|
||||
tokenCount: candidate.tokens.length,
|
||||
tokens: candidate.tokens,
|
||||
})),
|
||||
};
|
||||
|
||||
if (args.emitJson) {
|
||||
process.stdout.write(
|
||||
`${JSON.stringify(payload, null, args.emitPretty ? 2 : undefined)}\n`,
|
||||
);
|
||||
} else {
|
||||
renderTextOutput(payload);
|
||||
}
|
||||
} finally {
|
||||
await shutdownYomitanRuntime(yomitan);
|
||||
}
|
||||
}
|
||||
|
||||
main()
|
||||
.then(() => {
|
||||
process.exit(0);
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error(`Error: ${(error as Error).message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user