feat(tokenizer): refine Yomitan grouping and parser tooling

- map segmented Yomitan lines into single logical tokens and improve candidate selection heuristics

- limit frequency lookup to selected token text with POS-based exclusions and add debug logging hook

- add standalone Yomitan parser test script, deterministic utility-script shutdown, and docs/backlog updates
This commit is contained in:
kyasuda
2026-02-16 17:41:24 -08:00
parent 0eb2868805
commit 457e6f0f10
17 changed files with 1667 additions and 293 deletions

View File

@@ -385,7 +385,7 @@ function printUsage(): void {
pnpm run get-frequency [--pretty] [--verbose] [--dictionary <path>] [--mecab-command <path>] [--mecab-dictionary <path>] <text>
--pretty Pretty-print JSON output.
--verbose Include merged-frequency diagnostics and lookup terms.
--verbose Include merged-frequency diagnostics and lookup term details.
--force-mecab Skip Yomitan parser initialization and force MeCab fallback.
--yomitan-extension <path> Optional path to a Yomitan extension directory.
--yomitan-user-data <path> Optional Electron userData directory for Yomitan state.
@@ -413,41 +413,8 @@ type FrequencyCandidate = {
};
function getFrequencyLookupTextCandidates(token: MergedToken): string[] {
const tokenWithCandidates = token as MergedToken & {
frequencyLookupTerms?: string[];
};
const lookupTextCandidates: string[] = [];
const addLookupText = (text: string | undefined): void => {
if (!text) {
return;
}
const trimmed = text.trim();
if (!trimmed) {
return;
}
lookupTextCandidates.push(trimmed);
};
if (Array.isArray(tokenWithCandidates.frequencyLookupTerms)) {
for (const term of tokenWithCandidates.frequencyLookupTerms) {
addLookupText(term);
}
}
addLookupText(token.headword);
addLookupText(token.reading);
addLookupText(token.surface);
const uniqueLookupTerms: string[] = [];
const seen = new Set<string>();
for (const term of lookupTextCandidates) {
if (seen.has(term)) {
continue;
}
seen.add(term);
uniqueLookupTerms.push(term);
}
return uniqueLookupTerms;
const lookupText = token.headword?.trim() || token.reading?.trim() || token.surface.trim();
return lookupText ? [lookupText] : [];
}
function getBestFrequencyLookupCandidate(
@@ -488,10 +455,6 @@ function simplifyTokenWithVerbose(
token: MergedToken,
getFrequencyRank: FrequencyDictionaryLookup,
): Record<string, unknown> {
const tokenWithCandidates = token as MergedToken & {
frequencyLookupTerms?: string[];
};
const frequencyLookupTerms = tokenWithCandidates.frequencyLookupTerms;
const candidates = getFrequencyLookupTextCandidates(token).map((term) => ({
term,
rank: getFrequencyRank(term),
@@ -518,10 +481,6 @@ function simplifyTokenWithVerbose(
isNPlusOneTarget: token.isNPlusOneTarget,
frequencyRank: token.frequencyRank,
jlptLevel: token.jlptLevel,
frequencyLookupTerms:
Array.isArray(frequencyLookupTerms) && frequencyLookupTerms.length > 0
? frequencyLookupTerms
: undefined,
frequencyCandidates: candidates,
frequencyBestLookupTerm: bestCandidate?.term ?? null,
frequencyBestLookupRank: bestCandidate?.rank ?? null,
@@ -537,6 +496,25 @@ interface YomitanRuntimeState {
note?: string;
}
function destroyUnknownParserWindow(window: unknown): void {
if (!window || typeof window !== "object") {
return;
}
const candidate = window as {
isDestroyed?: () => boolean;
destroy?: () => void;
};
if (typeof candidate.isDestroyed !== "function") {
return;
}
if (typeof candidate.destroy !== "function") {
return;
}
if (!candidate.isDestroyed()) {
candidate.destroy();
}
}
async function createYomitanRuntimeState(
userDataPath: string,
): Promise<YomitanRuntimeState> {
@@ -775,133 +753,141 @@ function renderColoredLine(
}
async function main(): Promise<void> {
const args = parseCliArgs(process.argv.slice(2));
const getFrequencyRank = await getFrequencyLookup(args.dictionaryPath);
let electronModule: (typeof import("electron")) | null = null;
let yomitanState: YomitanRuntimeState | null = null;
const mecabTokenizer = new MecabTokenizer({
mecabCommand: args.mecabCommand,
dictionaryPath: args.mecabDictionaryPath,
});
const isMecabAvailable = await mecabTokenizer.checkAvailability();
if (!isMecabAvailable) {
throw new Error(
"MeCab is not available on this system. Install/run environment with MeCab to tokenize input.",
);
}
try {
const args = parseCliArgs(process.argv.slice(2));
const getFrequencyRank = await getFrequencyLookup(args.dictionaryPath);
const app = await import("electron").catch(() => null);
if (app && args.yomitanUserDataPath) {
app.app.setPath("userData", args.yomitanUserDataPath);
}
const yomitanState =
!args.forceMecabOnly
? await createYomitanRuntimeStateWithSearch(
app?.app?.getPath ? app.app.getPath("userData") : process.cwd(),
args.yomitanExtensionPath,
)
: null;
const hasYomitan = Boolean(yomitanState?.available && yomitanState?.yomitanExt);
const mecabTokenizer = new MecabTokenizer({
mecabCommand: args.mecabCommand,
dictionaryPath: args.mecabDictionaryPath,
});
const isMecabAvailable = await mecabTokenizer.checkAvailability();
if (!isMecabAvailable) {
throw new Error(
"MeCab is not available on this system. Install/run environment with MeCab to tokenize input.",
);
}
const deps = createTokenizerDepsRuntimeService({
getYomitanExt: () =>
(hasYomitan ? yomitanState!.yomitanExt : null) as never,
getYomitanParserWindow: () =>
(hasYomitan ? yomitanState!.parserWindow : null) as never,
setYomitanParserWindow: (window) => {
if (!hasYomitan) {
return;
}
yomitanState!.parserWindow = window;
},
getYomitanParserReadyPromise: () =>
(hasYomitan ? yomitanState!.parserReadyPromise : null) as never,
setYomitanParserReadyPromise: (promise) => {
if (!hasYomitan) {
return;
}
yomitanState!.parserReadyPromise = promise;
},
getYomitanParserInitPromise: () =>
(hasYomitan ? yomitanState!.parserInitPromise : null) as never,
setYomitanParserInitPromise: (promise) => {
if (!hasYomitan) {
return;
}
yomitanState!.parserInitPromise = promise;
},
isKnownWord: () => false,
getKnownWordMatchMode: () => "headword",
getJlptLevel: () => null,
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank,
getMecabTokenizer: () => ({
tokenize: (text: string) => mecabTokenizer.tokenize(text),
}),
});
electronModule = await import("electron").catch(() => null);
if (electronModule && args.yomitanUserDataPath) {
electronModule.app.setPath("userData", args.yomitanUserDataPath);
}
yomitanState =
!args.forceMecabOnly
? await createYomitanRuntimeStateWithSearch(
electronModule?.app?.getPath
? electronModule.app.getPath("userData")
: process.cwd(),
args.yomitanExtensionPath,
)
: null;
const hasYomitan = Boolean(yomitanState?.available && yomitanState?.yomitanExt);
const subtitleData = await tokenizeSubtitleService(args.input, deps);
const tokenCount = subtitleData.tokens?.length ?? 0;
const mergedCount = subtitleData.tokens?.filter((token) => token.isMerged).length ?? 0;
const hasYomitanCandidates = Boolean(
subtitleData.tokens?.some((token) => {
const frequencyLookupTerms = (
token as MergedToken & { frequencyLookupTerms?: string[] }
).frequencyLookupTerms;
return Array.isArray(frequencyLookupTerms) && frequencyLookupTerms.length > 0;
}) ?? false,
);
const tokens =
subtitleData.tokens?.map((token) =>
args.emitVerbose
? simplifyTokenWithVerbose(token, getFrequencyRank)
: simplifyToken(token),
) ?? null;
const diagnostics = {
yomitan: {
available: Boolean(yomitanState?.available),
loaded: hasYomitan,
forceMecabOnly: args.forceMecabOnly,
note: yomitanState?.note ?? null,
},
mecab: {
command: args.mecabCommand ?? "mecab",
dictionaryPath: args.mecabDictionaryPath ?? null,
available: isMecabAvailable,
},
tokenizer: {
sourceHint:
tokenCount === 0
? "none"
: hasYomitan ? "yomitan-merged" : "mecab-merge",
mergedTokenCount: mergedCount,
totalTokenCount: tokenCount,
},
};
if (tokens === null) {
diagnostics.mecab["status"] = "no-tokens";
diagnostics.mecab["note"] =
"MeCab returned no parseable tokens. This is often caused by a missing/invalid MeCab dictionary path.";
} else {
diagnostics.mecab["status"] = "ok";
}
const deps = createTokenizerDepsRuntimeService({
getYomitanExt: () =>
(hasYomitan ? yomitanState!.yomitanExt : null) as never,
getYomitanParserWindow: () =>
(hasYomitan ? yomitanState!.parserWindow : null) as never,
setYomitanParserWindow: (window) => {
if (!hasYomitan) {
return;
}
yomitanState!.parserWindow = window;
},
getYomitanParserReadyPromise: () =>
(hasYomitan ? yomitanState!.parserReadyPromise : null) as never,
setYomitanParserReadyPromise: (promise) => {
if (!hasYomitan) {
return;
}
yomitanState!.parserReadyPromise = promise;
},
getYomitanParserInitPromise: () =>
(hasYomitan ? yomitanState!.parserInitPromise : null) as never,
setYomitanParserInitPromise: (promise) => {
if (!hasYomitan) {
return;
}
yomitanState!.parserInitPromise = promise;
},
isKnownWord: () => false,
getKnownWordMatchMode: () => "headword",
getJlptLevel: () => null,
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank,
getMecabTokenizer: () => ({
tokenize: (text: string) => mecabTokenizer.tokenize(text),
}),
});
const output = {
input: args.input,
tokenizerText: subtitleData.text,
tokens,
diagnostics,
};
const subtitleData = await tokenizeSubtitleService(args.input, deps);
const tokenCount = subtitleData.tokens?.length ?? 0;
const mergedCount = subtitleData.tokens?.filter((token) => token.isMerged).length ?? 0;
const tokens =
subtitleData.tokens?.map((token) =>
args.emitVerbose
? simplifyTokenWithVerbose(token, getFrequencyRank)
: simplifyToken(token),
) ?? null;
const diagnostics = {
yomitan: {
available: Boolean(yomitanState?.available),
loaded: hasYomitan,
forceMecabOnly: args.forceMecabOnly,
note: yomitanState?.note ?? null,
},
mecab: {
command: args.mecabCommand ?? "mecab",
dictionaryPath: args.mecabDictionaryPath ?? null,
available: isMecabAvailable,
},
tokenizer: {
sourceHint:
tokenCount === 0
? "none"
: hasYomitan ? "yomitan-merged" : "mecab-merge",
mergedTokenCount: mergedCount,
totalTokenCount: tokenCount,
},
};
if (tokens === null) {
diagnostics.mecab["status"] = "no-tokens";
diagnostics.mecab["note"] =
"MeCab returned no parseable tokens. This is often caused by a missing/invalid MeCab dictionary path.";
} else {
diagnostics.mecab["status"] = "ok";
}
const json = JSON.stringify(output, null, args.emitPretty ? 2 : undefined);
process.stdout.write(`${json}\n`);
const output = {
input: args.input,
tokenizerText: subtitleData.text,
tokens,
diagnostics,
};
if (args.emitColoredLine && subtitleData.tokens) {
const coloredLine = renderColoredLine(subtitleData.text, subtitleData.tokens, args);
process.stdout.write(`${coloredLine}\n`);
const json = JSON.stringify(output, null, args.emitPretty ? 2 : undefined);
process.stdout.write(`${json}\n`);
if (args.emitColoredLine && subtitleData.tokens) {
const coloredLine = renderColoredLine(subtitleData.text, subtitleData.tokens, args);
process.stdout.write(`${coloredLine}\n`);
}
} finally {
destroyUnknownParserWindow(yomitanState?.parserWindow ?? null);
if (electronModule?.app) {
electronModule.app.quit();
}
}
}
main().catch((error) => {
console.error(`Error: ${(error as Error).message}`);
process.exit(1);
});
main()
.then(() => {
process.exit(0);
})
.catch((error) => {
console.error(`Error: ${(error as Error).message}`);
process.exit(1);
});