Add vendor dict fallback logic

This commit is contained in:
2026-02-15 22:45:03 -08:00
parent dae1f817e0
commit 01a48f4714
21 changed files with 1194 additions and 19 deletions

View File

@@ -0,0 +1,189 @@
import * as fs from "node:fs";
import * as path from "node:path";
export interface FrequencyDictionaryLookupOptions {
searchPaths: string[];
log: (message: string) => void;
}
interface FrequencyDictionaryEntry {
rank: number;
term: string;
}
const FREQUENCY_BANK_FILE_GLOB = /^term_meta_bank_.*\.json$/;
const NOOP_LOOKUP = (): null => null;
function normalizeFrequencyTerm(value: string): string {
return value.trim().toLowerCase();
}
function extractFrequencyDisplayValue(meta: unknown): number | null {
if (!meta || typeof meta !== "object") return null;
const frequency = (meta as { frequency?: unknown }).frequency;
if (!frequency || typeof frequency !== "object") return null;
const displayValue = (frequency as { displayValue?: unknown }).displayValue;
if (typeof displayValue === "number") {
if (!Number.isFinite(displayValue) || displayValue <= 0) return null;
return Math.floor(displayValue);
}
if (typeof displayValue === "string") {
const normalized = displayValue.trim().replace(/,/g, "");
const parsed = Number.parseInt(normalized, 10);
if (!Number.isFinite(parsed) || parsed <= 0) return null;
return parsed;
}
return null;
}
function asFrequencyDictionaryEntry(
entry: unknown,
): FrequencyDictionaryEntry | null {
if (!Array.isArray(entry) || entry.length < 3) {
return null;
}
const [term, _id, meta] = entry as [
unknown,
unknown,
unknown,
];
if (typeof term !== "string") {
return null;
}
const frequency = extractFrequencyDisplayValue(meta);
if (frequency === null) return null;
const normalizedTerm = normalizeFrequencyTerm(term);
if (!normalizedTerm) return null;
return {
term: normalizedTerm,
rank: frequency,
};
}
function addEntriesToMap(
rawEntries: unknown,
terms: Map<string, number>,
log: (message: string) => void,
): void {
if (!Array.isArray(rawEntries)) {
return;
}
for (const rawEntry of rawEntries) {
const entry = asFrequencyDictionaryEntry(rawEntry);
if (!entry) {
continue;
}
const currentRank = terms.get(entry.term);
if (currentRank === undefined || entry.rank < currentRank) {
terms.set(entry.term, entry.rank);
continue;
}
log(
`Frequency dictionary duplicate term ${entry.term} with weaker rank ${entry.rank}; keeping ${currentRank}.`,
);
}
}
function collectDictionaryFromPath(
dictionaryPath: string,
log: (message: string) => void,
): Map<string, number> {
const terms = new Map<string, number>();
let fileNames: string[];
try {
fileNames = fs.readdirSync(dictionaryPath);
} catch {
return terms;
}
const bankFiles = fileNames
.filter((name) => FREQUENCY_BANK_FILE_GLOB.test(name))
.sort();
if (bankFiles.length === 0) {
return terms;
}
for (const bankFile of bankFiles) {
const bankPath = path.join(dictionaryPath, bankFile);
let rawText: string;
try {
rawText = fs.readFileSync(bankPath, "utf-8");
} catch {
log(`Failed to read frequency dictionary file ${bankPath}`);
continue;
}
let rawEntries: unknown;
try {
rawEntries = JSON.parse(rawText) as unknown;
} catch {
log(`Failed to parse frequency dictionary file as JSON: ${bankPath}`);
continue;
}
const beforeSize = terms.size;
addEntriesToMap(rawEntries, terms, log);
if (terms.size === beforeSize) {
log(
`Frequency dictionary file contained no extractable entries: ${bankPath}`,
);
}
}
return terms;
}
export async function createFrequencyDictionaryLookupService(
options: FrequencyDictionaryLookupOptions,
): Promise<(term: string) => number | null> {
const attemptedPaths: string[] = [];
let foundDictionaryPathCount = 0;
for (const dictionaryPath of options.searchPaths) {
attemptedPaths.push(dictionaryPath);
if (!fs.existsSync(dictionaryPath)) {
continue;
}
if (!fs.statSync(dictionaryPath).isDirectory()) {
continue;
}
foundDictionaryPathCount += 1;
const terms = collectDictionaryFromPath(dictionaryPath, options.log);
if (terms.size > 0) {
options.log(
`Frequency dictionary loaded from ${dictionaryPath} (${terms.size} entries)`,
);
return (term: string): number | null => {
const normalized = normalizeFrequencyTerm(term);
if (!normalized) return null;
return terms.get(normalized) ?? null;
};
}
options.log(
`Frequency dictionary directory exists but contains no readable term_meta_bank_*.json files: ${dictionaryPath}`,
);
}
options.log(
`Frequency dictionary not found. Searched ${attemptedPaths.length} candidate path(s): ${attemptedPaths.join(", ")}`,
);
if (foundDictionaryPathCount > 0) {
options.log(
"Frequency dictionary directories found, but no usable term_meta_bank_*.json files were loaded.",
);
}
return NOOP_LOOKUP;
}

View File

@@ -32,6 +32,7 @@ export {
} from "./startup-service";
export { openYomitanSettingsWindow } from "./yomitan-settings-service";
export { createTokenizerDepsRuntimeService, tokenizeSubtitleService } from "./tokenizer-service";
export { createFrequencyDictionaryLookupService } from "./frequency-dictionary-service";
export { createJlptVocabularyLookupService } from "./jlpt-vocab-service";
export {
getIgnoredPos1Entries,

View File

@@ -190,6 +190,75 @@ test("tokenizeSubtitleService skips JLPT lookups when disabled", async () => {
assert.equal(lookupCalls, 0);
});
test("tokenizeSubtitleService applies frequency dictionary ranks", async () => {
const result = await tokenizeSubtitleService(
"猫です",
makeDeps({
getFrequencyDictionaryEnabled: () => true,
tokenizeWithMecab: async () => [
{
headword: "猫",
surface: "猫",
reading: "ネコ",
startPos: 0,
endPos: 1,
partOfSpeech: PartOfSpeech.noun,
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: "です",
surface: "です",
reading: "デス",
startPos: 1,
endPos: 2,
partOfSpeech: PartOfSpeech.bound_auxiliary,
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
],
getFrequencyRank: (text) => (text === "猫" ? 23 : 1200),
}),
);
assert.equal(result.tokens?.length, 2);
assert.equal(result.tokens?.[0]?.frequencyRank, 23);
assert.equal(result.tokens?.[1]?.frequencyRank, 1200);
});
test("tokenizeSubtitleService skips frequency lookups when disabled", async () => {
let frequencyCalls = 0;
const result = await tokenizeSubtitleService(
"猫",
makeDeps({
getFrequencyDictionaryEnabled: () => false,
tokenizeWithMecab: async () => [
{
headword: "猫",
surface: "猫",
reading: "ネコ",
startPos: 0,
endPos: 1,
partOfSpeech: PartOfSpeech.noun,
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
],
getFrequencyRank: () => {
frequencyCalls += 1;
return 10;
},
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
assert.equal(frequencyCalls, 0);
});
test("tokenizeSubtitleService skips JLPT level for excluded demonstratives", async () => {
const result = await tokenizeSubtitleService(
"この",

View File

@@ -7,6 +7,7 @@ import {
PartOfSpeech,
SubtitleData,
Token,
FrequencyDictionaryLookup,
} from "../../types";
import {
shouldIgnoreJlptForMecabPos1,
@@ -35,11 +36,16 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
const KATAKANA_CODEPOINT_START = 0x30a1;
const KATAKANA_CODEPOINT_END = 0x30f6;
const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
const FREQUENCY_RANK_LOOKUP_CACHE_LIMIT = 2048;
const jlptLevelLookupCaches = new WeakMap<
(text: string) => JlptLevel | null,
Map<string, JlptLevel | null>
>();
const frequencyRankLookupCaches = new WeakMap<
FrequencyDictionaryLookup,
Map<string, number | null>
>();
function isObject(value: unknown): value is Record<string, unknown> {
return Boolean(value && typeof value === "object");
@@ -61,6 +67,8 @@ export interface TokenizerServiceDeps {
getKnownWordMatchMode: () => NPlusOneMatchMode;
getJlptLevel: (text: string) => JlptLevel | null;
getJlptEnabled?: () => boolean;
getFrequencyDictionaryEnabled?: () => boolean;
getFrequencyRank?: FrequencyDictionaryLookup;
getMinSentenceWordsForNPlusOne?: () => number;
tokenizeWithMecab: (text: string) => Promise<MergedToken[] | null>;
}
@@ -81,6 +89,8 @@ export interface TokenizerDepsRuntimeOptions {
getKnownWordMatchMode: () => NPlusOneMatchMode;
getJlptLevel: (text: string) => JlptLevel | null;
getJlptEnabled?: () => boolean;
getFrequencyDictionaryEnabled?: () => boolean;
getFrequencyRank?: FrequencyDictionaryLookup;
getMinSentenceWordsForNPlusOne?: () => number;
getMecabTokenizer: () => MecabTokenizerLike | null;
}
@@ -122,6 +132,47 @@ function getCachedJlptLevel(
return level;
}
function normalizeFrequencyLookupText(rawText: string): string {
return rawText.trim().toLowerCase();
}
function getCachedFrequencyRank(
lookupText: string,
getFrequencyRank: FrequencyDictionaryLookup,
): number | null {
const normalizedText = normalizeFrequencyLookupText(lookupText);
if (!normalizedText) {
return null;
}
let cache = frequencyRankLookupCaches.get(getFrequencyRank);
if (!cache) {
cache = new Map<string, number | null>();
frequencyRankLookupCaches.set(getFrequencyRank, cache);
}
if (cache.has(normalizedText)) {
return cache.get(normalizedText) ?? null;
}
let rank: number | null;
try {
rank = getFrequencyRank(normalizedText);
} catch {
rank = null;
}
cache.set(normalizedText, rank);
while (cache.size > FREQUENCY_RANK_LOOKUP_CACHE_LIMIT) {
const firstKey = cache.keys().next().value;
if (firstKey !== undefined) {
cache.delete(firstKey);
}
}
return rank;
}
export function createTokenizerDepsRuntimeService(
options: TokenizerDepsRuntimeOptions,
): TokenizerServiceDeps {
@@ -137,6 +188,8 @@ export function createTokenizerDepsRuntimeService(
getKnownWordMatchMode: options.getKnownWordMatchMode,
getJlptLevel: options.getJlptLevel,
getJlptEnabled: options.getJlptEnabled,
getFrequencyDictionaryEnabled: options.getFrequencyDictionaryEnabled,
getFrequencyRank: options.getFrequencyRank,
getMinSentenceWordsForNPlusOne:
options.getMinSentenceWordsForNPlusOne ?? (() => 3),
tokenizeWithMecab: async (text) => {
@@ -184,6 +237,34 @@ function applyKnownWordMarking(
});
}
function resolveFrequencyLookupText(token: MergedToken): string {
if (token.headword && token.headword.length > 0) {
return token.headword;
}
if (token.reading && token.reading.length > 0) {
return token.reading;
}
return token.surface;
}
function applyFrequencyMarking(
tokens: MergedToken[],
getFrequencyRank: FrequencyDictionaryLookup,
): MergedToken[] {
return tokens.map((token) => {
const lookupText = resolveFrequencyLookupText(token);
if (!lookupText) {
return { ...token, frequencyRank: undefined };
}
const rank = getCachedFrequencyRank(lookupText, getFrequencyRank);
return {
...token,
frequencyRank: rank ?? undefined,
};
});
}
function resolveJlptLookupText(token: MergedToken): string {
if (token.headword && token.headword.length > 0) {
return token.headword;
@@ -753,6 +834,8 @@ export async function tokenizeSubtitleService(
.replace(/\s+/g, " ")
.trim();
const jlptEnabled = deps.getJlptEnabled?.() !== false;
const frequencyEnabled = deps.getFrequencyDictionaryEnabled?.() !== false;
const frequencyLookup = deps.getFrequencyRank;
const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps);
if (yomitanTokens && yomitanTokens.length > 0) {
@@ -761,9 +844,16 @@ export async function tokenizeSubtitleService(
deps.isKnownWord,
deps.getKnownWordMatchMode(),
);
const jlptMarkedTokens = jlptEnabled
? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel)
: knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
const frequencyMarkedTokens =
frequencyEnabled && frequencyLookup
? applyFrequencyMarking(knownMarkedTokens, frequencyLookup)
: knownMarkedTokens.map((token) => ({
...token,
frequencyRank: undefined,
}));
const jlptMarkedTokens = jlptEnabled
? applyJlptMarking(frequencyMarkedTokens, deps.getJlptLevel)
: frequencyMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
return {
text: displayText,
tokens: markNPlusOneTargets(
@@ -781,9 +871,16 @@ export async function tokenizeSubtitleService(
deps.isKnownWord,
deps.getKnownWordMatchMode(),
);
const frequencyMarkedTokens =
frequencyEnabled && frequencyLookup
? applyFrequencyMarking(knownMarkedTokens, frequencyLookup)
: knownMarkedTokens.map((token) => ({
...token,
frequencyRank: undefined,
}));
const jlptMarkedTokens = jlptEnabled
? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel)
: knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
? applyJlptMarking(frequencyMarkedTokens, deps.getJlptLevel)
: frequencyMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
return {
text: displayText,
tokens: markNPlusOneTargets(