Add vendor dict fallback logic

2026-07-10 16:49:50 -07:00 · 2026-02-15 22:45:03 -08:00
parent dae1f817e0
commit 01a48f4714
21 changed files with 1194 additions and 19 deletions
@@ -0,0 +1,189 @@
+import * as fs from "node:fs";
+import * as path from "node:path";
+
+export interface FrequencyDictionaryLookupOptions {
+  searchPaths: string[];
+  log: (message: string) => void;
+}
+
+interface FrequencyDictionaryEntry {
+  rank: number;
+  term: string;
+}
+
+const FREQUENCY_BANK_FILE_GLOB = /^term_meta_bank_.*\.json$/;
+const NOOP_LOOKUP = (): null => null;
+
+function normalizeFrequencyTerm(value: string): string {
+  return value.trim().toLowerCase();
+}
+
+function extractFrequencyDisplayValue(meta: unknown): number | null {
+  if (!meta || typeof meta !== "object") return null;
+  const frequency = (meta as { frequency?: unknown }).frequency;
+  if (!frequency || typeof frequency !== "object") return null;
+  const displayValue = (frequency as { displayValue?: unknown }).displayValue;
+  if (typeof displayValue === "number") {
+    if (!Number.isFinite(displayValue) || displayValue <= 0) return null;
+    return Math.floor(displayValue);
+  }
+  if (typeof displayValue === "string") {
+    const normalized = displayValue.trim().replace(/,/g, "");
+    const parsed = Number.parseInt(normalized, 10);
+    if (!Number.isFinite(parsed) || parsed <= 0) return null;
+    return parsed;
+  }
+
+  return null;
+}
+
+function asFrequencyDictionaryEntry(
+  entry: unknown,
+): FrequencyDictionaryEntry | null {
+  if (!Array.isArray(entry) || entry.length < 3) {
+    return null;
+  }
+
+  const [term, _id, meta] = entry as [
+    unknown,
+    unknown,
+    unknown,
+  ];
+  if (typeof term !== "string") {
+    return null;
+  }
+
+  const frequency = extractFrequencyDisplayValue(meta);
+  if (frequency === null) return null;
+
+  const normalizedTerm = normalizeFrequencyTerm(term);
+  if (!normalizedTerm) return null;
+
+  return {
+    term: normalizedTerm,
+    rank: frequency,
+  };
+}
+
+function addEntriesToMap(
+  rawEntries: unknown,
+  terms: Map<string, number>,
+  log: (message: string) => void,
+): void {
+  if (!Array.isArray(rawEntries)) {
+    return;
+  }
+
+  for (const rawEntry of rawEntries) {
+    const entry = asFrequencyDictionaryEntry(rawEntry);
+    if (!entry) {
+      continue;
+    }
+    const currentRank = terms.get(entry.term);
+    if (currentRank === undefined || entry.rank < currentRank) {
+      terms.set(entry.term, entry.rank);
+      continue;
+    }
+
+    log(
+      `Frequency dictionary duplicate term ${entry.term} with weaker rank ${entry.rank}; keeping ${currentRank}.`,
+    );
+  }
+}
+
+function collectDictionaryFromPath(
+  dictionaryPath: string,
+  log: (message: string) => void,
+): Map<string, number> {
+  const terms = new Map<string, number>();
+
+  let fileNames: string[];
+  try {
+    fileNames = fs.readdirSync(dictionaryPath);
+  } catch {
+    return terms;
+  }
+
+  const bankFiles = fileNames
+    .filter((name) => FREQUENCY_BANK_FILE_GLOB.test(name))
+    .sort();
+
+  if (bankFiles.length === 0) {
+    return terms;
+  }
+
+  for (const bankFile of bankFiles) {
+    const bankPath = path.join(dictionaryPath, bankFile);
+    let rawText: string;
+    try {
+      rawText = fs.readFileSync(bankPath, "utf-8");
+    } catch {
+      log(`Failed to read frequency dictionary file ${bankPath}`);
+      continue;
+    }
+
+    let rawEntries: unknown;
+    try {
+      rawEntries = JSON.parse(rawText) as unknown;
+    } catch {
+      log(`Failed to parse frequency dictionary file as JSON: ${bankPath}`);
+      continue;
+    }
+
+    const beforeSize = terms.size;
+    addEntriesToMap(rawEntries, terms, log);
+    if (terms.size === beforeSize) {
+      log(
+        `Frequency dictionary file contained no extractable entries: ${bankPath}`,
+      );
+    }
+  }
+
+  return terms;
+}
+
+export async function createFrequencyDictionaryLookupService(
+  options: FrequencyDictionaryLookupOptions,
+): Promise<(term: string) => number | null> {
+  const attemptedPaths: string[] = [];
+  let foundDictionaryPathCount = 0;
+
+  for (const dictionaryPath of options.searchPaths) {
+    attemptedPaths.push(dictionaryPath);
+    if (!fs.existsSync(dictionaryPath)) {
+      continue;
+    }
+    if (!fs.statSync(dictionaryPath).isDirectory()) {
+      continue;
+    }
+
+    foundDictionaryPathCount += 1;
+    const terms = collectDictionaryFromPath(dictionaryPath, options.log);
+    if (terms.size > 0) {
+      options.log(
+        `Frequency dictionary loaded from ${dictionaryPath} (${terms.size} entries)`,
+      );
+      return (term: string): number | null => {
+        const normalized = normalizeFrequencyTerm(term);
+        if (!normalized) return null;
+        return terms.get(normalized) ?? null;
+      };
+    }
+
+    options.log(
+      `Frequency dictionary directory exists but contains no readable term_meta_bank_*.json files: ${dictionaryPath}`,
+    );
+  }
+
+  options.log(
+    `Frequency dictionary not found. Searched ${attemptedPaths.length} candidate path(s): ${attemptedPaths.join(", ")}`,
+  );
+  if (foundDictionaryPathCount > 0) {
+    options.log(
+      "Frequency dictionary directories found, but no usable term_meta_bank_*.json files were loaded.",
+    );
+  }
+
+  return NOOP_LOOKUP;
+}
+
@@ -32,6 +32,7 @@ export {
 } from "./startup-service";
 export { openYomitanSettingsWindow } from "./yomitan-settings-service";
 export { createTokenizerDepsRuntimeService, tokenizeSubtitleService } from "./tokenizer-service";
+export { createFrequencyDictionaryLookupService } from "./frequency-dictionary-service";
 export { createJlptVocabularyLookupService } from "./jlpt-vocab-service";
 export {
  getIgnoredPos1Entries,
@@ -190,6 +190,75 @@ test("tokenizeSubtitleService skips JLPT lookups when disabled", async () => {
  assert.equal(lookupCalls, 0);
 });

+test("tokenizeSubtitleService applies frequency dictionary ranks", async () => {
+  const result = await tokenizeSubtitleService(
+    "猫です",
+    makeDeps({
+      getFrequencyDictionaryEnabled: () => true,
+      tokenizeWithMecab: async () => [
+        {
+          headword: "猫",
+          surface: "猫",
+          reading: "ネコ",
+          startPos: 0,
+          endPos: 1,
+          partOfSpeech: PartOfSpeech.noun,
+          isMerged: false,
+          isKnown: false,
+          isNPlusOneTarget: false,
+        },
+        {
+          headword: "です",
+          surface: "です",
+          reading: "デス",
+          startPos: 1,
+          endPos: 2,
+          partOfSpeech: PartOfSpeech.bound_auxiliary,
+          isMerged: false,
+          isKnown: false,
+          isNPlusOneTarget: false,
+        },
+      ],
+      getFrequencyRank: (text) => (text === "猫" ? 23 : 1200),
+    }),
+  );
+
+  assert.equal(result.tokens?.length, 2);
+  assert.equal(result.tokens?.[0]?.frequencyRank, 23);
+  assert.equal(result.tokens?.[1]?.frequencyRank, 1200);
+});
+
+test("tokenizeSubtitleService skips frequency lookups when disabled", async () => {
+  let frequencyCalls = 0;
+  const result = await tokenizeSubtitleService(
+    "猫",
+    makeDeps({
+      getFrequencyDictionaryEnabled: () => false,
+      tokenizeWithMecab: async () => [
+        {
+          headword: "猫",
+          surface: "猫",
+          reading: "ネコ",
+          startPos: 0,
+          endPos: 1,
+          partOfSpeech: PartOfSpeech.noun,
+          isMerged: false,
+          isKnown: false,
+          isNPlusOneTarget: false,
+        },
+      ],
+      getFrequencyRank: () => {
+        frequencyCalls += 1;
+        return 10;
+      },
+    }),
+  );
+
+  assert.equal(result.tokens?.length, 1);
+  assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
+  assert.equal(frequencyCalls, 0);
+});
+
 test("tokenizeSubtitleService skips JLPT level for excluded demonstratives", async () => {
  const result = await tokenizeSubtitleService(
    "この",
@@ -7,6 +7,7 @@ import {
  PartOfSpeech,
  SubtitleData,
  Token,
+  FrequencyDictionaryLookup,
 } from "../../types";
 import {
  shouldIgnoreJlptForMecabPos1,
@@ -35,11 +36,16 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
 const KATAKANA_CODEPOINT_START = 0x30a1;
 const KATAKANA_CODEPOINT_END = 0x30f6;
 const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
+const FREQUENCY_RANK_LOOKUP_CACHE_LIMIT = 2048;

 const jlptLevelLookupCaches = new WeakMap<
  (text: string) => JlptLevel | null,
  Map<string, JlptLevel | null>
 >();
+const frequencyRankLookupCaches = new WeakMap<
+  FrequencyDictionaryLookup,
+  Map<string, number | null>
+>();

 function isObject(value: unknown): value is Record<string, unknown> {
  return Boolean(value && typeof value === "object");
@@ -61,6 +67,8 @@ export interface TokenizerServiceDeps {
  getKnownWordMatchMode: () => NPlusOneMatchMode;
  getJlptLevel: (text: string) => JlptLevel | null;
  getJlptEnabled?: () => boolean;
+  getFrequencyDictionaryEnabled?: () => boolean;
+  getFrequencyRank?: FrequencyDictionaryLookup;
  getMinSentenceWordsForNPlusOne?: () => number;
  tokenizeWithMecab: (text: string) => Promise<MergedToken[] | null>;
 }
@@ -81,6 +89,8 @@ export interface TokenizerDepsRuntimeOptions {
  getKnownWordMatchMode: () => NPlusOneMatchMode;
  getJlptLevel: (text: string) => JlptLevel | null;
  getJlptEnabled?: () => boolean;
+  getFrequencyDictionaryEnabled?: () => boolean;
+  getFrequencyRank?: FrequencyDictionaryLookup;
  getMinSentenceWordsForNPlusOne?: () => number;
  getMecabTokenizer: () => MecabTokenizerLike | null;
 }
@@ -122,6 +132,47 @@ function getCachedJlptLevel(
  return level;
 }

+function normalizeFrequencyLookupText(rawText: string): string {
+  return rawText.trim().toLowerCase();
+}
+
+function getCachedFrequencyRank(
+  lookupText: string,
+  getFrequencyRank: FrequencyDictionaryLookup,
+): number | null {
+  const normalizedText = normalizeFrequencyLookupText(lookupText);
+  if (!normalizedText) {
+    return null;
+  }
+
+  let cache = frequencyRankLookupCaches.get(getFrequencyRank);
+  if (!cache) {
+    cache = new Map<string, number | null>();
+    frequencyRankLookupCaches.set(getFrequencyRank, cache);
+  }
+
+  if (cache.has(normalizedText)) {
+    return cache.get(normalizedText) ?? null;
+  }
+
+  let rank: number | null;
+  try {
+    rank = getFrequencyRank(normalizedText);
+  } catch {
+    rank = null;
+  }
+
+  cache.set(normalizedText, rank);
+  while (cache.size > FREQUENCY_RANK_LOOKUP_CACHE_LIMIT) {
+    const firstKey = cache.keys().next().value;
+    if (firstKey !== undefined) {
+      cache.delete(firstKey);
+    }
+  }
+
+  return rank;
+}
+
 export function createTokenizerDepsRuntimeService(
  options: TokenizerDepsRuntimeOptions,
 ): TokenizerServiceDeps {
@@ -137,6 +188,8 @@ export function createTokenizerDepsRuntimeService(
    getKnownWordMatchMode: options.getKnownWordMatchMode,
    getJlptLevel: options.getJlptLevel,
    getJlptEnabled: options.getJlptEnabled,
+    getFrequencyDictionaryEnabled: options.getFrequencyDictionaryEnabled,
+    getFrequencyRank: options.getFrequencyRank,
    getMinSentenceWordsForNPlusOne:
      options.getMinSentenceWordsForNPlusOne ?? (() => 3),
    tokenizeWithMecab: async (text) => {
@@ -184,6 +237,34 @@ function applyKnownWordMarking(
  });
 }

+function resolveFrequencyLookupText(token: MergedToken): string {
+  if (token.headword && token.headword.length > 0) {
+    return token.headword;
+  }
+  if (token.reading && token.reading.length > 0) {
+    return token.reading;
+  }
+  return token.surface;
+}
+
+function applyFrequencyMarking(
+  tokens: MergedToken[],
+  getFrequencyRank: FrequencyDictionaryLookup,
+): MergedToken[] {
+  return tokens.map((token) => {
+    const lookupText = resolveFrequencyLookupText(token);
+    if (!lookupText) {
+      return { ...token, frequencyRank: undefined };
+    }
+
+    const rank = getCachedFrequencyRank(lookupText, getFrequencyRank);
+    return {
+      ...token,
+      frequencyRank: rank ?? undefined,
+    };
+  });
+}
+
 function resolveJlptLookupText(token: MergedToken): string {
  if (token.headword && token.headword.length > 0) {
    return token.headword;
@@ -753,6 +834,8 @@ export async function tokenizeSubtitleService(
    .replace(/\s+/g, " ")
    .trim();
  const jlptEnabled = deps.getJlptEnabled?.() !== false;
+  const frequencyEnabled = deps.getFrequencyDictionaryEnabled?.() !== false;
+  const frequencyLookup = deps.getFrequencyRank;

  const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps);
  if (yomitanTokens && yomitanTokens.length > 0) {
@@ -761,9 +844,16 @@ export async function tokenizeSubtitleService(
      deps.isKnownWord,
      deps.getKnownWordMatchMode(),
    );
-      const jlptMarkedTokens = jlptEnabled
-        ? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel)
-        : knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
+    const frequencyMarkedTokens =
+      frequencyEnabled && frequencyLookup
+        ? applyFrequencyMarking(knownMarkedTokens, frequencyLookup)
+        : knownMarkedTokens.map((token) => ({
+          ...token,
+          frequencyRank: undefined,
+        }));
+    const jlptMarkedTokens = jlptEnabled
+      ? applyJlptMarking(frequencyMarkedTokens, deps.getJlptLevel)
+      : frequencyMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
    return {
      text: displayText,
      tokens: markNPlusOneTargets(
@@ -781,9 +871,16 @@ export async function tokenizeSubtitleService(
        deps.isKnownWord,
        deps.getKnownWordMatchMode(),
      );
+      const frequencyMarkedTokens =
+        frequencyEnabled && frequencyLookup
+          ? applyFrequencyMarking(knownMarkedTokens, frequencyLookup)
+          : knownMarkedTokens.map((token) => ({
+            ...token,
+            frequencyRank: undefined,
+          }));
      const jlptMarkedTokens = jlptEnabled
-        ? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel)
-        : knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
+        ? applyJlptMarking(frequencyMarkedTokens, deps.getJlptLevel)
+        : frequencyMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
      return {
        text: displayText,
        tokens: markNPlusOneTargets(