Update task metadata/docs and JLPT tokenizer work

2026-06-16 03:13:30 -07:00 · 2026-02-15 18:18:08 -08:00
parent 1ca9cbc20d
commit f1b5082801
11 changed files with 210 additions and 83 deletions
@@ -97,6 +97,7 @@ function collectDictionaryFromPath(
  for (const bank of JLPT_BANK_FILES) {
    const bankPath = path.join(dictionaryPath, bank.filename);
    if (!fs.existsSync(bankPath)) {
+      log(`JLPT bank file missing for ${bank.level}: ${bankPath}`);
      continue;
    }

@@ -104,6 +105,7 @@ function collectDictionaryFromPath(
    try {
      rawText = fs.readFileSync(bankPath, "utf-8");
    } catch {
+      log(`Failed to read JLPT bank file ${bankPath}`);
      continue;
    }

@@ -111,10 +113,22 @@ function collectDictionaryFromPath(
    try {
      rawEntries = JSON.parse(rawText) as unknown;
    } catch {
+      log(`Failed to parse JLPT bank file as JSON: ${bankPath}`);
      continue;
    }

+    if (!Array.isArray(rawEntries)) {
+      log(
+        `JLPT bank file has unsupported format (expected JSON array): ${bankPath}`,
+      );
+      continue;
+    }
+
+    const beforeSize = terms.size;
    addEntriesToMap(rawEntries, bank.level, terms, log);
+    if (terms.size === beforeSize) {
+      log(`JLPT bank file contained no extractable entries: ${bankPath}`);
+    }
  }

  return terms;
@@ -124,8 +138,9 @@ export async function createJlptVocabularyLookupService(
  options: JlptVocabLookupOptions,
 ): Promise<(term: string) => JlptLevel | null> {
  const attemptedPaths: string[] = [];
-  let foundDirectoryCount = 0;
+  let foundDictionaryPathCount = 0;
  let foundBankCount = 0;
+  const resolvedBanks: string[] = [];
  for (const dictionaryPath of options.searchPaths) {
    attemptedPaths.push(dictionaryPath);
    if (!fs.existsSync(dictionaryPath)) {
@@ -136,10 +151,11 @@ export async function createJlptVocabularyLookupService(
      continue;
    }

-    foundDirectoryCount += 1;
+    foundDictionaryPathCount += 1;

    const terms = collectDictionaryFromPath(dictionaryPath, options.log);
    if (terms.size > 0) {
+      resolvedBanks.push(dictionaryPath);
      foundBankCount += 1;
      options.log(
        `JLPT dictionary loaded from ${dictionaryPath} (${terms.size} entries)`,
@@ -159,10 +175,13 @@ export async function createJlptVocabularyLookupService(
  options.log(
    `JLPT dictionary not found. Searched ${attemptedPaths.length} candidate path(s): ${attemptedPaths.join(", ")}`,
  );
-  if (foundDirectoryCount > 0 && foundBankCount === 0) {
+  if (foundDictionaryPathCount > 0 && foundBankCount === 0) {
    options.log(
      "JLPT dictionary directories found, but none contained valid term_meta_bank_*.json files.",
    );
  }
+  if (resolvedBanks.length > 0 && foundBankCount > 0) {
+    options.log(`JLPT dictionary search matched path(s): ${resolvedBanks.join(", ")}`);
+  }
  return NOOP_LOOKUP;
 }
@@ -32,6 +32,12 @@ type YomitanParseLine = YomitanParseSegment[];
 const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
 const KATAKANA_CODEPOINT_START = 0x30a1;
 const KATAKANA_CODEPOINT_END = 0x30f6;
+const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
+
+const jlptLevelLookupCaches = new WeakMap<
+  (text: string) => JlptLevel | null,
+  Map<string, JlptLevel | null>
+>();

 function isObject(value: unknown): value is Record<string, unknown> {
  return Boolean(value && typeof value === "object");
@@ -75,6 +81,43 @@ export interface TokenizerDepsRuntimeOptions {
  getMecabTokenizer: () => MecabTokenizerLike | null;
 }

+function getCachedJlptLevel(
+  lookupText: string,
+  getJlptLevel: (text: string) => JlptLevel | null,
+): JlptLevel | null {
+  const normalizedText = lookupText.trim();
+  if (!normalizedText) {
+    return null;
+  }
+
+  let cache = jlptLevelLookupCaches.get(getJlptLevel);
+  if (!cache) {
+    cache = new Map<string, JlptLevel | null>();
+    jlptLevelLookupCaches.set(getJlptLevel, cache);
+  }
+
+  if (cache.has(normalizedText)) {
+    return cache.get(normalizedText) ?? null;
+  }
+
+  let level: JlptLevel | null;
+  try {
+    level = getJlptLevel(normalizedText);
+  } catch {
+    level = null;
+  }
+
+  cache.set(normalizedText, level);
+  while (cache.size > JLPT_LEVEL_LOOKUP_CACHE_LIMIT) {
+    const firstKey = cache.keys().next().value;
+    if (firstKey !== undefined) {
+      cache.delete(firstKey);
+    }
+  }
+
+  return level;
+}
+
 export function createTokenizerDepsRuntimeService(
  options: TokenizerDepsRuntimeOptions,
 ): TokenizerServiceDeps {
@@ -326,13 +369,17 @@ function applyJlptMarking(
      return { ...token, jlptLevel: undefined };
    }

-    const primaryLevel = getJlptLevel(resolveJlptLookupText(token));
-    const fallbackLevel = getJlptLevel(token.surface);
+    const primaryLevel = getCachedJlptLevel(
+      resolveJlptLookupText(token),
+      getJlptLevel,
+    );
+    const fallbackLevel =
+      primaryLevel === null ? getCachedJlptLevel(token.surface, getJlptLevel) : null;

-  return {
-    ...token,
-    jlptLevel: primaryLevel ?? fallbackLevel ?? token.jlptLevel,
-  };
+    return {
+      ...token,
+      jlptLevel: primaryLevel ?? fallbackLevel ?? token.jlptLevel,
+    };
  });
 }