Update task metadata/docs and JLPT tokenizer work

This commit is contained in:
2026-02-15 18:18:08 -08:00
parent 1ca9cbc20d
commit f1b5082801
11 changed files with 210 additions and 83 deletions

View File

@@ -97,6 +97,7 @@ function collectDictionaryFromPath(
for (const bank of JLPT_BANK_FILES) {
const bankPath = path.join(dictionaryPath, bank.filename);
if (!fs.existsSync(bankPath)) {
log(`JLPT bank file missing for ${bank.level}: ${bankPath}`);
continue;
}
@@ -104,6 +105,7 @@ function collectDictionaryFromPath(
try {
rawText = fs.readFileSync(bankPath, "utf-8");
} catch {
log(`Failed to read JLPT bank file ${bankPath}`);
continue;
}
@@ -111,10 +113,22 @@ function collectDictionaryFromPath(
try {
rawEntries = JSON.parse(rawText) as unknown;
} catch {
log(`Failed to parse JLPT bank file as JSON: ${bankPath}`);
continue;
}
if (!Array.isArray(rawEntries)) {
log(
`JLPT bank file has unsupported format (expected JSON array): ${bankPath}`,
);
continue;
}
const beforeSize = terms.size;
addEntriesToMap(rawEntries, bank.level, terms, log);
if (terms.size === beforeSize) {
log(`JLPT bank file contained no extractable entries: ${bankPath}`);
}
}
return terms;
@@ -124,8 +138,9 @@ export async function createJlptVocabularyLookupService(
options: JlptVocabLookupOptions,
): Promise<(term: string) => JlptLevel | null> {
const attemptedPaths: string[] = [];
let foundDirectoryCount = 0;
let foundDictionaryPathCount = 0;
let foundBankCount = 0;
const resolvedBanks: string[] = [];
for (const dictionaryPath of options.searchPaths) {
attemptedPaths.push(dictionaryPath);
if (!fs.existsSync(dictionaryPath)) {
@@ -136,10 +151,11 @@ export async function createJlptVocabularyLookupService(
continue;
}
foundDirectoryCount += 1;
foundDictionaryPathCount += 1;
const terms = collectDictionaryFromPath(dictionaryPath, options.log);
if (terms.size > 0) {
resolvedBanks.push(dictionaryPath);
foundBankCount += 1;
options.log(
`JLPT dictionary loaded from ${dictionaryPath} (${terms.size} entries)`,
@@ -159,10 +175,13 @@ export async function createJlptVocabularyLookupService(
options.log(
`JLPT dictionary not found. Searched ${attemptedPaths.length} candidate path(s): ${attemptedPaths.join(", ")}`,
);
if (foundDirectoryCount > 0 && foundBankCount === 0) {
if (foundDictionaryPathCount > 0 && foundBankCount === 0) {
options.log(
"JLPT dictionary directories found, but none contained valid term_meta_bank_*.json files.",
);
}
if (resolvedBanks.length > 0 && foundBankCount > 0) {
options.log(`JLPT dictionary search matched path(s): ${resolvedBanks.join(", ")}`);
}
return NOOP_LOOKUP;
}

View File

@@ -32,6 +32,12 @@ type YomitanParseLine = YomitanParseSegment[];
const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
const KATAKANA_CODEPOINT_START = 0x30a1;
const KATAKANA_CODEPOINT_END = 0x30f6;
const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
const jlptLevelLookupCaches = new WeakMap<
(text: string) => JlptLevel | null,
Map<string, JlptLevel | null>
>();
function isObject(value: unknown): value is Record<string, unknown> {
return Boolean(value && typeof value === "object");
@@ -75,6 +81,43 @@ export interface TokenizerDepsRuntimeOptions {
getMecabTokenizer: () => MecabTokenizerLike | null;
}
function getCachedJlptLevel(
lookupText: string,
getJlptLevel: (text: string) => JlptLevel | null,
): JlptLevel | null {
const normalizedText = lookupText.trim();
if (!normalizedText) {
return null;
}
let cache = jlptLevelLookupCaches.get(getJlptLevel);
if (!cache) {
cache = new Map<string, JlptLevel | null>();
jlptLevelLookupCaches.set(getJlptLevel, cache);
}
if (cache.has(normalizedText)) {
return cache.get(normalizedText) ?? null;
}
let level: JlptLevel | null;
try {
level = getJlptLevel(normalizedText);
} catch {
level = null;
}
cache.set(normalizedText, level);
while (cache.size > JLPT_LEVEL_LOOKUP_CACHE_LIMIT) {
const firstKey = cache.keys().next().value;
if (firstKey !== undefined) {
cache.delete(firstKey);
}
}
return level;
}
export function createTokenizerDepsRuntimeService(
options: TokenizerDepsRuntimeOptions,
): TokenizerServiceDeps {
@@ -326,13 +369,17 @@ function applyJlptMarking(
return { ...token, jlptLevel: undefined };
}
const primaryLevel = getJlptLevel(resolveJlptLookupText(token));
const fallbackLevel = getJlptLevel(token.surface);
const primaryLevel = getCachedJlptLevel(
resolveJlptLookupText(token),
getJlptLevel,
);
const fallbackLevel =
primaryLevel === null ? getCachedJlptLevel(token.surface, getJlptLevel) : null;
return {
...token,
jlptLevel: primaryLevel ?? fallbackLevel ?? token.jlptLevel,
};
return {
...token,
jlptLevel: primaryLevel ?? fallbackLevel ?? token.jlptLevel,
};
});
}