mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-02-28 06:22:45 -08:00
Update task metadata/docs and JLPT tokenizer work
This commit is contained in:
@@ -97,6 +97,7 @@ function collectDictionaryFromPath(
|
||||
for (const bank of JLPT_BANK_FILES) {
|
||||
const bankPath = path.join(dictionaryPath, bank.filename);
|
||||
if (!fs.existsSync(bankPath)) {
|
||||
log(`JLPT bank file missing for ${bank.level}: ${bankPath}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -104,6 +105,7 @@ function collectDictionaryFromPath(
|
||||
try {
|
||||
rawText = fs.readFileSync(bankPath, "utf-8");
|
||||
} catch {
|
||||
log(`Failed to read JLPT bank file ${bankPath}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -111,10 +113,22 @@ function collectDictionaryFromPath(
|
||||
try {
|
||||
rawEntries = JSON.parse(rawText) as unknown;
|
||||
} catch {
|
||||
log(`Failed to parse JLPT bank file as JSON: ${bankPath}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!Array.isArray(rawEntries)) {
|
||||
log(
|
||||
`JLPT bank file has unsupported format (expected JSON array): ${bankPath}`,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
const beforeSize = terms.size;
|
||||
addEntriesToMap(rawEntries, bank.level, terms, log);
|
||||
if (terms.size === beforeSize) {
|
||||
log(`JLPT bank file contained no extractable entries: ${bankPath}`);
|
||||
}
|
||||
}
|
||||
|
||||
return terms;
|
||||
@@ -124,8 +138,9 @@ export async function createJlptVocabularyLookupService(
|
||||
options: JlptVocabLookupOptions,
|
||||
): Promise<(term: string) => JlptLevel | null> {
|
||||
const attemptedPaths: string[] = [];
|
||||
let foundDirectoryCount = 0;
|
||||
let foundDictionaryPathCount = 0;
|
||||
let foundBankCount = 0;
|
||||
const resolvedBanks: string[] = [];
|
||||
for (const dictionaryPath of options.searchPaths) {
|
||||
attemptedPaths.push(dictionaryPath);
|
||||
if (!fs.existsSync(dictionaryPath)) {
|
||||
@@ -136,10 +151,11 @@ export async function createJlptVocabularyLookupService(
|
||||
continue;
|
||||
}
|
||||
|
||||
foundDirectoryCount += 1;
|
||||
foundDictionaryPathCount += 1;
|
||||
|
||||
const terms = collectDictionaryFromPath(dictionaryPath, options.log);
|
||||
if (terms.size > 0) {
|
||||
resolvedBanks.push(dictionaryPath);
|
||||
foundBankCount += 1;
|
||||
options.log(
|
||||
`JLPT dictionary loaded from ${dictionaryPath} (${terms.size} entries)`,
|
||||
@@ -159,10 +175,13 @@ export async function createJlptVocabularyLookupService(
|
||||
options.log(
|
||||
`JLPT dictionary not found. Searched ${attemptedPaths.length} candidate path(s): ${attemptedPaths.join(", ")}`,
|
||||
);
|
||||
if (foundDirectoryCount > 0 && foundBankCount === 0) {
|
||||
if (foundDictionaryPathCount > 0 && foundBankCount === 0) {
|
||||
options.log(
|
||||
"JLPT dictionary directories found, but none contained valid term_meta_bank_*.json files.",
|
||||
);
|
||||
}
|
||||
if (resolvedBanks.length > 0 && foundBankCount > 0) {
|
||||
options.log(`JLPT dictionary search matched path(s): ${resolvedBanks.join(", ")}`);
|
||||
}
|
||||
return NOOP_LOOKUP;
|
||||
}
|
||||
|
||||
@@ -32,6 +32,12 @@ type YomitanParseLine = YomitanParseSegment[];
|
||||
const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
|
||||
const KATAKANA_CODEPOINT_START = 0x30a1;
|
||||
const KATAKANA_CODEPOINT_END = 0x30f6;
|
||||
const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
|
||||
|
||||
const jlptLevelLookupCaches = new WeakMap<
|
||||
(text: string) => JlptLevel | null,
|
||||
Map<string, JlptLevel | null>
|
||||
>();
|
||||
|
||||
function isObject(value: unknown): value is Record<string, unknown> {
|
||||
return Boolean(value && typeof value === "object");
|
||||
@@ -75,6 +81,43 @@ export interface TokenizerDepsRuntimeOptions {
|
||||
getMecabTokenizer: () => MecabTokenizerLike | null;
|
||||
}
|
||||
|
||||
function getCachedJlptLevel(
|
||||
lookupText: string,
|
||||
getJlptLevel: (text: string) => JlptLevel | null,
|
||||
): JlptLevel | null {
|
||||
const normalizedText = lookupText.trim();
|
||||
if (!normalizedText) {
|
||||
return null;
|
||||
}
|
||||
|
||||
let cache = jlptLevelLookupCaches.get(getJlptLevel);
|
||||
if (!cache) {
|
||||
cache = new Map<string, JlptLevel | null>();
|
||||
jlptLevelLookupCaches.set(getJlptLevel, cache);
|
||||
}
|
||||
|
||||
if (cache.has(normalizedText)) {
|
||||
return cache.get(normalizedText) ?? null;
|
||||
}
|
||||
|
||||
let level: JlptLevel | null;
|
||||
try {
|
||||
level = getJlptLevel(normalizedText);
|
||||
} catch {
|
||||
level = null;
|
||||
}
|
||||
|
||||
cache.set(normalizedText, level);
|
||||
while (cache.size > JLPT_LEVEL_LOOKUP_CACHE_LIMIT) {
|
||||
const firstKey = cache.keys().next().value;
|
||||
if (firstKey !== undefined) {
|
||||
cache.delete(firstKey);
|
||||
}
|
||||
}
|
||||
|
||||
return level;
|
||||
}
|
||||
|
||||
export function createTokenizerDepsRuntimeService(
|
||||
options: TokenizerDepsRuntimeOptions,
|
||||
): TokenizerServiceDeps {
|
||||
@@ -326,13 +369,17 @@ function applyJlptMarking(
|
||||
return { ...token, jlptLevel: undefined };
|
||||
}
|
||||
|
||||
const primaryLevel = getJlptLevel(resolveJlptLookupText(token));
|
||||
const fallbackLevel = getJlptLevel(token.surface);
|
||||
const primaryLevel = getCachedJlptLevel(
|
||||
resolveJlptLookupText(token),
|
||||
getJlptLevel,
|
||||
);
|
||||
const fallbackLevel =
|
||||
primaryLevel === null ? getCachedJlptLevel(token.surface, getJlptLevel) : null;
|
||||
|
||||
return {
|
||||
...token,
|
||||
jlptLevel: primaryLevel ?? fallbackLevel ?? token.jlptLevel,
|
||||
};
|
||||
return {
|
||||
...token,
|
||||
jlptLevel: primaryLevel ?? fallbackLevel ?? token.jlptLevel,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user