From 50b45cac0b983d9daef27f102311e60b531d9443 Mon Sep 17 00:00:00 2001 From: sudacode Date: Mon, 16 Mar 2026 01:08:40 -0700 Subject: [PATCH] Summarize JLPT duplicate term conflicts into a single startup log (#26) --- changes/jlpt-duplicate-log-summary.md | 4 ++ src/core/services/jlpt-vocab.test.ts | 62 +++++++++++++++++++++++++-- src/core/services/jlpt-vocab.ts | 52 ++++++++++++++++++---- 3 files changed, 106 insertions(+), 12 deletions(-) create mode 100644 changes/jlpt-duplicate-log-summary.md diff --git a/changes/jlpt-duplicate-log-summary.md b/changes/jlpt-duplicate-log-summary.md new file mode 100644 index 0000000..9c88e15 --- /dev/null +++ b/changes/jlpt-duplicate-log-summary.md @@ -0,0 +1,4 @@ +type: fixed +area: jlpt + +- Reduced JLPT dictionary startup log noise by summarizing duplicate surface-form collisions instead of logging one line per duplicate entry. diff --git a/src/core/services/jlpt-vocab.test.ts b/src/core/services/jlpt-vocab.test.ts index 05e1437..3938641 100644 --- a/src/core/services/jlpt-vocab.test.ts +++ b/src/core/services/jlpt-vocab.test.ts @@ -6,9 +6,16 @@ import test from 'node:test'; import { createJlptVocabularyLookup } from './jlpt-vocab'; -test('createJlptVocabularyLookup loads JLPT bank entries and resolves known levels', async () => { +function createTempDir(): string { + return fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-jlpt-dict-')); +} + +test('createJlptVocabularyLookup loads JLPT bank entries and resolves known levels', async (t) => { const logs: string[] = []; - const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-jlpt-dict-')); + const tempDir = createTempDir(); + t.after(() => { + fs.rmSync(tempDir, { recursive: true, force: true }); + }); fs.writeFileSync( path.join(tempDir, 'term_meta_bank_5.json'), JSON.stringify([ @@ -37,8 +44,11 @@ test('createJlptVocabularyLookup loads JLPT bank entries and resolves known leve ); }); -test('createJlptVocabularyLookup does not require synchronous fs APIs', async () => { - const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-jlpt-dict-')); +test('createJlptVocabularyLookup does not require synchronous fs APIs', async (t) => { + const tempDir = createTempDir(); + t.after(() => { + fs.rmSync(tempDir, { recursive: true, force: true }); + }); fs.writeFileSync( path.join(tempDir, 'term_meta_bank_4.json'), JSON.stringify([['見る', 1, { frequency: { displayValue: 3 } }]]), @@ -73,3 +83,47 @@ test('createJlptVocabularyLookup does not require synchronous fs APIs', async () (fs as unknown as Record).existsSync = existsSync; } }); + +test('createJlptVocabularyLookup summarizes duplicate JLPT terms without per-entry log spam', async (t) => { + const logs: string[] = []; + const tempDir = createTempDir(); + t.after(() => { + fs.rmSync(tempDir, { recursive: true, force: true }); + }); + fs.writeFileSync( + path.join(tempDir, 'term_meta_bank_1.json'), + JSON.stringify([ + ['余り', 1, { frequency: { displayValue: 'N1' }, reading: 'あんまり' }], + ['私', 2, { frequency: { displayValue: 'N1' }, reading: 'あたし' }], + ]), + ); + fs.writeFileSync(path.join(tempDir, 'term_meta_bank_2.json'), JSON.stringify([])); + fs.writeFileSync(path.join(tempDir, 'term_meta_bank_3.json'), JSON.stringify([])); + fs.writeFileSync(path.join(tempDir, 'term_meta_bank_4.json'), JSON.stringify([])); + fs.writeFileSync( + path.join(tempDir, 'term_meta_bank_5.json'), + JSON.stringify([ + ['余り', 3, { frequency: { displayValue: 'N5' }, reading: 'あまり' }], + ['私', 4, { frequency: { displayValue: 'N5' }, reading: 'わたし' }], + ['私', 5, { frequency: { displayValue: 'N5' }, reading: 'わたくし' }], + ]), + ); + + const lookup = await createJlptVocabularyLookup({ + searchPaths: [tempDir], + log: (message) => { + logs.push(message); + }, + }); + + assert.equal(lookup('余り'), 'N1'); + assert.equal(lookup('私'), 'N1'); + assert.equal( + logs.some((entry) => entry.includes('keeping') && entry.includes('instead')), + false, + ); + assert.equal( + logs.some((entry) => entry.includes('collapsed') && entry.includes('duplicate')), + true, + ); +}); diff --git a/src/core/services/jlpt-vocab.ts b/src/core/services/jlpt-vocab.ts index c133979..58e1a74 100644 --- a/src/core/services/jlpt-vocab.ts +++ b/src/core/services/jlpt-vocab.ts @@ -22,10 +22,17 @@ const JLPT_LEVEL_PRECEDENCE: Record = { N4: 2, N5: 1, }; +const JLPT_DUPLICATE_LOG_EXAMPLE_LIMIT = 5; const NOOP_LOOKUP = (): null => null; const ENTRY_YIELD_INTERVAL = 5000; +interface JlptDuplicateStats { + duplicateEntryCount: number; + duplicateTerms: Set; + exampleTerms: string[]; +} + function isErrorCode(error: unknown, code: string): boolean { return Boolean(error && typeof error === 'object' && (error as { code?: unknown }).code === code); } @@ -47,11 +54,30 @@ function hasFrequencyDisplayValue(meta: unknown): boolean { return Object.prototype.hasOwnProperty.call(frequency as Record, 'displayValue'); } +function createJlptDuplicateStats(): JlptDuplicateStats { + return { + duplicateEntryCount: 0, + duplicateTerms: new Set(), + exampleTerms: [], + }; +} + +function recordJlptDuplicate(stats: JlptDuplicateStats, term: string): void { + stats.duplicateEntryCount += 1; + stats.duplicateTerms.add(term); + if ( + stats.exampleTerms.length < JLPT_DUPLICATE_LOG_EXAMPLE_LIMIT && + !stats.exampleTerms.includes(term) + ) { + stats.exampleTerms.push(term); + } +} + async function addEntriesToMap( rawEntries: unknown, level: JlptLevel, terms: Map, - log: (message: string) => void, + duplicateStats: JlptDuplicateStats, ): Promise { const shouldUpdateLevel = ( existingLevel: JlptLevel | undefined, @@ -90,14 +116,13 @@ async function addEntriesToMap( } const existingLevel = terms.get(normalizedTerm); - if (shouldUpdateLevel(existingLevel, level)) { - terms.set(normalizedTerm, level); - continue; + if (existingLevel !== undefined) { + recordJlptDuplicate(duplicateStats, normalizedTerm); } - log( - `JLPT dictionary already has ${normalizedTerm} as ${existingLevel}; keeping that level instead of ${level}`, - ); + if (shouldUpdateLevel(existingLevel, level)) { + terms.set(normalizedTerm, level); + } } } @@ -106,6 +131,7 @@ async function collectDictionaryFromPath( log: (message: string) => void, ): Promise> { const terms = new Map(); + const duplicateStats = createJlptDuplicateStats(); for (const bank of JLPT_BANK_FILES) { const bankPath = path.join(dictionaryPath, bank.filename); @@ -146,12 +172,22 @@ async function collectDictionaryFromPath( } const beforeSize = terms.size; - await addEntriesToMap(rawEntries, bank.level, terms, log); + await addEntriesToMap(rawEntries, bank.level, terms, duplicateStats); if (terms.size === beforeSize) { log(`JLPT bank file contained no extractable entries: ${bankPath}`); } } + if (duplicateStats.duplicateEntryCount > 0) { + const examples = + duplicateStats.exampleTerms.length > 0 + ? `; examples: ${duplicateStats.exampleTerms.join(', ')}` + : ''; + log( + `JLPT dictionary collapsed ${duplicateStats.duplicateEntryCount} duplicate JLPT entries across ${duplicateStats.duplicateTerms.size} terms; keeping highest-precedence level per surface form${examples}`, + ); + } + return terms; }