From 50b45cac0b983d9daef27f102311e60b531d9443 Mon Sep 17 00:00:00 2001
From: sudacode <suda@sudacode.com>
Date: Mon, 16 Mar 2026 01:08:40 -0700
Subject: [PATCH] Summarize JLPT duplicate term conflicts into a single startup
 log (#26)

---
 changes/jlpt-duplicate-log-summary.md |  4 ++
 src/core/services/jlpt-vocab.test.ts  | 62 +++++++++++++++++++++++++--
 src/core/services/jlpt-vocab.ts       | 52 ++++++++++++++++++----
 3 files changed, 106 insertions(+), 12 deletions(-)
 create mode 100644 changes/jlpt-duplicate-log-summary.md

diff --git a/changes/jlpt-duplicate-log-summary.md b/changes/jlpt-duplicate-log-summary.md
new file mode 100644
index 0000000..9c88e15
--- /dev/null
+++ b/changes/jlpt-duplicate-log-summary.md
@@ -0,0 +1,4 @@
+type: fixed
+area: jlpt
+
+- Reduced JLPT dictionary startup log noise by summarizing duplicate surface-form collisions instead of logging one line per duplicate entry.
diff --git a/src/core/services/jlpt-vocab.test.ts b/src/core/services/jlpt-vocab.test.ts
index 05e1437..3938641 100644
--- a/src/core/services/jlpt-vocab.test.ts
+++ b/src/core/services/jlpt-vocab.test.ts
@@ -6,9 +6,16 @@ import test from 'node:test';
 
 import { createJlptVocabularyLookup } from './jlpt-vocab';
 
-test('createJlptVocabularyLookup loads JLPT bank entries and resolves known levels', async () => {
+function createTempDir(): string {
+  return fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-jlpt-dict-'));
+}
+
+test('createJlptVocabularyLookup loads JLPT bank entries and resolves known levels', async (t) => {
   const logs: string[] = [];
-  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-jlpt-dict-'));
+  const tempDir = createTempDir();
+  t.after(() => {
+    fs.rmSync(tempDir, { recursive: true, force: true });
+  });
   fs.writeFileSync(
     path.join(tempDir, 'term_meta_bank_5.json'),
     JSON.stringify([
@@ -37,8 +44,11 @@ test('createJlptVocabularyLookup loads JLPT bank entries and resolves known leve
   );
 });
 
-test('createJlptVocabularyLookup does not require synchronous fs APIs', async () => {
-  const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-jlpt-dict-'));
+test('createJlptVocabularyLookup does not require synchronous fs APIs', async (t) => {
+  const tempDir = createTempDir();
+  t.after(() => {
+    fs.rmSync(tempDir, { recursive: true, force: true });
+  });
   fs.writeFileSync(
     path.join(tempDir, 'term_meta_bank_4.json'),
     JSON.stringify([['見る', 1, { frequency: { displayValue: 3 } }]]),
@@ -73,3 +83,47 @@ test('createJlptVocabularyLookup does not require synchronous fs APIs', async ()
     (fs as unknown as Record<string, unknown>).existsSync = existsSync;
   }
 });
+
+test('createJlptVocabularyLookup summarizes duplicate JLPT terms without per-entry log spam', async (t) => {
+  const logs: string[] = [];
+  const tempDir = createTempDir();
+  t.after(() => {
+    fs.rmSync(tempDir, { recursive: true, force: true });
+  });
+  fs.writeFileSync(
+    path.join(tempDir, 'term_meta_bank_1.json'),
+    JSON.stringify([
+      ['余り', 1, { frequency: { displayValue: 'N1' }, reading: 'あんまり' }],
+      ['私', 2, { frequency: { displayValue: 'N1' }, reading: 'あたし' }],
+    ]),
+  );
+  fs.writeFileSync(path.join(tempDir, 'term_meta_bank_2.json'), JSON.stringify([]));
+  fs.writeFileSync(path.join(tempDir, 'term_meta_bank_3.json'), JSON.stringify([]));
+  fs.writeFileSync(path.join(tempDir, 'term_meta_bank_4.json'), JSON.stringify([]));
+  fs.writeFileSync(
+    path.join(tempDir, 'term_meta_bank_5.json'),
+    JSON.stringify([
+      ['余り', 3, { frequency: { displayValue: 'N5' }, reading: 'あまり' }],
+      ['私', 4, { frequency: { displayValue: 'N5' }, reading: 'わたし' }],
+      ['私', 5, { frequency: { displayValue: 'N5' }, reading: 'わたくし' }],
+    ]),
+  );
+
+  const lookup = await createJlptVocabularyLookup({
+    searchPaths: [tempDir],
+    log: (message) => {
+      logs.push(message);
+    },
+  });
+
+  assert.equal(lookup('余り'), 'N1');
+  assert.equal(lookup('私'), 'N1');
+  assert.equal(
+    logs.some((entry) => entry.includes('keeping') && entry.includes('instead')),
+    false,
+  );
+  assert.equal(
+    logs.some((entry) => entry.includes('collapsed') && entry.includes('duplicate')),
+    true,
+  );
+});
diff --git a/src/core/services/jlpt-vocab.ts b/src/core/services/jlpt-vocab.ts
index c133979..58e1a74 100644
--- a/src/core/services/jlpt-vocab.ts
+++ b/src/core/services/jlpt-vocab.ts
@@ -22,10 +22,17 @@ const JLPT_LEVEL_PRECEDENCE: Record<JlptLevel, number> = {
   N4: 2,
   N5: 1,
 };
+const JLPT_DUPLICATE_LOG_EXAMPLE_LIMIT = 5;
 
 const NOOP_LOOKUP = (): null => null;
 const ENTRY_YIELD_INTERVAL = 5000;
 
+interface JlptDuplicateStats {
+  duplicateEntryCount: number;
+  duplicateTerms: Set<string>;
+  exampleTerms: string[];
+}
+
 function isErrorCode(error: unknown, code: string): boolean {
   return Boolean(error && typeof error === 'object' && (error as { code?: unknown }).code === code);
 }
@@ -47,11 +54,30 @@ function hasFrequencyDisplayValue(meta: unknown): boolean {
   return Object.prototype.hasOwnProperty.call(frequency as Record<string, unknown>, 'displayValue');
 }
 
+function createJlptDuplicateStats(): JlptDuplicateStats {
+  return {
+    duplicateEntryCount: 0,
+    duplicateTerms: new Set<string>(),
+    exampleTerms: [],
+  };
+}
+
+function recordJlptDuplicate(stats: JlptDuplicateStats, term: string): void {
+  stats.duplicateEntryCount += 1;
+  stats.duplicateTerms.add(term);
+  if (
+    stats.exampleTerms.length < JLPT_DUPLICATE_LOG_EXAMPLE_LIMIT &&
+    !stats.exampleTerms.includes(term)
+  ) {
+    stats.exampleTerms.push(term);
+  }
+}
+
 async function addEntriesToMap(
   rawEntries: unknown,
   level: JlptLevel,
   terms: Map<string, JlptLevel>,
-  log: (message: string) => void,
+  duplicateStats: JlptDuplicateStats,
 ): Promise<void> {
   const shouldUpdateLevel = (
     existingLevel: JlptLevel | undefined,
@@ -90,14 +116,13 @@ async function addEntriesToMap(
     }
 
     const existingLevel = terms.get(normalizedTerm);
-    if (shouldUpdateLevel(existingLevel, level)) {
-      terms.set(normalizedTerm, level);
-      continue;
+    if (existingLevel !== undefined) {
+      recordJlptDuplicate(duplicateStats, normalizedTerm);
     }
 
-    log(
-      `JLPT dictionary already has ${normalizedTerm} as ${existingLevel}; keeping that level instead of ${level}`,
-    );
+    if (shouldUpdateLevel(existingLevel, level)) {
+      terms.set(normalizedTerm, level);
+    }
   }
 }
 
@@ -106,6 +131,7 @@ async function collectDictionaryFromPath(
   log: (message: string) => void,
 ): Promise<Map<string, JlptLevel>> {
   const terms = new Map<string, JlptLevel>();
+  const duplicateStats = createJlptDuplicateStats();
 
   for (const bank of JLPT_BANK_FILES) {
     const bankPath = path.join(dictionaryPath, bank.filename);
@@ -146,12 +172,22 @@ async function collectDictionaryFromPath(
     }
 
     const beforeSize = terms.size;
-    await addEntriesToMap(rawEntries, bank.level, terms, log);
+    await addEntriesToMap(rawEntries, bank.level, terms, duplicateStats);
     if (terms.size === beforeSize) {
       log(`JLPT bank file contained no extractable entries: ${bankPath}`);
     }
   }
 
+  if (duplicateStats.duplicateEntryCount > 0) {
+    const examples =
+      duplicateStats.exampleTerms.length > 0
+        ? `; examples: ${duplicateStats.exampleTerms.join(', ')}`
+        : '';
+    log(
+      `JLPT dictionary collapsed ${duplicateStats.duplicateEntryCount} duplicate JLPT entries across ${duplicateStats.duplicateTerms.size} terms; keeping highest-precedence level per surface form${examples}`,
+    );
+  }
+
   return terms;
 }