mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-20 12:11:28 -07:00
Summarize JLPT duplicate term conflicts into a single startup log (#26)
This commit is contained in:
4
changes/jlpt-duplicate-log-summary.md
Normal file
4
changes/jlpt-duplicate-log-summary.md
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
type: fixed
|
||||||
|
area: jlpt
|
||||||
|
|
||||||
|
- Reduced JLPT dictionary startup log noise by summarizing duplicate surface-form collisions instead of logging one line per duplicate entry.
|
||||||
@@ -6,9 +6,16 @@ import test from 'node:test';
|
|||||||
|
|
||||||
import { createJlptVocabularyLookup } from './jlpt-vocab';
|
import { createJlptVocabularyLookup } from './jlpt-vocab';
|
||||||
|
|
||||||
test('createJlptVocabularyLookup loads JLPT bank entries and resolves known levels', async () => {
|
function createTempDir(): string {
|
||||||
|
return fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-jlpt-dict-'));
|
||||||
|
}
|
||||||
|
|
||||||
|
test('createJlptVocabularyLookup loads JLPT bank entries and resolves known levels', async (t) => {
|
||||||
const logs: string[] = [];
|
const logs: string[] = [];
|
||||||
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-jlpt-dict-'));
|
const tempDir = createTempDir();
|
||||||
|
t.after(() => {
|
||||||
|
fs.rmSync(tempDir, { recursive: true, force: true });
|
||||||
|
});
|
||||||
fs.writeFileSync(
|
fs.writeFileSync(
|
||||||
path.join(tempDir, 'term_meta_bank_5.json'),
|
path.join(tempDir, 'term_meta_bank_5.json'),
|
||||||
JSON.stringify([
|
JSON.stringify([
|
||||||
@@ -37,8 +44,11 @@ test('createJlptVocabularyLookup loads JLPT bank entries and resolves known leve
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('createJlptVocabularyLookup does not require synchronous fs APIs', async () => {
|
test('createJlptVocabularyLookup does not require synchronous fs APIs', async (t) => {
|
||||||
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-jlpt-dict-'));
|
const tempDir = createTempDir();
|
||||||
|
t.after(() => {
|
||||||
|
fs.rmSync(tempDir, { recursive: true, force: true });
|
||||||
|
});
|
||||||
fs.writeFileSync(
|
fs.writeFileSync(
|
||||||
path.join(tempDir, 'term_meta_bank_4.json'),
|
path.join(tempDir, 'term_meta_bank_4.json'),
|
||||||
JSON.stringify([['見る', 1, { frequency: { displayValue: 3 } }]]),
|
JSON.stringify([['見る', 1, { frequency: { displayValue: 3 } }]]),
|
||||||
@@ -73,3 +83,47 @@ test('createJlptVocabularyLookup does not require synchronous fs APIs', async ()
|
|||||||
(fs as unknown as Record<string, unknown>).existsSync = existsSync;
|
(fs as unknown as Record<string, unknown>).existsSync = existsSync;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('createJlptVocabularyLookup summarizes duplicate JLPT terms without per-entry log spam', async (t) => {
|
||||||
|
const logs: string[] = [];
|
||||||
|
const tempDir = createTempDir();
|
||||||
|
t.after(() => {
|
||||||
|
fs.rmSync(tempDir, { recursive: true, force: true });
|
||||||
|
});
|
||||||
|
fs.writeFileSync(
|
||||||
|
path.join(tempDir, 'term_meta_bank_1.json'),
|
||||||
|
JSON.stringify([
|
||||||
|
['余り', 1, { frequency: { displayValue: 'N1' }, reading: 'あんまり' }],
|
||||||
|
['私', 2, { frequency: { displayValue: 'N1' }, reading: 'あたし' }],
|
||||||
|
]),
|
||||||
|
);
|
||||||
|
fs.writeFileSync(path.join(tempDir, 'term_meta_bank_2.json'), JSON.stringify([]));
|
||||||
|
fs.writeFileSync(path.join(tempDir, 'term_meta_bank_3.json'), JSON.stringify([]));
|
||||||
|
fs.writeFileSync(path.join(tempDir, 'term_meta_bank_4.json'), JSON.stringify([]));
|
||||||
|
fs.writeFileSync(
|
||||||
|
path.join(tempDir, 'term_meta_bank_5.json'),
|
||||||
|
JSON.stringify([
|
||||||
|
['余り', 3, { frequency: { displayValue: 'N5' }, reading: 'あまり' }],
|
||||||
|
['私', 4, { frequency: { displayValue: 'N5' }, reading: 'わたし' }],
|
||||||
|
['私', 5, { frequency: { displayValue: 'N5' }, reading: 'わたくし' }],
|
||||||
|
]),
|
||||||
|
);
|
||||||
|
|
||||||
|
const lookup = await createJlptVocabularyLookup({
|
||||||
|
searchPaths: [tempDir],
|
||||||
|
log: (message) => {
|
||||||
|
logs.push(message);
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
assert.equal(lookup('余り'), 'N1');
|
||||||
|
assert.equal(lookup('私'), 'N1');
|
||||||
|
assert.equal(
|
||||||
|
logs.some((entry) => entry.includes('keeping') && entry.includes('instead')),
|
||||||
|
false,
|
||||||
|
);
|
||||||
|
assert.equal(
|
||||||
|
logs.some((entry) => entry.includes('collapsed') && entry.includes('duplicate')),
|
||||||
|
true,
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|||||||
@@ -22,10 +22,17 @@ const JLPT_LEVEL_PRECEDENCE: Record<JlptLevel, number> = {
|
|||||||
N4: 2,
|
N4: 2,
|
||||||
N5: 1,
|
N5: 1,
|
||||||
};
|
};
|
||||||
|
const JLPT_DUPLICATE_LOG_EXAMPLE_LIMIT = 5;
|
||||||
|
|
||||||
const NOOP_LOOKUP = (): null => null;
|
const NOOP_LOOKUP = (): null => null;
|
||||||
const ENTRY_YIELD_INTERVAL = 5000;
|
const ENTRY_YIELD_INTERVAL = 5000;
|
||||||
|
|
||||||
|
interface JlptDuplicateStats {
|
||||||
|
duplicateEntryCount: number;
|
||||||
|
duplicateTerms: Set<string>;
|
||||||
|
exampleTerms: string[];
|
||||||
|
}
|
||||||
|
|
||||||
function isErrorCode(error: unknown, code: string): boolean {
|
function isErrorCode(error: unknown, code: string): boolean {
|
||||||
return Boolean(error && typeof error === 'object' && (error as { code?: unknown }).code === code);
|
return Boolean(error && typeof error === 'object' && (error as { code?: unknown }).code === code);
|
||||||
}
|
}
|
||||||
@@ -47,11 +54,30 @@ function hasFrequencyDisplayValue(meta: unknown): boolean {
|
|||||||
return Object.prototype.hasOwnProperty.call(frequency as Record<string, unknown>, 'displayValue');
|
return Object.prototype.hasOwnProperty.call(frequency as Record<string, unknown>, 'displayValue');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function createJlptDuplicateStats(): JlptDuplicateStats {
|
||||||
|
return {
|
||||||
|
duplicateEntryCount: 0,
|
||||||
|
duplicateTerms: new Set<string>(),
|
||||||
|
exampleTerms: [],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function recordJlptDuplicate(stats: JlptDuplicateStats, term: string): void {
|
||||||
|
stats.duplicateEntryCount += 1;
|
||||||
|
stats.duplicateTerms.add(term);
|
||||||
|
if (
|
||||||
|
stats.exampleTerms.length < JLPT_DUPLICATE_LOG_EXAMPLE_LIMIT &&
|
||||||
|
!stats.exampleTerms.includes(term)
|
||||||
|
) {
|
||||||
|
stats.exampleTerms.push(term);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async function addEntriesToMap(
|
async function addEntriesToMap(
|
||||||
rawEntries: unknown,
|
rawEntries: unknown,
|
||||||
level: JlptLevel,
|
level: JlptLevel,
|
||||||
terms: Map<string, JlptLevel>,
|
terms: Map<string, JlptLevel>,
|
||||||
log: (message: string) => void,
|
duplicateStats: JlptDuplicateStats,
|
||||||
): Promise<void> {
|
): Promise<void> {
|
||||||
const shouldUpdateLevel = (
|
const shouldUpdateLevel = (
|
||||||
existingLevel: JlptLevel | undefined,
|
existingLevel: JlptLevel | undefined,
|
||||||
@@ -90,14 +116,13 @@ async function addEntriesToMap(
|
|||||||
}
|
}
|
||||||
|
|
||||||
const existingLevel = terms.get(normalizedTerm);
|
const existingLevel = terms.get(normalizedTerm);
|
||||||
if (shouldUpdateLevel(existingLevel, level)) {
|
if (existingLevel !== undefined) {
|
||||||
terms.set(normalizedTerm, level);
|
recordJlptDuplicate(duplicateStats, normalizedTerm);
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
log(
|
if (shouldUpdateLevel(existingLevel, level)) {
|
||||||
`JLPT dictionary already has ${normalizedTerm} as ${existingLevel}; keeping that level instead of ${level}`,
|
terms.set(normalizedTerm, level);
|
||||||
);
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -106,6 +131,7 @@ async function collectDictionaryFromPath(
|
|||||||
log: (message: string) => void,
|
log: (message: string) => void,
|
||||||
): Promise<Map<string, JlptLevel>> {
|
): Promise<Map<string, JlptLevel>> {
|
||||||
const terms = new Map<string, JlptLevel>();
|
const terms = new Map<string, JlptLevel>();
|
||||||
|
const duplicateStats = createJlptDuplicateStats();
|
||||||
|
|
||||||
for (const bank of JLPT_BANK_FILES) {
|
for (const bank of JLPT_BANK_FILES) {
|
||||||
const bankPath = path.join(dictionaryPath, bank.filename);
|
const bankPath = path.join(dictionaryPath, bank.filename);
|
||||||
@@ -146,12 +172,22 @@ async function collectDictionaryFromPath(
|
|||||||
}
|
}
|
||||||
|
|
||||||
const beforeSize = terms.size;
|
const beforeSize = terms.size;
|
||||||
await addEntriesToMap(rawEntries, bank.level, terms, log);
|
await addEntriesToMap(rawEntries, bank.level, terms, duplicateStats);
|
||||||
if (terms.size === beforeSize) {
|
if (terms.size === beforeSize) {
|
||||||
log(`JLPT bank file contained no extractable entries: ${bankPath}`);
|
log(`JLPT bank file contained no extractable entries: ${bankPath}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (duplicateStats.duplicateEntryCount > 0) {
|
||||||
|
const examples =
|
||||||
|
duplicateStats.exampleTerms.length > 0
|
||||||
|
? `; examples: ${duplicateStats.exampleTerms.join(', ')}`
|
||||||
|
: '';
|
||||||
|
log(
|
||||||
|
`JLPT dictionary collapsed ${duplicateStats.duplicateEntryCount} duplicate JLPT entries across ${duplicateStats.duplicateTerms.size} terms; keeping highest-precedence level per surface form${examples}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
return terms;
|
return terms;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user