From e4038127cb6cc9d41e581b74dcb4264e34edce2a Mon Sep 17 00:00:00 2001 From: sudacode Date: Sat, 28 Feb 2026 21:12:34 -0800 Subject: [PATCH] fix(tokenizer): disambiguate Yomitan frequency lookup by reading --- src/core/services/tokenizer.test.ts | 66 ++++++++++++++++++- src/core/services/tokenizer.ts | 3 +- .../tokenizer/yomitan-parser-runtime.test.ts | 22 +++++++ .../tokenizer/yomitan-parser-runtime.ts | 37 ++++++++--- 4 files changed, 115 insertions(+), 13 deletions(-) diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index 95fd4d4..afd6079 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -297,7 +297,7 @@ test('tokenizeSubtitle starts Yomitan frequency lookup and MeCab enrichment in p assert.equal(result.tokens?.[0]?.frequencyRank, 77); }); -test('tokenizeSubtitle queries headword frequencies without forcing surface reading', async () => { +test('tokenizeSubtitle queries headword frequencies with token reading for disambiguation', async () => { const result = await tokenizeSubtitle( '鍛えた', makeDeps({ @@ -309,7 +309,7 @@ test('tokenizeSubtitle queries headword frequencies without forcing surface read webContents: { executeJavaScript: async (script: string) => { if (script.includes('getTermFrequencies')) { - if (!script.includes('"term":"鍛える","reading":null')) { + if (!script.includes('"term":"鍛える","reading":"きた"')) { return []; } return [ @@ -351,6 +351,68 @@ test('tokenizeSubtitle queries headword frequencies without forcing surface read assert.equal(result.tokens?.[0]?.frequencyRank, 2847); }); +test('tokenizeSubtitle avoids headword term-only fallback rank when reading-specific frequency exists', async () => { + const result = await tokenizeSubtitle( + '無人', + makeDeps({ + getFrequencyDictionaryEnabled: () => true, + getYomitanExt: () => ({ id: 'dummy-ext' }) as any, + getYomitanParserWindow: () => + ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async (script: string) => { + if (script.includes('getTermFrequencies')) { + if (!script.includes('"term":"無人","reading":"むじん"')) { + return []; + } + return [ + { + term: '無人', + reading: null, + dictionary: 'CC100', + dictionaryPriority: 0, + frequency: 157632, + displayValue: null, + displayValueParsed: false, + }, + { + term: '無人', + reading: 'むじん', + dictionary: 'CC100', + dictionaryPriority: 0, + frequency: 7141, + displayValue: null, + displayValueParsed: false, + }, + ]; + } + + return [ + { + source: 'scanning-parser', + index: 0, + content: [ + [ + { + text: '無人', + reading: 'むじん', + headwords: [[{ term: '無人' }]], + }, + ], + ], + }, + ]; + }, + }, + }) as unknown as Electron.BrowserWindow, + }), + ); + + assert.equal(result.tokens?.length, 1); + assert.equal(result.tokens?.[0]?.frequencyRank, 7141); +}); + test('tokenizeSubtitle prefers Yomitan frequency from highest-priority dictionary', async () => { const result = await tokenizeSubtitle( '猫', diff --git a/src/core/services/tokenizer.ts b/src/core/services/tokenizer.ts index c07dd52..f9a515f 100644 --- a/src/core/services/tokenizer.ts +++ b/src/core/services/tokenizer.ts @@ -284,8 +284,7 @@ function buildYomitanFrequencyTermReadingList( } const readingRaw = token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null; - const reading = matchMode === 'headword' ? null : readingRaw; - return { term, reading }; + return { term, reading: readingRaw }; }) .filter((pair): pair is { term: string; reading: string | null } => pair !== null); } diff --git a/src/core/services/tokenizer/yomitan-parser-runtime.test.ts b/src/core/services/tokenizer/yomitan-parser-runtime.test.ts index 7ebf3f1..702f084 100644 --- a/src/core/services/tokenizer/yomitan-parser-runtime.test.ts +++ b/src/core/services/tokenizer/yomitan-parser-runtime.test.ts @@ -130,6 +130,28 @@ test('requestYomitanTermFrequencies returns normalized frequency entries', async assert.match(scriptValue, /optionsGetFull/); }); +test('requestYomitanTermFrequencies prefers primary rank from displayValue array pair', async () => { + const deps = createDeps(async () => [ + { + term: '無人', + reading: 'むじん', + dictionary: 'freq-dict', + dictionaryPriority: 0, + frequency: 157632, + displayValue: [7141, 157632], + displayValueParsed: true, + }, + ]); + + const result = await requestYomitanTermFrequencies([{ term: '無人', reading: 'むじん' }], deps, { + error: () => undefined, + }); + + assert.equal(result.length, 1); + assert.equal(result[0]?.term, '無人'); + assert.equal(result[0]?.frequency, 7141); +}); + test('requestYomitanTermFrequencies caches profile metadata between calls', async () => { const scripts: string[] = []; const deps = createDeps(async (script) => { diff --git a/src/core/services/tokenizer/yomitan-parser-runtime.ts b/src/core/services/tokenizer/yomitan-parser-runtime.ts index 21b44d8..9079931 100644 --- a/src/core/services/tokenizer/yomitan-parser-runtime.ts +++ b/src/core/services/tokenizer/yomitan-parser-runtime.ts @@ -96,6 +96,28 @@ function parsePositiveFrequencyString(value: string): number | null { return parsed; } +function parsePositiveFrequencyValue(value: unknown): number | null { + const numeric = asPositiveInteger(value); + if (numeric !== null) { + return numeric; + } + + if (typeof value === 'string') { + return parsePositiveFrequencyString(value); + } + + if (Array.isArray(value)) { + for (const item of value) { + const parsed = parsePositiveFrequencyValue(item); + if (parsed !== null) { + return parsed; + } + } + } + + return null; +} + function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null { if (!isObject(value)) { return null; @@ -103,15 +125,12 @@ function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null { const term = typeof value.term === 'string' ? value.term.trim() : ''; const dictionary = typeof value.dictionary === 'string' ? value.dictionary.trim() : ''; - const rawFrequency = asPositiveInteger(value.frequency); - const displayValueRaw = - value.displayValue === null - ? null - : typeof value.displayValue === 'string' - ? value.displayValue - : null; + const rawFrequency = parsePositiveFrequencyValue(value.frequency); + const displayValueRaw = value.displayValue; const parsedDisplayFrequency = - displayValueRaw !== null ? parsePositiveFrequencyString(displayValueRaw) : null; + displayValueRaw !== null && displayValueRaw !== undefined + ? parsePositiveFrequencyValue(displayValueRaw) + : null; const frequency = parsedDisplayFrequency ?? rawFrequency; if (!term || !dictionary || frequency === null) { return null; @@ -128,7 +147,7 @@ function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null { : typeof value.reading === 'string' ? value.reading : null; - const displayValue = displayValueRaw; + const displayValue = typeof displayValueRaw === 'string' ? displayValueRaw : null; const displayValueParsed = value.displayValueParsed === true; return {