From 737101fe9ed1a4a4ed8b4febd372268ad157404f Mon Sep 17 00:00:00 2001 From: sudacode Date: Mon, 2 Mar 2026 01:45:37 -0800 Subject: [PATCH] fix(tokenizer): lazy yomitan term-only frequency fallback --- ...readings-and-known-token-color-priority.md | 56 +++++ src/core/services/tokenizer.test.ts | 5 + src/core/services/tokenizer.ts | 5 - .../tokenizer/yomitan-parser-runtime.test.ts | 96 ++++++++ .../tokenizer/yomitan-parser-runtime.ts | 215 ++++++++++++------ 5 files changed, 298 insertions(+), 79 deletions(-) create mode 100644 backlog/tasks/task-82 - Subtitle-frequency-highlighting-fixes-for-noisy-readings-and-known-token-color-priority.md diff --git a/backlog/tasks/task-82 - Subtitle-frequency-highlighting-fixes-for-noisy-readings-and-known-token-color-priority.md b/backlog/tasks/task-82 - Subtitle-frequency-highlighting-fixes-for-noisy-readings-and-known-token-color-priority.md new file mode 100644 index 0000000..a91ef20 --- /dev/null +++ b/backlog/tasks/task-82 - Subtitle-frequency-highlighting-fixes-for-noisy-readings-and-known-token-color-priority.md @@ -0,0 +1,56 @@ +--- +id: TASK-82 +title: 'Subtitle frequency highlighting: fix noisy Yomitan readings and restore known/N+1 color priority' +status: Done +assignee: [] +created_date: '2026-03-02 20:10' +updated_date: '2026-03-02 01:44' +labels: [] +dependencies: [] +priority: high +ordinal: 9002 +--- + +## Description + + + +Address frequency-highlighting regressions: +- tokens like `断じて` missed rank assignment when Yomitan merged-token reading was truncated/noisy; +- known/N+1 tokens were incorrectly colored by frequency color instead of known/N+1 color. + +Expected behavior: +- known/N+1 color always wins; +- if token is frequent and within `topX`, frequency rank label can still appear on hover/metadata. + + + +## Acceptance Criteria + + + +- [x] #1 Frequency lookup succeeds for noisy/truncated merged-token readings via robust fallback behavior. +- [x] #2 Merged-token reading normalization restores missing kana suffixes where safe (`headword === surface` path). +- [x] #3 Known/N+1 tokens keep known/N+1 color classes; frequency color class does not override them. +- [x] #4 Frequency rank hover label remains available for in-range frequent tokens, including known/N+1. +- [x] #5 Regression tests added for tokenizer and renderer behavior. + + + +## Final Summary + + + +Implemented and validated: +- tokenizer now normalizes selected Yomitan merged-token readings by appending missing trailing kana suffixes when safe (`headword === surface`); +- frequency lookup now does lazy fallback: requests `{term, reading}` first, and only requests `{term, reading: null}` for misses; +- this removes eager `(term, null)` payload inflation on medium-frequency lines and reduces extension RPC payload/load; +- renderer restored known/N+1 color priority over frequency class coloring; +- frequency rank label display remains available for frequent known/N+1 tokens; +- added regression tests covering noisy-reading fallback, lazy fallback-query behavior, and renderer class/label precedence. + +Related commits: +- `17a417e` (`fix(subtitle): improve frequency highlight reliability`) +- `79f37f3` (`fix(subtitle): prioritize known and n+1 colors over frequency`) + + diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index 707e4f6..4dd8450 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -346,6 +346,11 @@ test('tokenizeSubtitle queries headword frequencies with token reading for disam webContents: { executeJavaScript: async (script: string) => { if (script.includes('getTermFrequencies')) { + assert.equal( + script.includes('"term":"鍛える","reading":null'), + false, + 'should not eagerly include term-only fallback pair when reading lookup is present', + ); if (!script.includes('"term":"鍛える","reading":"きた"')) { return []; } diff --git a/src/core/services/tokenizer.ts b/src/core/services/tokenizer.ts index c808642..d04df15 100644 --- a/src/core/services/tokenizer.ts +++ b/src/core/services/tokenizer.ts @@ -329,11 +329,6 @@ function buildYomitanFrequencyTermReadingList( const readingRaw = token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null; termReadingList.push({ term, reading: readingRaw }); - - // Yomitan parse readings can be noisy/truncated on merged tokens; include term-only fallback. - if (readingRaw !== null) { - termReadingList.push({ term, reading: null }); - } } return termReadingList; diff --git a/src/core/services/tokenizer/yomitan-parser-runtime.test.ts b/src/core/services/tokenizer/yomitan-parser-runtime.test.ts index 142c4ed..9f9586f 100644 --- a/src/core/services/tokenizer/yomitan-parser-runtime.test.ts +++ b/src/core/services/tokenizer/yomitan-parser-runtime.test.ts @@ -157,6 +157,102 @@ test('requestYomitanTermFrequencies prefers primary rank from displayValue array assert.equal(result[0]?.frequency, 7141); }); +test('requestYomitanTermFrequencies requests term-only fallback only after reading miss', async () => { + const frequencyScripts: string[] = []; + const deps = createDeps(async (script) => { + if (script.includes('optionsGetFull')) { + return { + profileCurrent: 0, + profiles: [ + { + options: { + scanning: { length: 40 }, + dictionaries: [{ name: 'freq-dict', enabled: true, id: 0 }], + }, + }, + ], + }; + } + + if (!script.includes('getTermFrequencies')) { + return []; + } + + frequencyScripts.push(script); + if (script.includes('"term":"断じて","reading":"だん"')) { + return []; + } + if (script.includes('"term":"断じて","reading":null')) { + return [ + { + term: '断じて', + reading: null, + dictionary: 'freq-dict', + frequency: 7082, + displayValue: '7082', + displayValueParsed: true, + }, + ]; + } + return []; + }); + + const result = await requestYomitanTermFrequencies([{ term: '断じて', reading: 'だん' }], deps, { + error: () => undefined, + }); + + assert.equal(result.length, 1); + assert.equal(result[0]?.frequency, 7082); + assert.equal(frequencyScripts.length, 2); + assert.match(frequencyScripts[0] ?? '', /"term":"断じて","reading":"だん"/); + assert.doesNotMatch(frequencyScripts[0] ?? '', /"term":"断じて","reading":null/); + assert.match(frequencyScripts[1] ?? '', /"term":"断じて","reading":null/); +}); + +test('requestYomitanTermFrequencies avoids term-only fallback request when reading lookup succeeds', async () => { + const frequencyScripts: string[] = []; + const deps = createDeps(async (script) => { + if (script.includes('optionsGetFull')) { + return { + profileCurrent: 0, + profiles: [ + { + options: { + scanning: { length: 40 }, + dictionaries: [{ name: 'freq-dict', enabled: true, id: 0 }], + }, + }, + ], + }; + } + + if (!script.includes('getTermFrequencies')) { + return []; + } + + frequencyScripts.push(script); + return [ + { + term: '鍛える', + reading: 'きたえる', + dictionary: 'freq-dict', + frequency: 2847, + displayValue: '2847', + displayValueParsed: true, + }, + ]; + }); + + const result = await requestYomitanTermFrequencies([{ term: '鍛える', reading: 'きた' }], deps, { + error: () => undefined, + }); + + assert.equal(result.length, 1); + assert.equal(frequencyScripts.length, 1); + assert.match(frequencyScripts[0] ?? '', /"term":"鍛える","reading":"きた"/); + assert.doesNotMatch(frequencyScripts[0] ?? '', /"term":"鍛える","reading":null/); +}); + test('requestYomitanTermFrequencies caches profile metadata between calls', async () => { const scripts: string[] = []; const deps = createDeps(async (script) => { diff --git a/src/core/services/tokenizer/yomitan-parser-runtime.ts b/src/core/services/tokenizer/yomitan-parser-runtime.ts index 705f50d..d893caa 100644 --- a/src/core/services/tokenizer/yomitan-parser-runtime.ts +++ b/src/core/services/tokenizer/yomitan-parser-runtime.ts @@ -578,50 +578,12 @@ export async function requestYomitanParseResults( } } -export async function requestYomitanTermFrequencies( +async function fetchYomitanTermFrequencies( + parserWindow: BrowserWindow, termReadingList: YomitanTermReadingPair[], - deps: YomitanParserRuntimeDeps, + metadata: YomitanProfileMetadata | null, logger: LoggerLike, -): Promise { - const normalizedTermReadingList = normalizeTermReadingList(termReadingList); - const yomitanExt = deps.getYomitanExt(); - if (normalizedTermReadingList.length === 0 || !yomitanExt) { - return []; - } - - const isReady = await ensureYomitanParserWindow(deps, logger); - const parserWindow = deps.getYomitanParserWindow(); - if (!isReady || !parserWindow || parserWindow.isDestroyed()) { - return []; - } - - const metadata = await requestYomitanProfileMetadata(parserWindow, logger); - const frequencyCache = getWindowFrequencyCache(parserWindow); - const missingTermReadingList: YomitanTermReadingPair[] = []; - - const buildCachedResult = (): YomitanTermFrequency[] => { - const result: YomitanTermFrequency[] = []; - for (const pair of normalizedTermReadingList) { - const key = makeTermReadingCacheKey(pair.term, pair.reading); - const cached = frequencyCache.get(key); - if (cached && cached.length > 0) { - result.push(...cached); - } - } - return result; - }; - - for (const pair of normalizedTermReadingList) { - const key = makeTermReadingCacheKey(pair.term, pair.reading); - if (!frequencyCache.has(key)) { - missingTermReadingList.push(pair); - } - } - - if (missingTermReadingList.length === 0) { - return buildCachedResult(); - } - +): Promise { if (metadata && metadata.dictionaries.length > 0) { const script = ` (async () => { @@ -645,7 +607,7 @@ export async function requestYomitanTermFrequencies( }); return await invoke("getTermFrequencies", { - termReadingList: ${JSON.stringify(missingTermReadingList)}, + termReadingList: ${JSON.stringify(termReadingList)}, dictionaries: ${JSON.stringify(metadata.dictionaries)} }); })(); @@ -653,28 +615,13 @@ export async function requestYomitanTermFrequencies( try { const rawResult = await parserWindow.webContents.executeJavaScript(script, true); - const fetchedEntries = Array.isArray(rawResult) + return Array.isArray(rawResult) ? normalizeFrequencyEntriesWithPriority(rawResult, metadata.dictionaryPriorityByName) : []; - const groupedByPair = groupFrequencyEntriesByPair(fetchedEntries); - const groupedByTerm = groupFrequencyEntriesByTerm(fetchedEntries); - const missingTerms = new Set(missingTermReadingList.map((pair) => pair.term)); - - for (const pair of missingTermReadingList) { - const key = makeTermReadingCacheKey(pair.term, pair.reading); - const exactEntries = groupedByPair.get(key); - const termEntries = groupedByTerm.get(pair.term) ?? []; - frequencyCache.set(key, exactEntries ?? termEntries); - } - - const cachedResult = buildCachedResult(); - const unmatchedEntries = fetchedEntries.filter((entry) => !missingTerms.has(entry.term.trim())); - return [...cachedResult, ...unmatchedEntries]; } catch (err) { logger.error('Yomitan term frequency request failed:', (err as Error).message); + return null; } - - return buildCachedResult(); } const script = ` @@ -721,7 +668,7 @@ export async function requestYomitanTermFrequencies( } const rawFrequencies = await invoke("getTermFrequencies", { - termReadingList: ${JSON.stringify(missingTermReadingList)}, + termReadingList: ${JSON.stringify(termReadingList)}, dictionaries }); @@ -743,27 +690,147 @@ export async function requestYomitanTermFrequencies( try { const rawResult = await parserWindow.webContents.executeJavaScript(script, true); - const fetchedEntries = Array.isArray(rawResult) + return Array.isArray(rawResult) ? rawResult .map((entry) => toYomitanTermFrequency(entry)) .filter((entry): entry is YomitanTermFrequency => entry !== null) : []; - const groupedByPair = groupFrequencyEntriesByPair(fetchedEntries); - const groupedByTerm = groupFrequencyEntriesByTerm(fetchedEntries); - const missingTerms = new Set(missingTermReadingList.map((pair) => pair.term)); - for (const pair of missingTermReadingList) { - const key = makeTermReadingCacheKey(pair.term, pair.reading); - const exactEntries = groupedByPair.get(key); - const termEntries = groupedByTerm.get(pair.term) ?? []; - frequencyCache.set(key, exactEntries ?? termEntries); - } - const cachedResult = buildCachedResult(); - const unmatchedEntries = fetchedEntries.filter((entry) => !missingTerms.has(entry.term.trim())); - return [...cachedResult, ...unmatchedEntries]; } catch (err) { logger.error('Yomitan term frequency request failed:', (err as Error).message); + return null; + } +} + +function cacheFrequencyEntriesForPairs( + frequencyCache: Map, + termReadingList: YomitanTermReadingPair[], + fetchedEntries: YomitanTermFrequency[], +): void { + const groupedByPair = groupFrequencyEntriesByPair(fetchedEntries); + const groupedByTerm = groupFrequencyEntriesByTerm(fetchedEntries); + for (const pair of termReadingList) { + const key = makeTermReadingCacheKey(pair.term, pair.reading); + const exactEntries = groupedByPair.get(key); + const termEntries = groupedByTerm.get(pair.term) ?? []; + frequencyCache.set(key, exactEntries ?? termEntries); + } +} + +export async function requestYomitanTermFrequencies( + termReadingList: YomitanTermReadingPair[], + deps: YomitanParserRuntimeDeps, + logger: LoggerLike, +): Promise { + const normalizedTermReadingList = normalizeTermReadingList(termReadingList); + const yomitanExt = deps.getYomitanExt(); + if (normalizedTermReadingList.length === 0 || !yomitanExt) { + return []; + } + + const isReady = await ensureYomitanParserWindow(deps, logger); + const parserWindow = deps.getYomitanParserWindow(); + if (!isReady || !parserWindow || parserWindow.isDestroyed()) { + return []; + } + + const metadata = await requestYomitanProfileMetadata(parserWindow, logger); + const frequencyCache = getWindowFrequencyCache(parserWindow); + const missingTermReadingList: YomitanTermReadingPair[] = []; + + const buildCachedResult = (): YomitanTermFrequency[] => { + const result: YomitanTermFrequency[] = []; + for (const pair of normalizedTermReadingList) { + const key = makeTermReadingCacheKey(pair.term, pair.reading); + const cached = frequencyCache.get(key); + if (cached && cached.length > 0) { + result.push(...cached); + } + } + return result; + }; + + for (const pair of normalizedTermReadingList) { + const key = makeTermReadingCacheKey(pair.term, pair.reading); + if (!frequencyCache.has(key)) { + missingTermReadingList.push(pair); + } + } + + if (missingTermReadingList.length === 0) { return buildCachedResult(); } + + const fetchedEntries = await fetchYomitanTermFrequencies( + parserWindow, + missingTermReadingList, + metadata, + logger, + ); + if (fetchedEntries === null) { + return buildCachedResult(); + } + + cacheFrequencyEntriesForPairs(frequencyCache, missingTermReadingList, fetchedEntries); + + const fallbackTermReadingList = normalizeTermReadingList( + missingTermReadingList + .filter((pair) => pair.reading !== null) + .map((pair) => { + const key = makeTermReadingCacheKey(pair.term, pair.reading); + const cachedEntries = frequencyCache.get(key); + if (cachedEntries && cachedEntries.length > 0) { + return null; + } + + const fallbackKey = makeTermReadingCacheKey(pair.term, null); + const cachedFallback = frequencyCache.get(fallbackKey); + if (cachedFallback && cachedFallback.length > 0) { + frequencyCache.set(key, cachedFallback); + return null; + } + + return { term: pair.term, reading: null }; + }) + .filter((pair): pair is YomitanTermReadingPair => pair !== null), + ).filter((pair) => !frequencyCache.has(makeTermReadingCacheKey(pair.term, pair.reading))); + + let fallbackFetchedEntries: YomitanTermFrequency[] = []; + + if (fallbackTermReadingList.length > 0) { + const fallbackFetchResult = await fetchYomitanTermFrequencies( + parserWindow, + fallbackTermReadingList, + metadata, + logger, + ); + if (fallbackFetchResult !== null) { + fallbackFetchedEntries = fallbackFetchResult; + cacheFrequencyEntriesForPairs(frequencyCache, fallbackTermReadingList, fallbackFetchedEntries); + } + + for (const pair of missingTermReadingList) { + if (pair.reading === null) { + continue; + } + const key = makeTermReadingCacheKey(pair.term, pair.reading); + const cachedEntries = frequencyCache.get(key); + if (cachedEntries && cachedEntries.length > 0) { + continue; + } + const fallbackEntries = frequencyCache.get(makeTermReadingCacheKey(pair.term, null)); + if (fallbackEntries && fallbackEntries.length > 0) { + frequencyCache.set(key, fallbackEntries); + } + } + } + + const allFetchedEntries = [...fetchedEntries, ...fallbackFetchedEntries]; + const queriedTerms = new Set( + [...missingTermReadingList, ...fallbackTermReadingList].map((pair) => pair.term), + ); + const cachedResult = buildCachedResult(); + const unmatchedEntries = allFetchedEntries.filter((entry) => !queriedTerms.has(entry.term.trim())); + return [...cachedResult, ...unmatchedEntries]; } export async function syncYomitanDefaultAnkiServer(