From 17a417e639b9a22bab165461ce42e74571a54302 Mon Sep 17 00:00:00 2001 From: sudacode Date: Sun, 1 Mar 2026 20:12:42 -0800 Subject: [PATCH] fix(subtitle): improve frequency highlight reliability --- src/core/services/tokenizer.test.ts | 89 ++++++++++++++++++++++++++++ src/core/services/tokenizer.ts | 86 +++++++++++++++++++++------ src/renderer/subtitle-render.test.ts | 4 +- src/renderer/subtitle-render.ts | 2 +- 4 files changed, 161 insertions(+), 20 deletions(-) diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index afd6079..c69d277 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -297,6 +297,43 @@ test('tokenizeSubtitle starts Yomitan frequency lookup and MeCab enrichment in p assert.equal(result.tokens?.[0]?.frequencyRank, 77); }); +test('tokenizeSubtitle appends trailing kana to merged Yomitan readings when headword equals surface', async () => { + const result = await tokenizeSubtitle( + '断じて見ていない', + makeDeps({ + getYomitanExt: () => ({ id: 'dummy-ext' }) as any, + getYomitanParserWindow: () => + ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: 'scanning-parser', + index: 0, + content: [ + [ + { text: '断', reading: 'だん', headwords: [[{ term: '断じて' }]] }, + { text: 'じて', reading: '', headwords: [[{ term: 'じて' }]] }, + ], + [ + { text: '見', reading: 'み', headwords: [[{ term: '見る' }]] }, + { text: 'ていない', reading: '', headwords: [[{ term: 'ていない' }]] }, + ], + ], + }, + ], + }, + }) as unknown as Electron.BrowserWindow, + }), + ); + + assert.equal(result.tokens?.length, 2); + assert.equal(result.tokens?.[0]?.surface, '断じて'); + assert.equal(result.tokens?.[0]?.reading, 'だんじて'); + assert.equal(result.tokens?.[1]?.surface, '見ていない'); + assert.equal(result.tokens?.[1]?.reading, 'み'); +}); + test('tokenizeSubtitle queries headword frequencies with token reading for disambiguation', async () => { const result = await tokenizeSubtitle( '鍛えた', @@ -351,6 +388,58 @@ test('tokenizeSubtitle queries headword frequencies with token reading for disam assert.equal(result.tokens?.[0]?.frequencyRank, 2847); }); +test('tokenizeSubtitle falls back to term-only Yomitan frequency lookup when reading is noisy', async () => { + const result = await tokenizeSubtitle( + '断じて', + makeDeps({ + getFrequencyDictionaryEnabled: () => true, + getYomitanExt: () => ({ id: 'dummy-ext' }) as any, + getYomitanParserWindow: () => + ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async (script: string) => { + if (script.includes('getTermFrequencies')) { + if (!script.includes('"term":"断じて","reading":null')) { + return []; + } + return [ + { + term: '断じて', + reading: null, + dictionary: 'freq-dict', + frequency: 7082, + displayValue: '7082', + displayValueParsed: true, + }, + ]; + } + + return [ + { + source: 'scanning-parser', + index: 0, + content: [ + [ + { + text: '断じて', + reading: 'だん', + headwords: [[{ term: '断じて' }]], + }, + ], + ], + }, + ]; + }, + }, + }) as unknown as Electron.BrowserWindow, + }), + ); + + assert.equal(result.tokens?.length, 1); + assert.equal(result.tokens?.[0]?.frequencyRank, 7082); +}); + test('tokenizeSubtitle avoids headword term-only fallback rank when reading-specific frequency exists', async () => { const result = await tokenizeSubtitle( '無人', diff --git a/src/core/services/tokenizer.ts b/src/core/services/tokenizer.ts index f9a515f..210e8f7 100644 --- a/src/core/services/tokenizer.ts +++ b/src/core/services/tokenizer.ts @@ -249,6 +249,50 @@ function normalizeFrequencyLookupText(rawText: string): string { return rawText.trim().toLowerCase(); } +function isKanaChar(char: string): boolean { + const code = char.codePointAt(0); + if (code === undefined) { + return false; + } + return ( + (code >= 0x3041 && code <= 0x3096) || + (code >= 0x309b && code <= 0x309f) || + (code >= 0x30a0 && code <= 0x30fa) || + (code >= 0x30fd && code <= 0x30ff) + ); +} + +function getTrailingKanaSuffix(surface: string): string { + const chars = Array.from(surface); + let splitIndex = chars.length; + while (splitIndex > 0 && isKanaChar(chars[splitIndex - 1]!)) { + splitIndex -= 1; + } + if (splitIndex <= 0 || splitIndex >= chars.length) { + return ''; + } + return chars.slice(splitIndex).join(''); +} + +function normalizeYomitanMergedReading(token: MergedToken): string { + const reading = token.reading ?? ''; + if (!reading || token.headword !== token.surface) { + return reading; + } + const trailingKanaSuffix = getTrailingKanaSuffix(token.surface); + if (!trailingKanaSuffix || reading.endsWith(trailingKanaSuffix)) { + return reading; + } + return `${reading}${trailingKanaSuffix}`; +} + +function normalizeSelectedYomitanTokens(tokens: MergedToken[]): MergedToken[] { + return tokens.map((token) => ({ + ...token, + reading: normalizeYomitanMergedReading(token), + })); +} + function resolveFrequencyLookupText( token: MergedToken, matchMode: FrequencyDictionaryMatchMode, @@ -276,17 +320,24 @@ function buildYomitanFrequencyTermReadingList( tokens: MergedToken[], matchMode: FrequencyDictionaryMatchMode, ): Array<{ term: string; reading: string | null }> { - return tokens - .map((token) => { - const term = resolveFrequencyLookupText(token, matchMode).trim(); - if (!term) { - return null; - } - const readingRaw = - token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null; - return { term, reading: readingRaw }; - }) - .filter((pair): pair is { term: string; reading: string | null } => pair !== null); + const termReadingList: Array<{ term: string; reading: string | null }> = []; + for (const token of tokens) { + const term = resolveFrequencyLookupText(token, matchMode).trim(); + if (!term) { + continue; + } + + const readingRaw = + token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null; + termReadingList.push({ term, reading: readingRaw }); + + // Yomitan parse readings can be noisy/truncated on merged tokens; include term-only fallback. + if (readingRaw !== null) { + termReadingList.push({ term, reading: null }); + } + } + + return termReadingList; } function buildYomitanFrequencyRankMap( @@ -427,16 +478,17 @@ async function parseWithYomitanInternalParser( if (!selectedTokens || selectedTokens.length === 0) { return null; } + const normalizedSelectedTokens = normalizeSelectedYomitanTokens(selectedTokens); if (deps.getYomitanGroupDebugEnabled?.() === true) { - logSelectedYomitanGroups(text, selectedTokens); + logSelectedYomitanGroups(text, normalizedSelectedTokens); } const frequencyRankPromise: Promise> = options.frequencyEnabled ? (async () => { const frequencyMatchMode = options.frequencyMatchMode; const termReadingList = buildYomitanFrequencyTermReadingList( - selectedTokens, + normalizedSelectedTokens, frequencyMatchMode, ); const yomitanFrequencies = await requestYomitanTermFrequencies(termReadingList, deps, logger); @@ -449,19 +501,19 @@ async function parseWithYomitanInternalParser( try { const mecabTokens = await deps.tokenizeWithMecab(text); const enrichTokensWithMecab = deps.enrichTokensWithMecab ?? enrichTokensWithMecabAsync; - return await enrichTokensWithMecab(selectedTokens, mecabTokens); + return await enrichTokensWithMecab(normalizedSelectedTokens, mecabTokens); } catch (err) { const error = err as Error; logger.warn( 'Failed to enrich Yomitan tokens with MeCab POS:', error.message, - `tokenCount=${selectedTokens.length}`, + `tokenCount=${normalizedSelectedTokens.length}`, `textLength=${text.length}`, ); - return selectedTokens; + return normalizedSelectedTokens; } })() - : Promise.resolve(selectedTokens); + : Promise.resolve(normalizedSelectedTokens); const [yomitanRankByTerm, enrichedTokens] = await Promise.all([ frequencyRankPromise, diff --git a/src/renderer/subtitle-render.test.ts b/src/renderer/subtitle-render.test.ts index 345e23e..6581ce8 100644 --- a/src/renderer/subtitle-render.test.ts +++ b/src/renderer/subtitle-render.test.ts @@ -79,7 +79,7 @@ test('computeWordClass preserves known and n+1 classes while adding JLPT classes assert.equal(computeWordClass(nPlusOneJlpt), 'word word-n-plus-one word-jlpt-n2'); }); -test('computeWordClass keeps known/N+1 color classes exclusive over frequency classes', () => { +test('computeWordClass composes known class with frequency class while keeping N+1 exclusive', () => { const known = createToken({ isKnown: true, frequencyRank: 10, @@ -103,7 +103,7 @@ test('computeWordClass keeps known/N+1 color classes exclusive over frequency cl singleColor: '#000000', bandedColors: ['#000000', '#000000', '#000000', '#000000', '#000000'] as const, }), - 'word word-known', + 'word word-known word-frequency-single', ); assert.equal( computeWordClass(nPlusOne, { diff --git a/src/renderer/subtitle-render.ts b/src/renderer/subtitle-render.ts index 48c149a..331f13a 100644 --- a/src/renderer/subtitle-render.ts +++ b/src/renderer/subtitle-render.ts @@ -429,7 +429,7 @@ export function computeWordClass( classes.push(`word-jlpt-${token.jlptLevel.toLowerCase()}`); } - if (!token.isKnown && !token.isNPlusOneTarget) { + if (!token.isNPlusOneTarget) { const frequencyClass = getFrequencyDictionaryClass(token, resolvedFrequencySettings); if (frequencyClass) { classes.push(frequencyClass);