From 1d76e05cd38cec1b7f8557a9f73494ad759a3f60 Mon Sep 17 00:00:00 2001 From: sudacode Date: Sat, 7 Mar 2026 01:28:37 -0800 Subject: [PATCH] fix(subtitle): tighten frequency token filtering --- ...mitan-scan-token-fallback-fragmentation.md | 42 +++++ ...kana-tokens-from-frequency-highlighting.md | 43 +++++ src/core/services/tokenizer.test.ts | 27 ++- .../tokenizer/annotation-stage.test.ts | 52 +++++- .../services/tokenizer/annotation-stage.ts | 18 ++ .../tokenizer/yomitan-parser-runtime.test.ts | 169 ++++++++++++++++++ .../tokenizer/yomitan-parser-runtime.ts | 9 +- 7 files changed, 343 insertions(+), 17 deletions(-) create mode 100644 backlog/tasks/task-107 - Fix-Yomitan-scan-token-fallback-fragmentation.md create mode 100644 backlog/tasks/task-108 - Exclude-single-kana-tokens-from-frequency-highlighting.md diff --git a/backlog/tasks/task-107 - Fix-Yomitan-scan-token-fallback-fragmentation.md b/backlog/tasks/task-107 - Fix-Yomitan-scan-token-fallback-fragmentation.md new file mode 100644 index 0000000..749b3b0 --- /dev/null +++ b/backlog/tasks/task-107 - Fix-Yomitan-scan-token-fallback-fragmentation.md @@ -0,0 +1,42 @@ +--- +id: TASK-107 +title: 'Fix Yomitan scan-token fallback fragmentation on exact-source misses' +status: Done +assignee: [] +created_date: '2026-03-07 01:10' +updated_date: '2026-03-07 01:12' +labels: [] +dependencies: [] +priority: high +ordinal: 9007 +--- + +## Description + + + +Left-to-right Yomitan scanning can emit bogus fallback tokens when `termsFind` returns entries but none of their headwords carries an exact primary source for the consumed substring. Repro: `だが それでも届かぬ高みがあった` currently yields trailing fragments like `があ` / `た`, which blocks the real `あった` token from receiving frequency highlighting. + + + +## Acceptance Criteria + + + +- [x] #1 Scanner skips `termsFind` fallback entries that are not backed by an exact primary source for the consumed substring. +- [x] #2 Repro line no longer yields bogus trailing fragments such as `があ`. +- [x] #3 Regression coverage added for the scan-token path. + + + +## Final Summary + + + +Removed the scan-token helper fallback that previously emitted a token from the first returned headword even when Yomitan did not report an exact primary source for the consumed substring. Added a focused regression test covering `だが それでも届かぬ高みがあった`, ensuring bogus `があ` fragmentation is skipped so the later `あった` exact match can still be tokenized and highlighted. + +Verification: + +- `bun test src/core/services/tokenizer/yomitan-parser-runtime.test.ts src/core/services/tokenizer.test.ts --timeout 20000` + + diff --git a/backlog/tasks/task-108 - Exclude-single-kana-tokens-from-frequency-highlighting.md b/backlog/tasks/task-108 - Exclude-single-kana-tokens-from-frequency-highlighting.md new file mode 100644 index 0000000..0706582 --- /dev/null +++ b/backlog/tasks/task-108 - Exclude-single-kana-tokens-from-frequency-highlighting.md @@ -0,0 +1,43 @@ +--- +id: TASK-108 +title: 'Exclude single kana tokens from frequency highlighting' +status: Done +assignee: [] +created_date: '2026-03-07 01:18' +updated_date: '2026-03-07 01:22' +labels: [] +dependencies: [] +priority: medium +ordinal: 9008 +--- + +## Description + + + +Suppress frequency highlighting for single-character hiragana or katakana tokens. Scope is frequency-only: known/N+1/JLPT behavior stays unchanged. + + + +## Acceptance Criteria + + + +- [x] #1 Single-character hiragana tokens do not retain `frequencyRank`. +- [x] #2 Single-character katakana tokens do not retain `frequencyRank`. +- [x] #3 Regression coverage exists at annotation-stage and tokenizer levels. + + + +## Final Summary + + + +Added a frequency-only suppression rule for single-character kana tokens based on token `surface`, so bogus merged fragments like `た` and standalone one-character kana no longer keep `frequencyRank`. Regression coverage now exists both in the annotation stage and in the tokenizer path, while multi-character tokens and N+1/JLPT behavior remain unchanged. + +Verification: + +- `bun test src/core/services/tokenizer/annotation-stage.test.ts --timeout 20000` +- `bun test src/core/services/tokenizer.test.ts --timeout 20000` + + diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index 864b9a6..d0d295e 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -1861,9 +1861,9 @@ test('tokenizeSubtitle keeps parsing explicit by scanning-parser source only', a assert.equal(result.tokens?.[4]?.frequencyRank, 1500); }); -test('tokenizeSubtitle still assigns frequency to non-known Yomitan tokens', async () => { +test('tokenizeSubtitle still assigns frequency to non-known multi-character Yomitan tokens', async () => { const result = await tokenizeSubtitle( - '小園に', + '小園友達', makeDeps({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => @@ -1884,9 +1884,9 @@ test('tokenizeSubtitle still assigns frequency to non-known Yomitan tokens', asy ], [ { - text: 'に', - reading: 'に', - headwords: [[{ term: 'に' }]], + text: '友達', + reading: 'ともだち', + headwords: [[{ term: '友達' }]], }, ], ], @@ -1895,7 +1895,7 @@ test('tokenizeSubtitle still assigns frequency to non-known Yomitan tokens', asy }, }) as unknown as Electron.BrowserWindow, getFrequencyDictionaryEnabled: () => true, - getFrequencyRank: (text) => (text === '小園' ? 75 : text === 'に' ? 3000 : null), + getFrequencyRank: (text) => (text === '小園' ? 75 : text === '友達' ? 3000 : null), isKnownWord: (text) => text === '小園', }), ); @@ -2635,6 +2635,21 @@ test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and freque assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false); }); +test('tokenizeSubtitle excludes single-kana merged tokens from frequency highlighting', async () => { + const result = await tokenizeSubtitle( + 'た', + makeDepsFromYomitanTokens([{ surface: 'た', reading: 'た', headword: 'た' }], { + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => (text === 'た' ? 17 : null), + getMinSentenceWordsForNPlusOne: () => 1, + tokenizeWithMecab: async () => null, + }), + ); + + assert.equal(result.tokens?.length, 1); + assert.equal(result.tokens?.[0]?.frequencyRank, undefined); +}); + test('tokenizeSubtitle excludes merged function/content token from frequency highlighting but keeps N+1', async () => { const result = await tokenizeSubtitle( 'になれば', diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts index 1123a5d..c6f4cfd 100644 --- a/src/core/services/tokenizer/annotation-stage.test.ts +++ b/src/core/services/tokenizer/annotation-stage.test.ts @@ -252,12 +252,12 @@ test('annotateTokens applies configured pos1 exclusions to both frequency and N+ test('annotateTokens allows previously default-excluded pos1 when removed from effective set', () => { const tokens = [ makeToken({ - surface: 'は', - headword: 'は', + surface: 'まで', + headword: 'まで', partOfSpeech: PartOfSpeech.other, pos1: '助詞', startPos: 0, - endPos: 1, + endPos: 2, frequencyRank: 8, }), ]; @@ -314,6 +314,52 @@ test('annotateTokens excludes likely kana SFX tokens from frequency when POS tag assert.equal(result[0]?.frequencyRank, undefined); }); +test('annotateTokens excludes single hiragana and katakana tokens from frequency when POS tags are missing', () => { + const tokens = [ + makeToken({ + surface: 'た', + reading: 'た', + headword: 'た', + pos1: '', + pos2: '', + partOfSpeech: PartOfSpeech.other, + frequencyRank: 21, + startPos: 0, + endPos: 1, + }), + makeToken({ + surface: 'ア', + reading: 'ア', + headword: 'ア', + pos1: '', + pos2: '', + partOfSpeech: PartOfSpeech.other, + frequencyRank: 22, + startPos: 1, + endPos: 2, + }), + makeToken({ + surface: '山', + reading: 'やま', + headword: '山', + pos1: '', + pos2: '', + partOfSpeech: PartOfSpeech.other, + frequencyRank: 23, + startPos: 2, + endPos: 3, + }), + ]; + + const result = annotateTokens(tokens, makeDeps(), { + minSentenceWordsForNPlusOne: 1, + }); + + assert.equal(result[0]?.frequencyRank, undefined); + assert.equal(result[1]?.frequencyRank, undefined); + assert.equal(result[2]?.frequencyRank, 23); +}); + test('annotateTokens keeps frequency when mecab tags classify token as content-bearing', () => { const tokens = [ makeToken({ diff --git a/src/core/services/tokenizer/annotation-stage.ts b/src/core/services/tokenizer/annotation-stage.ts index b62264a..c263757 100644 --- a/src/core/services/tokenizer/annotation-stage.ts +++ b/src/core/services/tokenizer/annotation-stage.ts @@ -103,6 +103,10 @@ function isFrequencyExcludedByPos( pos1Exclusions: ReadonlySet, pos2Exclusions: ReadonlySet, ): boolean { + if (isSingleKanaFrequencyNoiseToken(token.surface)) { + return true; + } + const normalizedPos1 = normalizePos1Tag(token.pos1); const hasPos1 = normalizedPos1.length > 0; if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) { @@ -363,6 +367,20 @@ function isLikelyFrequencyNoiseToken(token: MergedToken): boolean { return false; } +function isSingleKanaFrequencyNoiseToken(text: string | undefined): boolean { + if (typeof text !== 'string') { + return false; + } + + const normalized = text.trim(); + if (!normalized) { + return false; + } + + const chars = [...normalized]; + return chars.length === 1 && isKanaChar(chars[0]!); +} + function isJlptEligibleToken(token: MergedToken): boolean { if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) { return false; diff --git a/src/core/services/tokenizer/yomitan-parser-runtime.test.ts b/src/core/services/tokenizer/yomitan-parser-runtime.test.ts index b332313..d6aa519 100644 --- a/src/core/services/tokenizer/yomitan-parser-runtime.test.ts +++ b/src/core/services/tokenizer/yomitan-parser-runtime.test.ts @@ -643,6 +643,175 @@ test('requestYomitanScanTokens marks grouped entries when SubMiner dictionary al assert.equal((result as Array<{ isNameMatch?: boolean }>)[0]?.isNameMatch, true); }); +test('requestYomitanScanTokens skips fallback fragments without exact primary source matches', async () => { + const deps = createDeps(async (script) => { + if (script.includes('optionsGetFull')) { + return { + profileCurrent: 0, + profiles: [ + { + options: { + scanning: { length: 40 }, + }, + }, + ], + }; + } + + return await runInjectedYomitanScript(script, (action, params) => { + if (action !== 'termsFind') { + throw new Error(`unexpected action: ${action}`); + } + + const text = (params as { text?: string } | undefined)?.text ?? ''; + if (text.startsWith('だが ')) { + return { + originalTextLength: 2, + dictionaryEntries: [ + { + headwords: [ + { + term: 'だが', + reading: 'だが', + sources: [{ originalText: 'だが', isPrimary: true, matchType: 'exact' }], + }, + ], + }, + ], + }; + } + if (text.startsWith('それでも')) { + return { + originalTextLength: 4, + dictionaryEntries: [ + { + headwords: [ + { + term: 'それでも', + reading: 'それでも', + sources: [{ originalText: 'それでも', isPrimary: true, matchType: 'exact' }], + }, + ], + }, + ], + }; + } + if (text.startsWith('届かぬ')) { + return { + originalTextLength: 3, + dictionaryEntries: [ + { + headwords: [ + { + term: '届く', + reading: 'とどく', + sources: [{ originalText: '届かぬ', isPrimary: true, matchType: 'exact' }], + }, + ], + }, + ], + }; + } + if (text.startsWith('高み')) { + return { + originalTextLength: 2, + dictionaryEntries: [ + { + headwords: [ + { + term: '高み', + reading: 'たかみ', + sources: [{ originalText: '高み', isPrimary: true, matchType: 'exact' }], + }, + ], + }, + ], + }; + } + if (text.startsWith('があった')) { + return { + originalTextLength: 2, + dictionaryEntries: [ + { + headwords: [ + { + term: 'があ', + reading: '', + sources: [{ originalText: 'が', isPrimary: true, matchType: 'exact' }], + }, + ], + }, + ], + }; + } + if (text.startsWith('あった')) { + return { + originalTextLength: 3, + dictionaryEntries: [ + { + headwords: [ + { + term: 'ある', + reading: 'ある', + sources: [{ originalText: 'あった', isPrimary: true, matchType: 'exact' }], + }, + ], + }, + ], + }; + } + return { originalTextLength: 0, dictionaryEntries: [] }; + }); + }); + + const result = await requestYomitanScanTokens( + 'だが それでも届かぬ高みがあった', + deps, + { error: () => undefined }, + ); + + assert.deepEqual( + result?.map((token) => ({ + surface: token.surface, + headword: token.headword, + startPos: token.startPos, + endPos: token.endPos, + })), + [ + { + surface: 'だが', + headword: 'だが', + startPos: 0, + endPos: 2, + }, + { + surface: 'それでも', + headword: 'それでも', + startPos: 3, + endPos: 7, + }, + { + surface: '届かぬ', + headword: '届く', + startPos: 7, + endPos: 10, + }, + { + surface: '高み', + headword: '高み', + startPos: 10, + endPos: 12, + }, + { + surface: 'あった', + headword: 'ある', + startPos: 13, + endPos: 16, + }, + ], + ); +}); + test('getYomitanDictionaryInfo requests dictionary info via backend action', async () => { let scriptValue = ''; const deps = createDeps(async (script) => { diff --git a/src/core/services/tokenizer/yomitan-parser-runtime.ts b/src/core/services/tokenizer/yomitan-parser-runtime.ts index dad4930..fbf574b 100644 --- a/src/core/services/tokenizer/yomitan-parser-runtime.ts +++ b/src/core/services/tokenizer/yomitan-parser-runtime.ts @@ -843,14 +843,7 @@ const YOMITAN_SCANNING_HELPERS = String.raw` }; } } - const fallback = dictionaryEntries?.[0]?.headwords?.[0]; - return fallback - ? { - term: fallback.term, - reading: fallback.reading, - isNameMatch: matchedNameDictionary || isNameDictionaryEntry(dictionaryEntries?.[0]) - } - : null; + return null; } `;