From 402b58385d4cf7e2e7c8a8d5c55be74c34e5f17d Mon Sep 17 00:00:00 2001 From: sudacode Date: Sun, 3 May 2026 22:59:59 -0700 Subject: [PATCH] fix: suppress known highlights for subtitle particles --- ...hlight-on-standalone-subtitle-particles.md | 72 ++++++++++++++ changes/338-known-word-particle-highlights.md | 4 + src/core/services/tokenizer.test.ts | 28 +++--- .../tokenizer/annotation-stage.test.ts | 94 ++++++++++++++----- .../services/tokenizer/annotation-stage.ts | 4 +- 5 files changed, 159 insertions(+), 43 deletions(-) create mode 100644 backlog/tasks/task-338 - Fix-known-word-highlight-on-standalone-subtitle-particles.md create mode 100644 changes/338-known-word-particle-highlights.md diff --git a/backlog/tasks/task-338 - Fix-known-word-highlight-on-standalone-subtitle-particles.md b/backlog/tasks/task-338 - Fix-known-word-highlight-on-standalone-subtitle-particles.md new file mode 100644 index 00000000..e8432e9c --- /dev/null +++ b/backlog/tasks/task-338 - Fix-known-word-highlight-on-standalone-subtitle-particles.md @@ -0,0 +1,72 @@ +--- +id: TASK-338 +title: Fix known-word highlight on standalone subtitle particles +status: Done +assignee: + - codex +created_date: '2026-05-04 05:52' +updated_date: '2026-05-04 05:57' +labels: + - bug + - subtitle + - tokenizer +dependencies: [] +references: + - src/core/services/tokenizer/annotation-stage.ts + - src/core/services/tokenizer/subtitle-annotation-filter.ts + - src/renderer/subtitle-render.ts +priority: medium +--- + +## Description + + +Standalone grammar particles such as に should not render as known-word green when they appear in the known-word cache as readings for other words. Keep known-word coloring for lexical tokens, but prevent grammar-excluded subtitle tokens from getting known-green. + + +## Acceptance Criteria + +- [x] #1 Standalone grammar particles like に do not retain isKnown after subtitle annotation filtering. +- [x] #2 Lexical known-word tokens still render as known when not grammar-excluded. +- [x] #3 Focused regression test covers the particle false-positive path. + + +## Implementation Plan + + +1. Add a focused regression in `src/core/services/tokenizer/annotation-stage.test.ts` showing standalone particle `に` is grammar-excluded and does not retain `isKnown` even when `isKnownWord('に')` is true. +2. Run the focused tokenizer annotation test and confirm the new test fails for the current behavior. +3. Patch `src/core/services/tokenizer/annotation-stage.ts` so grammar-excluded tokens clear known status while still stripping N+1/frequency/JLPT/name metadata. +4. Run the focused test file, then inspect diff and update task acceptance criteria. + + +## Implementation Notes + + +Implemented tokenizer annotation filtering so grammar-excluded subtitle tokens clear known-word status instead of retaining green known coloring. Added focused regression for known-word-cache particle false positive and updated existing expectations for unified annotation clearing. Verification: `bun test src/core/services/tokenizer/annotation-stage.test.ts --test-name-pattern "clears known status from standalone particles"` failed before the production patch; after patch, `bun test src/core/services/tokenizer/annotation-stage.test.ts`, `bun test src/core/services/tokenizer.test.ts`, combined tokenizer tests, `bun run typecheck`, `bun run changelog:lint`, and `bun run test:fast` passed. + +Full handoff gate follow-up: `bun run test:env` and `bun run build` passed. `bun run test:smoke:dist` failed outside this tokenizer change in `dist/core/services/overlay-manager.test.js` because current dirty overlay-window code calls `window.getTitle()` on a test mock that does not provide it. + + +## Final Summary + + +Summary: +- Cleared `isKnown` for grammar-excluded subtitle tokens in the tokenizer annotation stage, preventing standalone particles such as `に` from rendering as known just because a known-word deck contains a matching reading. +- Added a focused regression test for the known-word-cache false positive and updated tokenizer expectations so helper/grammar spans consistently clear all subtitle annotations. +- Added changelog fragment `changes/338-known-word-particle-highlights.md`. + +Verification: +- `bun test src/core/services/tokenizer/annotation-stage.test.ts --test-name-pattern "clears known status from standalone particles"` failed before the production patch. +- `bun test src/core/services/tokenizer/annotation-stage.test.ts` +- `bun test src/core/services/tokenizer.test.ts` +- `bun test src/core/services/tokenizer/annotation-stage.test.ts src/core/services/tokenizer.test.ts` +- `bun run typecheck` +- `bun run changelog:lint` +- `bun run test:fast` +- `bun run test:env` +- `bun run build` + +Blocked/External: +- `bun run test:smoke:dist` currently fails outside this tokenizer change in `dist/core/services/overlay-manager.test.js`: dirty overlay-window code calls `window.getTitle()` on a test mock without that method. + diff --git a/changes/338-known-word-particle-highlights.md b/changes/338-known-word-particle-highlights.md new file mode 100644 index 00000000..c759e8e7 --- /dev/null +++ b/changes/338-known-word-particle-highlights.md @@ -0,0 +1,4 @@ +type: fixed +area: tokenizer + +- Prevented standalone grammar and helper tokens such as `に` from being colored as known words when readings from known-word decks match them. diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index ae49a85e..885f5c8d 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -129,7 +129,7 @@ test('tokenizeSubtitle splits same-line grammar endings before applying annotati assert.equal(result.tokens?.[0]?.jlptLevel, 'N5'); assert.equal(result.tokens?.[0]?.frequencyRank, 40); assert.equal(result.tokens?.[1]?.surface, 'です'); - assert.equal(result.tokens?.[1]?.isKnown, true); + assert.equal(result.tokens?.[1]?.isKnown, false); assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false); assert.equal(result.tokens?.[1]?.frequencyRank, undefined); assert.equal(result.tokens?.[1]?.jlptLevel, undefined); @@ -3230,7 +3230,7 @@ test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and freque assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false); }); -test('tokenizeSubtitle preserves known-word highlight for exact non-independent kanji noun tokens', async () => { +test('tokenizeSubtitle clears known-word highlight for exact non-independent kanji noun tokens', async () => { const result = await tokenizeSubtitle( 'その点', makeDepsFromYomitanTokens( @@ -3278,7 +3278,7 @@ test('tokenizeSubtitle preserves known-word highlight for exact non-independent assert.equal(result.tokens?.length, 2); assert.equal(result.tokens?.[0]?.isKnown, false); assert.equal(result.tokens?.[1]?.surface, '点'); - assert.equal(result.tokens?.[1]?.isKnown, true); + assert.equal(result.tokens?.[1]?.isKnown, false); assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false); assert.equal(result.tokens?.[1]?.frequencyRank, undefined); assert.equal(result.tokens?.[1]?.jlptLevel, undefined); @@ -3335,7 +3335,7 @@ test('tokenizeSubtitle keeps mecab-tagged interjections tokenized while clearing ); }); -test('tokenizeSubtitle keeps excluded interjections hoverable while clearing only their annotation metadata', async () => { +test('tokenizeSubtitle keeps excluded interjections hoverable while clearing annotation metadata', async () => { const result = await tokenizeSubtitle( 'ぐはっ 猫', makeDeps({ @@ -3409,7 +3409,7 @@ test('tokenizeSubtitle keeps excluded interjections hoverable while clearing onl ); }); -test('tokenizeSubtitle keeps explanatory ending variants hoverable while clearing only their annotation metadata', async () => { +test('tokenizeSubtitle keeps explanatory ending variants hoverable while clearing annotation metadata', async () => { const result = await tokenizeSubtitle( '猫んです', makeDepsFromYomitanTokens( @@ -3480,7 +3480,7 @@ test('tokenizeSubtitle keeps explanatory ending variants hoverable while clearin ); }); -test('tokenizeSubtitle keeps standalone grammar-only tokens hoverable while clearing only their annotation metadata', async () => { +test('tokenizeSubtitle keeps standalone grammar-only tokens hoverable while clearing annotation metadata', async () => { const result = await tokenizeSubtitle( '私はこの猫です', makeDeps({ @@ -3599,7 +3599,7 @@ test('tokenizeSubtitle keeps standalone grammar-only tokens hoverable while clea ); }); -test('tokenizeSubtitle keeps trailing quote-particle merged tokens hoverable while clearing only their annotation metadata', async () => { +test('tokenizeSubtitle keeps trailing quote-particle merged tokens hoverable while clearing annotation metadata', async () => { const result = await tokenizeSubtitle( 'どうしてもって', makeDepsFromYomitanTokens( @@ -3893,7 +3893,7 @@ test('tokenizeSubtitle clears all annotations for kana-only demonstrative helper { surface: 'これで', headword: 'これ', - isKnown: true, + isKnown: false, isNPlusOneTarget: false, frequencyRank: undefined, jlptLevel: undefined, @@ -4008,7 +4008,7 @@ test('tokenizeSubtitle clears all annotations for explanatory pondering endings' { surface: 'のかな', headword: 'の', - isKnown: true, + isKnown: false, isNPlusOneTarget: false, frequencyRank: undefined, jlptLevel: undefined, @@ -4447,7 +4447,7 @@ test('tokenizeSubtitle clears annotations for ja-nai explanatory endings and aru { surface: 'ある', headword: 'ある', - isKnown: true, + isKnown: false, isNPlusOneTarget: false, frequencyRank: undefined, jlptLevel: undefined, @@ -4492,7 +4492,7 @@ test('tokenizeSubtitle clears annotations for standalone polite copula endings w { surface: 'ですよ', headword: 'です', - isKnown: true, + isKnown: false, isNPlusOneTarget: false, frequencyRank: undefined, jlptLevel: undefined, @@ -4819,7 +4819,7 @@ test('tokenizeSubtitle clears annotations for auxiliary inflection fragments whi { surface: 'れた', headword: 'れる', - isKnown: true, + isKnown: false, isNPlusOneTarget: false, frequencyRank: undefined, jlptLevel: undefined, @@ -4956,7 +4956,7 @@ test('tokenizeSubtitle clears annotations for te-kureru auxiliary helper spans', { surface: 'てく', headword: 'てく', - isKnown: true, + isKnown: false, isNPlusOneTarget: false, frequencyRank: undefined, jlptLevel: undefined, @@ -4967,7 +4967,7 @@ test('tokenizeSubtitle clears annotations for te-kureru auxiliary helper spans', { surface: 'れた', headword: 'れる', - isKnown: true, + isKnown: false, isNPlusOneTarget: false, frequencyRank: undefined, jlptLevel: undefined, diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts index e4ac9d69..d64a50d7 100644 --- a/src/core/services/tokenizer/annotation-stage.test.ts +++ b/src/core/services/tokenizer/annotation-stage.test.ts @@ -898,8 +898,8 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens ); assert.equal(result[0]?.isKnown, false); - assert.equal(result[1]?.isKnown, true); - assert.equal(result[2]?.isKnown, true); + assert.equal(result[1]?.isKnown, false); + assert.equal(result[2]?.isKnown, false); assert.equal(result[0]?.isNPlusOneTarget, false); }); @@ -1113,7 +1113,7 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+ assert.equal(result[0]?.isNPlusOneTarget, false); }); -test('annotateTokens preserves exact known-word status for non-independent kanji noun tokens', () => { +test('annotateTokens clears known-word status for non-independent kanji noun tokens', () => { const tokens = [ makeToken({ surface: '点', @@ -1138,7 +1138,7 @@ test('annotateTokens preserves exact known-word status for non-independent kanji { minSentenceWordsForNPlusOne: 1 }, ); - assert.equal(result[0]?.isKnown, true); + assert.equal(result[0]?.isKnown, false); assert.equal(result[0]?.isNPlusOneTarget, false); assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.jlptLevel, undefined); @@ -1352,13 +1352,13 @@ test('annotateTokens applies one shared exclusion gate across known N+1 frequenc { minSentenceWordsForNPlusOne: 1 }, ); - assert.equal(result[0]?.isKnown, true); + assert.equal(result[0]?.isKnown, false); assert.equal(result[0]?.isNPlusOneTarget, false); assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.jlptLevel, undefined); }); -test('annotateTokens keeps known status while clearing other annotations for kana-only non-independent noun helper merges', () => { +test('annotateTokens clears known status and other annotations for kana-only non-independent noun helper merges', () => { const tokens = [ makeToken({ surface: 'ことに', @@ -1382,13 +1382,13 @@ test('annotateTokens keeps known status while clearing other annotations for kan { minSentenceWordsForNPlusOne: 1 }, ); - assert.equal(result[0]?.isKnown, true); + assert.equal(result[0]?.isKnown, false); assert.equal(result[0]?.isNPlusOneTarget, false); assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.jlptLevel, undefined); }); -test('annotateTokens keeps known status while clearing other annotations for standalone auxiliary inflection fragments', () => { +test('annotateTokens clears known status and other annotations for standalone auxiliary inflection fragments', () => { const tokens = [ makeToken({ surface: 'れる', @@ -1424,14 +1424,14 @@ test('annotateTokens keeps known status while clearing other annotations for sta ); for (const token of result) { - assert.equal(token.isKnown, true, token.surface); + assert.equal(token.isKnown, false, token.surface); assert.equal(token.isNPlusOneTarget, false, token.surface); assert.equal(token.frequencyRank, undefined, token.surface); assert.equal(token.jlptLevel, undefined, token.surface); } }); -test('annotateTokens keeps known status while clearing other annotations for auxiliary-only te-kureru helper spans', () => { +test('annotateTokens clears known status and other annotations for auxiliary-only te-kureru helper spans', () => { const tokens = [ makeToken({ surface: 'てく', @@ -1467,7 +1467,7 @@ test('annotateTokens keeps known status while clearing other annotations for aux ); for (const token of result) { - assert.equal(token.isKnown, true, token.surface); + assert.equal(token.isKnown, false, token.surface); assert.equal(token.isNPlusOneTarget, false, token.surface); assert.equal(token.frequencyRank, undefined, token.surface); assert.equal(token.jlptLevel, undefined, token.surface); @@ -1503,7 +1503,7 @@ test('annotateTokens keeps lexical くれる forms eligible for annotation', () assert.equal(result[0]?.jlptLevel, 'N4'); }); -test('annotateTokens keeps known status while clearing other annotations for standalone して helper fragments', () => { +test('annotateTokens clears known status and other annotations for standalone して helper fragments', () => { const tokens = [ makeToken({ surface: 'してる', @@ -1527,13 +1527,13 @@ test('annotateTokens keeps known status while clearing other annotations for sta { minSentenceWordsForNPlusOne: 1 }, ); - assert.equal(result[0]?.isKnown, true); + assert.equal(result[0]?.isKnown, false); assert.equal(result[0]?.isNPlusOneTarget, false); assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.jlptLevel, undefined); }); -test('annotateTokens keeps known status while clearing other annotations for standalone particle fragments without POS tags', () => { +test('annotateTokens clears known status and other annotations for standalone particle fragments without POS tags', () => { const tokens = [ makeToken({ surface: 'と', @@ -1557,12 +1557,54 @@ test('annotateTokens keeps known status while clearing other annotations for sta { minSentenceWordsForNPlusOne: 1 }, ); - assert.equal(result[0]?.isKnown, true); + assert.equal(result[0]?.isKnown, false); assert.equal(result[0]?.isNPlusOneTarget, false); assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.jlptLevel, undefined); }); +test('annotateTokens clears known status from standalone particles even when the known-word cache contains them', () => { + const tokens = [ + makeToken({ + surface: 'に', + headword: 'に', + reading: 'ニ', + partOfSpeech: PartOfSpeech.particle, + pos1: '助詞', + pos2: '格助詞', + startPos: 0, + endPos: 1, + frequencyRank: 2, + }), + makeToken({ + surface: '泉', + headword: '泉', + reading: 'イズミ', + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '一般', + startPos: 1, + endPos: 2, + frequencyRank: 50, + }), + ]; + + const result = annotateTokens( + tokens, + makeDeps({ + isKnownWord: (text) => text === 'に' || text === '泉', + getJlptLevel: (text) => (text === 'に' ? 'N5' : null), + }), + { minSentenceWordsForNPlusOne: 1 }, + ); + + assert.equal(result[0]?.isKnown, false); + assert.equal(result[0]?.isNPlusOneTarget, false); + assert.equal(result[0]?.frequencyRank, undefined); + assert.equal(result[0]?.jlptLevel, undefined); + assert.equal(result[1]?.isKnown, true); +}); + test('annotateTokens does not mark standalone connective particles as N+1', () => { const tokens = [ makeToken({ @@ -1613,7 +1655,7 @@ test('annotateTokens does not mark standalone connective particles as N+1', () = assert.equal(result[1]?.jlptLevel, undefined); }); -test('annotateTokens keeps known status while clearing other annotations for rhetorical もんか grammar particle phrases', () => { +test('annotateTokens clears known status and other annotations for rhetorical もんか grammar particle phrases', () => { const tokens = [ makeToken({ surface: 'もんか', @@ -1637,13 +1679,13 @@ test('annotateTokens keeps known status while clearing other annotations for rhe { minSentenceWordsForNPlusOne: 1 }, ); - assert.equal(result[0]?.isKnown, true); + assert.equal(result[0]?.isKnown, false); assert.equal(result[0]?.isNPlusOneTarget, false); assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.jlptLevel, undefined); }); -test('annotateTokens keeps known status while clearing other annotations for bare くれ auxiliary fragments', () => { +test('annotateTokens clears known status and other annotations for bare くれ auxiliary fragments', () => { const tokens = [ makeToken({ surface: 'くれ', @@ -1667,13 +1709,13 @@ test('annotateTokens keeps known status while clearing other annotations for bar { minSentenceWordsForNPlusOne: 1 }, ); - assert.equal(result[0]?.isKnown, true); + assert.equal(result[0]?.isKnown, false); assert.equal(result[0]?.isNPlusOneTarget, false); assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.jlptLevel, undefined); }); -test('annotateTokens keeps known status while clearing other annotations for aru existence verbs', () => { +test('annotateTokens clears known status and other annotations for aru existence verbs', () => { const tokens = [ makeToken({ surface: '有る', @@ -1703,14 +1745,14 @@ test('annotateTokens keeps known status while clearing other annotations for aru assert.equal(result[0]?.surface, '有る'); assert.equal(result[0]?.headword, '有る'); - assert.equal(result[0]?.isKnown, true); + assert.equal(result[0]?.isKnown, false); assert.equal(result[0]?.isNPlusOneTarget, false); assert.equal(result[0]?.isNameMatch, false); assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.jlptLevel, undefined); }); -test('annotateTokens keeps known status while clearing other annotations for standalone quote particle and auxiliary grammar terms', () => { +test('annotateTokens clears known status and other annotations for standalone quote particle and auxiliary grammar terms', () => { const tokens = [ makeToken({ surface: 'って', @@ -1746,14 +1788,14 @@ test('annotateTokens keeps known status while clearing other annotations for sta ); for (const token of result) { - assert.equal(token.isKnown, true, token.surface); + assert.equal(token.isKnown, false, token.surface); assert.equal(token.isNPlusOneTarget, false, token.surface); assert.equal(token.frequencyRank, undefined, token.surface); assert.equal(token.jlptLevel, undefined, token.surface); } }); -test('annotateTokens keeps known status while clearing other annotations from standalone あ interjections without POS tags', () => { +test('annotateTokens clears known status and other annotations from standalone あ interjections without POS tags', () => { const tokens = [ makeToken({ surface: 'あ', @@ -1783,7 +1825,7 @@ test('annotateTokens keeps known status while clearing other annotations from st assert.equal(result[0]?.surface, 'あ'); assert.equal(result[0]?.headword, 'あ'); assert.equal(result[0]?.reading, 'あ'); - assert.equal(result[0]?.isKnown, true); + assert.equal(result[0]?.isKnown, false); assert.equal(result[0]?.isNPlusOneTarget, false); assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.jlptLevel, undefined); @@ -1845,7 +1887,7 @@ test('annotateTokens clears all annotations from expressive subtitle interjectio ); for (const token of result.slice(0, 2)) { - assert.equal(token.isKnown, true, token.surface); + assert.equal(token.isKnown, false, token.surface); assert.equal(token.isNPlusOneTarget, false, token.surface); assert.equal(token.frequencyRank, undefined, token.surface); assert.equal(token.jlptLevel, undefined, token.surface); diff --git a/src/core/services/tokenizer/annotation-stage.ts b/src/core/services/tokenizer/annotation-stage.ts index ec529cc6..e95435f1 100644 --- a/src/core/services/tokenizer/annotation-stage.ts +++ b/src/core/services/tokenizer/annotation-stage.ts @@ -627,9 +627,7 @@ export function annotateTokens( }); return { ...strippedToken, - isKnown: nPlusOneEnabled - ? computeTokenKnownStatus(token, deps.isKnownWord, deps.knownWordMatchMode) - : false, + isKnown: false, }; }