From 9bcea2fc5f6a4f8f6fa19d860857f2616aaa934c Mon Sep 17 00:00:00 2001 From: sudacode Date: Sun, 3 May 2026 22:03:42 -0700 Subject: [PATCH] fix: preserve known highlighting for filtered tokens --- ...333 - Suppress-aru-subtitle-annotations.md | 53 +++++++++ changes/333-aru-annotation-filter.md | 4 + src/core/services/tokenizer.test.ts | 24 ++-- .../tokenizer/annotation-stage.test.ts | 107 ++++++++++++++---- .../services/tokenizer/annotation-stage.ts | 34 +----- .../tokenizer/subtitle-annotation-filter.ts | 4 +- 6 files changed, 158 insertions(+), 68 deletions(-) create mode 100644 backlog/tasks/task-333 - Suppress-aru-subtitle-annotations.md create mode 100644 changes/333-aru-annotation-filter.md diff --git a/backlog/tasks/task-333 - Suppress-aru-subtitle-annotations.md b/backlog/tasks/task-333 - Suppress-aru-subtitle-annotations.md new file mode 100644 index 00000000..293fc8f1 --- /dev/null +++ b/backlog/tasks/task-333 - Suppress-aru-subtitle-annotations.md @@ -0,0 +1,53 @@ +--- +id: TASK-333 +title: Suppress aru subtitle annotations +status: Done +assignee: [] +created_date: '2026-05-04 04:39' +updated_date: '2026-05-04 05:02' +labels: + - tokenizer + - annotations + - bug +dependencies: [] +priority: medium +--- + +## Description + + +Add `ある` / `有る` to the subtitle annotation suppression path so `aru` tokens remain hoverable and never receive N+1, JLPT, frequency, or name-match annotation metadata. Known-word highlighting is special: if a filtered `aru` token is known and known highlighting is enabled, it should still render as known. + + +## Acceptance Criteria + +- [x] #1 `ある` and kanji headword/surface variants such as `有る` are excluded by the subtitle annotation filter. +- [x] #2 Annotation stripping clears N+1, JLPT, frequency, and name metadata for `aru` tokens while preserving token hover data. +- [x] #3 Known-word highlighting still applies to filtered tokens, including `aru`, when known-word lookup marks them known. +- [x] #4 Regression coverage fails before the fix and passes after. + + +## Implementation Plan + + +1. Add `ある`/`有る`/`在る` to the shared subtitle annotation hard-exclusion terms. +2. Preserve/recompute known-word status for filtered tokens while stripping N+1, JLPT, frequency, and name metadata. +3. Add RED/GREEN unit and tokenizer regression coverage, plus a changelog fragment. +4. Run targeted tests and full handoff gate. + + +## Implementation Notes + + +TDD path: added failing annotation-stage coverage first. Initial implementation made targeted tests pass, then broader tokenizer coverage revealed an older fixture expecting `ある` to remain lexical; updated that integration expectation to the new requested behavior. Follow-up correction: known-word highlighting is the lone annotation exception for filtered tokens, so the strip path now preserves known state and `annotateTokens` recomputes known status for filtered tokens while still clearing N+1/JLPT/frequency/name metadata. + + +## Final Summary + + +Suppressed non-known subtitle annotations for `aru` existence verbs by adding `ある`, `有る`, and `在る` to the shared hard-exclusion list. Corrected the filtered-token path so known-word highlighting still applies whenever known highlighting is enabled; filtered tokens now keep/gain `isKnown` but still lose N+1, JLPT, frequency, and name metadata. + +Added and updated annotation-stage and tokenizer regression coverage for `aru`, particles, helper fragments, interjections, and other filtered known tokens. Added `changes/333-aru-annotation-filter.md`. + +Validation passed: RED failures observed before implementation/correction; `bun test src/core/services/tokenizer/annotation-stage.test.ts`; `bun test src/core/services/tokenizer.test.ts`; `bun run typecheck`; `bun run format:check:src`; `bun run changelog:lint`; `bun run test:fast`; `bun run test:env`; `bun run build`; `bun run test:smoke:dist`. + diff --git a/changes/333-aru-annotation-filter.md b/changes/333-aru-annotation-filter.md new file mode 100644 index 00000000..7f3425bc --- /dev/null +++ b/changes/333-aru-annotation-filter.md @@ -0,0 +1,4 @@ +type: fixed +area: tokenizer + +- Suppressed N+1, JLPT, frequency, and name styling for `ある` / `有る` existence verbs while still allowing known-word highlighting. diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index b025eb1b..ae49a85e 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -129,7 +129,7 @@ test('tokenizeSubtitle splits same-line grammar endings before applying annotati assert.equal(result.tokens?.[0]?.jlptLevel, 'N5'); assert.equal(result.tokens?.[0]?.frequencyRank, 40); assert.equal(result.tokens?.[1]?.surface, 'です'); - assert.equal(result.tokens?.[1]?.isKnown, false); + assert.equal(result.tokens?.[1]?.isKnown, true); assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false); assert.equal(result.tokens?.[1]?.frequencyRank, undefined); assert.equal(result.tokens?.[1]?.jlptLevel, undefined); @@ -3893,7 +3893,7 @@ test('tokenizeSubtitle clears all annotations for kana-only demonstrative helper { surface: 'これで', headword: 'これ', - isKnown: false, + isKnown: true, isNPlusOneTarget: false, frequencyRank: undefined, jlptLevel: undefined, @@ -4008,7 +4008,7 @@ test('tokenizeSubtitle clears all annotations for explanatory pondering endings' { surface: 'のかな', headword: 'の', - isKnown: false, + isKnown: true, isNPlusOneTarget: false, frequencyRank: undefined, jlptLevel: undefined, @@ -4306,7 +4306,7 @@ test('tokenizeSubtitle clears all annotations for explanatory contrast endings', ); }); -test('tokenizeSubtitle clears annotations for ja-nai explanatory endings while preserving lexical content', async () => { +test('tokenizeSubtitle clears annotations for ja-nai explanatory endings and aru verbs', async () => { const result = await tokenizeSubtitle( 'みたいなのあるじゃないですか', makeDepsFromYomitanTokens( @@ -4322,7 +4322,7 @@ test('tokenizeSubtitle clears annotations for ja-nai explanatory endings while p text === 'みたい' ? 320 : text === 'ある' ? 240 : text === 'じゃない' ? 80 : null, getJlptLevel: (text) => text === 'みたい' ? 'N4' : text === 'ある' ? 'N5' : text === 'じゃない' ? 'N5' : null, - isKnownWord: (text) => text === 'みたい' || text === 'の', + isKnownWord: (text) => text === 'みたい' || text === 'の' || text === 'ある', getMinSentenceWordsForNPlusOne: () => 1, tokenizeWithMecab: async () => [ { @@ -4447,10 +4447,10 @@ test('tokenizeSubtitle clears annotations for ja-nai explanatory endings while p { surface: 'ある', headword: 'ある', - isKnown: false, + isKnown: true, isNPlusOneTarget: false, - frequencyRank: 240, - jlptLevel: 'N5', + frequencyRank: undefined, + jlptLevel: undefined, }, ); }); @@ -4492,7 +4492,7 @@ test('tokenizeSubtitle clears annotations for standalone polite copula endings w { surface: 'ですよ', headword: 'です', - isKnown: false, + isKnown: true, isNPlusOneTarget: false, frequencyRank: undefined, jlptLevel: undefined, @@ -4819,7 +4819,7 @@ test('tokenizeSubtitle clears annotations for auxiliary inflection fragments whi { surface: 'れた', headword: 'れる', - isKnown: false, + isKnown: true, isNPlusOneTarget: false, frequencyRank: undefined, jlptLevel: undefined, @@ -4956,7 +4956,7 @@ test('tokenizeSubtitle clears annotations for te-kureru auxiliary helper spans', { surface: 'てく', headword: 'てく', - isKnown: false, + isKnown: true, isNPlusOneTarget: false, frequencyRank: undefined, jlptLevel: undefined, @@ -4967,7 +4967,7 @@ test('tokenizeSubtitle clears annotations for te-kureru auxiliary helper spans', { surface: 'れた', headword: 'れる', - isKnown: false, + isKnown: true, isNPlusOneTarget: false, frequencyRank: undefined, jlptLevel: undefined, diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts index 68d8c996..e4ac9d69 100644 --- a/src/core/services/tokenizer/annotation-stage.test.ts +++ b/src/core/services/tokenizer/annotation-stage.test.ts @@ -608,6 +608,29 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes bare くれ auxiliary f assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true); }); +test('shouldExcludeTokenFromSubtitleAnnotations excludes aru existence verbs', () => { + for (const token of [ + makeToken({ + surface: 'ある', + headword: 'ある', + reading: 'アル', + partOfSpeech: PartOfSpeech.verb, + pos1: '動詞', + pos2: '自立', + }), + makeToken({ + surface: '有る', + headword: '有る', + reading: 'アル', + partOfSpeech: PartOfSpeech.verb, + pos1: '動詞', + pos2: '自立', + }), + ]) { + assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface); + } +}); + test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone quote particle and auxiliary grammar terms', () => { for (const token of [ makeToken({ @@ -654,7 +677,7 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes single-kana surface fra } }); -test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => { +test('stripSubtitleAnnotationMetadata keeps known hover data while clearing non-known annotation fields', () => { const token = makeToken({ surface: 'は', headword: 'は', @@ -670,7 +693,6 @@ test('stripSubtitleAnnotationMetadata keeps token hover data while clearing anno assert.deepEqual(stripSubtitleAnnotationMetadata(token), { ...token, - isKnown: false, isNPlusOneTarget: false, isNameMatch: false, jlptLevel: undefined, @@ -876,8 +898,8 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens ); assert.equal(result[0]?.isKnown, false); - assert.equal(result[1]?.isKnown, false); - assert.equal(result[2]?.isKnown, false); + assert.equal(result[1]?.isKnown, true); + assert.equal(result[2]?.isKnown, true); assert.equal(result[0]?.isNPlusOneTarget, false); }); @@ -1330,13 +1352,13 @@ test('annotateTokens applies one shared exclusion gate across known N+1 frequenc { minSentenceWordsForNPlusOne: 1 }, ); - assert.equal(result[0]?.isKnown, false); + assert.equal(result[0]?.isKnown, true); assert.equal(result[0]?.isNPlusOneTarget, false); assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.jlptLevel, undefined); }); -test('annotateTokens clears all annotations for kana-only non-independent noun helper merges', () => { +test('annotateTokens keeps known status while clearing other annotations for kana-only non-independent noun helper merges', () => { const tokens = [ makeToken({ surface: 'ことに', @@ -1360,13 +1382,13 @@ test('annotateTokens clears all annotations for kana-only non-independent noun h { minSentenceWordsForNPlusOne: 1 }, ); - assert.equal(result[0]?.isKnown, false); + assert.equal(result[0]?.isKnown, true); assert.equal(result[0]?.isNPlusOneTarget, false); assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.jlptLevel, undefined); }); -test('annotateTokens clears all annotations for standalone auxiliary inflection fragments', () => { +test('annotateTokens keeps known status while clearing other annotations for standalone auxiliary inflection fragments', () => { const tokens = [ makeToken({ surface: 'れる', @@ -1402,14 +1424,14 @@ test('annotateTokens clears all annotations for standalone auxiliary inflection ); for (const token of result) { - assert.equal(token.isKnown, false, token.surface); + assert.equal(token.isKnown, true, token.surface); assert.equal(token.isNPlusOneTarget, false, token.surface); assert.equal(token.frequencyRank, undefined, token.surface); assert.equal(token.jlptLevel, undefined, token.surface); } }); -test('annotateTokens clears all annotations for auxiliary-only te-kureru helper spans', () => { +test('annotateTokens keeps known status while clearing other annotations for auxiliary-only te-kureru helper spans', () => { const tokens = [ makeToken({ surface: 'てく', @@ -1445,7 +1467,7 @@ test('annotateTokens clears all annotations for auxiliary-only te-kureru helper ); for (const token of result) { - assert.equal(token.isKnown, false, token.surface); + assert.equal(token.isKnown, true, token.surface); assert.equal(token.isNPlusOneTarget, false, token.surface); assert.equal(token.frequencyRank, undefined, token.surface); assert.equal(token.jlptLevel, undefined, token.surface); @@ -1481,7 +1503,7 @@ test('annotateTokens keeps lexical くれる forms eligible for annotation', () assert.equal(result[0]?.jlptLevel, 'N4'); }); -test('annotateTokens clears all annotations for standalone して helper fragments', () => { +test('annotateTokens keeps known status while clearing other annotations for standalone して helper fragments', () => { const tokens = [ makeToken({ surface: 'してる', @@ -1505,13 +1527,13 @@ test('annotateTokens clears all annotations for standalone して helper fragmen { minSentenceWordsForNPlusOne: 1 }, ); - assert.equal(result[0]?.isKnown, false); + assert.equal(result[0]?.isKnown, true); assert.equal(result[0]?.isNPlusOneTarget, false); assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.jlptLevel, undefined); }); -test('annotateTokens clears all annotations for standalone particle fragments without POS tags', () => { +test('annotateTokens keeps known status while clearing other annotations for standalone particle fragments without POS tags', () => { const tokens = [ makeToken({ surface: 'と', @@ -1535,7 +1557,7 @@ test('annotateTokens clears all annotations for standalone particle fragments wi { minSentenceWordsForNPlusOne: 1 }, ); - assert.equal(result[0]?.isKnown, false); + assert.equal(result[0]?.isKnown, true); assert.equal(result[0]?.isNPlusOneTarget, false); assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.jlptLevel, undefined); @@ -1591,7 +1613,7 @@ test('annotateTokens does not mark standalone connective particles as N+1', () = assert.equal(result[1]?.jlptLevel, undefined); }); -test('annotateTokens clears all annotations for rhetorical もんか grammar particle phrases', () => { +test('annotateTokens keeps known status while clearing other annotations for rhetorical もんか grammar particle phrases', () => { const tokens = [ makeToken({ surface: 'もんか', @@ -1615,13 +1637,13 @@ test('annotateTokens clears all annotations for rhetorical もんか grammar par { minSentenceWordsForNPlusOne: 1 }, ); - assert.equal(result[0]?.isKnown, false); + assert.equal(result[0]?.isKnown, true); assert.equal(result[0]?.isNPlusOneTarget, false); assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.jlptLevel, undefined); }); -test('annotateTokens clears all annotations for bare くれ auxiliary fragments', () => { +test('annotateTokens keeps known status while clearing other annotations for bare くれ auxiliary fragments', () => { const tokens = [ makeToken({ surface: 'くれ', @@ -1645,13 +1667,50 @@ test('annotateTokens clears all annotations for bare くれ auxiliary fragments' { minSentenceWordsForNPlusOne: 1 }, ); - assert.equal(result[0]?.isKnown, false); + assert.equal(result[0]?.isKnown, true); assert.equal(result[0]?.isNPlusOneTarget, false); assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.jlptLevel, undefined); }); -test('annotateTokens clears all annotations for standalone quote particle and auxiliary grammar terms', () => { +test('annotateTokens keeps known status while clearing other annotations for aru existence verbs', () => { + const tokens = [ + makeToken({ + surface: '有る', + headword: '有る', + reading: 'アル', + partOfSpeech: PartOfSpeech.verb, + pos1: '動詞', + pos2: '自立', + startPos: 0, + endPos: 2, + frequencyRank: 8447, + isKnown: true, + isNPlusOneTarget: true, + isNameMatch: true, + jlptLevel: 'N5', + }), + ]; + + const result = annotateTokens( + tokens, + makeDeps({ + isKnownWord: (text) => text === '有る' || text === 'ある', + getJlptLevel: (text) => (text === '有る' || text === 'ある' ? 'N5' : null), + }), + { minSentenceWordsForNPlusOne: 1 }, + ); + + assert.equal(result[0]?.surface, '有る'); + assert.equal(result[0]?.headword, '有る'); + assert.equal(result[0]?.isKnown, true); + assert.equal(result[0]?.isNPlusOneTarget, false); + assert.equal(result[0]?.isNameMatch, false); + assert.equal(result[0]?.frequencyRank, undefined); + assert.equal(result[0]?.jlptLevel, undefined); +}); + +test('annotateTokens keeps known status while clearing other annotations for standalone quote particle and auxiliary grammar terms', () => { const tokens = [ makeToken({ surface: 'って', @@ -1687,14 +1746,14 @@ test('annotateTokens clears all annotations for standalone quote particle and au ); for (const token of result) { - assert.equal(token.isKnown, false, token.surface); + assert.equal(token.isKnown, true, token.surface); assert.equal(token.isNPlusOneTarget, false, token.surface); assert.equal(token.frequencyRank, undefined, token.surface); assert.equal(token.jlptLevel, undefined, token.surface); } }); -test('annotateTokens clears all annotations from standalone あ interjections without POS tags', () => { +test('annotateTokens keeps known status while clearing other annotations from standalone あ interjections without POS tags', () => { const tokens = [ makeToken({ surface: 'あ', @@ -1724,7 +1783,7 @@ test('annotateTokens clears all annotations from standalone あ interjections wi assert.equal(result[0]?.surface, 'あ'); assert.equal(result[0]?.headword, 'あ'); assert.equal(result[0]?.reading, 'あ'); - assert.equal(result[0]?.isKnown, false); + assert.equal(result[0]?.isKnown, true); assert.equal(result[0]?.isNPlusOneTarget, false); assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.jlptLevel, undefined); @@ -1786,7 +1845,7 @@ test('annotateTokens clears all annotations from expressive subtitle interjectio ); for (const token of result.slice(0, 2)) { - assert.equal(token.isKnown, false, token.surface); + assert.equal(token.isKnown, true, token.surface); assert.equal(token.isNPlusOneTarget, false, token.surface); assert.equal(token.frequencyRank, undefined, token.surface); assert.equal(token.jlptLevel, undefined, token.surface); diff --git a/src/core/services/tokenizer/annotation-stage.ts b/src/core/services/tokenizer/annotation-stage.ts index 9e3fa9af..ec529cc6 100644 --- a/src/core/services/tokenizer/annotation-stage.ts +++ b/src/core/services/tokenizer/annotation-stage.ts @@ -559,36 +559,6 @@ function computeTokenKnownStatus( return normalizedReading !== matchText.trim() && isKnownWord(normalizedReading); } -function computeExcludedTokenKnownStatus( - token: MergedToken, - isKnownWord: (text: string) => boolean, -): boolean { - const normalizedSurface = token.surface.trim(); - if (!hasKanjiChar(normalizedSurface)) { - return false; - } - - if (normalizedSurface && isKnownWord(normalizedSurface)) { - return true; - } - - const normalizedReading = token.reading.trim(); - if ( - normalizedReading && - normalizedReading !== normalizedSurface && - isKnownWord(normalizedReading) - ) { - return true; - } - - const normalizedHeadword = token.headword.trim(); - return ( - normalizedHeadword.length > 0 && - normalizedHeadword === normalizedSurface && - isKnownWord(normalizedHeadword) - ); -} - function filterTokenFrequencyRank( token: MergedToken, pos1Exclusions: ReadonlySet, @@ -657,7 +627,9 @@ export function annotateTokens( }); return { ...strippedToken, - isKnown: nPlusOneEnabled && computeExcludedTokenKnownStatus(token, deps.isKnownWord), + isKnown: nPlusOneEnabled + ? computeTokenKnownStatus(token, deps.isKnownWord, deps.knownWordMatchMode) + : false, }; } diff --git a/src/core/services/tokenizer/subtitle-annotation-filter.ts b/src/core/services/tokenizer/subtitle-annotation-filter.ts index 72cdd64b..927a0364 100644 --- a/src/core/services/tokenizer/subtitle-annotation-filter.ts +++ b/src/core/services/tokenizer/subtitle-annotation-filter.ts @@ -22,6 +22,7 @@ const STANDALONE_GRAMMAR_PARTICLE_PHRASES_SET: ReadonlySet = new Set( export const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([ 'あ', 'ああ', + 'ある', 'あなた', 'あんた', 'ええ', @@ -51,6 +52,8 @@ export const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([ '何だ', '何も', '如何した', + '有る', + '在る', '様', '確かに', '誰も', @@ -507,7 +510,6 @@ export function stripSubtitleAnnotationMetadata( return { ...token, - isKnown: false, isNPlusOneTarget: false, isNameMatch: false, jlptLevel: undefined,