From 9bcea2fc5f6a4f8f6fa19d860857f2616aaa934c Mon Sep 17 00:00:00 2001
From: sudacode <suda@sudacode.com>
Date: Sun, 3 May 2026 22:03:42 -0700
Subject: [PATCH] fix: preserve known highlighting for filtered tokens

---
 ...333 - Suppress-aru-subtitle-annotations.md |  53 +++++++++
 changes/333-aru-annotation-filter.md          |   4 +
 src/core/services/tokenizer.test.ts           |  24 ++--
 .../tokenizer/annotation-stage.test.ts        | 107 ++++++++++++++----
 .../services/tokenizer/annotation-stage.ts    |  34 +-----
 .../tokenizer/subtitle-annotation-filter.ts   |   4 +-
 6 files changed, 158 insertions(+), 68 deletions(-)
 create mode 100644 backlog/tasks/task-333 - Suppress-aru-subtitle-annotations.md
 create mode 100644 changes/333-aru-annotation-filter.md

diff --git a/backlog/tasks/task-333 - Suppress-aru-subtitle-annotations.md b/backlog/tasks/task-333 - Suppress-aru-subtitle-annotations.md
new file mode 100644
index 00000000..293fc8f1
--- /dev/null
+++ b/backlog/tasks/task-333 - Suppress-aru-subtitle-annotations.md	
@@ -0,0 +1,53 @@
+---
+id: TASK-333
+title: Suppress aru subtitle annotations
+status: Done
+assignee: []
+created_date: '2026-05-04 04:39'
+updated_date: '2026-05-04 05:02'
+labels:
+  - tokenizer
+  - annotations
+  - bug
+dependencies: []
+priority: medium
+---
+
+## Description
+
+<!-- SECTION:DESCRIPTION:BEGIN -->
+Add `ある` / `有る` to the subtitle annotation suppression path so `aru` tokens remain hoverable and never receive N+1, JLPT, frequency, or name-match annotation metadata. Known-word highlighting is special: if a filtered `aru` token is known and known highlighting is enabled, it should still render as known.
+<!-- SECTION:DESCRIPTION:END -->
+
+## Acceptance Criteria
+<!-- AC:BEGIN -->
+- [x] #1 `ある` and kanji headword/surface variants such as `有る` are excluded by the subtitle annotation filter.
+- [x] #2 Annotation stripping clears N+1, JLPT, frequency, and name metadata for `aru` tokens while preserving token hover data.
+- [x] #3 Known-word highlighting still applies to filtered tokens, including `aru`, when known-word lookup marks them known.
+- [x] #4 Regression coverage fails before the fix and passes after.
+<!-- AC:END -->
+
+## Implementation Plan
+
+<!-- SECTION:PLAN:BEGIN -->
+1. Add `ある`/`有る`/`在る` to the shared subtitle annotation hard-exclusion terms.
+2. Preserve/recompute known-word status for filtered tokens while stripping N+1, JLPT, frequency, and name metadata.
+3. Add RED/GREEN unit and tokenizer regression coverage, plus a changelog fragment.
+4. Run targeted tests and full handoff gate.
+<!-- SECTION:PLAN:END -->
+
+## Implementation Notes
+
+<!-- SECTION:NOTES:BEGIN -->
+TDD path: added failing annotation-stage coverage first. Initial implementation made targeted tests pass, then broader tokenizer coverage revealed an older fixture expecting `ある` to remain lexical; updated that integration expectation to the new requested behavior. Follow-up correction: known-word highlighting is the lone annotation exception for filtered tokens, so the strip path now preserves known state and `annotateTokens` recomputes known status for filtered tokens while still clearing N+1/JLPT/frequency/name metadata.
+<!-- SECTION:NOTES:END -->
+
+## Final Summary
+
+<!-- SECTION:FINAL_SUMMARY:BEGIN -->
+Suppressed non-known subtitle annotations for `aru` existence verbs by adding `ある`, `有る`, and `在る` to the shared hard-exclusion list. Corrected the filtered-token path so known-word highlighting still applies whenever known highlighting is enabled; filtered tokens now keep/gain `isKnown` but still lose N+1, JLPT, frequency, and name metadata.
+
+Added and updated annotation-stage and tokenizer regression coverage for `aru`, particles, helper fragments, interjections, and other filtered known tokens. Added `changes/333-aru-annotation-filter.md`.
+
+Validation passed: RED failures observed before implementation/correction; `bun test src/core/services/tokenizer/annotation-stage.test.ts`; `bun test src/core/services/tokenizer.test.ts`; `bun run typecheck`; `bun run format:check:src`; `bun run changelog:lint`; `bun run test:fast`; `bun run test:env`; `bun run build`; `bun run test:smoke:dist`.
+<!-- SECTION:FINAL_SUMMARY:END -->
diff --git a/changes/333-aru-annotation-filter.md b/changes/333-aru-annotation-filter.md
new file mode 100644
index 00000000..7f3425bc
--- /dev/null
+++ b/changes/333-aru-annotation-filter.md
@@ -0,0 +1,4 @@
+type: fixed
+area: tokenizer
+
+- Suppressed N+1, JLPT, frequency, and name styling for `ある` / `有る` existence verbs while still allowing known-word highlighting.
diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts
index b025eb1b..ae49a85e 100644
--- a/src/core/services/tokenizer.test.ts
+++ b/src/core/services/tokenizer.test.ts
@@ -129,7 +129,7 @@ test('tokenizeSubtitle splits same-line grammar endings before applying annotati
   assert.equal(result.tokens?.[0]?.jlptLevel, 'N5');
   assert.equal(result.tokens?.[0]?.frequencyRank, 40);
   assert.equal(result.tokens?.[1]?.surface, 'です');
-  assert.equal(result.tokens?.[1]?.isKnown, false);
+  assert.equal(result.tokens?.[1]?.isKnown, true);
   assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false);
   assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
   assert.equal(result.tokens?.[1]?.jlptLevel, undefined);
@@ -3893,7 +3893,7 @@ test('tokenizeSubtitle clears all annotations for kana-only demonstrative helper
       {
         surface: 'これで',
         headword: 'これ',
-        isKnown: false,
+        isKnown: true,
         isNPlusOneTarget: false,
         frequencyRank: undefined,
         jlptLevel: undefined,
@@ -4008,7 +4008,7 @@ test('tokenizeSubtitle clears all annotations for explanatory pondering endings'
       {
         surface: 'のかな',
         headword: 'の',
-        isKnown: false,
+        isKnown: true,
         isNPlusOneTarget: false,
         frequencyRank: undefined,
         jlptLevel: undefined,
@@ -4306,7 +4306,7 @@ test('tokenizeSubtitle clears all annotations for explanatory contrast endings',
   );
 });
 
-test('tokenizeSubtitle clears annotations for ja-nai explanatory endings while preserving lexical content', async () => {
+test('tokenizeSubtitle clears annotations for ja-nai explanatory endings and aru verbs', async () => {
   const result = await tokenizeSubtitle(
     'みたいなのあるじゃないですか',
     makeDepsFromYomitanTokens(
@@ -4322,7 +4322,7 @@ test('tokenizeSubtitle clears annotations for ja-nai explanatory endings while p
           text === 'みたい' ? 320 : text === 'ある' ? 240 : text === 'じゃない' ? 80 : null,
         getJlptLevel: (text) =>
           text === 'みたい' ? 'N4' : text === 'ある' ? 'N5' : text === 'じゃない' ? 'N5' : null,
-        isKnownWord: (text) => text === 'みたい' || text === 'の',
+        isKnownWord: (text) => text === 'みたい' || text === 'の' || text === 'ある',
         getMinSentenceWordsForNPlusOne: () => 1,
         tokenizeWithMecab: async () => [
           {
@@ -4447,10 +4447,10 @@ test('tokenizeSubtitle clears annotations for ja-nai explanatory endings while p
     {
       surface: 'ある',
       headword: 'ある',
-      isKnown: false,
+      isKnown: true,
       isNPlusOneTarget: false,
-      frequencyRank: 240,
-      jlptLevel: 'N5',
+      frequencyRank: undefined,
+      jlptLevel: undefined,
     },
   );
 });
@@ -4492,7 +4492,7 @@ test('tokenizeSubtitle clears annotations for standalone polite copula endings w
     {
       surface: 'ですよ',
       headword: 'です',
-      isKnown: false,
+      isKnown: true,
       isNPlusOneTarget: false,
       frequencyRank: undefined,
       jlptLevel: undefined,
@@ -4819,7 +4819,7 @@ test('tokenizeSubtitle clears annotations for auxiliary inflection fragments whi
     {
       surface: 'れた',
       headword: 'れる',
-      isKnown: false,
+      isKnown: true,
       isNPlusOneTarget: false,
       frequencyRank: undefined,
       jlptLevel: undefined,
@@ -4956,7 +4956,7 @@ test('tokenizeSubtitle clears annotations for te-kureru auxiliary helper spans',
     {
       surface: 'てく',
       headword: 'てく',
-      isKnown: false,
+      isKnown: true,
       isNPlusOneTarget: false,
       frequencyRank: undefined,
       jlptLevel: undefined,
@@ -4967,7 +4967,7 @@ test('tokenizeSubtitle clears annotations for te-kureru auxiliary helper spans',
     {
       surface: 'れた',
       headword: 'れる',
-      isKnown: false,
+      isKnown: true,
       isNPlusOneTarget: false,
       frequencyRank: undefined,
       jlptLevel: undefined,
diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts
index 68d8c996..e4ac9d69 100644
--- a/src/core/services/tokenizer/annotation-stage.test.ts
+++ b/src/core/services/tokenizer/annotation-stage.test.ts
@@ -608,6 +608,29 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes bare くれ auxiliary f
   assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
 });
 
+test('shouldExcludeTokenFromSubtitleAnnotations excludes aru existence verbs', () => {
+  for (const token of [
+    makeToken({
+      surface: 'ある',
+      headword: 'ある',
+      reading: 'アル',
+      partOfSpeech: PartOfSpeech.verb,
+      pos1: '動詞',
+      pos2: '自立',
+    }),
+    makeToken({
+      surface: '有る',
+      headword: '有る',
+      reading: 'アル',
+      partOfSpeech: PartOfSpeech.verb,
+      pos1: '動詞',
+      pos2: '自立',
+    }),
+  ]) {
+    assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
+  }
+});
+
 test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone quote particle and auxiliary grammar terms', () => {
   for (const token of [
     makeToken({
@@ -654,7 +677,7 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes single-kana surface fra
   }
 });
 
-test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => {
+test('stripSubtitleAnnotationMetadata keeps known hover data while clearing non-known annotation fields', () => {
   const token = makeToken({
     surface: 'は',
     headword: 'は',
@@ -670,7 +693,6 @@ test('stripSubtitleAnnotationMetadata keeps token hover data while clearing anno
 
   assert.deepEqual(stripSubtitleAnnotationMetadata(token), {
     ...token,
-    isKnown: false,
     isNPlusOneTarget: false,
     isNameMatch: false,
     jlptLevel: undefined,
@@ -876,8 +898,8 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
   );
 
   assert.equal(result[0]?.isKnown, false);
-  assert.equal(result[1]?.isKnown, false);
-  assert.equal(result[2]?.isKnown, false);
+  assert.equal(result[1]?.isKnown, true);
+  assert.equal(result[2]?.isKnown, true);
   assert.equal(result[0]?.isNPlusOneTarget, false);
 });
 
@@ -1330,13 +1352,13 @@ test('annotateTokens applies one shared exclusion gate across known N+1 frequenc
     { minSentenceWordsForNPlusOne: 1 },
   );
 
-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
   assert.equal(result[0]?.isNPlusOneTarget, false);
   assert.equal(result[0]?.frequencyRank, undefined);
   assert.equal(result[0]?.jlptLevel, undefined);
 });
 
-test('annotateTokens clears all annotations for kana-only non-independent noun helper merges', () => {
+test('annotateTokens keeps known status while clearing other annotations for kana-only non-independent noun helper merges', () => {
   const tokens = [
     makeToken({
       surface: 'ことに',
@@ -1360,13 +1382,13 @@ test('annotateTokens clears all annotations for kana-only non-independent noun h
     { minSentenceWordsForNPlusOne: 1 },
   );
 
-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
   assert.equal(result[0]?.isNPlusOneTarget, false);
   assert.equal(result[0]?.frequencyRank, undefined);
   assert.equal(result[0]?.jlptLevel, undefined);
 });
 
-test('annotateTokens clears all annotations for standalone auxiliary inflection fragments', () => {
+test('annotateTokens keeps known status while clearing other annotations for standalone auxiliary inflection fragments', () => {
   const tokens = [
     makeToken({
       surface: 'れる',
@@ -1402,14 +1424,14 @@ test('annotateTokens clears all annotations for standalone auxiliary inflection
   );
 
   for (const token of result) {
-    assert.equal(token.isKnown, false, token.surface);
+    assert.equal(token.isKnown, true, token.surface);
     assert.equal(token.isNPlusOneTarget, false, token.surface);
     assert.equal(token.frequencyRank, undefined, token.surface);
     assert.equal(token.jlptLevel, undefined, token.surface);
   }
 });
 
-test('annotateTokens clears all annotations for auxiliary-only te-kureru helper spans', () => {
+test('annotateTokens keeps known status while clearing other annotations for auxiliary-only te-kureru helper spans', () => {
   const tokens = [
     makeToken({
       surface: 'てく',
@@ -1445,7 +1467,7 @@ test('annotateTokens clears all annotations for auxiliary-only te-kureru helper
   );
 
   for (const token of result) {
-    assert.equal(token.isKnown, false, token.surface);
+    assert.equal(token.isKnown, true, token.surface);
     assert.equal(token.isNPlusOneTarget, false, token.surface);
     assert.equal(token.frequencyRank, undefined, token.surface);
     assert.equal(token.jlptLevel, undefined, token.surface);
@@ -1481,7 +1503,7 @@ test('annotateTokens keeps lexical くれる forms eligible for annotation', ()
   assert.equal(result[0]?.jlptLevel, 'N4');
 });
 
-test('annotateTokens clears all annotations for standalone して helper fragments', () => {
+test('annotateTokens keeps known status while clearing other annotations for standalone して helper fragments', () => {
   const tokens = [
     makeToken({
       surface: 'してる',
@@ -1505,13 +1527,13 @@ test('annotateTokens clears all annotations for standalone して helper fragmen
     { minSentenceWordsForNPlusOne: 1 },
   );
 
-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
   assert.equal(result[0]?.isNPlusOneTarget, false);
   assert.equal(result[0]?.frequencyRank, undefined);
   assert.equal(result[0]?.jlptLevel, undefined);
 });
 
-test('annotateTokens clears all annotations for standalone particle fragments without POS tags', () => {
+test('annotateTokens keeps known status while clearing other annotations for standalone particle fragments without POS tags', () => {
   const tokens = [
     makeToken({
       surface: 'と',
@@ -1535,7 +1557,7 @@ test('annotateTokens clears all annotations for standalone particle fragments wi
     { minSentenceWordsForNPlusOne: 1 },
   );
 
-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
   assert.equal(result[0]?.isNPlusOneTarget, false);
   assert.equal(result[0]?.frequencyRank, undefined);
   assert.equal(result[0]?.jlptLevel, undefined);
@@ -1591,7 +1613,7 @@ test('annotateTokens does not mark standalone connective particles as N+1', () =
   assert.equal(result[1]?.jlptLevel, undefined);
 });
 
-test('annotateTokens clears all annotations for rhetorical もんか grammar particle phrases', () => {
+test('annotateTokens keeps known status while clearing other annotations for rhetorical もんか grammar particle phrases', () => {
   const tokens = [
     makeToken({
       surface: 'もんか',
@@ -1615,13 +1637,13 @@ test('annotateTokens clears all annotations for rhetorical もんか grammar par
     { minSentenceWordsForNPlusOne: 1 },
   );
 
-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
   assert.equal(result[0]?.isNPlusOneTarget, false);
   assert.equal(result[0]?.frequencyRank, undefined);
   assert.equal(result[0]?.jlptLevel, undefined);
 });
 
-test('annotateTokens clears all annotations for bare くれ auxiliary fragments', () => {
+test('annotateTokens keeps known status while clearing other annotations for bare くれ auxiliary fragments', () => {
   const tokens = [
     makeToken({
       surface: 'くれ',
@@ -1645,13 +1667,50 @@ test('annotateTokens clears all annotations for bare くれ auxiliary fragments'
     { minSentenceWordsForNPlusOne: 1 },
   );
 
-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
   assert.equal(result[0]?.isNPlusOneTarget, false);
   assert.equal(result[0]?.frequencyRank, undefined);
   assert.equal(result[0]?.jlptLevel, undefined);
 });
 
-test('annotateTokens clears all annotations for standalone quote particle and auxiliary grammar terms', () => {
+test('annotateTokens keeps known status while clearing other annotations for aru existence verbs', () => {
+  const tokens = [
+    makeToken({
+      surface: '有る',
+      headword: '有る',
+      reading: 'アル',
+      partOfSpeech: PartOfSpeech.verb,
+      pos1: '動詞',
+      pos2: '自立',
+      startPos: 0,
+      endPos: 2,
+      frequencyRank: 8447,
+      isKnown: true,
+      isNPlusOneTarget: true,
+      isNameMatch: true,
+      jlptLevel: 'N5',
+    }),
+  ];
+
+  const result = annotateTokens(
+    tokens,
+    makeDeps({
+      isKnownWord: (text) => text === '有る' || text === 'ある',
+      getJlptLevel: (text) => (text === '有る' || text === 'ある' ? 'N5' : null),
+    }),
+    { minSentenceWordsForNPlusOne: 1 },
+  );
+
+  assert.equal(result[0]?.surface, '有る');
+  assert.equal(result[0]?.headword, '有る');
+  assert.equal(result[0]?.isKnown, true);
+  assert.equal(result[0]?.isNPlusOneTarget, false);
+  assert.equal(result[0]?.isNameMatch, false);
+  assert.equal(result[0]?.frequencyRank, undefined);
+  assert.equal(result[0]?.jlptLevel, undefined);
+});
+
+test('annotateTokens keeps known status while clearing other annotations for standalone quote particle and auxiliary grammar terms', () => {
   const tokens = [
     makeToken({
       surface: 'って',
@@ -1687,14 +1746,14 @@ test('annotateTokens clears all annotations for standalone quote particle and au
   );
 
   for (const token of result) {
-    assert.equal(token.isKnown, false, token.surface);
+    assert.equal(token.isKnown, true, token.surface);
     assert.equal(token.isNPlusOneTarget, false, token.surface);
     assert.equal(token.frequencyRank, undefined, token.surface);
     assert.equal(token.jlptLevel, undefined, token.surface);
   }
 });
 
-test('annotateTokens clears all annotations from standalone あ interjections without POS tags', () => {
+test('annotateTokens keeps known status while clearing other annotations from standalone あ interjections without POS tags', () => {
   const tokens = [
     makeToken({
       surface: 'あ',
@@ -1724,7 +1783,7 @@ test('annotateTokens clears all annotations from standalone あ interjections wi
   assert.equal(result[0]?.surface, 'あ');
   assert.equal(result[0]?.headword, 'あ');
   assert.equal(result[0]?.reading, 'あ');
-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
   assert.equal(result[0]?.isNPlusOneTarget, false);
   assert.equal(result[0]?.frequencyRank, undefined);
   assert.equal(result[0]?.jlptLevel, undefined);
@@ -1786,7 +1845,7 @@ test('annotateTokens clears all annotations from expressive subtitle interjectio
   );
 
   for (const token of result.slice(0, 2)) {
-    assert.equal(token.isKnown, false, token.surface);
+    assert.equal(token.isKnown, true, token.surface);
     assert.equal(token.isNPlusOneTarget, false, token.surface);
     assert.equal(token.frequencyRank, undefined, token.surface);
     assert.equal(token.jlptLevel, undefined, token.surface);
diff --git a/src/core/services/tokenizer/annotation-stage.ts b/src/core/services/tokenizer/annotation-stage.ts
index 9e3fa9af..ec529cc6 100644
--- a/src/core/services/tokenizer/annotation-stage.ts
+++ b/src/core/services/tokenizer/annotation-stage.ts
@@ -559,36 +559,6 @@ function computeTokenKnownStatus(
   return normalizedReading !== matchText.trim() && isKnownWord(normalizedReading);
 }
 
-function computeExcludedTokenKnownStatus(
-  token: MergedToken,
-  isKnownWord: (text: string) => boolean,
-): boolean {
-  const normalizedSurface = token.surface.trim();
-  if (!hasKanjiChar(normalizedSurface)) {
-    return false;
-  }
-
-  if (normalizedSurface && isKnownWord(normalizedSurface)) {
-    return true;
-  }
-
-  const normalizedReading = token.reading.trim();
-  if (
-    normalizedReading &&
-    normalizedReading !== normalizedSurface &&
-    isKnownWord(normalizedReading)
-  ) {
-    return true;
-  }
-
-  const normalizedHeadword = token.headword.trim();
-  return (
-    normalizedHeadword.length > 0 &&
-    normalizedHeadword === normalizedSurface &&
-    isKnownWord(normalizedHeadword)
-  );
-}
-
 function filterTokenFrequencyRank(
   token: MergedToken,
   pos1Exclusions: ReadonlySet<string>,
@@ -657,7 +627,9 @@ export function annotateTokens(
       });
       return {
         ...strippedToken,
-        isKnown: nPlusOneEnabled && computeExcludedTokenKnownStatus(token, deps.isKnownWord),
+        isKnown: nPlusOneEnabled
+          ? computeTokenKnownStatus(token, deps.isKnownWord, deps.knownWordMatchMode)
+          : false,
       };
     }
 
diff --git a/src/core/services/tokenizer/subtitle-annotation-filter.ts b/src/core/services/tokenizer/subtitle-annotation-filter.ts
index 72cdd64b..927a0364 100644
--- a/src/core/services/tokenizer/subtitle-annotation-filter.ts
+++ b/src/core/services/tokenizer/subtitle-annotation-filter.ts
@@ -22,6 +22,7 @@ const STANDALONE_GRAMMAR_PARTICLE_PHRASES_SET: ReadonlySet<string> = new Set(
 export const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
   'あ',
   'ああ',
+  'ある',
   'あなた',
   'あんた',
   'ええ',
@@ -51,6 +52,8 @@ export const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
   '何だ',
   '何も',
   '如何した',
+  '有る',
+  '在る',
   '様',
   '確かに',
   '誰も',
@@ -507,7 +510,6 @@ export function stripSubtitleAnnotationMetadata(
 
   return {
     ...token,
-    isKnown: false,
     isNPlusOneTarget: false,
     isNameMatch: false,
     jlptLevel: undefined,