fix: preserve known highlighting for filtered tokens

2026-05-04 00:41:33 -07:00 · 2026-05-03 22:03:42 -07:00
parent 00a94d6bd1
commit 9bcea2fc5f
6 changed files with 158 additions and 68 deletions
@@ -0,0 +1,53 @@
 ---
 id: TASK-333
 title: Suppress aru subtitle annotations
 status: Done
 assignee: []
 created_date: '2026-05-04 04:39'
 updated_date: '2026-05-04 05:02'
 labels:
  - tokenizer
  - annotations
  - bug
 dependencies: []
 priority: medium
 ---
 ## Description
 <!-- SECTION:DESCRIPTION:BEGIN -->
 Add `ある` / `有る` to the subtitle annotation suppression path so `aru` tokens remain hoverable and never receive N+1, JLPT, frequency, or name-match annotation metadata. Known-word highlighting is special: if a filtered `aru` token is known and known highlighting is enabled, it should still render as known.
 <!-- SECTION:DESCRIPTION:END -->
 ## Acceptance Criteria
 <!-- AC:BEGIN -->
 - [x] #1 `ある` and kanji headword/surface variants such as `有る` are excluded by the subtitle annotation filter.
 - [x] #2 Annotation stripping clears N+1, JLPT, frequency, and name metadata for `aru` tokens while preserving token hover data.
 - [x] #3 Known-word highlighting still applies to filtered tokens, including `aru`, when known-word lookup marks them known.
 - [x] #4 Regression coverage fails before the fix and passes after.
 <!-- AC:END -->
 ## Implementation Plan
 <!-- SECTION:PLAN:BEGIN -->
 1. Add `ある`/`有る`/`在る` to the shared subtitle annotation hard-exclusion terms.
 2. Preserve/recompute known-word status for filtered tokens while stripping N+1, JLPT, frequency, and name metadata.
 3. Add RED/GREEN unit and tokenizer regression coverage, plus a changelog fragment.
 4. Run targeted tests and full handoff gate.
 <!-- SECTION:PLAN:END -->
 ## Implementation Notes
 <!-- SECTION:NOTES:BEGIN -->
 TDD path: added failing annotation-stage coverage first. Initial implementation made targeted tests pass, then broader tokenizer coverage revealed an older fixture expecting `ある` to remain lexical; updated that integration expectation to the new requested behavior. Follow-up correction: known-word highlighting is the lone annotation exception for filtered tokens, so the strip path now preserves known state and `annotateTokens` recomputes known status for filtered tokens while still clearing N+1/JLPT/frequency/name metadata.
 <!-- SECTION:NOTES:END -->
 ## Final Summary
 <!-- SECTION:FINAL_SUMMARY:BEGIN -->
 Suppressed non-known subtitle annotations for `aru` existence verbs by adding `ある`, `有る`, and `在る` to the shared hard-exclusion list. Corrected the filtered-token path so known-word highlighting still applies whenever known highlighting is enabled; filtered tokens now keep/gain `isKnown` but still lose N+1, JLPT, frequency, and name metadata.
 Added and updated annotation-stage and tokenizer regression coverage for `aru`, particles, helper fragments, interjections, and other filtered known tokens. Added `changes/333-aru-annotation-filter.md`.
 Validation passed: RED failures observed before implementation/correction; `bun test src/core/services/tokenizer/annotation-stage.test.ts`; `bun test src/core/services/tokenizer.test.ts`; `bun run typecheck`; `bun run format:check:src`; `bun run changelog:lint`; `bun run test:fast`; `bun run test:env`; `bun run build`; `bun run test:smoke:dist`.
 <!-- SECTION:FINAL_SUMMARY:END -->
@@ -0,0 +1,4 @@
 type: fixed
 area: tokenizer
 - Suppressed N+1, JLPT, frequency, and name styling for `ある` / `有る` existence verbs while still allowing known-word highlighting.
@@ -129,7 +129,7 @@ test('tokenizeSubtitle splits same-line grammar endings before applying annotati
  assert.equal(result.tokens?.[0]?.jlptLevel, 'N5');
  assert.equal(result.tokens?.[0]?.frequencyRank, 40);
  assert.equal(result.tokens?.[1]?.surface, 'です');
-  assert.equal(result.tokens?.[1]?.isKnown, false);
+  assert.equal(result.tokens?.[1]?.isKnown, true);
  assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false);
  assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
  assert.equal(result.tokens?.[1]?.jlptLevel, undefined);
@@ -3893,7 +3893,7 @@ test('tokenizeSubtitle clears all annotations for kana-only demonstrative helper
      {
        surface: 'これで',
        headword: 'これ',
-        isKnown: false,
+        isKnown: true,
        isNPlusOneTarget: false,
        frequencyRank: undefined,
        jlptLevel: undefined,
@@ -4008,7 +4008,7 @@ test('tokenizeSubtitle clears all annotations for explanatory pondering endings'
      {
        surface: 'のかな',
        headword: 'の',
-        isKnown: false,
+        isKnown: true,
        isNPlusOneTarget: false,
        frequencyRank: undefined,
        jlptLevel: undefined,
@@ -4306,7 +4306,7 @@ test('tokenizeSubtitle clears all annotations for explanatory contrast endings',
  );
 });
-test('tokenizeSubtitle clears annotations for ja-nai explanatory endings while preserving lexical content', async () => {
+test('tokenizeSubtitle clears annotations for ja-nai explanatory endings and aru verbs', async () => {
  const result = await tokenizeSubtitle(
    'みたいなのあるじゃないですか',
    makeDepsFromYomitanTokens(
@@ -4322,7 +4322,7 @@ test('tokenizeSubtitle clears annotations for ja-nai explanatory endings while p
          text === 'みたい' ? 320 : text === 'ある' ? 240 : text === 'じゃない' ? 80 : null,
        getJlptLevel: (text) =>
          text === 'みたい' ? 'N4' : text === 'ある' ? 'N5' : text === 'じゃない' ? 'N5' : null,
-        isKnownWord: (text) => text === 'みたい' || text === 'の',
+        isKnownWord: (text) => text === 'みたい' || text === 'の' || text === 'ある',
        getMinSentenceWordsForNPlusOne: () => 1,
        tokenizeWithMecab: async () => [
          {
@@ -4447,10 +4447,10 @@ test('tokenizeSubtitle clears annotations for ja-nai explanatory endings while p
    {
      surface: 'ある',
      headword: 'ある',
-      isKnown: false,
+      isKnown: true,
      isNPlusOneTarget: false,
-      frequencyRank: 240,
+      frequencyRank: undefined,
-      jlptLevel: 'N5',
+      jlptLevel: undefined,
    },
  );
 });
@@ -4492,7 +4492,7 @@ test('tokenizeSubtitle clears annotations for standalone polite copula endings w
    {
      surface: 'ですよ',
      headword: 'です',
-      isKnown: false,
+      isKnown: true,
      isNPlusOneTarget: false,
      frequencyRank: undefined,
      jlptLevel: undefined,
@@ -4819,7 +4819,7 @@ test('tokenizeSubtitle clears annotations for auxiliary inflection fragments whi
    {
      surface: 'れた',
      headword: 'れる',
-      isKnown: false,
+      isKnown: true,
      isNPlusOneTarget: false,
      frequencyRank: undefined,
      jlptLevel: undefined,
@@ -4956,7 +4956,7 @@ test('tokenizeSubtitle clears annotations for te-kureru auxiliary helper spans',
    {
      surface: 'てく',
      headword: 'てく',
-      isKnown: false,
+      isKnown: true,
      isNPlusOneTarget: false,
      frequencyRank: undefined,
      jlptLevel: undefined,
@@ -4967,7 +4967,7 @@ test('tokenizeSubtitle clears annotations for te-kureru auxiliary helper spans',
    {
      surface: 'れた',
      headword: 'れる',
-      isKnown: false,
+      isKnown: true,
      isNPlusOneTarget: false,
      frequencyRank: undefined,
      jlptLevel: undefined,
@@ -608,6 +608,29 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes bare くれ auxiliary f
  assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
 });
 test('shouldExcludeTokenFromSubtitleAnnotations excludes aru existence verbs', () => {
  for (const token of [
    makeToken({
      surface: 'ある',
      headword: 'ある',
      reading: 'アル',
      partOfSpeech: PartOfSpeech.verb,
      pos1: '動詞',
      pos2: '自立',
    }),
    makeToken({
      surface: '有る',
      headword: '有る',
      reading: 'アル',
      partOfSpeech: PartOfSpeech.verb,
      pos1: '動詞',
      pos2: '自立',
    }),
  ]) {
    assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
  }
 });
 test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone quote particle and auxiliary grammar terms', () => {
  for (const token of [
    makeToken({
@@ -654,7 +677,7 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes single-kana surface fra
  }
 });
-test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => {
+test('stripSubtitleAnnotationMetadata keeps known hover data while clearing non-known annotation fields', () => {
  const token = makeToken({
    surface: 'は',
    headword: 'は',
@@ -670,7 +693,6 @@ test('stripSubtitleAnnotationMetadata keeps token hover data while clearing anno
  assert.deepEqual(stripSubtitleAnnotationMetadata(token), {
    ...token,
    isKnown: false,
    isNPlusOneTarget: false,
    isNameMatch: false,
    jlptLevel: undefined,
@@ -876,8 +898,8 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
  );
  assert.equal(result[0]?.isKnown, false);
-  assert.equal(result[1]?.isKnown, false);
+  assert.equal(result[1]?.isKnown, true);
-  assert.equal(result[2]?.isKnown, false);
+  assert.equal(result[2]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
 });
@@ -1330,13 +1352,13 @@ test('annotateTokens applies one shared exclusion gate across known N+1 frequenc
    { minSentenceWordsForNPlusOne: 1 },
  );
-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
 });
-test('annotateTokens clears all annotations for kana-only non-independent noun helper merges', () => {
+test('annotateTokens keeps known status while clearing other annotations for kana-only non-independent noun helper merges', () => {
  const tokens = [
    makeToken({
      surface: 'ことに',
@@ -1360,13 +1382,13 @@ test('annotateTokens clears all annotations for kana-only non-independent noun h
    { minSentenceWordsForNPlusOne: 1 },
  );
-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
 });
-test('annotateTokens clears all annotations for standalone auxiliary inflection fragments', () => {
+test('annotateTokens keeps known status while clearing other annotations for standalone auxiliary inflection fragments', () => {
  const tokens = [
    makeToken({
      surface: 'れる',
@@ -1402,14 +1424,14 @@ test('annotateTokens clears all annotations for standalone auxiliary inflection
  );
  for (const token of result) {
-    assert.equal(token.isKnown, false, token.surface);
+    assert.equal(token.isKnown, true, token.surface);
    assert.equal(token.isNPlusOneTarget, false, token.surface);
    assert.equal(token.frequencyRank, undefined, token.surface);
    assert.equal(token.jlptLevel, undefined, token.surface);
  }
 });
-test('annotateTokens clears all annotations for auxiliary-only te-kureru helper spans', () => {
+test('annotateTokens keeps known status while clearing other annotations for auxiliary-only te-kureru helper spans', () => {
  const tokens = [
    makeToken({
      surface: 'てく',
@@ -1445,7 +1467,7 @@ test('annotateTokens clears all annotations for auxiliary-only te-kureru helper
  );
  for (const token of result) {
-    assert.equal(token.isKnown, false, token.surface);
+    assert.equal(token.isKnown, true, token.surface);
    assert.equal(token.isNPlusOneTarget, false, token.surface);
    assert.equal(token.frequencyRank, undefined, token.surface);
    assert.equal(token.jlptLevel, undefined, token.surface);
@@ -1481,7 +1503,7 @@ test('annotateTokens keeps lexical くれる forms eligible for annotation', ()
  assert.equal(result[0]?.jlptLevel, 'N4');
 });
-test('annotateTokens clears all annotations for standalone して helper fragments', () => {
+test('annotateTokens keeps known status while clearing other annotations for standalone して helper fragments', () => {
  const tokens = [
    makeToken({
      surface: 'してる',
@@ -1505,13 +1527,13 @@ test('annotateTokens clears all annotations for standalone して helper fragmen
    { minSentenceWordsForNPlusOne: 1 },
  );
-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
 });
-test('annotateTokens clears all annotations for standalone particle fragments without POS tags', () => {
+test('annotateTokens keeps known status while clearing other annotations for standalone particle fragments without POS tags', () => {
  const tokens = [
    makeToken({
      surface: 'と',
@@ -1535,7 +1557,7 @@ test('annotateTokens clears all annotations for standalone particle fragments wi
    { minSentenceWordsForNPlusOne: 1 },
  );
-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
@@ -1591,7 +1613,7 @@ test('annotateTokens does not mark standalone connective particles as N+1', () =
  assert.equal(result[1]?.jlptLevel, undefined);
 });
-test('annotateTokens clears all annotations for rhetorical もんか grammar particle phrases', () => {
+test('annotateTokens keeps known status while clearing other annotations for rhetorical もんか grammar particle phrases', () => {
  const tokens = [
    makeToken({
      surface: 'もんか',
@@ -1615,13 +1637,13 @@ test('annotateTokens clears all annotations for rhetorical もんか grammar par
    { minSentenceWordsForNPlusOne: 1 },
  );
-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
 });
-test('annotateTokens clears all annotations for bare くれ auxiliary fragments', () => {
+test('annotateTokens keeps known status while clearing other annotations for bare くれ auxiliary fragments', () => {
  const tokens = [
    makeToken({
      surface: 'くれ',
@@ -1645,13 +1667,50 @@ test('annotateTokens clears all annotations for bare くれ auxiliary fragments'
    { minSentenceWordsForNPlusOne: 1 },
  );
-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
 });
-test('annotateTokens clears all annotations for standalone quote particle and auxiliary grammar terms', () => {
+test('annotateTokens keeps known status while clearing other annotations for aru existence verbs', () => {
  const tokens = [
    makeToken({
      surface: '有る',
      headword: '有る',
      reading: 'アル',
      partOfSpeech: PartOfSpeech.verb,
      pos1: '動詞',
      pos2: '自立',
      startPos: 0,
      endPos: 2,
      frequencyRank: 8447,
      isKnown: true,
      isNPlusOneTarget: true,
      isNameMatch: true,
      jlptLevel: 'N5',
    }),
  ];
  const result = annotateTokens(
    tokens,
    makeDeps({
      isKnownWord: (text) => text === '有る' || text === 'ある',
      getJlptLevel: (text) => (text === '有る' || text === 'ある' ? 'N5' : null),
    }),
    { minSentenceWordsForNPlusOne: 1 },
  );
  assert.equal(result[0]?.surface, '有る');
  assert.equal(result[0]?.headword, '有る');
  assert.equal(result[0]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.isNameMatch, false);
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
 });
 test('annotateTokens keeps known status while clearing other annotations for standalone quote particle and auxiliary grammar terms', () => {
  const tokens = [
    makeToken({
      surface: 'って',
@@ -1687,14 +1746,14 @@ test('annotateTokens clears all annotations for standalone quote particle and au
  );
  for (const token of result) {
-    assert.equal(token.isKnown, false, token.surface);
+    assert.equal(token.isKnown, true, token.surface);
    assert.equal(token.isNPlusOneTarget, false, token.surface);
    assert.equal(token.frequencyRank, undefined, token.surface);
    assert.equal(token.jlptLevel, undefined, token.surface);
  }
 });
-test('annotateTokens clears all annotations from standalone あ interjections without POS tags', () => {
+test('annotateTokens keeps known status while clearing other annotations from standalone あ interjections without POS tags', () => {
  const tokens = [
    makeToken({
      surface: 'あ',
@@ -1724,7 +1783,7 @@ test('annotateTokens clears all annotations from standalone あ interjections wi
  assert.equal(result[0]?.surface, 'あ');
  assert.equal(result[0]?.headword, 'あ');
  assert.equal(result[0]?.reading, 'あ');
-  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
@@ -1786,7 +1845,7 @@ test('annotateTokens clears all annotations from expressive subtitle interjectio
  );
  for (const token of result.slice(0, 2)) {
-    assert.equal(token.isKnown, false, token.surface);
+    assert.equal(token.isKnown, true, token.surface);
    assert.equal(token.isNPlusOneTarget, false, token.surface);
    assert.equal(token.frequencyRank, undefined, token.surface);
    assert.equal(token.jlptLevel, undefined, token.surface);
@@ -559,36 +559,6 @@ function computeTokenKnownStatus(
  return normalizedReading !== matchText.trim() && isKnownWord(normalizedReading);
 }
 function computeExcludedTokenKnownStatus(
  token: MergedToken,
  isKnownWord: (text: string) => boolean,
 ): boolean {
  const normalizedSurface = token.surface.trim();
  if (!hasKanjiChar(normalizedSurface)) {
    return false;
  }
  if (normalizedSurface && isKnownWord(normalizedSurface)) {
    return true;
  }
  const normalizedReading = token.reading.trim();
  if (
    normalizedReading &&
    normalizedReading !== normalizedSurface &&
    isKnownWord(normalizedReading)
  ) {
    return true;
  }
  const normalizedHeadword = token.headword.trim();
  return (
    normalizedHeadword.length > 0 &&
    normalizedHeadword === normalizedSurface &&
    isKnownWord(normalizedHeadword)
  );
 }
 function filterTokenFrequencyRank(
  token: MergedToken,
  pos1Exclusions: ReadonlySet<string>,
@@ -657,7 +627,9 @@ export function annotateTokens(
      });
      return {
        ...strippedToken,
-        isKnown: nPlusOneEnabled && computeExcludedTokenKnownStatus(token, deps.isKnownWord),
+        isKnown: nPlusOneEnabled
          ? computeTokenKnownStatus(token, deps.isKnownWord, deps.knownWordMatchMode)
          : false,
      };
    }
@@ -22,6 +22,7 @@ const STANDALONE_GRAMMAR_PARTICLE_PHRASES_SET: ReadonlySet<string> = new Set(
 export const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
  'あ',
  'ああ',
  'ある',
  'あなた',
  'あんた',
  'ええ',
@@ -51,6 +52,8 @@ export const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
  '何だ',
  '何も',
  '如何した',
  '有る',
  '在る',
  '様',
  '確かに',
  '誰も',
@@ -507,7 +510,6 @@ export function stripSubtitleAnnotationMetadata(
  return {
    ...token,
    isKnown: false,
    isNPlusOneTarget: false,
    isNameMatch: false,
    jlptLevel: undefined,