fix: suppress sigh interjection annotations

2026-05-04 00:41:33 -07:00 · 2026-05-02 20:57:09 -07:00
parent f96467a1d6
commit 508f243d76
4 changed files with 124 additions and 0 deletions
@@ -0,0 +1,58 @@
+---
+id: TASK-319
+title: Suppress annotations for expressive interjection subtitles
+status: Done
+assignee:
+  - Codex
+created_date: '2026-05-03 03:18'
+updated_date: '2026-05-03 03:20'
+labels:
+  - bug
+  - subtitle-annotations
+dependencies: []
+references:
+  - src/core/services/tokenizer/subtitle-annotation-filter.ts
+  - src/core/services/tokenizer/annotation-stage.test.ts
+priority: medium
+---
+
+## Description
+
+<!-- SECTION:DESCRIPTION:BEGIN -->
+Interjection-only subtitle tokens such as ハァ and はっ should remain hoverable as tokens but must not receive known, N+1, frequency, or JLPT annotation styling. Current behavior can still annotate these forms when dictionary/POS metadata does not trip the existing exclusion gate.
+<!-- SECTION:DESCRIPTION:END -->
+
+## Acceptance Criteria
+<!-- AC:BEGIN -->
+- [x] #1 Standalone ハァ/はっ-style interjection tokens have annotation metadata cleared even when dictionary metadata exists.
+- [x] #2 Filtering remains scoped so content-bearing non-interjection tokens still receive annotations.
+- [x] #3 Regression coverage exercises the reported subtitle pattern: ハァ… / （ガーフィール）はっ！
+<!-- AC:END -->
+
+## Implementation Plan
+
+<!-- SECTION:PLAN:BEGIN -->
+1. Add failing regression coverage around annotation filtering for the reported interjection forms, including katakana ハァ and small-tsu はっ with surrounding subtitle punctuation/name text.
+2. Tighten the shared subtitle annotation exclusion gate so expressive kana interjections clear annotation metadata without relying only on MeCab pos1=感動詞.
+3. Run the focused tokenizer/annotation tests, then update acceptance criteria and notes.
+<!-- SECTION:PLAN:END -->
+
+## Implementation Notes
+
+<!-- SECTION:NOTES:BEGIN -->
+Implemented via shared subtitle annotation exclusion term normalization: added はぁ so katakana ハァ normalizes into the existing term gate. Existing small-tsu kana SFX logic already covers はっ. Regression confirms both reported forms clear known/N+1/frequency/JLPT metadata while a normal noun keeps frequency annotation.
+<!-- SECTION:NOTES:END -->
+
+## Final Summary
+
+<!-- SECTION:FINAL_SUMMARY:BEGIN -->
+Summary:
+- Added a regression for the reported subtitle pattern ハァ… / （ガーフィール）はっ！, with annotation metadata present on both interjection tokens.
+- Extended the shared subtitle annotation exclusion term set so ハァ normalizes to はぁ and is stripped of annotation styling. Existing はっ handling remains covered by small-tsu kana SFX filtering.
+- Added a change fragment for the user-visible bug fix.
+
+Verification:
+- bun test src/core/services/tokenizer/annotation-stage.test.ts
+- bun test src/core/services/tokenizer/annotation-stage.test.ts src/core/services/tokenizer.test.ts src/renderer/subtitle-render.test.ts
+- bun run typecheck
+<!-- SECTION:FINAL_SUMMARY:END -->
@@ -0,0 +1 @@
+fix: suppress annotations for ハァ-style interjection subtitles
@@ -1691,3 +1691,67 @@ test('annotateTokens clears all annotations from standalone あ interjections wi
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
 });
+
+test('annotateTokens clears all annotations from expressive subtitle interjections without POS tags', () => {
+  const tokens = [
+    makeToken({
+      surface: 'ハァ',
+      headword: 'ハァ',
+      reading: 'ハァ',
+      partOfSpeech: PartOfSpeech.other,
+      pos1: '',
+      pos2: '',
+      startPos: 0,
+      endPos: 2,
+      isKnown: true,
+      isNPlusOneTarget: true,
+      frequencyRank: 3007,
+      jlptLevel: 'N5',
+    }),
+    makeToken({
+      surface: 'はっ',
+      headword: 'はっ',
+      reading: 'ハッ',
+      partOfSpeech: PartOfSpeech.other,
+      pos1: '',
+      pos2: '',
+      startPos: 10,
+      endPos: 12,
+      isKnown: true,
+      isNPlusOneTarget: true,
+      frequencyRank: 3007,
+      jlptLevel: 'N5',
+    }),
+    makeToken({
+      surface: '猫',
+      headword: '猫',
+      reading: 'ネコ',
+      partOfSpeech: PartOfSpeech.noun,
+      pos1: '名詞',
+      pos2: '一般',
+      startPos: 13,
+      endPos: 14,
+      frequencyRank: 11,
+    }),
+  ];
+
+  const result = annotateTokens(
+    tokens,
+    makeDeps({
+      isKnownWord: (text) => text === 'ハァ' || text === 'はっ',
+      getJlptLevel: (text) => (text === 'ハァ' || text === 'はっ' ? 'N5' : null),
+    }),
+    {
+      minSentenceWordsForNPlusOne: 1,
+      sourceText: 'ハァ…\n（ガーフィール）はっ！ 猫',
+    },
+  );
+
+  for (const token of result.slice(0, 2)) {
+    assert.equal(token.isKnown, false, token.surface);
+    assert.equal(token.isNPlusOneTarget, false, token.surface);
+    assert.equal(token.frequencyRank, undefined, token.surface);
+    assert.equal(token.jlptLevel, undefined, token.surface);
+  }
+  assert.equal(result[2]?.frequencyRank, 11);
+});
@@ -40,6 +40,7 @@ export const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
  'べき',
  'って',
  'はあ',
+  'はぁ',
  'はは',
  'へえ',
  'ふう',
				`@@ -0,0 +1 @@`
				`fix: suppress annotations for ハァ-style interjection subtitles`