diff --git a/backlog/tasks/task-319 - Suppress-annotations-for-expressive-interjection-subtitles.md b/backlog/tasks/task-319 - Suppress-annotations-for-expressive-interjection-subtitles.md new file mode 100644 index 00000000..420bf4d7 --- /dev/null +++ b/backlog/tasks/task-319 - Suppress-annotations-for-expressive-interjection-subtitles.md @@ -0,0 +1,58 @@ +--- +id: TASK-319 +title: Suppress annotations for expressive interjection subtitles +status: Done +assignee: + - Codex +created_date: '2026-05-03 03:18' +updated_date: '2026-05-03 03:20' +labels: + - bug + - subtitle-annotations +dependencies: [] +references: + - src/core/services/tokenizer/subtitle-annotation-filter.ts + - src/core/services/tokenizer/annotation-stage.test.ts +priority: medium +--- + +## Description + + +Interjection-only subtitle tokens such as ハァ and はっ should remain hoverable as tokens but must not receive known, N+1, frequency, or JLPT annotation styling. Current behavior can still annotate these forms when dictionary/POS metadata does not trip the existing exclusion gate. + + +## Acceptance Criteria + +- [x] #1 Standalone ハァ/はっ-style interjection tokens have annotation metadata cleared even when dictionary metadata exists. +- [x] #2 Filtering remains scoped so content-bearing non-interjection tokens still receive annotations. +- [x] #3 Regression coverage exercises the reported subtitle pattern: ハァ… / (ガーフィール)はっ! + + +## Implementation Plan + + +1. Add failing regression coverage around annotation filtering for the reported interjection forms, including katakana ハァ and small-tsu はっ with surrounding subtitle punctuation/name text. +2. Tighten the shared subtitle annotation exclusion gate so expressive kana interjections clear annotation metadata without relying only on MeCab pos1=感動詞. +3. Run the focused tokenizer/annotation tests, then update acceptance criteria and notes. + + +## Implementation Notes + + +Implemented via shared subtitle annotation exclusion term normalization: added はぁ so katakana ハァ normalizes into the existing term gate. Existing small-tsu kana SFX logic already covers はっ. Regression confirms both reported forms clear known/N+1/frequency/JLPT metadata while a normal noun keeps frequency annotation. + + +## Final Summary + + +Summary: +- Added a regression for the reported subtitle pattern ハァ… / (ガーフィール)はっ!, with annotation metadata present on both interjection tokens. +- Extended the shared subtitle annotation exclusion term set so ハァ normalizes to はぁ and is stripped of annotation styling. Existing はっ handling remains covered by small-tsu kana SFX filtering. +- Added a change fragment for the user-visible bug fix. + +Verification: +- bun test src/core/services/tokenizer/annotation-stage.test.ts +- bun test src/core/services/tokenizer/annotation-stage.test.ts src/core/services/tokenizer.test.ts src/renderer/subtitle-render.test.ts +- bun run typecheck + diff --git a/changes/319-interjection-annotation-filter.md b/changes/319-interjection-annotation-filter.md new file mode 100644 index 00000000..78fd13d8 --- /dev/null +++ b/changes/319-interjection-annotation-filter.md @@ -0,0 +1 @@ +fix: suppress annotations for ハァ-style interjection subtitles diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts index 7f2aafe4..d0f30737 100644 --- a/src/core/services/tokenizer/annotation-stage.test.ts +++ b/src/core/services/tokenizer/annotation-stage.test.ts @@ -1691,3 +1691,67 @@ test('annotateTokens clears all annotations from standalone あ interjections wi assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.jlptLevel, undefined); }); + +test('annotateTokens clears all annotations from expressive subtitle interjections without POS tags', () => { + const tokens = [ + makeToken({ + surface: 'ハァ', + headword: 'ハァ', + reading: 'ハァ', + partOfSpeech: PartOfSpeech.other, + pos1: '', + pos2: '', + startPos: 0, + endPos: 2, + isKnown: true, + isNPlusOneTarget: true, + frequencyRank: 3007, + jlptLevel: 'N5', + }), + makeToken({ + surface: 'はっ', + headword: 'はっ', + reading: 'ハッ', + partOfSpeech: PartOfSpeech.other, + pos1: '', + pos2: '', + startPos: 10, + endPos: 12, + isKnown: true, + isNPlusOneTarget: true, + frequencyRank: 3007, + jlptLevel: 'N5', + }), + makeToken({ + surface: '猫', + headword: '猫', + reading: 'ネコ', + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '一般', + startPos: 13, + endPos: 14, + frequencyRank: 11, + }), + ]; + + const result = annotateTokens( + tokens, + makeDeps({ + isKnownWord: (text) => text === 'ハァ' || text === 'はっ', + getJlptLevel: (text) => (text === 'ハァ' || text === 'はっ' ? 'N5' : null), + }), + { + minSentenceWordsForNPlusOne: 1, + sourceText: 'ハァ…\n(ガーフィール)はっ! 猫', + }, + ); + + for (const token of result.slice(0, 2)) { + assert.equal(token.isKnown, false, token.surface); + assert.equal(token.isNPlusOneTarget, false, token.surface); + assert.equal(token.frequencyRank, undefined, token.surface); + assert.equal(token.jlptLevel, undefined, token.surface); + } + assert.equal(result[2]?.frequencyRank, 11); +}); diff --git a/src/core/services/tokenizer/subtitle-annotation-filter.ts b/src/core/services/tokenizer/subtitle-annotation-filter.ts index dc77dc4d..a1446901 100644 --- a/src/core/services/tokenizer/subtitle-annotation-filter.ts +++ b/src/core/services/tokenizer/subtitle-annotation-filter.ts @@ -40,6 +40,7 @@ export const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([ 'べき', 'って', 'はあ', + 'はぁ', 'はは', 'へえ', 'ふう',