diff --git a/backlog/tasks/task-293 - Fix-interjection-tokens-receiving-subtitle-annotations.md b/backlog/tasks/task-293 - Fix-interjection-tokens-receiving-subtitle-annotations.md new file mode 100644 index 00000000..874b37df --- /dev/null +++ b/backlog/tasks/task-293 - Fix-interjection-tokens-receiving-subtitle-annotations.md @@ -0,0 +1,25 @@ +--- +id: TASK-293 +title: Fix interjection tokens receiving subtitle annotations +status: In Progress +assignee: [] +created_date: '2026-04-25 22:50' +labels: + - tokenizer + - bug +dependencies: [] +priority: medium +--- + +## Description + + +Standalone interjections such as あ should remain hoverable dictionary tokens but must not receive N+1, frequency, JLPT, or known-word subtitle annotation metadata. + + +## Acceptance Criteria + +- [ ] #1 A MeCab 感動詞 token like あ is excluded by the shared subtitle annotation gate. +- [ ] #2 annotateTokens strips N+1/frequency/JLPT/known metadata from the interjection while preserving token lookup fields. +- [ ] #3 Focused tokenizer regression passes. + diff --git a/changes/293-interjection-annotation-filter.md b/changes/293-interjection-annotation-filter.md new file mode 100644 index 00000000..36e14b67 --- /dev/null +++ b/changes/293-interjection-annotation-filter.md @@ -0,0 +1,4 @@ +type: fixed +area: tokenizer + +- Stopped standalone `あ` interjections from receiving subtitle annotation metadata such as N+1, JLPT, and frequency highlighting when POS tags are unavailable. diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts index 7683ea66..2e0a7bb3 100644 --- a/src/core/services/tokenizer/annotation-stage.test.ts +++ b/src/core/services/tokenizer/annotation-stage.test.ts @@ -812,3 +812,39 @@ test('annotateTokens applies one shared exclusion gate across known N+1 frequenc assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.jlptLevel, undefined); }); + +test('annotateTokens clears all annotations from standalone あ interjections without POS tags', () => { + const tokens = [ + makeToken({ + surface: 'あ', + headword: 'あ', + reading: 'あ', + partOfSpeech: PartOfSpeech.other, + pos1: '', + pos2: '', + startPos: 0, + endPos: 1, + isKnown: true, + isNPlusOneTarget: true, + frequencyRank: 522, + jlptLevel: 'N5', + }), + ]; + + const result = annotateTokens( + tokens, + makeDeps({ + isKnownWord: (text) => text === 'あ', + getJlptLevel: (text) => (text === 'あ' ? 'N5' : null), + }), + { minSentenceWordsForNPlusOne: 1 }, + ); + + assert.equal(result[0]?.surface, 'あ'); + assert.equal(result[0]?.headword, 'あ'); + assert.equal(result[0]?.reading, 'あ'); + assert.equal(result[0]?.isKnown, false); + assert.equal(result[0]?.isNPlusOneTarget, false); + assert.equal(result[0]?.frequencyRank, undefined); + assert.equal(result[0]?.jlptLevel, undefined); +}); diff --git a/src/core/services/tokenizer/subtitle-annotation-filter.ts b/src/core/services/tokenizer/subtitle-annotation-filter.ts index 8b2a3d49..95700613 100644 --- a/src/core/services/tokenizer/subtitle-annotation-filter.ts +++ b/src/core/services/tokenizer/subtitle-annotation-filter.ts @@ -14,6 +14,7 @@ const KATAKANA_CODEPOINT_START = 0x30a1; const KATAKANA_CODEPOINT_END = 0x30f6; const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([ + 'あ', 'ああ', 'ええ', 'うう',