fix: exclude standalone interjection annotations

2026-08-02 07:21:33 -07:00 · 2026-04-25 15:52:31 -07:00
parent 5b326978e9
commit 60435fee10
4 changed files with 66 additions and 0 deletions
@@ -0,0 +1,25 @@
+---
+id: TASK-293
+title: Fix interjection tokens receiving subtitle annotations
+status: In Progress
+assignee: []
+created_date: '2026-04-25 22:50'
+labels:
+  - tokenizer
+  - bug
+dependencies: []
+priority: medium
+---
+
+## Description
+
+<!-- SECTION:DESCRIPTION:BEGIN -->
+Standalone interjections such as あ should remain hoverable dictionary tokens but must not receive N+1, frequency, JLPT, or known-word subtitle annotation metadata.
+<!-- SECTION:DESCRIPTION:END -->
+
+## Acceptance Criteria
+<!-- AC:BEGIN -->
+- [ ] #1 A MeCab 感動詞 token like あ is excluded by the shared subtitle annotation gate.
+- [ ] #2 annotateTokens strips N+1/frequency/JLPT/known metadata from the interjection while preserving token lookup fields.
+- [ ] #3 Focused tokenizer regression passes.
+<!-- AC:END -->
@@ -0,0 +1,4 @@
+type: fixed
+area: tokenizer
+
+- Stopped standalone `あ` interjections from receiving subtitle annotation metadata such as N+1, JLPT, and frequency highlighting when POS tags are unavailable.
@@ -812,3 +812,39 @@ test('annotateTokens applies one shared exclusion gate across known N+1 frequenc
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
 });
+
+test('annotateTokens clears all annotations from standalone あ interjections without POS tags', () => {
+  const tokens = [
+    makeToken({
+      surface: 'あ',
+      headword: 'あ',
+      reading: 'あ',
+      partOfSpeech: PartOfSpeech.other,
+      pos1: '',
+      pos2: '',
+      startPos: 0,
+      endPos: 1,
+      isKnown: true,
+      isNPlusOneTarget: true,
+      frequencyRank: 522,
+      jlptLevel: 'N5',
+    }),
+  ];
+
+  const result = annotateTokens(
+    tokens,
+    makeDeps({
+      isKnownWord: (text) => text === 'あ',
+      getJlptLevel: (text) => (text === 'あ' ? 'N5' : null),
+    }),
+    { minSentenceWordsForNPlusOne: 1 },
+  );
+
+  assert.equal(result[0]?.surface, 'あ');
+  assert.equal(result[0]?.headword, 'あ');
+  assert.equal(result[0]?.reading, 'あ');
+  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isNPlusOneTarget, false);
+  assert.equal(result[0]?.frequencyRank, undefined);
+  assert.equal(result[0]?.jlptLevel, undefined);
+});
@@ -14,6 +14,7 @@ const KATAKANA_CODEPOINT_START = 0x30a1;
 const KATAKANA_CODEPOINT_END = 0x30f6;

 const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
+  'あ',
  'ああ',
  'ええ',
  'うう',