fix: exclude standalone interjection annotations

2026-04-26 04:19:27 -07:00 · 2026-04-25 15:52:31 -07:00
parent 5b326978e9
commit 60435fee10
4 changed files with 66 additions and 0 deletions
--- a/Fix-interjection-tokens-receiving-subtitle-annotations.md
+++ b/Fix-interjection-tokens-receiving-subtitle-annotations.md
@@ -0,0 +1,25 @@
 ---
 id: TASK-293
 title: Fix interjection tokens receiving subtitle annotations
 status: In Progress
 assignee: []
 created_date: '2026-04-25 22:50'
 labels:
  - tokenizer
  - bug
 dependencies: []
 priority: medium
 ---
 ## Description
 <!-- SECTION:DESCRIPTION:BEGIN -->
 Standalone interjections such as あ should remain hoverable dictionary tokens but must not receive N+1, frequency, JLPT, or known-word subtitle annotation metadata.
 <!-- SECTION:DESCRIPTION:END -->
 ## Acceptance Criteria
 <!-- AC:BEGIN -->
 - [ ] #1 A MeCab 感動詞 token like あ is excluded by the shared subtitle annotation gate.
 - [ ] #2 annotateTokens strips N+1/frequency/JLPT/known metadata from the interjection while preserving token lookup fields.
 - [ ] #3 Focused tokenizer regression passes.
 <!-- AC:END -->
--- a/changes/293-interjection-annotation-filter.md
+++ b/changes/293-interjection-annotation-filter.md
@@ -0,0 +1,4 @@
 type: fixed
 area: tokenizer
 - Stopped standalone `あ` interjections from receiving subtitle annotation metadata such as N+1, JLPT, and frequency highlighting when POS tags are unavailable.
--- a/src/core/services/tokenizer/annotation-stage.test.ts
+++ b/src/core/services/tokenizer/annotation-stage.test.ts
@@ -812,3 +812,39 @@ test('annotateTokens applies one shared exclusion gate across known N+1 frequenc
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
 });
 test('annotateTokens clears all annotations from standalone あ interjections without POS tags', () => {
  const tokens = [
    makeToken({
      surface: 'あ',
      headword: 'あ',
      reading: 'あ',
      partOfSpeech: PartOfSpeech.other,
      pos1: '',
      pos2: '',
      startPos: 0,
      endPos: 1,
      isKnown: true,
      isNPlusOneTarget: true,
      frequencyRank: 522,
      jlptLevel: 'N5',
    }),
  ];
  const result = annotateTokens(
    tokens,
    makeDeps({
      isKnownWord: (text) => text === 'あ',
      getJlptLevel: (text) => (text === 'あ' ? 'N5' : null),
    }),
    { minSentenceWordsForNPlusOne: 1 },
  );
  assert.equal(result[0]?.surface, 'あ');
  assert.equal(result[0]?.headword, 'あ');
  assert.equal(result[0]?.reading, 'あ');
  assert.equal(result[0]?.isKnown, false);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
 });
--- a/src/core/services/tokenizer/subtitle-annotation-filter.ts
+++ b/src/core/services/tokenizer/subtitle-annotation-filter.ts
@@ -14,6 +14,7 @@ const KATAKANA_CODEPOINT_START = 0x30a1;
 const KATAKANA_CODEPOINT_END = 0x30f6;
 const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
  'あ',
  'ああ',
  'ええ',
  'うう',