fix: exclude standalone interjection annotations

This commit is contained in:
2026-04-25 15:52:31 -07:00
parent 5b326978e9
commit 60435fee10
4 changed files with 66 additions and 0 deletions

View File

@@ -0,0 +1,25 @@
---
id: TASK-293
title: Fix interjection tokens receiving subtitle annotations
status: In Progress
assignee: []
created_date: '2026-04-25 22:50'
labels:
- tokenizer
- bug
dependencies: []
priority: medium
---
## Description
<!-- SECTION:DESCRIPTION:BEGIN -->
Standalone interjections such as あ should remain hoverable dictionary tokens but must not receive N+1, frequency, JLPT, or known-word subtitle annotation metadata.
<!-- SECTION:DESCRIPTION:END -->
## Acceptance Criteria
<!-- AC:BEGIN -->
- [ ] #1 A MeCab 感動詞 token like あ is excluded by the shared subtitle annotation gate.
- [ ] #2 annotateTokens strips N+1/frequency/JLPT/known metadata from the interjection while preserving token lookup fields.
- [ ] #3 Focused tokenizer regression passes.
<!-- AC:END -->

View File

@@ -0,0 +1,4 @@
type: fixed
area: tokenizer
- Stopped standalone `あ` interjections from receiving subtitle annotation metadata such as N+1, JLPT, and frequency highlighting when POS tags are unavailable.

View File

@@ -812,3 +812,39 @@ test('annotateTokens applies one shared exclusion gate across known N+1 frequenc
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.jlptLevel, undefined);
});
test('annotateTokens clears all annotations from standalone あ interjections without POS tags', () => {
const tokens = [
makeToken({
surface: 'あ',
headword: 'あ',
reading: 'あ',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
startPos: 0,
endPos: 1,
isKnown: true,
isNPlusOneTarget: true,
frequencyRank: 522,
jlptLevel: 'N5',
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === 'あ',
getJlptLevel: (text) => (text === 'あ' ? 'N5' : null),
}),
{ minSentenceWordsForNPlusOne: 1 },
);
assert.equal(result[0]?.surface, 'あ');
assert.equal(result[0]?.headword, 'あ');
assert.equal(result[0]?.reading, 'あ');
assert.equal(result[0]?.isKnown, false);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.jlptLevel, undefined);
});

View File

@@ -14,6 +14,7 @@ const KATAKANA_CODEPOINT_START = 0x30a1;
const KATAKANA_CODEPOINT_END = 0x30f6;
const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
'あ',
'ああ',
'ええ',
'うう',