fix: exclude standalone interjection annotations

This commit is contained in:
2026-04-25 15:52:31 -07:00
parent 5b326978e9
commit 60435fee10
4 changed files with 66 additions and 0 deletions

View File

@@ -812,3 +812,39 @@ test('annotateTokens applies one shared exclusion gate across known N+1 frequenc
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.jlptLevel, undefined);
});
test('annotateTokens clears all annotations from standalone あ interjections without POS tags', () => {
const tokens = [
makeToken({
surface: 'あ',
headword: 'あ',
reading: 'あ',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
startPos: 0,
endPos: 1,
isKnown: true,
isNPlusOneTarget: true,
frequencyRank: 522,
jlptLevel: 'N5',
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === 'あ',
getJlptLevel: (text) => (text === 'あ' ? 'N5' : null),
}),
{ minSentenceWordsForNPlusOne: 1 },
);
assert.equal(result[0]?.surface, 'あ');
assert.equal(result[0]?.headword, 'あ');
assert.equal(result[0]?.reading, 'あ');
assert.equal(result[0]?.isKnown, false);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.jlptLevel, undefined);
});

View File

@@ -14,6 +14,7 @@ const KATAKANA_CODEPOINT_START = 0x30a1;
const KATAKANA_CODEPOINT_END = 0x30f6;
const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
'あ',
'ああ',
'ええ',
'うう',