mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-04-26 04:19:27 -07:00
fix: exclude standalone interjection annotations
This commit is contained in:
@@ -0,0 +1,25 @@
|
||||
---
|
||||
id: TASK-293
|
||||
title: Fix interjection tokens receiving subtitle annotations
|
||||
status: In Progress
|
||||
assignee: []
|
||||
created_date: '2026-04-25 22:50'
|
||||
labels:
|
||||
- tokenizer
|
||||
- bug
|
||||
dependencies: []
|
||||
priority: medium
|
||||
---
|
||||
|
||||
## Description
|
||||
|
||||
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||
Standalone interjections such as あ should remain hoverable dictionary tokens but must not receive N+1, frequency, JLPT, or known-word subtitle annotation metadata.
|
||||
<!-- SECTION:DESCRIPTION:END -->
|
||||
|
||||
## Acceptance Criteria
|
||||
<!-- AC:BEGIN -->
|
||||
- [ ] #1 A MeCab 感動詞 token like あ is excluded by the shared subtitle annotation gate.
|
||||
- [ ] #2 annotateTokens strips N+1/frequency/JLPT/known metadata from the interjection while preserving token lookup fields.
|
||||
- [ ] #3 Focused tokenizer regression passes.
|
||||
<!-- AC:END -->
|
||||
4
changes/293-interjection-annotation-filter.md
Normal file
4
changes/293-interjection-annotation-filter.md
Normal file
@@ -0,0 +1,4 @@
|
||||
type: fixed
|
||||
area: tokenizer
|
||||
|
||||
- Stopped standalone `あ` interjections from receiving subtitle annotation metadata such as N+1, JLPT, and frequency highlighting when POS tags are unavailable.
|
||||
@@ -812,3 +812,39 @@ test('annotateTokens applies one shared exclusion gate across known N+1 frequenc
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
assert.equal(result[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens clears all annotations from standalone あ interjections without POS tags', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'あ',
|
||||
headword: 'あ',
|
||||
reading: 'あ',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
isKnown: true,
|
||||
isNPlusOneTarget: true,
|
||||
frequencyRank: 522,
|
||||
jlptLevel: 'N5',
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === 'あ',
|
||||
getJlptLevel: (text) => (text === 'あ' ? 'N5' : null),
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 1 },
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.surface, 'あ');
|
||||
assert.equal(result[0]?.headword, 'あ');
|
||||
assert.equal(result[0]?.reading, 'あ');
|
||||
assert.equal(result[0]?.isKnown, false);
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
assert.equal(result[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
@@ -14,6 +14,7 @@ const KATAKANA_CODEPOINT_START = 0x30a1;
|
||||
const KATAKANA_CODEPOINT_END = 0x30f6;
|
||||
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
||||
'あ',
|
||||
'ああ',
|
||||
'ええ',
|
||||
'うう',
|
||||
|
||||
Reference in New Issue
Block a user