mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-04-26 04:19:27 -07:00
fix: exclude standalone interjection annotations
This commit is contained in:
@@ -0,0 +1,25 @@
|
|||||||
|
---
|
||||||
|
id: TASK-293
|
||||||
|
title: Fix interjection tokens receiving subtitle annotations
|
||||||
|
status: In Progress
|
||||||
|
assignee: []
|
||||||
|
created_date: '2026-04-25 22:50'
|
||||||
|
labels:
|
||||||
|
- tokenizer
|
||||||
|
- bug
|
||||||
|
dependencies: []
|
||||||
|
priority: medium
|
||||||
|
---
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||||
|
Standalone interjections such as あ should remain hoverable dictionary tokens but must not receive N+1, frequency, JLPT, or known-word subtitle annotation metadata.
|
||||||
|
<!-- SECTION:DESCRIPTION:END -->
|
||||||
|
|
||||||
|
## Acceptance Criteria
|
||||||
|
<!-- AC:BEGIN -->
|
||||||
|
- [ ] #1 A MeCab 感動詞 token like あ is excluded by the shared subtitle annotation gate.
|
||||||
|
- [ ] #2 annotateTokens strips N+1/frequency/JLPT/known metadata from the interjection while preserving token lookup fields.
|
||||||
|
- [ ] #3 Focused tokenizer regression passes.
|
||||||
|
<!-- AC:END -->
|
||||||
4
changes/293-interjection-annotation-filter.md
Normal file
4
changes/293-interjection-annotation-filter.md
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
type: fixed
|
||||||
|
area: tokenizer
|
||||||
|
|
||||||
|
- Stopped standalone `あ` interjections from receiving subtitle annotation metadata such as N+1, JLPT, and frequency highlighting when POS tags are unavailable.
|
||||||
@@ -812,3 +812,39 @@ test('annotateTokens applies one shared exclusion gate across known N+1 frequenc
|
|||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.jlptLevel, undefined);
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('annotateTokens clears all annotations from standalone あ interjections without POS tags', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: 'あ',
|
||||||
|
headword: 'あ',
|
||||||
|
reading: 'あ',
|
||||||
|
partOfSpeech: PartOfSpeech.other,
|
||||||
|
pos1: '',
|
||||||
|
pos2: '',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 1,
|
||||||
|
isKnown: true,
|
||||||
|
isNPlusOneTarget: true,
|
||||||
|
frequencyRank: 522,
|
||||||
|
jlptLevel: 'N5',
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = annotateTokens(
|
||||||
|
tokens,
|
||||||
|
makeDeps({
|
||||||
|
isKnownWord: (text) => text === 'あ',
|
||||||
|
getJlptLevel: (text) => (text === 'あ' ? 'N5' : null),
|
||||||
|
}),
|
||||||
|
{ minSentenceWordsForNPlusOne: 1 },
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.equal(result[0]?.surface, 'あ');
|
||||||
|
assert.equal(result[0]?.headword, 'あ');
|
||||||
|
assert.equal(result[0]?.reading, 'あ');
|
||||||
|
assert.equal(result[0]?.isKnown, false);
|
||||||
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
|
});
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ const KATAKANA_CODEPOINT_START = 0x30a1;
|
|||||||
const KATAKANA_CODEPOINT_END = 0x30f6;
|
const KATAKANA_CODEPOINT_END = 0x30f6;
|
||||||
|
|
||||||
const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
||||||
|
'あ',
|
||||||
'ああ',
|
'ああ',
|
||||||
'ええ',
|
'ええ',
|
||||||
'うう',
|
'うう',
|
||||||
|
|||||||
Reference in New Issue
Block a user