mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-05-04 00:41:33 -07:00
fix: suppress sigh interjection annotations
This commit is contained in:
+58
@@ -0,0 +1,58 @@
|
|||||||
|
---
|
||||||
|
id: TASK-319
|
||||||
|
title: Suppress annotations for expressive interjection subtitles
|
||||||
|
status: Done
|
||||||
|
assignee:
|
||||||
|
- Codex
|
||||||
|
created_date: '2026-05-03 03:18'
|
||||||
|
updated_date: '2026-05-03 03:20'
|
||||||
|
labels:
|
||||||
|
- bug
|
||||||
|
- subtitle-annotations
|
||||||
|
dependencies: []
|
||||||
|
references:
|
||||||
|
- src/core/services/tokenizer/subtitle-annotation-filter.ts
|
||||||
|
- src/core/services/tokenizer/annotation-stage.test.ts
|
||||||
|
priority: medium
|
||||||
|
---
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||||
|
Interjection-only subtitle tokens such as ハァ and はっ should remain hoverable as tokens but must not receive known, N+1, frequency, or JLPT annotation styling. Current behavior can still annotate these forms when dictionary/POS metadata does not trip the existing exclusion gate.
|
||||||
|
<!-- SECTION:DESCRIPTION:END -->
|
||||||
|
|
||||||
|
## Acceptance Criteria
|
||||||
|
<!-- AC:BEGIN -->
|
||||||
|
- [x] #1 Standalone ハァ/はっ-style interjection tokens have annotation metadata cleared even when dictionary metadata exists.
|
||||||
|
- [x] #2 Filtering remains scoped so content-bearing non-interjection tokens still receive annotations.
|
||||||
|
- [x] #3 Regression coverage exercises the reported subtitle pattern: ハァ… / (ガーフィール)はっ!
|
||||||
|
<!-- AC:END -->
|
||||||
|
|
||||||
|
## Implementation Plan
|
||||||
|
|
||||||
|
<!-- SECTION:PLAN:BEGIN -->
|
||||||
|
1. Add failing regression coverage around annotation filtering for the reported interjection forms, including katakana ハァ and small-tsu はっ with surrounding subtitle punctuation/name text.
|
||||||
|
2. Tighten the shared subtitle annotation exclusion gate so expressive kana interjections clear annotation metadata without relying only on MeCab pos1=感動詞.
|
||||||
|
3. Run the focused tokenizer/annotation tests, then update acceptance criteria and notes.
|
||||||
|
<!-- SECTION:PLAN:END -->
|
||||||
|
|
||||||
|
## Implementation Notes
|
||||||
|
|
||||||
|
<!-- SECTION:NOTES:BEGIN -->
|
||||||
|
Implemented via shared subtitle annotation exclusion term normalization: added はぁ so katakana ハァ normalizes into the existing term gate. Existing small-tsu kana SFX logic already covers はっ. Regression confirms both reported forms clear known/N+1/frequency/JLPT metadata while a normal noun keeps frequency annotation.
|
||||||
|
<!-- SECTION:NOTES:END -->
|
||||||
|
|
||||||
|
## Final Summary
|
||||||
|
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||||
|
Summary:
|
||||||
|
- Added a regression for the reported subtitle pattern ハァ… / (ガーフィール)はっ!, with annotation metadata present on both interjection tokens.
|
||||||
|
- Extended the shared subtitle annotation exclusion term set so ハァ normalizes to はぁ and is stripped of annotation styling. Existing はっ handling remains covered by small-tsu kana SFX filtering.
|
||||||
|
- Added a change fragment for the user-visible bug fix.
|
||||||
|
|
||||||
|
Verification:
|
||||||
|
- bun test src/core/services/tokenizer/annotation-stage.test.ts
|
||||||
|
- bun test src/core/services/tokenizer/annotation-stage.test.ts src/core/services/tokenizer.test.ts src/renderer/subtitle-render.test.ts
|
||||||
|
- bun run typecheck
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
fix: suppress annotations for ハァ-style interjection subtitles
|
||||||
@@ -1691,3 +1691,67 @@ test('annotateTokens clears all annotations from standalone あ interjections wi
|
|||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.jlptLevel, undefined);
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('annotateTokens clears all annotations from expressive subtitle interjections without POS tags', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: 'ハァ',
|
||||||
|
headword: 'ハァ',
|
||||||
|
reading: 'ハァ',
|
||||||
|
partOfSpeech: PartOfSpeech.other,
|
||||||
|
pos1: '',
|
||||||
|
pos2: '',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 2,
|
||||||
|
isKnown: true,
|
||||||
|
isNPlusOneTarget: true,
|
||||||
|
frequencyRank: 3007,
|
||||||
|
jlptLevel: 'N5',
|
||||||
|
}),
|
||||||
|
makeToken({
|
||||||
|
surface: 'はっ',
|
||||||
|
headword: 'はっ',
|
||||||
|
reading: 'ハッ',
|
||||||
|
partOfSpeech: PartOfSpeech.other,
|
||||||
|
pos1: '',
|
||||||
|
pos2: '',
|
||||||
|
startPos: 10,
|
||||||
|
endPos: 12,
|
||||||
|
isKnown: true,
|
||||||
|
isNPlusOneTarget: true,
|
||||||
|
frequencyRank: 3007,
|
||||||
|
jlptLevel: 'N5',
|
||||||
|
}),
|
||||||
|
makeToken({
|
||||||
|
surface: '猫',
|
||||||
|
headword: '猫',
|
||||||
|
reading: 'ネコ',
|
||||||
|
partOfSpeech: PartOfSpeech.noun,
|
||||||
|
pos1: '名詞',
|
||||||
|
pos2: '一般',
|
||||||
|
startPos: 13,
|
||||||
|
endPos: 14,
|
||||||
|
frequencyRank: 11,
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = annotateTokens(
|
||||||
|
tokens,
|
||||||
|
makeDeps({
|
||||||
|
isKnownWord: (text) => text === 'ハァ' || text === 'はっ',
|
||||||
|
getJlptLevel: (text) => (text === 'ハァ' || text === 'はっ' ? 'N5' : null),
|
||||||
|
}),
|
||||||
|
{
|
||||||
|
minSentenceWordsForNPlusOne: 1,
|
||||||
|
sourceText: 'ハァ…\n(ガーフィール)はっ! 猫',
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const token of result.slice(0, 2)) {
|
||||||
|
assert.equal(token.isKnown, false, token.surface);
|
||||||
|
assert.equal(token.isNPlusOneTarget, false, token.surface);
|
||||||
|
assert.equal(token.frequencyRank, undefined, token.surface);
|
||||||
|
assert.equal(token.jlptLevel, undefined, token.surface);
|
||||||
|
}
|
||||||
|
assert.equal(result[2]?.frequencyRank, 11);
|
||||||
|
});
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ export const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
|||||||
'べき',
|
'べき',
|
||||||
'って',
|
'って',
|
||||||
'はあ',
|
'はあ',
|
||||||
|
'はぁ',
|
||||||
'はは',
|
'はは',
|
||||||
'へえ',
|
'へえ',
|
||||||
'ふう',
|
'ふう',
|
||||||
|
|||||||
Reference in New Issue
Block a user