fix: exclude auxiliary grammar tails from subtitle annotations

This commit is contained in:
2026-03-19 21:40:20 -07:00
parent ff95934f07
commit 59fa3b427d
4 changed files with 160 additions and 0 deletions

View File

@@ -0,0 +1,59 @@
---
id: TASK-209
title: Exclude grammar-tail そうだ from subtitle annotations
status: Done
assignee:
- codex
created_date: '2026-03-20 04:06'
updated_date: '2026-03-20 04:33'
labels:
- bug
- tokenizer
dependencies: []
references:
- >-
/Users/sudacode/projects/japanese/SubMiner/src/core/services/tokenizer/annotation-stage.ts
- >-
/Users/sudacode/projects/japanese/SubMiner/src/core/services/tokenizer/annotation-stage.test.ts
- >-
/Users/sudacode/projects/japanese/SubMiner/src/core/services/tokenizer.test.ts
priority: high
---
## Description
<!-- SECTION:DESCRIPTION:BEGIN -->
Sentence-final grammar-tail `そうだ` tokens can still receive subtitle annotation styling, including frequency highlighting, when Yomitan returns a standalone `そうだ` token and MeCab enriches it as an auxiliary-stem/coupla pattern (`名詞|助動詞`, `助動詞語幹`). Keep the subtitle text visible, but treat this grammar tail like other grammar-only endings so it renders without annotation metadata.
<!-- SECTION:DESCRIPTION:END -->
## Acceptance Criteria
<!-- AC:BEGIN -->
- [x] #1 Sentence-final grammar-tail `そうだ` tokens enriched as auxiliary-stem/copula patterns do not receive frequency highlighting or other subtitle annotation metadata.
- [x] #2 The preceding lexical token in cases like `与えるそうだ` keeps its existing annotation behavior.
- [x] #3 Regression tests cover the annotation-stage exclusion and end-to-end subtitle tokenization for the `そうだ` grammar-tail case.
<!-- AC:END -->
## Implementation Plan
<!-- SECTION:PLAN:BEGIN -->
1. Add focused regression coverage for the reported `与えるそうだ` case at both annotation-stage and tokenizeSubtitle levels.
2. Reproduce failure by modeling the MeCab-enriched grammar-tail shape (`名詞|助動詞`, `特殊`, `助動詞語幹`) that currently keeps frequency metadata.
3. Update subtitle-annotation exclusion logic to recognize auxiliary-stem/copula grammar tails via POS metadata plus normalized tail text, not a raw sentence-specific string match.
4. Re-run targeted tokenizer and annotation-stage tests, then record the verification commands and outcome in the task notes.
<!-- SECTION:PLAN:END -->
## Implementation Notes
<!-- SECTION:NOTES:BEGIN -->
Investigated reported `与えるそうだ` case. MeCab tags `そう` as `名詞,特殊,助動詞語幹` and `だ` as `助動詞`; after overlap enrichment the Yomitan token becomes `pos1=名詞|助動詞`, `pos2=特殊`, `pos3=助動詞語幹`, which currently escapes subtitle-annotation exclusion and can keep a frequency rank.
Implemented a POS-shape subtitle-annotation exclusion for MeCab-enriched auxiliary-stem grammar tails. The new predicate keys off merged tokens whose POS tags stay within `名詞/助動詞/助詞` and whose POS3 includes `助動詞語幹`, which clears annotation metadata for `そうだ`-style tails without hard-coding the full subtitle text.
Verification: `bun test src/core/services/tokenizer/annotation-stage.test.ts`, `bun test src/core/services/tokenizer.test.ts --test-name-pattern 'explanatory ending|interjection|single-kana merged tokens from frequency highlighting|auxiliary-stem そうだ grammar tails|composite function/content token from frequency highlighting|keeps frequency for content-led merged token with trailing colloquial suffixes'`
<!-- SECTION:NOTES:END -->
## Final Summary
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
Added regression coverage for `与えるそうだ` and updated subtitle annotation exclusion logic to drop annotation metadata for MeCab-enriched auxiliary-stem grammar tails. The fix is POS-driven rather than sentence-specific, so `そうだ`-style grammar endings stay visible/hoverable as plain text while neighboring lexical tokens keep their existing frequency/JLPT behavior.
<!-- SECTION:FINAL_SUMMARY:END -->

View File

@@ -3483,6 +3483,79 @@ test('tokenizeSubtitle keeps trailing quote-particle merged tokens hoverable whi
); );
}); });
test('tokenizeSubtitle keeps auxiliary-stem そうだ grammar tails hoverable while clearing annotation metadata', async () => {
const result = await tokenizeSubtitle(
'与えるそうだ',
makeDepsFromYomitanTokens(
[
{ surface: '与える', reading: 'あたえる', headword: '与える' },
{ surface: 'そうだ', reading: 'そうだ', headword: 'そうだ' },
],
{
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) => (text === '与える' ? 100 : text === 'そうだ' ? 12 : null),
getJlptLevel: (text) => (text === '与える' ? 'N3' : text === 'そうだ' ? 'N5' : null),
tokenizeWithMecab: async () => [
{
headword: '与える',
surface: '与える',
reading: 'アタエル',
startPos: 0,
endPos: 3,
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '自立',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'そう',
surface: 'そう',
reading: 'ソウ',
startPos: 3,
endPos: 5,
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '特殊',
pos3: '助動詞語幹',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'だ',
surface: 'だ',
reading: 'ダ',
startPos: 5,
endPos: 6,
partOfSpeech: PartOfSpeech.bound_auxiliary,
pos1: '助動詞',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
],
getMinSentenceWordsForNPlusOne: () => 1,
},
),
);
assert.equal(result.text, '与えるそうだ');
assert.deepEqual(
result.tokens?.map((token) => ({
surface: token.surface,
headword: token.headword,
frequencyRank: token.frequencyRank,
jlptLevel: token.jlptLevel,
})),
[
{ surface: '与える', headword: '与える', frequencyRank: 100, jlptLevel: 'N3' },
{ surface: 'そうだ', headword: 'そうだ', frequencyRank: undefined, jlptLevel: undefined },
],
);
});
test('tokenizeSubtitle excludes single-kana merged tokens from frequency highlighting', async () => { test('tokenizeSubtitle excludes single-kana merged tokens from frequency highlighting', async () => {
const result = await tokenizeSubtitle( const result = await tokenizeSubtitle(
'た', 'た',

View File

@@ -234,6 +234,19 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes explanatory ending vari
} }
}); });
test('shouldExcludeTokenFromSubtitleAnnotations excludes auxiliary-stem そうだ grammar tails', () => {
const token = makeToken({
surface: 'そうだ',
headword: 'そうだ',
reading: 'ソウダ',
pos1: '名詞|助動詞',
pos2: '特殊',
pos3: '助動詞語幹',
});
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
});
test('shouldExcludeTokenFromSubtitleAnnotations keeps lexical tokens outside explanatory ending family', () => { test('shouldExcludeTokenFromSubtitleAnnotations keeps lexical tokens outside explanatory ending family', () => {
const token = makeToken({ const token = makeToken({
surface: '問題', surface: '問題',

View File

@@ -100,6 +100,7 @@ function normalizePos1Tag(pos1: string | undefined): string {
const SUBTITLE_ANNOTATION_EXCLUDED_POS1 = new Set(['感動詞']); const SUBTITLE_ANNOTATION_EXCLUDED_POS1 = new Set(['感動詞']);
const SUBTITLE_ANNOTATION_GRAMMAR_ONLY_POS1 = new Set(['助詞', '助動詞', '連体詞']); const SUBTITLE_ANNOTATION_GRAMMAR_ONLY_POS1 = new Set(['助詞', '助動詞', '連体詞']);
const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
function splitNormalizedTagParts(normalizedTag: string): string[] { function splitNormalizedTagParts(normalizedTag: string): string[] {
if (!normalizedTag) { if (!normalizedTag) {
@@ -156,6 +157,16 @@ function isExcludedTrailingParticleMergedToken(token: MergedToken): boolean {
return trailingPos1.length > 0 && trailingPos1.every((part) => part === '助詞'); return trailingPos1.length > 0 && trailingPos1.every((part) => part === '助詞');
} }
function isAuxiliaryStemGrammarTailToken(token: MergedToken): boolean {
const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1));
if (pos1Parts.length === 0 || !pos1Parts.every((part) => AUXILIARY_STEM_GRAMMAR_TAIL_POS1.has(part))) {
return false;
}
const pos3Parts = splitNormalizedTagParts(normalizePos2Tag(token.pos3));
return pos3Parts.includes('助動詞語幹');
}
function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> { function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
if (options.pos1Exclusions) { if (options.pos1Exclusions) {
return options.pos1Exclusions; return options.pos1Exclusions;
@@ -626,6 +637,10 @@ export function shouldExcludeTokenFromSubtitleAnnotations(token: MergedToken): b
return true; return true;
} }
if (isAuxiliaryStemGrammarTailToken(token)) {
return true;
}
if (isExcludedTrailingParticleMergedToken(token)) { if (isExcludedTrailingParticleMergedToken(token)) {
return true; return true;
} }