From 59fa3b427db4519de1f8fdbd2fea66f2c1f2f696 Mon Sep 17 00:00:00 2001 From: sudacode Date: Thu, 19 Mar 2026 21:40:20 -0700 Subject: [PATCH] fix: exclude auxiliary grammar tails from subtitle annotations --- ...r-tail-そうだ-from-subtitle-annotations.md | 59 +++++++++++++++ src/core/services/tokenizer.test.ts | 73 +++++++++++++++++++ .../tokenizer/annotation-stage.test.ts | 13 ++++ .../services/tokenizer/annotation-stage.ts | 15 ++++ 4 files changed, 160 insertions(+) create mode 100644 backlog/tasks/task-209 - Exclude-grammar-tail-そうだ-from-subtitle-annotations.md diff --git a/backlog/tasks/task-209 - Exclude-grammar-tail-そうだ-from-subtitle-annotations.md b/backlog/tasks/task-209 - Exclude-grammar-tail-そうだ-from-subtitle-annotations.md new file mode 100644 index 0000000..6660f56 --- /dev/null +++ b/backlog/tasks/task-209 - Exclude-grammar-tail-そうだ-from-subtitle-annotations.md @@ -0,0 +1,59 @@ +--- +id: TASK-209 +title: Exclude grammar-tail そうだ from subtitle annotations +status: Done +assignee: + - codex +created_date: '2026-03-20 04:06' +updated_date: '2026-03-20 04:33' +labels: + - bug + - tokenizer +dependencies: [] +references: + - >- + /Users/sudacode/projects/japanese/SubMiner/src/core/services/tokenizer/annotation-stage.ts + - >- + /Users/sudacode/projects/japanese/SubMiner/src/core/services/tokenizer/annotation-stage.test.ts + - >- + /Users/sudacode/projects/japanese/SubMiner/src/core/services/tokenizer.test.ts +priority: high +--- + +## Description + + +Sentence-final grammar-tail `そうだ` tokens can still receive subtitle annotation styling, including frequency highlighting, when Yomitan returns a standalone `そうだ` token and MeCab enriches it as an auxiliary-stem/coupla pattern (`名詞|助動詞`, `助動詞語幹`). Keep the subtitle text visible, but treat this grammar tail like other grammar-only endings so it renders without annotation metadata. + + +## Acceptance Criteria + +- [x] #1 Sentence-final grammar-tail `そうだ` tokens enriched as auxiliary-stem/copula patterns do not receive frequency highlighting or other subtitle annotation metadata. +- [x] #2 The preceding lexical token in cases like `与えるそうだ` keeps its existing annotation behavior. +- [x] #3 Regression tests cover the annotation-stage exclusion and end-to-end subtitle tokenization for the `そうだ` grammar-tail case. + + +## Implementation Plan + + +1. Add focused regression coverage for the reported `与えるそうだ` case at both annotation-stage and tokenizeSubtitle levels. +2. Reproduce failure by modeling the MeCab-enriched grammar-tail shape (`名詞|助動詞`, `特殊`, `助動詞語幹`) that currently keeps frequency metadata. +3. Update subtitle-annotation exclusion logic to recognize auxiliary-stem/copula grammar tails via POS metadata plus normalized tail text, not a raw sentence-specific string match. +4. Re-run targeted tokenizer and annotation-stage tests, then record the verification commands and outcome in the task notes. + + +## Implementation Notes + + +Investigated reported `与えるそうだ` case. MeCab tags `そう` as `名詞,特殊,助動詞語幹` and `だ` as `助動詞`; after overlap enrichment the Yomitan token becomes `pos1=名詞|助動詞`, `pos2=特殊`, `pos3=助動詞語幹`, which currently escapes subtitle-annotation exclusion and can keep a frequency rank. + +Implemented a POS-shape subtitle-annotation exclusion for MeCab-enriched auxiliary-stem grammar tails. The new predicate keys off merged tokens whose POS tags stay within `名詞/助動詞/助詞` and whose POS3 includes `助動詞語幹`, which clears annotation metadata for `そうだ`-style tails without hard-coding the full subtitle text. + +Verification: `bun test src/core/services/tokenizer/annotation-stage.test.ts`, `bun test src/core/services/tokenizer.test.ts --test-name-pattern 'explanatory ending|interjection|single-kana merged tokens from frequency highlighting|auxiliary-stem そうだ grammar tails|composite function/content token from frequency highlighting|keeps frequency for content-led merged token with trailing colloquial suffixes'` + + +## Final Summary + + +Added regression coverage for `与えるそうだ` and updated subtitle annotation exclusion logic to drop annotation metadata for MeCab-enriched auxiliary-stem grammar tails. The fix is POS-driven rather than sentence-specific, so `そうだ`-style grammar endings stay visible/hoverable as plain text while neighboring lexical tokens keep their existing frequency/JLPT behavior. + diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index c228bb0..ee933a9 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -3483,6 +3483,79 @@ test('tokenizeSubtitle keeps trailing quote-particle merged tokens hoverable whi ); }); +test('tokenizeSubtitle keeps auxiliary-stem そうだ grammar tails hoverable while clearing annotation metadata', async () => { + const result = await tokenizeSubtitle( + '与えるそうだ', + makeDepsFromYomitanTokens( + [ + { surface: '与える', reading: 'あたえる', headword: '与える' }, + { surface: 'そうだ', reading: 'そうだ', headword: 'そうだ' }, + ], + { + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => (text === '与える' ? 100 : text === 'そうだ' ? 12 : null), + getJlptLevel: (text) => (text === '与える' ? 'N3' : text === 'そうだ' ? 'N5' : null), + tokenizeWithMecab: async () => [ + { + headword: '与える', + surface: '与える', + reading: 'アタエル', + startPos: 0, + endPos: 3, + partOfSpeech: PartOfSpeech.verb, + pos1: '動詞', + pos2: '自立', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'そう', + surface: 'そう', + reading: 'ソウ', + startPos: 3, + endPos: 5, + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '特殊', + pos3: '助動詞語幹', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'だ', + surface: 'だ', + reading: 'ダ', + startPos: 5, + endPos: 6, + partOfSpeech: PartOfSpeech.bound_auxiliary, + pos1: '助動詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + ], + getMinSentenceWordsForNPlusOne: () => 1, + }, + ), + ); + + assert.equal(result.text, '与えるそうだ'); + assert.deepEqual( + result.tokens?.map((token) => ({ + surface: token.surface, + headword: token.headword, + frequencyRank: token.frequencyRank, + jlptLevel: token.jlptLevel, + })), + [ + { surface: '与える', headword: '与える', frequencyRank: 100, jlptLevel: 'N3' }, + { surface: 'そうだ', headword: 'そうだ', frequencyRank: undefined, jlptLevel: undefined }, + ], + ); +}); + test('tokenizeSubtitle excludes single-kana merged tokens from frequency highlighting', async () => { const result = await tokenizeSubtitle( 'た', diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts index 4137418..1ea49f1 100644 --- a/src/core/services/tokenizer/annotation-stage.test.ts +++ b/src/core/services/tokenizer/annotation-stage.test.ts @@ -234,6 +234,19 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes explanatory ending vari } }); +test('shouldExcludeTokenFromSubtitleAnnotations excludes auxiliary-stem そうだ grammar tails', () => { + const token = makeToken({ + surface: 'そうだ', + headword: 'そうだ', + reading: 'ソウダ', + pos1: '名詞|助動詞', + pos2: '特殊', + pos3: '助動詞語幹', + }); + + assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true); +}); + test('shouldExcludeTokenFromSubtitleAnnotations keeps lexical tokens outside explanatory ending family', () => { const token = makeToken({ surface: '問題', diff --git a/src/core/services/tokenizer/annotation-stage.ts b/src/core/services/tokenizer/annotation-stage.ts index 7b6c1b4..e9f9dda 100644 --- a/src/core/services/tokenizer/annotation-stage.ts +++ b/src/core/services/tokenizer/annotation-stage.ts @@ -100,6 +100,7 @@ function normalizePos1Tag(pos1: string | undefined): string { const SUBTITLE_ANNOTATION_EXCLUDED_POS1 = new Set(['感動詞']); const SUBTITLE_ANNOTATION_GRAMMAR_ONLY_POS1 = new Set(['助詞', '助動詞', '連体詞']); +const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']); function splitNormalizedTagParts(normalizedTag: string): string[] { if (!normalizedTag) { @@ -156,6 +157,16 @@ function isExcludedTrailingParticleMergedToken(token: MergedToken): boolean { return trailingPos1.length > 0 && trailingPos1.every((part) => part === '助詞'); } +function isAuxiliaryStemGrammarTailToken(token: MergedToken): boolean { + const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1)); + if (pos1Parts.length === 0 || !pos1Parts.every((part) => AUXILIARY_STEM_GRAMMAR_TAIL_POS1.has(part))) { + return false; + } + + const pos3Parts = splitNormalizedTagParts(normalizePos2Tag(token.pos3)); + return pos3Parts.includes('助動詞語幹'); +} + function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet { if (options.pos1Exclusions) { return options.pos1Exclusions; @@ -626,6 +637,10 @@ export function shouldExcludeTokenFromSubtitleAnnotations(token: MergedToken): b return true; } + if (isAuxiliaryStemGrammarTailToken(token)) { + return true; + } + if (isExcludedTrailingParticleMergedToken(token)) { return true; }