From 2c01baafc9498ad3a0169ba3b6b925cef6658e75 Mon Sep 17 00:00:00 2001 From: sudacode Date: Sat, 25 Apr 2026 19:41:36 -0700 Subject: [PATCH] fix: exclude kana grammar helper annotations --- ...s-like-ことに-from-subtitle-annotations.md | 54 +++++ .../298-kana-grammar-helper-annotations.md | 4 + src/core/services/tokenizer.test.ts | 220 ++++++++++++++++++ .../tokenizer/annotation-stage.test.ts | 43 ++++ .../tokenizer/subtitle-annotation-filter.ts | 30 +++ 5 files changed, 351 insertions(+) create mode 100644 backlog/tasks/task-298 - Exclude-kana-grammar-helper-merges-like-ことに-from-subtitle-annotations.md create mode 100644 changes/298-kana-grammar-helper-annotations.md diff --git a/backlog/tasks/task-298 - Exclude-kana-grammar-helper-merges-like-ことに-from-subtitle-annotations.md b/backlog/tasks/task-298 - Exclude-kana-grammar-helper-merges-like-ことに-from-subtitle-annotations.md new file mode 100644 index 00000000..329d063b --- /dev/null +++ b/backlog/tasks/task-298 - Exclude-kana-grammar-helper-merges-like-ことに-from-subtitle-annotations.md @@ -0,0 +1,54 @@ +--- +id: TASK-298 +title: Exclude kana grammar-helper merges like ことに from subtitle annotations +status: Done +assignee: + - codex +created_date: '2026-04-26 00:08' +updated_date: '2026-04-26 00:15' +labels: + - tokenizer + - annotations + - bug +dependencies: [] +priority: medium +--- + +## Description + + +Investigate and fix subtitle tokenizer annotation behavior where all-hiragana grammar-helper merged tokens such as `ことに` can be marked as N+1. Current likely path: Yomitan emits `ことに` with headword `こと`; MeCab enrichment supplies content-led POS (`名詞|助詞`, likely `非自立|格助詞`); shared subtitle annotation filter does not exclude this family unless it matches narrower rules such as `これで` or explanatory endings. + + +## Acceptance Criteria + +- [x] #1 `ことに`-style kana grammar-helper merges are not marked known, N+1, JLPT, or frequency-highlighted when their MeCab metadata indicates a non-independent noun plus helper particle. +- [x] #2 Regression coverage demonstrates the reported subtitle phrase does not mark `ことに` as N+1 while preserving annotation for real lexical content tokens. +- [x] #3 Existing tokenizer annotation tests pass. + + +## Implementation Plan + + +Approved approach (user: "let's do it"): +1. Add a regression test for the reported `ことに` case using Yomitan token `ことに` -> headword `こと` and MeCab metadata `名詞|助詞` / `非自立|格助詞`; assert all annotation fields are stripped while nearby lexical content can still be N+1. +2. Verify the new test fails before production changes. +3. Update the shared subtitle annotation filter to exclude conservative kana-only grammar-helper merges: merged surface differs from headword, surface is kana-only, first POS component is `名詞`, first POS2 component is `非自立`, and remaining POS components are grammar helpers (`助詞`/`助動詞`). +4. Run targeted tokenizer/annotation tests and update the task acceptance criteria/final notes. + + +## Implementation Notes + + +Red test initially passed with headword `こと` because `こと` is already in `JLPT_EXCLUDED_TERMS` and the shared subtitle annotation filter checks that set. Updated regression to the live-risk shape `surface=ことに`, `headword=事`, with MeCab POS `名詞|助詞` / `非自立|格助詞`; this failed before the filter change and passed after. + + +## Final Summary + + +Implemented a conservative shared subtitle annotation filter for kana-only non-independent noun helper merges. Tokens such as `ことに` with a kanji dictionary headword like `事` are now stripped of known-word, N+1, JLPT, and frequency metadata when MeCab shows the first component as `名詞/非自立` and trailing components as grammar helpers. + +Added unit coverage in `src/core/services/tokenizer/annotation-stage.test.ts` and an integration-style tokenizer regression for the reported phrase shape in `src/core/services/tokenizer.test.ts`, verifying `ことに` stays plain while a real lexical token can still become the N+1 target. + +Validation: `bun test src/core/services/tokenizer/annotation-stage.test.ts`; `bun test src/core/services/tokenizer.test.ts`; `bun run test:fast`; `bun run changelog:lint`. + diff --git a/changes/298-kana-grammar-helper-annotations.md b/changes/298-kana-grammar-helper-annotations.md new file mode 100644 index 00000000..09155993 --- /dev/null +++ b/changes/298-kana-grammar-helper-annotations.md @@ -0,0 +1,4 @@ +type: fixed +area: tokenizer + +- Stopped kana-only grammar-helper merges such as `ことに` from receiving subtitle annotation metadata like N+1, JLPT, known-word, or frequency highlighting. diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index 670f0055..74b52020 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -4069,6 +4069,226 @@ test('tokenizeSubtitle clears all annotations for explanatory contrast endings', ); }); +test('tokenizeSubtitle clears annotations for ことに while preserving lexical N+1 target', async () => { + const result = await tokenizeSubtitle( + 'さっきの俺と違うことに気付かないのかい?', + makeDepsFromYomitanTokens( + [ + { surface: 'さっき', reading: 'さっき', headword: 'さっき' }, + { surface: 'の', reading: 'の', headword: 'の' }, + { surface: '俺', reading: 'おれ', headword: '俺' }, + { surface: 'と', reading: 'と', headword: 'と' }, + { surface: '違う', reading: 'ちがう', headword: '違う' }, + { surface: 'ことに', reading: 'ことに', headword: '事' }, + { surface: '気付かない', reading: 'きづかない', headword: '気付く' }, + { surface: 'の', reading: 'の', headword: 'の' }, + { surface: 'かい', reading: 'かい', headword: 'かい' }, + { surface: '?', reading: '', headword: '?' }, + ], + { + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => + text === '違う' ? 900 : text === '事' ? 81 : text === '気付く' ? 1500 : null, + getJlptLevel: (text) => + text === '違う' ? 'N4' : text === '事' ? 'N4' : text === '気付く' ? 'N3' : null, + isKnownWord: (text) => + ['さっき', 'の', '俺', 'と', '気付く', 'かい', '?'].includes(text), + getMinSentenceWordsForNPlusOne: () => 1, + tokenizeWithMecab: async () => [ + { + headword: 'さっき', + surface: 'さっき', + reading: 'サッキ', + startPos: 0, + endPos: 3, + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '副詞可能', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'の', + surface: 'の', + reading: 'ノ', + startPos: 3, + endPos: 4, + partOfSpeech: PartOfSpeech.particle, + pos1: '助詞', + pos2: '連体化', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: '俺', + surface: '俺', + reading: 'オレ', + startPos: 4, + endPos: 5, + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '代名詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'と', + surface: 'と', + reading: 'ト', + startPos: 5, + endPos: 6, + partOfSpeech: PartOfSpeech.particle, + pos1: '助詞', + pos2: '格助詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: '違う', + surface: '違う', + reading: 'チガウ', + startPos: 6, + endPos: 8, + partOfSpeech: PartOfSpeech.verb, + pos1: '動詞', + pos2: '自立', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: '事', + surface: 'こと', + reading: 'コト', + startPos: 8, + endPos: 10, + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '非自立', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'に', + surface: 'に', + reading: 'ニ', + startPos: 10, + endPos: 11, + partOfSpeech: PartOfSpeech.particle, + pos1: '助詞', + pos2: '格助詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: '気付く', + surface: '気付か', + reading: 'キヅカ', + startPos: 11, + endPos: 14, + partOfSpeech: PartOfSpeech.verb, + pos1: '動詞', + pos2: '自立', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'ない', + surface: 'ない', + reading: 'ナイ', + startPos: 14, + endPos: 16, + partOfSpeech: PartOfSpeech.bound_auxiliary, + pos1: '助動詞', + pos2: '*', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'の', + surface: 'の', + reading: 'ノ', + startPos: 16, + endPos: 17, + partOfSpeech: PartOfSpeech.particle, + pos1: '助詞', + pos2: '終助詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'かい', + surface: 'かい', + reading: 'カイ', + startPos: 17, + endPos: 19, + partOfSpeech: PartOfSpeech.particle, + pos1: '助詞', + pos2: '終助詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: '?', + surface: '?', + reading: '', + startPos: 19, + endPos: 20, + partOfSpeech: PartOfSpeech.symbol, + pos1: '記号', + pos2: '一般', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + ], + }, + ), + ); + + const tokenSummary = result.tokens?.map((token) => ({ + surface: token.surface, + headword: token.headword, + isKnown: token.isKnown, + isNPlusOneTarget: token.isNPlusOneTarget, + frequencyRank: token.frequencyRank, + jlptLevel: token.jlptLevel, + })); + + assert.deepEqual( + tokenSummary?.find((token) => token.surface === 'ことに'), + { + surface: 'ことに', + headword: '事', + isKnown: false, + isNPlusOneTarget: false, + frequencyRank: undefined, + jlptLevel: undefined, + }, + ); + assert.deepEqual( + tokenSummary?.find((token) => token.surface === '違う'), + { + surface: '違う', + headword: '違う', + isKnown: false, + isNPlusOneTarget: true, + frequencyRank: 900, + jlptLevel: 'N4', + }, + ); +}); + test('tokenizeSubtitle excludes default non-independent pos2 from N+1 when JLPT/frequency are disabled', async () => { let mecabCalls = 0; const result = await tokenizeSubtitle( diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts index 2e0a7bb3..cb78c244 100644 --- a/src/core/services/tokenizer/annotation-stage.test.ts +++ b/src/core/services/tokenizer/annotation-stage.test.ts @@ -353,6 +353,19 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes kana-only demonstrative assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true); }); +test('shouldExcludeTokenFromSubtitleAnnotations excludes kana-only non-independent noun helper merges', () => { + const token = makeToken({ + surface: 'ことに', + headword: '事', + reading: 'コトニ', + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞|助詞', + pos2: '非自立|格助詞', + }); + + assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true); +}); + test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => { const token = makeToken({ surface: 'は', @@ -813,6 +826,36 @@ test('annotateTokens applies one shared exclusion gate across known N+1 frequenc assert.equal(result[0]?.jlptLevel, undefined); }); +test('annotateTokens clears all annotations for kana-only non-independent noun helper merges', () => { + const tokens = [ + makeToken({ + surface: 'ことに', + headword: '事', + reading: 'コトニ', + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞|助詞', + pos2: '非自立|格助詞', + startPos: 0, + endPos: 3, + frequencyRank: 81, + }), + ]; + + const result = annotateTokens( + tokens, + makeDeps({ + isKnownWord: (text) => text === '事', + getJlptLevel: (text) => (text === '事' ? 'N4' : null), + }), + { minSentenceWordsForNPlusOne: 1 }, + ); + + assert.equal(result[0]?.isKnown, false); + assert.equal(result[0]?.isNPlusOneTarget, false); + assert.equal(result[0]?.frequencyRank, undefined); + assert.equal(result[0]?.jlptLevel, undefined); +}); + test('annotateTokens clears all annotations from standalone あ interjections without POS tags', () => { const tokens = [ makeToken({ diff --git a/src/core/services/tokenizer/subtitle-annotation-filter.ts b/src/core/services/tokenizer/subtitle-annotation-filter.ts index 95700613..4537a962 100644 --- a/src/core/services/tokenizer/subtitle-annotation-filter.ts +++ b/src/core/services/tokenizer/subtitle-annotation-filter.ts @@ -71,6 +71,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([ 'ってば', ]); const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']); +const NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1 = new Set(['助詞', '助動詞']); export interface SubtitleAnnotationFilterOptions { pos1Exclusions?: ReadonlySet; @@ -252,6 +253,31 @@ function isAuxiliaryStemGrammarTailToken(token: MergedToken): boolean { return pos3Parts.includes('助動詞語幹'); } +function isKanaOnlyNonIndependentNounHelperMerge(token: MergedToken): boolean { + const normalizedSurface = normalizeKana(token.surface); + const normalizedHeadword = normalizeKana(token.headword); + if ( + !normalizedSurface || + !normalizedHeadword || + normalizedSurface === normalizedHeadword || + ![...normalizedSurface].every(isKanaChar) + ) { + return false; + } + + const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1)); + if (pos1Parts.length < 2 || pos1Parts[0] !== '名詞') { + return false; + } + + const pos2Parts = splitNormalizedTagParts(normalizePosTag(token.pos2)); + if (pos2Parts[0] !== '非自立') { + return false; + } + + return pos1Parts.slice(1).every((part) => NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1.has(part)); +} + function isExcludedByTerm(token: MergedToken): boolean { const candidates = [token.surface, token.reading, token.headword].filter( (candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0, @@ -335,6 +361,10 @@ export function shouldExcludeTokenFromSubtitleAnnotations( return true; } + if (isKanaOnlyNonIndependentNounHelperMerge(token)) { + return true; + } + if (isExcludedTrailingParticleMergedToken(token)) { return true; }