From 9e4ad907fe694a169d11739881890d41214f907d Mon Sep 17 00:00:00 2001 From: sudacode Date: Sun, 26 Apr 2026 19:21:59 -0700 Subject: [PATCH] fix: exclude kana-only n+1 targets --- ...ana-only-words-from-N1-subtitle-targets.md | 58 +++++++++++++ changes/307-kana-nplusone-targets.md | 4 + src/core/services/tokenizer.test.ts | 87 ++++++++++++------- .../tokenizer/annotation-stage.test.ts | 26 +++--- src/token-merger.ts | 23 +++++ 5 files changed, 154 insertions(+), 44 deletions(-) create mode 100644 backlog/tasks/task-307 - Exclude-kana-only-words-from-N1-subtitle-targets.md create mode 100644 changes/307-kana-nplusone-targets.md diff --git a/backlog/tasks/task-307 - Exclude-kana-only-words-from-N1-subtitle-targets.md b/backlog/tasks/task-307 - Exclude-kana-only-words-from-N1-subtitle-targets.md new file mode 100644 index 00000000..78e4de50 --- /dev/null +++ b/backlog/tasks/task-307 - Exclude-kana-only-words-from-N1-subtitle-targets.md @@ -0,0 +1,58 @@ +--- +id: TASK-307 +title: Exclude kana-only words from N+1 subtitle targets +status: Done +assignee: + - codex +created_date: '2026-04-27 01:52' +updated_date: '2026-04-27 01:57' +labels: + - tokenizer + - annotations +dependencies: [] +priority: medium +--- + +## Description + + +Subtitle N+1 annotation is over-targeting kana-only or hiragana/katakana tokens that collapse to dictionary words. Adjust targeting so kana-only tokens are not selected as N+1 candidates, while preserving tokenization/hover behavior and other annotation metadata where existing filters allow it. + + +## Acceptance Criteria + +- [x] #1 Kana-only subtitle tokens are not marked as N+1 targets. +- [x] #2 Kanji or mixed lexical tokens can still be marked as N+1 targets when they are the single unknown candidate in a sentence. +- [x] #3 Regression coverage demonstrates the kana-only N+1 exclusion. + + +## Implementation Plan + + +1. Add a failing regression in `src/core/services/tokenizer.test.ts` showing a kana-only Yomitan token is not selected as the single N+1 target, while a mixed lexical token in the same style still can be targeted. +2. Implement the smallest filter in `src/token-merger.ts`: N+1 candidate selection rejects tokens whose surface is entirely kana; word-count behavior remains governed by existing annotation/POS filters. +3. Run the focused tokenizer tests, then update task acceptance criteria/final summary. + + +## Implementation Notes + + +Implemented a surface-level kana-only guard in N+1 candidate selection. Kept existing word-count/POS filtering behavior intact; updated tokenizer and annotation-stage expectations where old tests intentionally allowed kana-only N+1 targets. + + +## Final Summary + + +Summary: +- Added kana-only surface detection to `isNPlusOneCandidateToken` so hiragana/katakana-only subtitle tokens are not selected as N+1 targets. +- Added/updated tokenizer and annotation-stage regressions for kana-only targets while preserving non-kana N+1 behavior. +- Added changelog fragment `changes/307-kana-nplusone-targets.md`. + +Verification: +- `bun test src/core/services/tokenizer.test.ts --test-name-pattern "kana-only N\+1"` failed before the fix with `true !== false`. +- `bun test src/core/services/tokenizer/annotation-stage.test.ts src/core/services/tokenizer.test.ts` passed. +- `bun run typecheck` passed. +- `bun run test:fast` passed. +- `bun run changelog:lint` passed. +- `bunx prettier --check src/core/services/tokenizer.test.ts src/core/services/tokenizer/annotation-stage.test.ts src/token-merger.ts changes/307-kana-nplusone-targets.md` passed. + diff --git a/changes/307-kana-nplusone-targets.md b/changes/307-kana-nplusone-targets.md new file mode 100644 index 00000000..7ea569f9 --- /dev/null +++ b/changes/307-kana-nplusone-targets.md @@ -0,0 +1,4 @@ +type: fixed +area: tokenizer + +- Stopped kana-only subtitle tokens from being selected as N+1 targets. diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index 170f4853..50f524c7 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -2306,6 +2306,29 @@ test('tokenizeSubtitle selects one N+1 target token', async () => { assert.equal(targets[0]?.surface, '犬'); }); +test('tokenizeSubtitle does not select kana-only N+1 target tokens', async () => { + const result = await tokenizeSubtitle( + '私のばあい', + makeDepsFromYomitanTokens( + [ + { surface: '私', reading: 'わたし', headword: '私' }, + { surface: 'の', reading: 'の', headword: 'の' }, + { surface: 'ばあい', reading: 'ばあい', headword: '場合' }, + ], + { + getMinSentenceWordsForNPlusOne: () => 2, + isKnownWord: (text) => text === '私', + }, + ), + ); + + assert.equal(result.tokens?.length, 3); + assert.equal( + result.tokens?.some((token) => token.isNPlusOneTarget), + false, + ); +}); + test('tokenizeSubtitle does not mark target when sentence has multiple candidates', async () => { const result = await tokenizeSubtitle( '猫犬', @@ -3040,15 +3063,18 @@ test('tokenizeSubtitle uses Yomitan word classes to classify standalone particle let mecabCalls = 0; const result = await tokenizeSubtitle( 'は', - makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は', wordClasses: ['prt'] }], { - getFrequencyDictionaryEnabled: () => true, - getFrequencyRank: (text) => (text === 'は' ? 10 : null), - getJlptLevel: (text) => (text === 'は' ? 'N5' : null), - tokenizeWithMecab: async () => { - mecabCalls += 1; - return null; + makeDepsFromYomitanTokens( + [{ surface: 'は', reading: 'は', headword: 'は', wordClasses: ['prt'] }], + { + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => (text === 'は' ? 10 : null), + getJlptLevel: (text) => (text === 'は' ? 'N5' : null), + tokenizeWithMecab: async () => { + mecabCalls += 1; + return null; + }, }, - }), + ), ); assert.equal(mecabCalls, 1); @@ -3063,24 +3089,27 @@ test('tokenizeSubtitle uses Yomitan word classes to classify standalone particle test('tokenizeSubtitle fills detailed MeCab POS when Yomitan word class supplies coarse POS', async () => { const result = await tokenizeSubtitle( 'は', - makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は', wordClasses: ['prt'] }], { - tokenizeWithMecab: async () => [ - { - headword: 'は', - surface: 'は', - reading: 'ハ', - startPos: 0, - endPos: 1, - partOfSpeech: PartOfSpeech.particle, - pos1: '助詞', - pos2: '係助詞', - pos3: '*', - isMerged: false, - isKnown: false, - isNPlusOneTarget: false, - }, - ], - }), + makeDepsFromYomitanTokens( + [{ surface: 'は', reading: 'は', headword: 'は', wordClasses: ['prt'] }], + { + tokenizeWithMecab: async () => [ + { + headword: 'は', + surface: 'は', + reading: 'ハ', + startPos: 0, + endPos: 1, + partOfSpeech: PartOfSpeech.particle, + pos1: '助詞', + pos2: '係助詞', + pos3: '*', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + ], + }, + ), ); assert.equal(result.tokens?.[0]?.partOfSpeech, PartOfSpeech.particle); @@ -3682,7 +3711,7 @@ test('tokenizeSubtitle excludes single-kana merged tokens from frequency highlig assert.equal(result.tokens?.[0]?.frequencyRank, undefined); }); -test('tokenizeSubtitle excludes merged function/content token from frequency highlighting but keeps N+1', async () => { +test('tokenizeSubtitle excludes merged kana-only function/content token from frequency and N+1', async () => { const result = await tokenizeSubtitle( 'になれば', makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], { @@ -3736,7 +3765,7 @@ test('tokenizeSubtitle excludes merged function/content token from frequency hig assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.pos1, '助詞|動詞'); assert.equal(result.tokens?.[0]?.frequencyRank, undefined); - assert.equal(result.tokens?.[0]?.isNPlusOneTarget, true); + assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false); }); test('tokenizeSubtitle clears all annotations for kana-only demonstrative helper merges', async () => { @@ -3935,7 +3964,7 @@ test('tokenizeSubtitle clears all annotations for explanatory pondering endings' surface: 'どうかしちゃった', headword: 'どうかしちゃう', isKnown: false, - isNPlusOneTarget: true, + isNPlusOneTarget: false, frequencyRank: 3200, jlptLevel: 'N3', }, diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts index 70ec7ba0..9df047e8 100644 --- a/src/core/services/tokenizer/annotation-stage.test.ts +++ b/src/core/services/tokenizer/annotation-stage.test.ts @@ -570,13 +570,13 @@ test('annotateTokens keeps other annotations for name matches when name highligh let jlptLookupCalls = 0; const tokens = [ makeToken({ - surface: 'オリヴィア', - reading: 'オリヴィア', - headword: 'オリヴィア', + surface: '山田', + reading: 'ヤマダ', + headword: '山田', isNameMatch: true, frequencyRank: 42, startPos: 0, - endPos: 5, + endPos: 2, }), ]; @@ -770,7 +770,7 @@ test('annotateTokens allows previously default-excluded pos1 when removed from e }); assert.equal(result[0]?.frequencyRank, 8); - assert.equal(result[0]?.isNPlusOneTarget, true); + assert.equal(result[0]?.isNPlusOneTarget, false); }); test('annotateTokens excludes default non-independent pos2 from frequency and N+1', () => { @@ -787,13 +787,9 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+ }), ]; - const result = annotateTokens( - tokens, - makeDeps(), - { - minSentenceWordsForNPlusOne: 1, - }, - ); + const result = annotateTokens(tokens, makeDeps(), { + minSentenceWordsForNPlusOne: 1, + }); assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.isNPlusOneTarget, false); @@ -969,10 +965,10 @@ test('annotateTokens allows previously default-excluded pos2 when removed from e }); assert.equal(result[0]?.frequencyRank, 9); - assert.equal(result[0]?.isNPlusOneTarget, true); + assert.equal(result[0]?.isNPlusOneTarget, false); }); -test('annotateTokens excludes composite function/content tokens from frequency but keeps N+1 eligible', () => { +test('annotateTokens excludes kana-only composite function/content tokens from frequency and N+1', () => { const tokens = [ makeToken({ surface: 'になれば', @@ -990,7 +986,7 @@ test('annotateTokens excludes composite function/content tokens from frequency b }); assert.equal(result[0]?.frequencyRank, undefined); - assert.equal(result[0]?.isNPlusOneTarget, true); + assert.equal(result[0]?.isNPlusOneTarget, false); }); test('annotateTokens excludes composite tokens when all component pos tags are excluded', () => { diff --git a/src/token-merger.ts b/src/token-merger.ts index 23d54efa..948eb322 100644 --- a/src/token-merger.ts +++ b/src/token-merger.ts @@ -282,6 +282,26 @@ function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet exclusions.has(part)); } +function isKanaChar(char: string): boolean { + const code = char.codePointAt(0); + if (code === undefined) { + return false; + } + + return ( + (code >= 0x3041 && code <= 0x3096) || + (code >= 0x309b && code <= 0x309f) || + code === 0x30fc || + (code >= 0x30a0 && code <= 0x30fa) || + (code >= 0x30fd && code <= 0x30ff) + ); +} + +function isKanaOnlyText(text: string): boolean { + const normalized = text.trim(); + return normalized.length > 0 && Array.from(normalized).every((char) => isKanaChar(char)); +} + export function isNPlusOneCandidateToken( token: MergedToken, pos1Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS1, @@ -290,6 +310,9 @@ export function isNPlusOneCandidateToken( if (token.isKnown) { return false; } + if (isKanaOnlyText(token.surface)) { + return false; + } return isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions); }