From 7442e4266cf403e729cb463873627a75e3804a77 Mon Sep 17 00:00:00 2001 From: sudacode Date: Tue, 28 Apr 2026 00:09:02 -0700 Subject: [PATCH] fix: suppress N+1 for kana-only candidates and fix minSentenceWords coun MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Treat kana-only tokens with surrounding subtitle punctuation (…, ―, etc.) as kana-only so they are not promoted to N+1 targets - Exclude unknown tokens filtered from N+1 targeting from the minSentenceWords count so filtered kana-only unknowns cannot satisfy sentence length threshold - Add regression tests for kana-only candidate suppression and filtered-unknown padding cases --- ...light-for-kana-only-candidate-sentences.md | 58 +++++++++++++ .../tokenizer/annotation-stage.test.ts | 83 +++++++++++++++++++ src/token-merger.ts | 35 +++++++- 3 files changed, 174 insertions(+), 2 deletions(-) create mode 100644 backlog/tasks/task-310 - Suppress-N1-highlight-for-kana-only-candidate-sentences.md diff --git a/backlog/tasks/task-310 - Suppress-N1-highlight-for-kana-only-candidate-sentences.md b/backlog/tasks/task-310 - Suppress-N1-highlight-for-kana-only-candidate-sentences.md new file mode 100644 index 00000000..46093703 --- /dev/null +++ b/backlog/tasks/task-310 - Suppress-N1-highlight-for-kana-only-candidate-sentences.md @@ -0,0 +1,58 @@ +--- +id: TASK-310 +title: Suppress N+1 highlight for kana-only candidate sentences +status: Done +assignee: + - Codex +created_date: '2026-04-28 06:55' +updated_date: '2026-04-28 07:04' +labels: + - tokenizer + - n+1 +dependencies: [] +priority: medium +--- + +## Description + + +Reduce noisy N+1 subtitle annotations when the only unknown candidates in a sentence are kana-only hiragana or katakana words, such as mostly-kana subtitle lines where highlighting a particle/helper-like token is low value. + + +## Acceptance Criteria + +- [x] #1 N+1 annotation does not mark a kana-only unknown target when all N+1 candidates in the sentence are kana-only. +- [x] #2 N+1 annotation continues to mark kanji or mixed-script unknown targets in otherwise eligible sentences. +- [x] #3 A focused regression test covers the kana-only candidate case. +- [x] #4 N+1 minimum sentence word count excludes tokens stripped by the subtitle annotation filter, so filtered grammar/noise tokens cannot satisfy minSentenceWords. + + +## Implementation Plan + + +1. Keep the existing N+1 target eligibility guard: kana-only subtitle surfaces do not become N+1 targets. +2. Add a focused regression in src/core/services/tokenizer/annotation-stage.test.ts proving annotation-filtered tokens do not count toward ankiConnect.nPlusOne.minSentenceWords. +3. Verify the new regression fails before code changes. +4. Patch src/token-merger.ts so the N+1 minimum sentence word count uses the same subtitle-annotation eligibility filter as annotation rendering, excluding filtered particles/auxiliaries/noise from the count. +5. Re-run focused tokenizer tests, then update TASK-310 acceptance criteria and final notes. + + +## Implementation Notes + + +Initial context: current token-merger has an existing surface-level kana-only guard in isNPlusOneCandidateToken, added in commit 9e4ad907. Need decide whether to broaden behavior to lookup/headword forms or verify current behavior only. + +Implemented by treating kana-only N+1 candidates as kana-only even when their token surface includes surrounding subtitle punctuation such as ellipsis or dashes. Focused regression was red before the token-merger change: スイッチ… was marked true, then passed after the guard update. test:env initially hit an unrelated immersion-tracker active_days timing/order failure and Bun follow-on loader error; the failing test passed in isolation and the full test:env rerun passed. + +Reopened for follow-up scope: minSentenceWords must count annotation-eligible tokens only, not tokens stripped from annotation metadata. + +Implemented follow-up minSentenceWords behavior: unknown tokens filtered from N+1 targeting no longer contribute to sentence length; known eligible tokens and true N+1 candidates still count. + + +## Final Summary + + +Changed N+1 sentence-length counting so minSentenceWords only counts known eligible words and actual N+1 target candidates. Unknown tokens filtered from N+1 targeting, including kana-only unknowns, no longer pad a sentence into eligibility. Existing annotation-filtered particles/auxiliaries remain excluded. Added regression coverage for the filtered unknown padding case while preserving kanji/mixed-script target behavior. + +Verification: new regression failed before implementation; `bun test src/core/services/tokenizer/annotation-stage.test.ts -t "N\\+1"` pass; full `bun test src/core/services/tokenizer/annotation-stage.test.ts` pass; `bun test src/core/services/tokenizer.test.ts -t "N\\+1"` pass; `bun run typecheck` pass. + diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts index c098e646..32e6ee25 100644 --- a/src/core/services/tokenizer/annotation-stage.test.ts +++ b/src/core/services/tokenizer/annotation-stage.test.ts @@ -627,6 +627,63 @@ test('annotateTokens N+1 handoff marks expected target when threshold is satisfi assert.equal(result[2]?.isNPlusOneTarget, false); }); +test('annotateTokens does not mark kana-only unknown target with subtitle punctuation as N+1', () => { + const tokens = [ + makeToken({ + surface: '何やら', + headword: '何やら', + reading: 'ナニヤラ', + pos1: '副詞', + startPos: 0, + endPos: 3, + }), + makeToken({ + surface: 'ボタン', + headword: 'ボタン', + reading: 'ボタン', + pos1: '名詞', + startPos: 3, + endPos: 6, + }), + makeToken({ + surface: 'スイッチ…', + headword: 'スイッチ', + reading: 'スイッチ', + pos1: '名詞', + startPos: 6, + endPos: 11, + }), + ]; + + const result = annotateTokens( + tokens, + makeDeps({ + isKnownWord: (text) => text === '何やら' || text === 'ボタン', + }), + { minSentenceWordsForNPlusOne: 3 }, + ); + + assert.equal(result[2]?.isNPlusOneTarget, false); +}); + +test('annotateTokens still marks kanji unknown target in otherwise eligible sentence as N+1', () => { + const tokens = [ + makeToken({ surface: '私', headword: '私', pos1: '名詞', startPos: 0, endPos: 1 }), + makeToken({ surface: '猫', headword: '猫', pos1: '名詞', startPos: 1, endPos: 2 }), + makeToken({ surface: '装置…', headword: '装置', pos1: '名詞', startPos: 2, endPos: 5 }), + ]; + + const result = annotateTokens( + tokens, + makeDeps({ + isKnownWord: (text) => text === '私' || text === '猫', + }), + { minSentenceWordsForNPlusOne: 3 }, + ); + + assert.equal(result[2]?.isNPlusOneTarget, true); +}); + test('annotateTokens N+1 minimum sentence words counts only eligible word tokens', () => { const tokens = [ makeToken({ surface: '猫', headword: '猫', startPos: 0, endPos: 1 }), @@ -662,6 +719,32 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens assert.equal(result[0]?.isNPlusOneTarget, false); }); +test('annotateTokens N+1 minimum sentence words excludes unknown tokens filtered from N+1 targeting', () => { + const tokens = [ + makeToken({ surface: '私', headword: '私', pos1: '名詞', startPos: 0, endPos: 1 }), + makeToken({ surface: '猫', headword: '猫', pos1: '名詞', startPos: 1, endPos: 2 }), + makeToken({ + surface: 'スイッチ', + headword: 'スイッチ', + reading: 'スイッチ', + pos1: '名詞', + startPos: 2, + endPos: 6, + }), + makeToken({ surface: '装置', headword: '装置', pos1: '名詞', startPos: 6, endPos: 8 }), + ]; + + const result = annotateTokens( + tokens, + makeDeps({ + isKnownWord: (text) => text === '私' || text === '猫', + }), + { minSentenceWordsForNPlusOne: 4 }, + ); + + assert.equal(result[3]?.isNPlusOneTarget, false); +}); + test('annotateTokens N+1 sentence word count respects source punctuation gaps omitted by Yomitan', () => { const tokens = [ makeToken({ diff --git a/src/token-merger.ts b/src/token-merger.ts index f0a65e5b..dd69866c 100644 --- a/src/token-merger.ts +++ b/src/token-merger.ts @@ -298,9 +298,28 @@ function isKanaChar(char: string): boolean { ); } +function isKanaCandidateIgnorableChar(char: string): boolean { + return /^[\s.,!?;:()[\]{}"'`、。!?…‥・「」『』()[]{}〈〉《》【】―-]$/u.test(char); +} + function isKanaOnlyText(text: string): boolean { const normalized = text.trim(); - return normalized.length > 0 && Array.from(normalized).every((char) => isKanaChar(char)); + if (normalized.length === 0) { + return false; + } + + let hasKana = false; + for (const char of normalized) { + if (isKanaChar(char)) { + hasKana = true; + continue; + } + if (!isKanaCandidateIgnorableChar(char)) { + return false; + } + } + + return hasKana; } function normalizeSourceTextForTokenOffsets(sourceText: string | undefined): string | undefined { @@ -367,6 +386,18 @@ function isNPlusOneWordCountToken( return true; } +function isNPlusOneSentenceLengthToken( + token: MergedToken, + pos1Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS1, + pos2Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS2, +): boolean { + if (!isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions)) { + return false; + } + + return token.isKnown || isNPlusOneCandidateToken(token, pos1Exclusions, pos2Exclusions); +} + function isSentenceBoundaryToken(token: MergedToken): boolean { if (token.partOfSpeech !== PartOfSpeech.symbol) { return false; @@ -418,7 +449,7 @@ export function markNPlusOneTargets( for (let i = start; i < endExclusive; i++) { const token = markedTokens[i]; if (!token) continue; - if (isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions)) { + if (isNPlusOneSentenceLengthToken(token, pos1Exclusions, pos2Exclusions)) { sentenceWordCount += 1; }