diff --git a/backlog/tasks/task-341 - Fix-frequency-highlight-for-honorific-prefix-noun-tokens.md b/backlog/tasks/task-341 - Fix-frequency-highlight-for-honorific-prefix-noun-tokens.md new file mode 100644 index 00000000..e6362ade --- /dev/null +++ b/backlog/tasks/task-341 - Fix-frequency-highlight-for-honorific-prefix-noun-tokens.md @@ -0,0 +1,66 @@ +--- +id: TASK-341 +title: Fix frequency highlight for honorific prefix noun tokens +status: Done +assignee: + - codex +created_date: '2026-05-05 02:08' +updated_date: '2026-05-05 02:10' +labels: + - bug + - tokenizer + - frequency +dependencies: [] +documentation: + - docs/architecture/2026-03-15-renderer-performance-design.md +priority: high +--- + +## Description + + +User reported subtitle token `ご機嫌` in `(フランク)ご機嫌が良くないようだな アンドリュー` shows Yomitan/JPDB rank 5484 in popup but is not highlighted as frequent. Frequency annotation currently excludes merged tokens containing default-excluded POS parts such as `接頭詞`; ordinal prefix-noun tokens already have an exception. Desired outcome: honorific prefix + noun lexical tokens like `ご機嫌` keep their valid frequency rank so renderer can apply frequent-token styling, while standalone prefixes and noisy merged grammar fragments remain excluded. + + +## Acceptance Criteria + +- [x] #1 `ご機嫌`-style honorific prefix + noun tokens retain a finite frequency rank after annotation/tokenization when frequency highlighting is enabled. +- [x] #2 Standalone prefix/noise tokens remain excluded from frequency annotation. +- [x] #3 Regression test covers the reported `ご機嫌` rank 5484 behavior. +- [x] #4 Relevant tokenizer/annotation tests pass. + + +## Implementation Plan + + +1. Add a failing regression around honorific prefix + noun token frequency retention, using `ご機嫌` with rank 5484 and POS `接頭詞|名詞` / `名詞接続|一般`. +2. Implement a narrow annotation-stage exception for lexical honorific prefix-noun tokens, adjacent to the existing ordinal prefix-noun allowance. +3. Verify standalone prefix/noise exclusion behavior remains covered. +4. Run targeted tokenizer/annotation tests and update acceptance criteria/final notes. + + +## Implementation Notes + + +TDD red verified: `bun test src/core/services/tokenizer.test.ts -t "honorific prefix-noun"` failed with `actual: undefined`, `expected: 5484` before implementation. + +Implemented a narrow honorific prefix-noun frequency allowance for merged `お`/`ご`/`御` + noun tokens with POS `接頭詞|名詞` and prefix POS2 `名詞接続`. Existing standalone prefix/noise exclusion tests still pass. + +Verification: `bun test src/core/services/tokenizer.test.ts src/core/services/tokenizer/annotation-stage.test.ts` passed (164 tests); `bun run typecheck` passed; `bunx prettier --check src/core/services/tokenizer/annotation-stage.ts src/core/services/tokenizer.test.ts` passed. Repo-wide `bun run format:check:src` still fails on pre-existing `src/core/services/stats-window.ts` formatting. + + +## Final Summary + + +Fixed frequency annotation for lexical honorific prefix-noun tokens such as `ご機嫌`. The annotation filter now allows merged `お`/`ご`/`御` prefix + noun tokens with MeCab POS `接頭詞|名詞` / `名詞接続|...` to retain a valid frequency rank, while standalone prefixes and existing noise filters remain excluded. + +Added a tokenizer regression for the reported `ご機嫌` case asserting rank `5484` is preserved after MeCab enrichment and annotation. + +Verification: +- `bun test src/core/services/tokenizer.test.ts -t "honorific prefix-noun"` failed before the fix with `undefined` vs `5484`, then passed after the fix. +- `bun test src/core/services/tokenizer.test.ts src/core/services/tokenizer/annotation-stage.test.ts` passed (164 tests). +- `bun run typecheck` passed. +- `bunx prettier --check src/core/services/tokenizer/annotation-stage.ts src/core/services/tokenizer.test.ts` passed. + +Note: repo-wide `bun run format:check:src` currently fails on unrelated existing formatting in `src/core/services/stats-window.ts`. + diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index 885f5c8d..d26fb152 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -4140,6 +4140,96 @@ test('tokenizeSubtitle keeps frequency for ordinal prefix-noun tokens', async () assert.equal(result.tokens?.[0]?.frequencyRank, 1820); }); +test('tokenizeSubtitle keeps frequency for honorific prefix-noun tokens', async () => { + const result = await tokenizeSubtitle( + 'ご機嫌が良くない', + makeDepsFromYomitanTokens( + [ + { surface: 'ご機嫌', reading: 'ごきげん', headword: 'ご機嫌' }, + { surface: 'が', reading: 'が', headword: 'が' }, + { surface: '良くない', reading: 'よくない', headword: '良い' }, + ], + { + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => (text === 'ご機嫌' ? 5484 : null), + tokenizeWithMecab: async () => [ + { + headword: 'ご', + surface: 'ご', + reading: 'ゴ', + startPos: 0, + endPos: 1, + partOfSpeech: PartOfSpeech.other, + pos1: '接頭詞', + pos2: '名詞接続', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: '機嫌', + surface: '機嫌', + reading: 'キゲン', + startPos: 1, + endPos: 3, + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '一般', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'が', + surface: 'が', + reading: 'ガ', + startPos: 3, + endPos: 4, + partOfSpeech: PartOfSpeech.particle, + pos1: '助詞', + pos2: '格助詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: '良い', + surface: '良く', + reading: 'ヨク', + startPos: 4, + endPos: 6, + partOfSpeech: PartOfSpeech.i_adjective, + pos1: '形容詞', + pos2: '自立', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'ない', + surface: 'ない', + reading: 'ナイ', + startPos: 6, + endPos: 8, + partOfSpeech: PartOfSpeech.bound_auxiliary, + pos1: '助動詞', + pos2: '*', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + ], + getMinSentenceWordsForNPlusOne: () => 1, + }, + ), + ); + + assert.equal(result.tokens?.[0]?.surface, 'ご機嫌'); + assert.equal(result.tokens?.[0]?.pos1, '接頭詞|名詞'); + assert.equal(result.tokens?.[0]?.pos2, '名詞接続|一般'); + assert.equal(result.tokens?.[0]?.frequencyRank, 5484); +}); + test('tokenizeSubtitle clears all annotations for explanatory contrast endings', async () => { const result = await tokenizeSubtitle( '最近辛いものが続いとるんですけど', diff --git a/src/core/services/tokenizer/annotation-stage.ts b/src/core/services/tokenizer/annotation-stage.ts index e95435f1..f1cd6bb6 100644 --- a/src/core/services/tokenizer/annotation-stage.ts +++ b/src/core/services/tokenizer/annotation-stage.ts @@ -167,6 +167,27 @@ function shouldAllowOrdinalPrefixNounFrequency(token: MergedToken): boolean { ); } +function shouldAllowHonorificPrefixNounFrequency(token: MergedToken): boolean { + const normalizedSurface = token.surface.trim(); + const normalizedHeadword = token.headword.trim(); + if ( + !['お', 'ご', '御'].some( + (prefix) => normalizedSurface.startsWith(prefix) || normalizedHeadword.startsWith(prefix), + ) + ) { + return false; + } + + const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1)); + const pos2Parts = splitNormalizedTagParts(normalizePos2Tag(token.pos2)); + return ( + pos1Parts.length >= 2 && + pos1Parts[0] === '接頭詞' && + pos1Parts.slice(1).some((part) => part === '名詞') && + pos2Parts[0] === '名詞接続' + ); +} + function isFrequencyExcludedByPos( token: MergedToken, pos1Exclusions: ReadonlySet, @@ -187,11 +208,13 @@ function isFrequencyExcludedByPos( pos2Exclusions, ); const allowOrdinalPrefixNounToken = shouldAllowOrdinalPrefixNounFrequency(token); + const allowHonorificPrefixNounToken = shouldAllowHonorificPrefixNounFrequency(token); if ( isExcludedByTagSet(normalizedPos1, pos1Exclusions) && !allowContentLedMergedToken && - !allowOrdinalPrefixNounToken + !allowOrdinalPrefixNounToken && + !allowHonorificPrefixNounToken ) { return true; } @@ -199,7 +222,8 @@ function isFrequencyExcludedByPos( if ( isExcludedByTagSet(normalizedPos2, pos2Exclusions) && !allowContentLedMergedToken && - !allowOrdinalPrefixNounToken + !allowOrdinalPrefixNounToken && + !allowHonorificPrefixNounToken ) { return true; }