mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-05-05 00:41:30 -07:00
fix: retain frequency rank for honorific prefix-noun tokens
- Add `shouldAllowHonorificPrefixNounFrequency` to exempt お/ご/御 + noun merged tokens from frequency exclusion - Add regression test for `ご機嫌` asserting rank 5484 is preserved after MeCab enrichment and annotation - Close TASK-341
This commit is contained in:
@@ -0,0 +1,66 @@
|
|||||||
|
---
|
||||||
|
id: TASK-341
|
||||||
|
title: Fix frequency highlight for honorific prefix noun tokens
|
||||||
|
status: Done
|
||||||
|
assignee:
|
||||||
|
- codex
|
||||||
|
created_date: '2026-05-05 02:08'
|
||||||
|
updated_date: '2026-05-05 02:10'
|
||||||
|
labels:
|
||||||
|
- bug
|
||||||
|
- tokenizer
|
||||||
|
- frequency
|
||||||
|
dependencies: []
|
||||||
|
documentation:
|
||||||
|
- docs/architecture/2026-03-15-renderer-performance-design.md
|
||||||
|
priority: high
|
||||||
|
---
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||||
|
User reported subtitle token `ご機嫌` in `(フランク)ご機嫌が良くないようだな アンドリュー` shows Yomitan/JPDB rank 5484 in popup but is not highlighted as frequent. Frequency annotation currently excludes merged tokens containing default-excluded POS parts such as `接頭詞`; ordinal prefix-noun tokens already have an exception. Desired outcome: honorific prefix + noun lexical tokens like `ご機嫌` keep their valid frequency rank so renderer can apply frequent-token styling, while standalone prefixes and noisy merged grammar fragments remain excluded.
|
||||||
|
<!-- SECTION:DESCRIPTION:END -->
|
||||||
|
|
||||||
|
## Acceptance Criteria
|
||||||
|
<!-- AC:BEGIN -->
|
||||||
|
- [x] #1 `ご機嫌`-style honorific prefix + noun tokens retain a finite frequency rank after annotation/tokenization when frequency highlighting is enabled.
|
||||||
|
- [x] #2 Standalone prefix/noise tokens remain excluded from frequency annotation.
|
||||||
|
- [x] #3 Regression test covers the reported `ご機嫌` rank 5484 behavior.
|
||||||
|
- [x] #4 Relevant tokenizer/annotation tests pass.
|
||||||
|
<!-- AC:END -->
|
||||||
|
|
||||||
|
## Implementation Plan
|
||||||
|
|
||||||
|
<!-- SECTION:PLAN:BEGIN -->
|
||||||
|
1. Add a failing regression around honorific prefix + noun token frequency retention, using `ご機嫌` with rank 5484 and POS `接頭詞|名詞` / `名詞接続|一般`.
|
||||||
|
2. Implement a narrow annotation-stage exception for lexical honorific prefix-noun tokens, adjacent to the existing ordinal prefix-noun allowance.
|
||||||
|
3. Verify standalone prefix/noise exclusion behavior remains covered.
|
||||||
|
4. Run targeted tokenizer/annotation tests and update acceptance criteria/final notes.
|
||||||
|
<!-- SECTION:PLAN:END -->
|
||||||
|
|
||||||
|
## Implementation Notes
|
||||||
|
|
||||||
|
<!-- SECTION:NOTES:BEGIN -->
|
||||||
|
TDD red verified: `bun test src/core/services/tokenizer.test.ts -t "honorific prefix-noun"` failed with `actual: undefined`, `expected: 5484` before implementation.
|
||||||
|
|
||||||
|
Implemented a narrow honorific prefix-noun frequency allowance for merged `お`/`ご`/`御` + noun tokens with POS `接頭詞|名詞` and prefix POS2 `名詞接続`. Existing standalone prefix/noise exclusion tests still pass.
|
||||||
|
|
||||||
|
Verification: `bun test src/core/services/tokenizer.test.ts src/core/services/tokenizer/annotation-stage.test.ts` passed (164 tests); `bun run typecheck` passed; `bunx prettier --check src/core/services/tokenizer/annotation-stage.ts src/core/services/tokenizer.test.ts` passed. Repo-wide `bun run format:check:src` still fails on pre-existing `src/core/services/stats-window.ts` formatting.
|
||||||
|
<!-- SECTION:NOTES:END -->
|
||||||
|
|
||||||
|
## Final Summary
|
||||||
|
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||||
|
Fixed frequency annotation for lexical honorific prefix-noun tokens such as `ご機嫌`. The annotation filter now allows merged `お`/`ご`/`御` prefix + noun tokens with MeCab POS `接頭詞|名詞` / `名詞接続|...` to retain a valid frequency rank, while standalone prefixes and existing noise filters remain excluded.
|
||||||
|
|
||||||
|
Added a tokenizer regression for the reported `ご機嫌` case asserting rank `5484` is preserved after MeCab enrichment and annotation.
|
||||||
|
|
||||||
|
Verification:
|
||||||
|
- `bun test src/core/services/tokenizer.test.ts -t "honorific prefix-noun"` failed before the fix with `undefined` vs `5484`, then passed after the fix.
|
||||||
|
- `bun test src/core/services/tokenizer.test.ts src/core/services/tokenizer/annotation-stage.test.ts` passed (164 tests).
|
||||||
|
- `bun run typecheck` passed.
|
||||||
|
- `bunx prettier --check src/core/services/tokenizer/annotation-stage.ts src/core/services/tokenizer.test.ts` passed.
|
||||||
|
|
||||||
|
Note: repo-wide `bun run format:check:src` currently fails on unrelated existing formatting in `src/core/services/stats-window.ts`.
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||||
@@ -4140,6 +4140,96 @@ test('tokenizeSubtitle keeps frequency for ordinal prefix-noun tokens', async ()
|
|||||||
assert.equal(result.tokens?.[0]?.frequencyRank, 1820);
|
assert.equal(result.tokens?.[0]?.frequencyRank, 1820);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('tokenizeSubtitle keeps frequency for honorific prefix-noun tokens', async () => {
|
||||||
|
const result = await tokenizeSubtitle(
|
||||||
|
'ご機嫌が良くない',
|
||||||
|
makeDepsFromYomitanTokens(
|
||||||
|
[
|
||||||
|
{ surface: 'ご機嫌', reading: 'ごきげん', headword: 'ご機嫌' },
|
||||||
|
{ surface: 'が', reading: 'が', headword: 'が' },
|
||||||
|
{ surface: '良くない', reading: 'よくない', headword: '良い' },
|
||||||
|
],
|
||||||
|
{
|
||||||
|
getFrequencyDictionaryEnabled: () => true,
|
||||||
|
getFrequencyRank: (text) => (text === 'ご機嫌' ? 5484 : null),
|
||||||
|
tokenizeWithMecab: async () => [
|
||||||
|
{
|
||||||
|
headword: 'ご',
|
||||||
|
surface: 'ご',
|
||||||
|
reading: 'ゴ',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 1,
|
||||||
|
partOfSpeech: PartOfSpeech.other,
|
||||||
|
pos1: '接頭詞',
|
||||||
|
pos2: '名詞接続',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: '機嫌',
|
||||||
|
surface: '機嫌',
|
||||||
|
reading: 'キゲン',
|
||||||
|
startPos: 1,
|
||||||
|
endPos: 3,
|
||||||
|
partOfSpeech: PartOfSpeech.noun,
|
||||||
|
pos1: '名詞',
|
||||||
|
pos2: '一般',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: 'が',
|
||||||
|
surface: 'が',
|
||||||
|
reading: 'ガ',
|
||||||
|
startPos: 3,
|
||||||
|
endPos: 4,
|
||||||
|
partOfSpeech: PartOfSpeech.particle,
|
||||||
|
pos1: '助詞',
|
||||||
|
pos2: '格助詞',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: '良い',
|
||||||
|
surface: '良く',
|
||||||
|
reading: 'ヨク',
|
||||||
|
startPos: 4,
|
||||||
|
endPos: 6,
|
||||||
|
partOfSpeech: PartOfSpeech.i_adjective,
|
||||||
|
pos1: '形容詞',
|
||||||
|
pos2: '自立',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: 'ない',
|
||||||
|
surface: 'ない',
|
||||||
|
reading: 'ナイ',
|
||||||
|
startPos: 6,
|
||||||
|
endPos: 8,
|
||||||
|
partOfSpeech: PartOfSpeech.bound_auxiliary,
|
||||||
|
pos1: '助動詞',
|
||||||
|
pos2: '*',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
getMinSentenceWordsForNPlusOne: () => 1,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.equal(result.tokens?.[0]?.surface, 'ご機嫌');
|
||||||
|
assert.equal(result.tokens?.[0]?.pos1, '接頭詞|名詞');
|
||||||
|
assert.equal(result.tokens?.[0]?.pos2, '名詞接続|一般');
|
||||||
|
assert.equal(result.tokens?.[0]?.frequencyRank, 5484);
|
||||||
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle clears all annotations for explanatory contrast endings', async () => {
|
test('tokenizeSubtitle clears all annotations for explanatory contrast endings', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'最近辛いものが続いとるんですけど',
|
'最近辛いものが続いとるんですけど',
|
||||||
|
|||||||
@@ -167,6 +167,27 @@ function shouldAllowOrdinalPrefixNounFrequency(token: MergedToken): boolean {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function shouldAllowHonorificPrefixNounFrequency(token: MergedToken): boolean {
|
||||||
|
const normalizedSurface = token.surface.trim();
|
||||||
|
const normalizedHeadword = token.headword.trim();
|
||||||
|
if (
|
||||||
|
!['お', 'ご', '御'].some(
|
||||||
|
(prefix) => normalizedSurface.startsWith(prefix) || normalizedHeadword.startsWith(prefix),
|
||||||
|
)
|
||||||
|
) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1));
|
||||||
|
const pos2Parts = splitNormalizedTagParts(normalizePos2Tag(token.pos2));
|
||||||
|
return (
|
||||||
|
pos1Parts.length >= 2 &&
|
||||||
|
pos1Parts[0] === '接頭詞' &&
|
||||||
|
pos1Parts.slice(1).some((part) => part === '名詞') &&
|
||||||
|
pos2Parts[0] === '名詞接続'
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
function isFrequencyExcludedByPos(
|
function isFrequencyExcludedByPos(
|
||||||
token: MergedToken,
|
token: MergedToken,
|
||||||
pos1Exclusions: ReadonlySet<string>,
|
pos1Exclusions: ReadonlySet<string>,
|
||||||
@@ -187,11 +208,13 @@ function isFrequencyExcludedByPos(
|
|||||||
pos2Exclusions,
|
pos2Exclusions,
|
||||||
);
|
);
|
||||||
const allowOrdinalPrefixNounToken = shouldAllowOrdinalPrefixNounFrequency(token);
|
const allowOrdinalPrefixNounToken = shouldAllowOrdinalPrefixNounFrequency(token);
|
||||||
|
const allowHonorificPrefixNounToken = shouldAllowHonorificPrefixNounFrequency(token);
|
||||||
|
|
||||||
if (
|
if (
|
||||||
isExcludedByTagSet(normalizedPos1, pos1Exclusions) &&
|
isExcludedByTagSet(normalizedPos1, pos1Exclusions) &&
|
||||||
!allowContentLedMergedToken &&
|
!allowContentLedMergedToken &&
|
||||||
!allowOrdinalPrefixNounToken
|
!allowOrdinalPrefixNounToken &&
|
||||||
|
!allowHonorificPrefixNounToken
|
||||||
) {
|
) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -199,7 +222,8 @@ function isFrequencyExcludedByPos(
|
|||||||
if (
|
if (
|
||||||
isExcludedByTagSet(normalizedPos2, pos2Exclusions) &&
|
isExcludedByTagSet(normalizedPos2, pos2Exclusions) &&
|
||||||
!allowContentLedMergedToken &&
|
!allowContentLedMergedToken &&
|
||||||
!allowOrdinalPrefixNounToken
|
!allowOrdinalPrefixNounToken &&
|
||||||
|
!allowHonorificPrefixNounToken
|
||||||
) {
|
) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user