fix: retain frequency rank for honorific prefix-noun tokens

- Add `shouldAllowHonorificPrefixNounFrequency` to exempt お/ご/御 + noun merged tokens from frequency exclusion - Add regression test for `ご機嫌` asserting rank 5484 is preserved after MeCab enrichment and annotation - Close TASK-341
2026-05-13 20:12:54 -07:00 · 2026-05-04 19:12:22 -07:00
parent 2b60c20711
commit 0b72fa108f
3 changed files with 182 additions and 2 deletions
@@ -4140,6 +4140,96 @@ test('tokenizeSubtitle keeps frequency for ordinal prefix-noun tokens', async ()
  assert.equal(result.tokens?.[0]?.frequencyRank, 1820);
 });

+test('tokenizeSubtitle keeps frequency for honorific prefix-noun tokens', async () => {
+  const result = await tokenizeSubtitle(
+    'ご機嫌が良くない',
+    makeDepsFromYomitanTokens(
+      [
+        { surface: 'ご機嫌', reading: 'ごきげん', headword: 'ご機嫌' },
+        { surface: 'が', reading: 'が', headword: 'が' },
+        { surface: '良くない', reading: 'よくない', headword: '良い' },
+      ],
+      {
+        getFrequencyDictionaryEnabled: () => true,
+        getFrequencyRank: (text) => (text === 'ご機嫌' ? 5484 : null),
+        tokenizeWithMecab: async () => [
+          {
+            headword: 'ご',
+            surface: 'ご',
+            reading: 'ゴ',
+            startPos: 0,
+            endPos: 1,
+            partOfSpeech: PartOfSpeech.other,
+            pos1: '接頭詞',
+            pos2: '名詞接続',
+            isMerged: false,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+          {
+            headword: '機嫌',
+            surface: '機嫌',
+            reading: 'キゲン',
+            startPos: 1,
+            endPos: 3,
+            partOfSpeech: PartOfSpeech.noun,
+            pos1: '名詞',
+            pos2: '一般',
+            isMerged: false,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+          {
+            headword: 'が',
+            surface: 'が',
+            reading: 'ガ',
+            startPos: 3,
+            endPos: 4,
+            partOfSpeech: PartOfSpeech.particle,
+            pos1: '助詞',
+            pos2: '格助詞',
+            isMerged: false,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+          {
+            headword: '良い',
+            surface: '良く',
+            reading: 'ヨク',
+            startPos: 4,
+            endPos: 6,
+            partOfSpeech: PartOfSpeech.i_adjective,
+            pos1: '形容詞',
+            pos2: '自立',
+            isMerged: false,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+          {
+            headword: 'ない',
+            surface: 'ない',
+            reading: 'ナイ',
+            startPos: 6,
+            endPos: 8,
+            partOfSpeech: PartOfSpeech.bound_auxiliary,
+            pos1: '助動詞',
+            pos2: '*',
+            isMerged: false,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+        ],
+        getMinSentenceWordsForNPlusOne: () => 1,
+      },
+    ),
+  );
+
+  assert.equal(result.tokens?.[0]?.surface, 'ご機嫌');
+  assert.equal(result.tokens?.[0]?.pos1, '接頭詞|名詞');
+  assert.equal(result.tokens?.[0]?.pos2, '名詞接続|一般');
+  assert.equal(result.tokens?.[0]?.frequencyRank, 5484);
+});
+
 test('tokenizeSubtitle clears all annotations for explanatory contrast endings', async () => {
  const result = await tokenizeSubtitle(
    '最近辛いものが続いとるんですけど',
@@ -167,6 +167,27 @@ function shouldAllowOrdinalPrefixNounFrequency(token: MergedToken): boolean {
  );
 }

+function shouldAllowHonorificPrefixNounFrequency(token: MergedToken): boolean {
+  const normalizedSurface = token.surface.trim();
+  const normalizedHeadword = token.headword.trim();
+  if (
+    !['お', 'ご', '御'].some(
+      (prefix) => normalizedSurface.startsWith(prefix) || normalizedHeadword.startsWith(prefix),
+    )
+  ) {
+    return false;
+  }
+
+  const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1));
+  const pos2Parts = splitNormalizedTagParts(normalizePos2Tag(token.pos2));
+  return (
+    pos1Parts.length >= 2 &&
+    pos1Parts[0] === '接頭詞' &&
+    pos1Parts.slice(1).some((part) => part === '名詞') &&
+    pos2Parts[0] === '名詞接続'
+  );
+}
+
 function isFrequencyExcludedByPos(
  token: MergedToken,
  pos1Exclusions: ReadonlySet<string>,
@@ -187,11 +208,13 @@ function isFrequencyExcludedByPos(
    pos2Exclusions,
  );
  const allowOrdinalPrefixNounToken = shouldAllowOrdinalPrefixNounFrequency(token);
+  const allowHonorificPrefixNounToken = shouldAllowHonorificPrefixNounFrequency(token);

  if (
    isExcludedByTagSet(normalizedPos1, pos1Exclusions) &&
    !allowContentLedMergedToken &&
-    !allowOrdinalPrefixNounToken
+    !allowOrdinalPrefixNounToken &&
+    !allowHonorificPrefixNounToken
  ) {
    return true;
  }
@@ -199,7 +222,8 @@ function isFrequencyExcludedByPos(
  if (
    isExcludedByTagSet(normalizedPos2, pos2Exclusions) &&
    !allowContentLedMergedToken &&
-    !allowOrdinalPrefixNounToken
+    !allowOrdinalPrefixNounToken &&
+    !allowHonorificPrefixNounToken
  ) {
    return true;
  }