fix: preserve ordinal frequency annotations

2026-08-02 19:21:34 -07:00 · 2026-05-03 21:07:46 -07:00
parent 4bd8fc3db4
commit 3284c40ab5
6 changed files with 258 additions and 2 deletions
@@ -4077,6 +4077,69 @@ test('tokenizeSubtitle keeps frequency for content-led merged token with trailin
  assert.equal(result.tokens?.[0]?.frequencyRank, 5468);
 });

+test('tokenizeSubtitle keeps frequency for ordinal prefix-noun tokens', async () => {
+  const result = await tokenizeSubtitle(
+    '第二走者',
+    makeDepsFromYomitanTokens(
+      [
+        { surface: '第二', reading: 'だいに', headword: '第二' },
+        { surface: '走者', reading: 'そうしゃ', headword: '走者' },
+      ],
+      {
+        getFrequencyDictionaryEnabled: () => true,
+        getFrequencyRank: (text) => (text === '第二' ? 1820 : text === '走者' ? 41555 : null),
+        tokenizeWithMecab: async () => [
+          {
+            headword: '第',
+            surface: '第',
+            reading: 'ダイ',
+            startPos: 0,
+            endPos: 1,
+            partOfSpeech: PartOfSpeech.other,
+            pos1: '接頭詞',
+            pos2: '数接続',
+            isMerged: false,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+          {
+            headword: '二',
+            surface: '二',
+            reading: 'ニ',
+            startPos: 1,
+            endPos: 2,
+            partOfSpeech: PartOfSpeech.noun,
+            pos1: '名詞',
+            pos2: '数',
+            isMerged: false,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+          {
+            headword: '走者',
+            surface: '走者',
+            reading: 'ソウシャ',
+            startPos: 2,
+            endPos: 4,
+            partOfSpeech: PartOfSpeech.noun,
+            pos1: '名詞',
+            pos2: '一般',
+            isMerged: false,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+        ],
+        getMinSentenceWordsForNPlusOne: () => 1,
+      },
+    ),
+  );
+
+  assert.equal(result.tokens?.[0]?.surface, '第二');
+  assert.equal(result.tokens?.[0]?.pos1, '接頭詞|名詞');
+  assert.equal(result.tokens?.[0]?.pos2, '数接続|数');
+  assert.equal(result.tokens?.[0]?.frequencyRank, 1820);
+});
+
 test('tokenizeSubtitle clears all annotations for explanatory contrast endings', async () => {
  const result = await tokenizeSubtitle(
    '最近辛いものが続いとるんですけど',
@@ -149,6 +149,24 @@ function shouldAllowContentLedMergedTokenFrequency(
  return true;
 }

+function shouldAllowOrdinalPrefixNounFrequency(token: MergedToken): boolean {
+  const normalizedSurface = token.surface.trim();
+  const normalizedHeadword = token.headword.trim();
+  if (!normalizedSurface.startsWith('第') && !normalizedHeadword.startsWith('第')) {
+    return false;
+  }
+
+  const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1));
+  const pos2Parts = splitNormalizedTagParts(normalizePos2Tag(token.pos2));
+  return (
+    pos1Parts.length >= 2 &&
+    pos1Parts[0] === '接頭詞' &&
+    pos1Parts.slice(1).some((part) => part === '名詞') &&
+    pos2Parts[0] === '数接続' &&
+    pos2Parts.slice(1).some((part) => part === '数')
+  );
+}
+
 function isFrequencyExcludedByPos(
  token: MergedToken,
  pos1Exclusions: ReadonlySet<string>,
@@ -168,12 +186,21 @@ function isFrequencyExcludedByPos(
    pos1Exclusions,
    pos2Exclusions,
  );
+  const allowOrdinalPrefixNounToken = shouldAllowOrdinalPrefixNounFrequency(token);

-  if (isExcludedByTagSet(normalizedPos1, pos1Exclusions) && !allowContentLedMergedToken) {
+  if (
+    isExcludedByTagSet(normalizedPos1, pos1Exclusions) &&
+    !allowContentLedMergedToken &&
+    !allowOrdinalPrefixNounToken
+  ) {
    return true;
  }

-  if (isExcludedByTagSet(normalizedPos2, pos2Exclusions) && !allowContentLedMergedToken) {
+  if (
+    isExcludedByTagSet(normalizedPos2, pos2Exclusions) &&
+    !allowContentLedMergedToken &&
+    !allowOrdinalPrefixNounToken
+  ) {
    return true;
  }

@@ -891,6 +891,105 @@ test('requestYomitanScanTokens can use frequency from later exact secondary-matc
  ]);
 });

+test('requestYomitanScanTokens uses exact frequency entry when selected reading differs', async () => {
+  let scannerScript = '';
+  const deps = createDeps(async (script) => {
+    if (script.includes('termsFind')) {
+      scannerScript = script;
+      return [];
+    }
+    if (script.includes('optionsGetFull')) {
+      return {
+        profileCurrent: 0,
+        profileIndex: 0,
+        scanLength: 40,
+        dictionaries: ['JPDBv2㋕', 'Jiten', 'CC100'],
+        dictionaryPriorityByName: {
+          'JPDBv2㋕': 0,
+          Jiten: 1,
+          CC100: 2,
+        },
+        dictionaryFrequencyModeByName: {
+          'JPDBv2㋕': 'rank-based',
+          Jiten: 'rank-based',
+          CC100: 'rank-based',
+        },
+        profiles: [
+          {
+            options: {
+              scanning: { length: 40 },
+              dictionaries: [
+                { name: 'JPDBv2㋕', enabled: true, id: 0 },
+                { name: 'Jiten', enabled: true, id: 1 },
+                { name: 'CC100', enabled: true, id: 2 },
+              ],
+            },
+          },
+        ],
+      };
+    }
+    return null;
+  });
+
+  await requestYomitanScanTokens('第二走者', deps, {
+    error: () => undefined,
+  });
+
+  const result = (await runInjectedYomitanScript(scannerScript, (action, params) => {
+    if (action !== 'termsFind') {
+      throw new Error(`unexpected action: ${action}`);
+    }
+
+    const text = (params as { text?: string } | undefined)?.text ?? '';
+    if (!text.startsWith('第二')) {
+      return { originalTextLength: 0, dictionaryEntries: [] };
+    }
+
+    return {
+      originalTextLength: 2,
+      dictionaryEntries: [
+        {
+          headwords: [
+            {
+              term: '第二',
+              reading: 'だいに',
+              sources: [{ originalText: '第二', isPrimary: true, matchType: 'exact' }],
+            },
+          ],
+          frequencies: [],
+        },
+        {
+          headwords: [
+            {
+              term: '第二',
+              reading: '',
+              sources: [{ originalText: '第二', isPrimary: false, matchType: 'exact' }],
+            },
+          ],
+          frequencies: [
+            {
+              headwordIndex: 0,
+              dictionary: 'JPDBv2㋕',
+              frequency: 189513,
+              displayValue: '1820,189513句',
+            },
+          ],
+        },
+      ],
+    };
+  })) as Array<Record<string, unknown>>;
+
+  assert.deepEqual(result?.[0], {
+    surface: '第二',
+    reading: 'だいに',
+    headword: '第二',
+    startPos: 0,
+    endPos: 2,
+    isNameMatch: false,
+    frequencyRank: 1820,
+  });
+});
+
 test('requestYomitanScanTokens marks tokens backed by SubMiner character dictionary entries', async () => {
  const deps = createDeps(async (script) => {
    if (script.includes('optionsGetFull')) {
@@ -960,6 +960,9 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
        const matchReading = typeof match.headword?.reading === 'string' ? match.headword.reading : '';
        const preferredReading =
          typeof preferredMatch.headword?.reading === 'string' ? preferredMatch.headword.reading : '';
+        if (!matchReading || !preferredReading) {
+          return true;
+        }
        return matchReading === preferredReading;
      }
      function getBestFrequencyRankForMatches(matches, dictionaryPriorityByName, dictionaryFrequencyModeByName) {