fix: preserve ordinal frequency annotations

2026-05-04 00:41:33 -07:00 · 2026-05-03 21:07:46 -07:00
parent 69d5cc7557
commit 00a94d6bd1
6 changed files with 258 additions and 2 deletions
@@ -0,0 +1,60 @@
 ---
 id: TASK-332
 title: Fix subtitle frequency annotation missing ranks shown in Yomitan popup
 status: Done
 assignee:
  - Codex
 created_date: '2026-05-04 03:29'
 updated_date: '2026-05-04 03:41'
 labels:
  - bug
  - tokenizer
 dependencies: []
 priority: medium
 ---
 ## Description
 <!-- SECTION:DESCRIPTION:BEGIN -->
 Subtitle frequency highlighting can miss a token even when the Yomitan popup shows a rank within the configured threshold. Reproduced with `第二走者とアンカーは\n中継地点に速やかに移動！`: Yomitan popup shows `第二` JPDB rank 1820, but SubMiner tokenizer output has no `frequencyRank` for `第二`, so renderer cannot annotate it.
 <!-- SECTION:DESCRIPTION:END -->
 ## Acceptance Criteria
 <!-- AC:BEGIN -->
 - [x] #1 `第二` in `第二走者とアンカーは\n中継地点に速やかに移動！` receives the Yomitan rank shown by the popup when frequency highlighting is enabled.
 - [x] #2 Regression test covers the Yomitan scan/frequency ingestion path for exact popup-derived ranks.
 - [x] #3 Existing tokenizer frequency tests continue to pass.
 <!-- AC:END -->
 ## Implementation Plan
 <!-- SECTION:PLAN:BEGIN -->
 1. Reproduce and inspect the missing `第二` rank path with tokenizer probes and focused tests.
 2. Preserve exact Yomitan scan frequency ranks when the matching frequency entry omits reading metadata but has the same exact term.
 3. Allow ranked ordinal prefix-noun tokens (`第` + numeric noun, e.g. `第二`) through annotation POS filtering while keeping standalone prefixes excluded.
 4. Verify with focused tokenizer/runtime/annotation tests, typecheck, changelog lint, and a live-style Yomitan profile probe.
 <!-- SECTION:PLAN:END -->
 ## Implementation Notes
 <!-- SECTION:NOTES:BEGIN -->
 Root-cause probe against temp copy of Yomitan profile: tokenizer returns no frequencyRank for `第二`; renderer config `topX` is 10000, so render threshold is not the blocker.
 User approved implementation plan on 2026-05-04.
 Verification: `bun test src/core/services/tokenizer.test.ts src/core/services/tokenizer/yomitan-parser-runtime.test.ts src/core/services/tokenizer/annotation-stage.test.ts` passed (192 tests).
 Verification: `bun run typecheck` passed.
 Verification: `bun run changelog:lint` passed.
 Verification: `bun run get-frequency:electron -- --yomitan-user-data /tmp/subminer-yomitan-probe-909423 "第二走者とアンカーは\\n中継地点に速やかに移動！"` produced `第二` with `frequencyRank: 1820`.
 Finalization check: implementation plan updated to reflect the discovered POS-filter root cause and completed solution.
 <!-- SECTION:NOTES:END -->
 ## Final Summary
 <!-- SECTION:FINAL_SUMMARY:BEGIN -->
 Fixed subtitle frequency annotation for `第二` by allowing ranked ordinal prefix-noun compounds through annotation POS filtering. Also made scan rank matching tolerate exact frequency entries where one side omits reading metadata. Verified with tokenizer/runtime/annotation tests, typecheck, changelog lint, and a live-style Yomitan profile probe showing `第二` now receives frequencyRank 1820.
 <!-- SECTION:FINAL_SUMMARY:END -->
@@ -0,0 +1,4 @@
 type: fixed
 area: overlay
 - Overlay: Fixed frequency highlighting for ordinal prefix-noun tokens like `第二` so popup ranks such as JPDB 1820 are preserved in subtitle annotations.
@@ -4077,6 +4077,69 @@ test('tokenizeSubtitle keeps frequency for content-led merged token with trailin
  assert.equal(result.tokens?.[0]?.frequencyRank, 5468);
 });
 test('tokenizeSubtitle keeps frequency for ordinal prefix-noun tokens', async () => {
  const result = await tokenizeSubtitle(
    '第二走者',
    makeDepsFromYomitanTokens(
      [
        { surface: '第二', reading: 'だいに', headword: '第二' },
        { surface: '走者', reading: 'そうしゃ', headword: '走者' },
      ],
      {
        getFrequencyDictionaryEnabled: () => true,
        getFrequencyRank: (text) => (text === '第二' ? 1820 : text === '走者' ? 41555 : null),
        tokenizeWithMecab: async () => [
          {
            headword: '第',
            surface: '第',
            reading: 'ダイ',
            startPos: 0,
            endPos: 1,
            partOfSpeech: PartOfSpeech.other,
            pos1: '接頭詞',
            pos2: '数接続',
            isMerged: false,
            isKnown: false,
            isNPlusOneTarget: false,
          },
          {
            headword: '二',
            surface: '二',
            reading: 'ニ',
            startPos: 1,
            endPos: 2,
            partOfSpeech: PartOfSpeech.noun,
            pos1: '名詞',
            pos2: '数',
            isMerged: false,
            isKnown: false,
            isNPlusOneTarget: false,
          },
          {
            headword: '走者',
            surface: '走者',
            reading: 'ソウシャ',
            startPos: 2,
            endPos: 4,
            partOfSpeech: PartOfSpeech.noun,
            pos1: '名詞',
            pos2: '一般',
            isMerged: false,
            isKnown: false,
            isNPlusOneTarget: false,
          },
        ],
        getMinSentenceWordsForNPlusOne: () => 1,
      },
    ),
  );
  assert.equal(result.tokens?.[0]?.surface, '第二');
  assert.equal(result.tokens?.[0]?.pos1, '接頭詞|名詞');
  assert.equal(result.tokens?.[0]?.pos2, '数接続|数');
  assert.equal(result.tokens?.[0]?.frequencyRank, 1820);
 });
 test('tokenizeSubtitle clears all annotations for explanatory contrast endings', async () => {
  const result = await tokenizeSubtitle(
    '最近辛いものが続いとるんですけど',
@@ -149,6 +149,24 @@ function shouldAllowContentLedMergedTokenFrequency(
  return true;
 }
 function shouldAllowOrdinalPrefixNounFrequency(token: MergedToken): boolean {
  const normalizedSurface = token.surface.trim();
  const normalizedHeadword = token.headword.trim();
  if (!normalizedSurface.startsWith('第') && !normalizedHeadword.startsWith('第')) {
    return false;
  }
  const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1));
  const pos2Parts = splitNormalizedTagParts(normalizePos2Tag(token.pos2));
  return (
    pos1Parts.length >= 2 &&
    pos1Parts[0] === '接頭詞' &&
    pos1Parts.slice(1).some((part) => part === '名詞') &&
    pos2Parts[0] === '数接続' &&
    pos2Parts.slice(1).some((part) => part === '数')
  );
 }
 function isFrequencyExcludedByPos(
  token: MergedToken,
  pos1Exclusions: ReadonlySet<string>,
@@ -168,12 +186,21 @@ function isFrequencyExcludedByPos(
    pos1Exclusions,
    pos2Exclusions,
  );
  const allowOrdinalPrefixNounToken = shouldAllowOrdinalPrefixNounFrequency(token);
-  if (isExcludedByTagSet(normalizedPos1, pos1Exclusions) && !allowContentLedMergedToken) {
+  if (
    isExcludedByTagSet(normalizedPos1, pos1Exclusions) &&
    !allowContentLedMergedToken &&
    !allowOrdinalPrefixNounToken
  ) {
    return true;
  }
-  if (isExcludedByTagSet(normalizedPos2, pos2Exclusions) && !allowContentLedMergedToken) {
+  if (
    isExcludedByTagSet(normalizedPos2, pos2Exclusions) &&
    !allowContentLedMergedToken &&
    !allowOrdinalPrefixNounToken
  ) {
    return true;
  }
@@ -891,6 +891,105 @@ test('requestYomitanScanTokens can use frequency from later exact secondary-matc
  ]);
 });
 test('requestYomitanScanTokens uses exact frequency entry when selected reading differs', async () => {
  let scannerScript = '';
  const deps = createDeps(async (script) => {
    if (script.includes('termsFind')) {
      scannerScript = script;
      return [];
    }
    if (script.includes('optionsGetFull')) {
      return {
        profileCurrent: 0,
        profileIndex: 0,
        scanLength: 40,
        dictionaries: ['JPDBv2㋕', 'Jiten', 'CC100'],
        dictionaryPriorityByName: {
          'JPDBv2㋕': 0,
          Jiten: 1,
          CC100: 2,
        },
        dictionaryFrequencyModeByName: {
          'JPDBv2㋕': 'rank-based',
          Jiten: 'rank-based',
          CC100: 'rank-based',
        },
        profiles: [
          {
            options: {
              scanning: { length: 40 },
              dictionaries: [
                { name: 'JPDBv2㋕', enabled: true, id: 0 },
                { name: 'Jiten', enabled: true, id: 1 },
                { name: 'CC100', enabled: true, id: 2 },
              ],
            },
          },
        ],
      };
    }
    return null;
  });
  await requestYomitanScanTokens('第二走者', deps, {
    error: () => undefined,
  });
  const result = (await runInjectedYomitanScript(scannerScript, (action, params) => {
    if (action !== 'termsFind') {
      throw new Error(`unexpected action: ${action}`);
    }
    const text = (params as { text?: string } | undefined)?.text ?? '';
    if (!text.startsWith('第二')) {
      return { originalTextLength: 0, dictionaryEntries: [] };
    }
    return {
      originalTextLength: 2,
      dictionaryEntries: [
        {
          headwords: [
            {
              term: '第二',
              reading: 'だいに',
              sources: [{ originalText: '第二', isPrimary: true, matchType: 'exact' }],
            },
          ],
          frequencies: [],
        },
        {
          headwords: [
            {
              term: '第二',
              reading: '',
              sources: [{ originalText: '第二', isPrimary: false, matchType: 'exact' }],
            },
          ],
          frequencies: [
            {
              headwordIndex: 0,
              dictionary: 'JPDBv2㋕',
              frequency: 189513,
              displayValue: '1820,189513句',
            },
          ],
        },
      ],
    };
  })) as Array<Record<string, unknown>>;
  assert.deepEqual(result?.[0], {
    surface: '第二',
    reading: 'だいに',
    headword: '第二',
    startPos: 0,
    endPos: 2,
    isNameMatch: false,
    frequencyRank: 1820,
  });
 });
 test('requestYomitanScanTokens marks tokens backed by SubMiner character dictionary entries', async () => {
  const deps = createDeps(async (script) => {
    if (script.includes('optionsGetFull')) {
@@ -960,6 +960,9 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
        const matchReading = typeof match.headword?.reading === 'string' ? match.headword.reading : '';
        const preferredReading =
          typeof preferredMatch.headword?.reading === 'string' ? preferredMatch.headword.reading : '';
        if (!matchReading || !preferredReading) {
          return true;
        }
        return matchReading === preferredReading;
      }
      function getBestFrequencyRankForMatches(matches, dictionaryPriorityByName, dictionaryFrequencyModeByName) {