fix: preserve ordinal frequency annotations

2026-05-04 00:41:33 -07:00 · 2026-05-03 21:07:46 -07:00
parent 69d5cc7557
commit 00a94d6bd1
6 changed files with 258 additions and 2 deletions
@@ -0,0 +1,60 @@
+---
+id: TASK-332
+title: Fix subtitle frequency annotation missing ranks shown in Yomitan popup
+status: Done
+assignee:
+  - Codex
+created_date: '2026-05-04 03:29'
+updated_date: '2026-05-04 03:41'
+labels:
+  - bug
+  - tokenizer
+dependencies: []
+priority: medium
+---
+
+## Description
+
+<!-- SECTION:DESCRIPTION:BEGIN -->
+Subtitle frequency highlighting can miss a token even when the Yomitan popup shows a rank within the configured threshold. Reproduced with `第二走者とアンカーは\n中継地点に速やかに移動！`: Yomitan popup shows `第二` JPDB rank 1820, but SubMiner tokenizer output has no `frequencyRank` for `第二`, so renderer cannot annotate it.
+<!-- SECTION:DESCRIPTION:END -->
+
+## Acceptance Criteria
+<!-- AC:BEGIN -->
+- [x] #1 `第二` in `第二走者とアンカーは\n中継地点に速やかに移動！` receives the Yomitan rank shown by the popup when frequency highlighting is enabled.
+- [x] #2 Regression test covers the Yomitan scan/frequency ingestion path for exact popup-derived ranks.
+- [x] #3 Existing tokenizer frequency tests continue to pass.
+<!-- AC:END -->
+
+## Implementation Plan
+
+<!-- SECTION:PLAN:BEGIN -->
+1. Reproduce and inspect the missing `第二` rank path with tokenizer probes and focused tests.
+2. Preserve exact Yomitan scan frequency ranks when the matching frequency entry omits reading metadata but has the same exact term.
+3. Allow ranked ordinal prefix-noun tokens (`第` + numeric noun, e.g. `第二`) through annotation POS filtering while keeping standalone prefixes excluded.
+4. Verify with focused tokenizer/runtime/annotation tests, typecheck, changelog lint, and a live-style Yomitan profile probe.
+<!-- SECTION:PLAN:END -->
+
+## Implementation Notes
+
+<!-- SECTION:NOTES:BEGIN -->
+Root-cause probe against temp copy of Yomitan profile: tokenizer returns no frequencyRank for `第二`; renderer config `topX` is 10000, so render threshold is not the blocker.
+
+User approved implementation plan on 2026-05-04.
+
+Verification: `bun test src/core/services/tokenizer.test.ts src/core/services/tokenizer/yomitan-parser-runtime.test.ts src/core/services/tokenizer/annotation-stage.test.ts` passed (192 tests).
+
+Verification: `bun run typecheck` passed.
+
+Verification: `bun run changelog:lint` passed.
+
+Verification: `bun run get-frequency:electron -- --yomitan-user-data /tmp/subminer-yomitan-probe-909423 "第二走者とアンカーは\\n中継地点に速やかに移動！"` produced `第二` with `frequencyRank: 1820`.
+
+Finalization check: implementation plan updated to reflect the discovered POS-filter root cause and completed solution.
+<!-- SECTION:NOTES:END -->
+
+## Final Summary
+
+<!-- SECTION:FINAL_SUMMARY:BEGIN -->
+Fixed subtitle frequency annotation for `第二` by allowing ranked ordinal prefix-noun compounds through annotation POS filtering. Also made scan rank matching tolerate exact frequency entries where one side omits reading metadata. Verified with tokenizer/runtime/annotation tests, typecheck, changelog lint, and a live-style Yomitan profile probe showing `第二` now receives frequencyRank 1820.
+<!-- SECTION:FINAL_SUMMARY:END -->
@@ -0,0 +1,4 @@
+type: fixed
+area: overlay
+
+- Overlay: Fixed frequency highlighting for ordinal prefix-noun tokens like `第二` so popup ranks such as JPDB 1820 are preserved in subtitle annotations.
@@ -4077,6 +4077,69 @@ test('tokenizeSubtitle keeps frequency for content-led merged token with trailin
  assert.equal(result.tokens?.[0]?.frequencyRank, 5468);
 });

+test('tokenizeSubtitle keeps frequency for ordinal prefix-noun tokens', async () => {
+  const result = await tokenizeSubtitle(
+    '第二走者',
+    makeDepsFromYomitanTokens(
+      [
+        { surface: '第二', reading: 'だいに', headword: '第二' },
+        { surface: '走者', reading: 'そうしゃ', headword: '走者' },
+      ],
+      {
+        getFrequencyDictionaryEnabled: () => true,
+        getFrequencyRank: (text) => (text === '第二' ? 1820 : text === '走者' ? 41555 : null),
+        tokenizeWithMecab: async () => [
+          {
+            headword: '第',
+            surface: '第',
+            reading: 'ダイ',
+            startPos: 0,
+            endPos: 1,
+            partOfSpeech: PartOfSpeech.other,
+            pos1: '接頭詞',
+            pos2: '数接続',
+            isMerged: false,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+          {
+            headword: '二',
+            surface: '二',
+            reading: 'ニ',
+            startPos: 1,
+            endPos: 2,
+            partOfSpeech: PartOfSpeech.noun,
+            pos1: '名詞',
+            pos2: '数',
+            isMerged: false,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+          {
+            headword: '走者',
+            surface: '走者',
+            reading: 'ソウシャ',
+            startPos: 2,
+            endPos: 4,
+            partOfSpeech: PartOfSpeech.noun,
+            pos1: '名詞',
+            pos2: '一般',
+            isMerged: false,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+        ],
+        getMinSentenceWordsForNPlusOne: () => 1,
+      },
+    ),
+  );
+
+  assert.equal(result.tokens?.[0]?.surface, '第二');
+  assert.equal(result.tokens?.[0]?.pos1, '接頭詞|名詞');
+  assert.equal(result.tokens?.[0]?.pos2, '数接続|数');
+  assert.equal(result.tokens?.[0]?.frequencyRank, 1820);
+});
+
 test('tokenizeSubtitle clears all annotations for explanatory contrast endings', async () => {
  const result = await tokenizeSubtitle(
    '最近辛いものが続いとるんですけど',
@@ -149,6 +149,24 @@ function shouldAllowContentLedMergedTokenFrequency(
  return true;
 }

+function shouldAllowOrdinalPrefixNounFrequency(token: MergedToken): boolean {
+  const normalizedSurface = token.surface.trim();
+  const normalizedHeadword = token.headword.trim();
+  if (!normalizedSurface.startsWith('第') && !normalizedHeadword.startsWith('第')) {
+    return false;
+  }
+
+  const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1));
+  const pos2Parts = splitNormalizedTagParts(normalizePos2Tag(token.pos2));
+  return (
+    pos1Parts.length >= 2 &&
+    pos1Parts[0] === '接頭詞' &&
+    pos1Parts.slice(1).some((part) => part === '名詞') &&
+    pos2Parts[0] === '数接続' &&
+    pos2Parts.slice(1).some((part) => part === '数')
+  );
+}
+
 function isFrequencyExcludedByPos(
  token: MergedToken,
  pos1Exclusions: ReadonlySet<string>,
@@ -168,12 +186,21 @@ function isFrequencyExcludedByPos(
    pos1Exclusions,
    pos2Exclusions,
  );
+  const allowOrdinalPrefixNounToken = shouldAllowOrdinalPrefixNounFrequency(token);

-  if (isExcludedByTagSet(normalizedPos1, pos1Exclusions) && !allowContentLedMergedToken) {
+  if (
+    isExcludedByTagSet(normalizedPos1, pos1Exclusions) &&
+    !allowContentLedMergedToken &&
+    !allowOrdinalPrefixNounToken
+  ) {
    return true;
  }

-  if (isExcludedByTagSet(normalizedPos2, pos2Exclusions) && !allowContentLedMergedToken) {
+  if (
+    isExcludedByTagSet(normalizedPos2, pos2Exclusions) &&
+    !allowContentLedMergedToken &&
+    !allowOrdinalPrefixNounToken
+  ) {
    return true;
  }

@@ -891,6 +891,105 @@ test('requestYomitanScanTokens can use frequency from later exact secondary-matc
  ]);
 });

+test('requestYomitanScanTokens uses exact frequency entry when selected reading differs', async () => {
+  let scannerScript = '';
+  const deps = createDeps(async (script) => {
+    if (script.includes('termsFind')) {
+      scannerScript = script;
+      return [];
+    }
+    if (script.includes('optionsGetFull')) {
+      return {
+        profileCurrent: 0,
+        profileIndex: 0,
+        scanLength: 40,
+        dictionaries: ['JPDBv2㋕', 'Jiten', 'CC100'],
+        dictionaryPriorityByName: {
+          'JPDBv2㋕': 0,
+          Jiten: 1,
+          CC100: 2,
+        },
+        dictionaryFrequencyModeByName: {
+          'JPDBv2㋕': 'rank-based',
+          Jiten: 'rank-based',
+          CC100: 'rank-based',
+        },
+        profiles: [
+          {
+            options: {
+              scanning: { length: 40 },
+              dictionaries: [
+                { name: 'JPDBv2㋕', enabled: true, id: 0 },
+                { name: 'Jiten', enabled: true, id: 1 },
+                { name: 'CC100', enabled: true, id: 2 },
+              ],
+            },
+          },
+        ],
+      };
+    }
+    return null;
+  });
+
+  await requestYomitanScanTokens('第二走者', deps, {
+    error: () => undefined,
+  });
+
+  const result = (await runInjectedYomitanScript(scannerScript, (action, params) => {
+    if (action !== 'termsFind') {
+      throw new Error(`unexpected action: ${action}`);
+    }
+
+    const text = (params as { text?: string } | undefined)?.text ?? '';
+    if (!text.startsWith('第二')) {
+      return { originalTextLength: 0, dictionaryEntries: [] };
+    }
+
+    return {
+      originalTextLength: 2,
+      dictionaryEntries: [
+        {
+          headwords: [
+            {
+              term: '第二',
+              reading: 'だいに',
+              sources: [{ originalText: '第二', isPrimary: true, matchType: 'exact' }],
+            },
+          ],
+          frequencies: [],
+        },
+        {
+          headwords: [
+            {
+              term: '第二',
+              reading: '',
+              sources: [{ originalText: '第二', isPrimary: false, matchType: 'exact' }],
+            },
+          ],
+          frequencies: [
+            {
+              headwordIndex: 0,
+              dictionary: 'JPDBv2㋕',
+              frequency: 189513,
+              displayValue: '1820,189513句',
+            },
+          ],
+        },
+      ],
+    };
+  })) as Array<Record<string, unknown>>;
+
+  assert.deepEqual(result?.[0], {
+    surface: '第二',
+    reading: 'だいに',
+    headword: '第二',
+    startPos: 0,
+    endPos: 2,
+    isNameMatch: false,
+    frequencyRank: 1820,
+  });
+});
+
 test('requestYomitanScanTokens marks tokens backed by SubMiner character dictionary entries', async () => {
  const deps = createDeps(async (script) => {
    if (script.includes('optionsGetFull')) {
@@ -960,6 +960,9 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
        const matchReading = typeof match.headword?.reading === 'string' ? match.headword.reading : '';
        const preferredReading =
          typeof preferredMatch.headword?.reading === 'string' ? preferredMatch.headword.reading : '';
+        if (!matchReading || !preferredReading) {
+          return true;
+        }
        return matchReading === preferredReading;
      }
      function getBestFrequencyRankForMatches(matches, dictionaryPriorityByName, dictionaryFrequencyModeByName) {