From 1d76e05cd38cec1b7f8557a9f73494ad759a3f60 Mon Sep 17 00:00:00 2001
From: sudacode <suda@sudacode.com>
Date: Sat, 7 Mar 2026 01:28:37 -0800
Subject: [PATCH] fix(subtitle): tighten frequency token filtering

---
 ...mitan-scan-token-fallback-fragmentation.md |  42 +++++
 ...kana-tokens-from-frequency-highlighting.md |  43 +++++
 src/core/services/tokenizer.test.ts           |  27 ++-
 .../tokenizer/annotation-stage.test.ts        |  52 +++++-
 .../services/tokenizer/annotation-stage.ts    |  18 ++
 .../tokenizer/yomitan-parser-runtime.test.ts  | 169 ++++++++++++++++++
 .../tokenizer/yomitan-parser-runtime.ts       |   9 +-
 7 files changed, 343 insertions(+), 17 deletions(-)
 create mode 100644 backlog/tasks/task-107 - Fix-Yomitan-scan-token-fallback-fragmentation.md
 create mode 100644 backlog/tasks/task-108 - Exclude-single-kana-tokens-from-frequency-highlighting.md

diff --git a/backlog/tasks/task-107 - Fix-Yomitan-scan-token-fallback-fragmentation.md b/backlog/tasks/task-107 - Fix-Yomitan-scan-token-fallback-fragmentation.md
new file mode 100644
index 0000000..749b3b0
--- /dev/null
+++ b/backlog/tasks/task-107 - Fix-Yomitan-scan-token-fallback-fragmentation.md	
@@ -0,0 +1,42 @@
+---
+id: TASK-107
+title: 'Fix Yomitan scan-token fallback fragmentation on exact-source misses'
+status: Done
+assignee: []
+created_date: '2026-03-07 01:10'
+updated_date: '2026-03-07 01:12'
+labels: []
+dependencies: []
+priority: high
+ordinal: 9007
+---
+
+## Description
+
+<!-- SECTION:DESCRIPTION:BEGIN -->
+
+Left-to-right Yomitan scanning can emit bogus fallback tokens when `termsFind` returns entries but none of their headwords carries an exact primary source for the consumed substring. Repro: `だが それでも届かぬ高みがあった` currently yields trailing fragments like `があ` / `た`, which blocks the real `あった` token from receiving frequency highlighting.
+
+<!-- SECTION:DESCRIPTION:END -->
+
+## Acceptance Criteria
+
+<!-- AC:BEGIN -->
+
+- [x] #1 Scanner skips `termsFind` fallback entries that are not backed by an exact primary source for the consumed substring.
+- [x] #2 Repro line no longer yields bogus trailing fragments such as `があ`.
+- [x] #3 Regression coverage added for the scan-token path.
+
+<!-- AC:END -->
+
+## Final Summary
+
+<!-- SECTION:FINAL_SUMMARY:BEGIN -->
+
+Removed the scan-token helper fallback that previously emitted a token from the first returned headword even when Yomitan did not report an exact primary source for the consumed substring. Added a focused regression test covering `だが それでも届かぬ高みがあった`, ensuring bogus `があ` fragmentation is skipped so the later `あった` exact match can still be tokenized and highlighted.
+
+Verification:
+
+- `bun test src/core/services/tokenizer/yomitan-parser-runtime.test.ts src/core/services/tokenizer.test.ts --timeout 20000`
+
+<!-- SECTION:FINAL_SUMMARY:END -->
diff --git a/backlog/tasks/task-108 - Exclude-single-kana-tokens-from-frequency-highlighting.md b/backlog/tasks/task-108 - Exclude-single-kana-tokens-from-frequency-highlighting.md
new file mode 100644
index 0000000..0706582
--- /dev/null
+++ b/backlog/tasks/task-108 - Exclude-single-kana-tokens-from-frequency-highlighting.md	
@@ -0,0 +1,43 @@
+---
+id: TASK-108
+title: 'Exclude single kana tokens from frequency highlighting'
+status: Done
+assignee: []
+created_date: '2026-03-07 01:18'
+updated_date: '2026-03-07 01:22'
+labels: []
+dependencies: []
+priority: medium
+ordinal: 9008
+---
+
+## Description
+
+<!-- SECTION:DESCRIPTION:BEGIN -->
+
+Suppress frequency highlighting for single-character hiragana or katakana tokens. Scope is frequency-only: known/N+1/JLPT behavior stays unchanged.
+
+<!-- SECTION:DESCRIPTION:END -->
+
+## Acceptance Criteria
+
+<!-- AC:BEGIN -->
+
+- [x] #1 Single-character hiragana tokens do not retain `frequencyRank`.
+- [x] #2 Single-character katakana tokens do not retain `frequencyRank`.
+- [x] #3 Regression coverage exists at annotation-stage and tokenizer levels.
+
+<!-- AC:END -->
+
+## Final Summary
+
+<!-- SECTION:FINAL_SUMMARY:BEGIN -->
+
+Added a frequency-only suppression rule for single-character kana tokens based on token `surface`, so bogus merged fragments like `た` and standalone one-character kana no longer keep `frequencyRank`. Regression coverage now exists both in the annotation stage and in the tokenizer path, while multi-character tokens and N+1/JLPT behavior remain unchanged.
+
+Verification:
+
+- `bun test src/core/services/tokenizer/annotation-stage.test.ts --timeout 20000`
+- `bun test src/core/services/tokenizer.test.ts --timeout 20000`
+
+<!-- SECTION:FINAL_SUMMARY:END -->
diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts
index 864b9a6..d0d295e 100644
--- a/src/core/services/tokenizer.test.ts
+++ b/src/core/services/tokenizer.test.ts
@@ -1861,9 +1861,9 @@ test('tokenizeSubtitle keeps parsing explicit by scanning-parser source only', a
   assert.equal(result.tokens?.[4]?.frequencyRank, 1500);
 });
 
-test('tokenizeSubtitle still assigns frequency to non-known Yomitan tokens', async () => {
+test('tokenizeSubtitle still assigns frequency to non-known multi-character Yomitan tokens', async () => {
   const result = await tokenizeSubtitle(
-    '小園に',
+    '小園友達',
     makeDeps({
       getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
       getYomitanParserWindow: () =>
@@ -1884,9 +1884,9 @@ test('tokenizeSubtitle still assigns frequency to non-known Yomitan tokens', asy
                   ],
                   [
                     {
-                      text: 'に',
-                      reading: 'に',
-                      headwords: [[{ term: 'に' }]],
+                      text: '友達',
+                      reading: 'ともだち',
+                      headwords: [[{ term: '友達' }]],
                     },
                   ],
                 ],
@@ -1895,7 +1895,7 @@ test('tokenizeSubtitle still assigns frequency to non-known Yomitan tokens', asy
           },
         }) as unknown as Electron.BrowserWindow,
       getFrequencyDictionaryEnabled: () => true,
-      getFrequencyRank: (text) => (text === '小園' ? 75 : text === 'に' ? 3000 : null),
+      getFrequencyRank: (text) => (text === '小園' ? 75 : text === '友達' ? 3000 : null),
       isKnownWord: (text) => text === '小園',
     }),
   );
@@ -2635,6 +2635,21 @@ test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and freque
   assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
 });
 
+test('tokenizeSubtitle excludes single-kana merged tokens from frequency highlighting', async () => {
+  const result = await tokenizeSubtitle(
+    'た',
+    makeDepsFromYomitanTokens([{ surface: 'た', reading: 'た', headword: 'た' }], {
+      getFrequencyDictionaryEnabled: () => true,
+      getFrequencyRank: (text) => (text === 'た' ? 17 : null),
+      getMinSentenceWordsForNPlusOne: () => 1,
+      tokenizeWithMecab: async () => null,
+    }),
+  );
+
+  assert.equal(result.tokens?.length, 1);
+  assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
+});
+
 test('tokenizeSubtitle excludes merged function/content token from frequency highlighting but keeps N+1', async () => {
   const result = await tokenizeSubtitle(
     'になれば',
diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts
index 1123a5d..c6f4cfd 100644
--- a/src/core/services/tokenizer/annotation-stage.test.ts
+++ b/src/core/services/tokenizer/annotation-stage.test.ts
@@ -252,12 +252,12 @@ test('annotateTokens applies configured pos1 exclusions to both frequency and N+
 test('annotateTokens allows previously default-excluded pos1 when removed from effective set', () => {
   const tokens = [
     makeToken({
-      surface: 'は',
-      headword: 'は',
+      surface: 'まで',
+      headword: 'まで',
       partOfSpeech: PartOfSpeech.other,
       pos1: '助詞',
       startPos: 0,
-      endPos: 1,
+      endPos: 2,
       frequencyRank: 8,
     }),
   ];
@@ -314,6 +314,52 @@ test('annotateTokens excludes likely kana SFX tokens from frequency when POS tag
   assert.equal(result[0]?.frequencyRank, undefined);
 });
 
+test('annotateTokens excludes single hiragana and katakana tokens from frequency when POS tags are missing', () => {
+  const tokens = [
+    makeToken({
+      surface: 'た',
+      reading: 'た',
+      headword: 'た',
+      pos1: '',
+      pos2: '',
+      partOfSpeech: PartOfSpeech.other,
+      frequencyRank: 21,
+      startPos: 0,
+      endPos: 1,
+    }),
+    makeToken({
+      surface: 'ア',
+      reading: 'ア',
+      headword: 'ア',
+      pos1: '',
+      pos2: '',
+      partOfSpeech: PartOfSpeech.other,
+      frequencyRank: 22,
+      startPos: 1,
+      endPos: 2,
+    }),
+    makeToken({
+      surface: '山',
+      reading: 'やま',
+      headword: '山',
+      pos1: '',
+      pos2: '',
+      partOfSpeech: PartOfSpeech.other,
+      frequencyRank: 23,
+      startPos: 2,
+      endPos: 3,
+    }),
+  ];
+
+  const result = annotateTokens(tokens, makeDeps(), {
+    minSentenceWordsForNPlusOne: 1,
+  });
+
+  assert.equal(result[0]?.frequencyRank, undefined);
+  assert.equal(result[1]?.frequencyRank, undefined);
+  assert.equal(result[2]?.frequencyRank, 23);
+});
+
 test('annotateTokens keeps frequency when mecab tags classify token as content-bearing', () => {
   const tokens = [
     makeToken({
diff --git a/src/core/services/tokenizer/annotation-stage.ts b/src/core/services/tokenizer/annotation-stage.ts
index b62264a..c263757 100644
--- a/src/core/services/tokenizer/annotation-stage.ts
+++ b/src/core/services/tokenizer/annotation-stage.ts
@@ -103,6 +103,10 @@ function isFrequencyExcludedByPos(
   pos1Exclusions: ReadonlySet<string>,
   pos2Exclusions: ReadonlySet<string>,
 ): boolean {
+  if (isSingleKanaFrequencyNoiseToken(token.surface)) {
+    return true;
+  }
+
   const normalizedPos1 = normalizePos1Tag(token.pos1);
   const hasPos1 = normalizedPos1.length > 0;
   if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
@@ -363,6 +367,20 @@ function isLikelyFrequencyNoiseToken(token: MergedToken): boolean {
   return false;
 }
 
+function isSingleKanaFrequencyNoiseToken(text: string | undefined): boolean {
+  if (typeof text !== 'string') {
+    return false;
+  }
+
+  const normalized = text.trim();
+  if (!normalized) {
+    return false;
+  }
+
+  const chars = [...normalized];
+  return chars.length === 1 && isKanaChar(chars[0]!);
+}
+
 function isJlptEligibleToken(token: MergedToken): boolean {
   if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) {
     return false;
diff --git a/src/core/services/tokenizer/yomitan-parser-runtime.test.ts b/src/core/services/tokenizer/yomitan-parser-runtime.test.ts
index b332313..d6aa519 100644
--- a/src/core/services/tokenizer/yomitan-parser-runtime.test.ts
+++ b/src/core/services/tokenizer/yomitan-parser-runtime.test.ts
@@ -643,6 +643,175 @@ test('requestYomitanScanTokens marks grouped entries when SubMiner dictionary al
   assert.equal((result as Array<{ isNameMatch?: boolean }>)[0]?.isNameMatch, true);
 });
 
+test('requestYomitanScanTokens skips fallback fragments without exact primary source matches', async () => {
+  const deps = createDeps(async (script) => {
+    if (script.includes('optionsGetFull')) {
+      return {
+        profileCurrent: 0,
+        profiles: [
+          {
+            options: {
+              scanning: { length: 40 },
+            },
+          },
+        ],
+      };
+    }
+
+    return await runInjectedYomitanScript(script, (action, params) => {
+      if (action !== 'termsFind') {
+        throw new Error(`unexpected action: ${action}`);
+      }
+
+      const text = (params as { text?: string } | undefined)?.text ?? '';
+      if (text.startsWith('だが ')) {
+        return {
+          originalTextLength: 2,
+          dictionaryEntries: [
+            {
+              headwords: [
+                {
+                  term: 'だが',
+                  reading: 'だが',
+                  sources: [{ originalText: 'だが', isPrimary: true, matchType: 'exact' }],
+                },
+              ],
+            },
+          ],
+        };
+      }
+      if (text.startsWith('それでも')) {
+        return {
+          originalTextLength: 4,
+          dictionaryEntries: [
+            {
+              headwords: [
+                {
+                  term: 'それでも',
+                  reading: 'それでも',
+                  sources: [{ originalText: 'それでも', isPrimary: true, matchType: 'exact' }],
+                },
+              ],
+            },
+          ],
+        };
+      }
+      if (text.startsWith('届かぬ')) {
+        return {
+          originalTextLength: 3,
+          dictionaryEntries: [
+            {
+              headwords: [
+                {
+                  term: '届く',
+                  reading: 'とどく',
+                  sources: [{ originalText: '届かぬ', isPrimary: true, matchType: 'exact' }],
+                },
+              ],
+            },
+          ],
+        };
+      }
+      if (text.startsWith('高み')) {
+        return {
+          originalTextLength: 2,
+          dictionaryEntries: [
+            {
+              headwords: [
+                {
+                  term: '高み',
+                  reading: 'たかみ',
+                  sources: [{ originalText: '高み', isPrimary: true, matchType: 'exact' }],
+                },
+              ],
+            },
+          ],
+        };
+      }
+      if (text.startsWith('があった')) {
+        return {
+          originalTextLength: 2,
+          dictionaryEntries: [
+            {
+              headwords: [
+                {
+                  term: 'があ',
+                  reading: '',
+                  sources: [{ originalText: 'が', isPrimary: true, matchType: 'exact' }],
+                },
+              ],
+            },
+          ],
+        };
+      }
+      if (text.startsWith('あった')) {
+        return {
+          originalTextLength: 3,
+          dictionaryEntries: [
+            {
+              headwords: [
+                {
+                  term: 'ある',
+                  reading: 'ある',
+                  sources: [{ originalText: 'あった', isPrimary: true, matchType: 'exact' }],
+                },
+              ],
+            },
+          ],
+        };
+      }
+      return { originalTextLength: 0, dictionaryEntries: [] };
+    });
+  });
+
+  const result = await requestYomitanScanTokens(
+    'だが それでも届かぬ高みがあった',
+    deps,
+    { error: () => undefined },
+  );
+
+  assert.deepEqual(
+    result?.map((token) => ({
+      surface: token.surface,
+      headword: token.headword,
+      startPos: token.startPos,
+      endPos: token.endPos,
+    })),
+    [
+      {
+        surface: 'だが',
+        headword: 'だが',
+        startPos: 0,
+        endPos: 2,
+      },
+      {
+        surface: 'それでも',
+        headword: 'それでも',
+        startPos: 3,
+        endPos: 7,
+      },
+      {
+        surface: '届かぬ',
+        headword: '届く',
+        startPos: 7,
+        endPos: 10,
+      },
+      {
+        surface: '高み',
+        headword: '高み',
+        startPos: 10,
+        endPos: 12,
+      },
+      {
+        surface: 'あった',
+        headword: 'ある',
+        startPos: 13,
+        endPos: 16,
+      },
+    ],
+  );
+});
+
 test('getYomitanDictionaryInfo requests dictionary info via backend action', async () => {
   let scriptValue = '';
   const deps = createDeps(async (script) => {
diff --git a/src/core/services/tokenizer/yomitan-parser-runtime.ts b/src/core/services/tokenizer/yomitan-parser-runtime.ts
index dad4930..fbf574b 100644
--- a/src/core/services/tokenizer/yomitan-parser-runtime.ts
+++ b/src/core/services/tokenizer/yomitan-parser-runtime.ts
@@ -843,14 +843,7 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
             };
           }
         }
-        const fallback = dictionaryEntries?.[0]?.headwords?.[0];
-        return fallback
-          ? {
-              term: fallback.term,
-              reading: fallback.reading,
-              isNameMatch: matchedNameDictionary || isNameDictionaryEntry(dictionaryEntries?.[0])
-            }
-          : null;
+        return null;
       }
 `;