feat(tokenizer): refine Yomitan grouping and parser tooling

- map segmented Yomitan lines into single logical tokens and improve candidate selection heuristics - limit frequency lookup to selected token text with POS-based exclusions and add debug logging hook - add standalone Yomitan parser test script, deterministic utility-script shutdown, and docs/backlog updates
2026-05-01 04:19:26 -07:00 · 2026-02-16 17:41:24 -08:00
parent 0eb2868805
commit 457e6f0f10
17 changed files with 1667 additions and 293 deletions
--- a/src/core/services/tokenizer-service.test.ts
+++ b/src/core/services/tokenizer-service.test.ts
@@ -213,7 +213,7 @@ test("tokenizeSubtitleService applies frequency dictionary ranks", async () => {
          reading: "デス",
          startPos: 1,
          endPos: 2,
-          partOfSpeech: PartOfSpeech.bound_auxiliary,
+          partOfSpeech: PartOfSpeech.other,
          isMerged: false,
          isKnown: false,
          isNPlusOneTarget: false,
@@ -228,7 +228,7 @@ test("tokenizeSubtitleService applies frequency dictionary ranks", async () => {
  assert.equal(result.tokens?.[1]?.frequencyRank, 1200);
 });

-test("tokenizeSubtitleService uses all Yomitan headword candidates for frequency lookup", async () => {
+test("tokenizeSubtitleService uses only selected Yomitan headword for frequency lookup", async () => {
  const result = await tokenizeSubtitleService(
    "猫です",
    makeDeps({
@@ -262,7 +262,66 @@ test("tokenizeSubtitleService uses all Yomitan headword candidates for frequency
  );

  assert.equal(result.tokens?.length, 1);
-  assert.equal(result.tokens?.[0]?.frequencyRank, 40);
+  assert.equal(result.tokens?.[0]?.frequencyRank, 1200);
+});
+
+test("tokenizeSubtitleService keeps furigana-split Yomitan segments as one token", async () => {
+  const result = await tokenizeSubtitleService(
+    "友達と話した",
+    makeDeps({
+      getFrequencyDictionaryEnabled: () => true,
+      getYomitanExt: () => ({ id: "dummy-ext" } as any),
+      getYomitanParserWindow: () => ({
+        isDestroyed: () => false,
+        webContents: {
+          executeJavaScript: async () => [
+            {
+              source: "scanning-parser",
+              index: 0,
+              content: [
+                [
+                  {
+                    text: "友",
+                    reading: "とも",
+                    headwords: [[{ term: "友達" }]],
+                  },
+                  {
+                    text: "達",
+                    reading: "だち",
+                  },
+                ],
+                [
+                  {
+                    text: "と",
+                    reading: "と",
+                    headwords: [[{ term: "と" }]],
+                  },
+                ],
+                [
+                  {
+                    text: "話した",
+                    reading: "はなした",
+                    headwords: [[{ term: "話す" }]],
+                  },
+                ],
+              ],
+            },
+          ],
+        },
+      } as unknown as Electron.BrowserWindow),
+      getFrequencyRank: (text) => (text === "友達" ? 22 : text === "話す" ? 90 : null),
+    }),
+  );
+
+  assert.equal(result.tokens?.length, 3);
+  assert.equal(result.tokens?.[0]?.surface, "友達");
+  assert.equal(result.tokens?.[0]?.reading, "ともだち");
+  assert.equal(result.tokens?.[0]?.headword, "友達");
+  assert.equal(result.tokens?.[0]?.frequencyRank, 22);
+  assert.equal(result.tokens?.[1]?.surface, "と");
+  assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
+  assert.equal(result.tokens?.[2]?.surface, "話した");
+  assert.equal(result.tokens?.[2]?.frequencyRank, 90);
 });

 test("tokenizeSubtitleService prefers exact headword frequency over surface/reading when available", async () => {
@@ -299,7 +358,7 @@ test("tokenizeSubtitleService prefers exact headword frequency over surface/read
  assert.equal(result.tokens?.[0]?.frequencyRank, 8);
 });

-test("tokenizeSubtitleService keeps no frequency when only reading matches and headword candidates miss", async () => {
+test("tokenizeSubtitleService keeps no frequency when only reading matches and headword misses", async () => {
  const result = await tokenizeSubtitleService(
    "猫です",
    makeDeps({
@@ -333,7 +392,7 @@ test("tokenizeSubtitleService keeps no frequency when only reading matches and h
  assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
 });

-test("tokenizeSubtitleService ignores invalid frequency ranks and takes best valid headword candidate", async () => {
+test("tokenizeSubtitleService ignores invalid frequency rank on selected headword", async () => {
  const result = await tokenizeSubtitleService(
    "猫です",
    makeDeps({
@@ -367,7 +426,7 @@ test("tokenizeSubtitleService ignores invalid frequency ranks and takes best val
  );

  assert.equal(result.tokens?.length, 1);
-  assert.equal(result.tokens?.[0]?.frequencyRank, 500);
+  assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
 });

 test("tokenizeSubtitleService handles real-word frequency candidates and prefers most frequent term", async () => {
@@ -472,6 +531,55 @@ test("tokenizeSubtitleService ignores frequency lookup failures", async () => {
  assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
 });

+test("tokenizeSubtitleService skips frequency rank when Yomitan token is enriched as particle by mecab pos1", async () => {
+  const result = await tokenizeSubtitleService(
+    "は",
+    makeDeps({
+      getFrequencyDictionaryEnabled: () => true,
+      getYomitanExt: () => ({ id: "dummy-ext" } as any),
+      getYomitanParserWindow: () => ({
+        isDestroyed: () => false,
+        webContents: {
+          executeJavaScript: async () => [
+            {
+              source: "scanning-parser",
+              index: 0,
+              content: [
+                [
+                  {
+                    text: "は",
+                    reading: "は",
+                    headwords: [[{ term: "は" }]],
+                  },
+                ],
+              ],
+            },
+          ],
+        },
+      } as unknown as Electron.BrowserWindow),
+      tokenizeWithMecab: async () => [
+        {
+          headword: "は",
+          surface: "は",
+          reading: "ハ",
+          startPos: 0,
+          endPos: 1,
+          partOfSpeech: PartOfSpeech.particle,
+          pos1: "助詞",
+          isMerged: false,
+          isKnown: false,
+          isNPlusOneTarget: false,
+        },
+      ],
+      getFrequencyRank: (text) => (text === "は" ? 10 : null),
+    }),
+  );
+
+  assert.equal(result.tokens?.length, 1);
+  assert.equal(result.tokens?.[0]?.pos1, "助詞");
+  assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
+});
+
 test("tokenizeSubtitleService ignores invalid frequency ranks", async () => {
  const result = await tokenizeSubtitleService(
    "猫",
@@ -753,6 +861,8 @@ test("tokenizeSubtitleService uses Yomitan parser result when available", async
                reading: "ねこ",
                headwords: [[{ term: "猫" }]],
              },
+            ],
+            [
              {
                text: "です",
                reading: "です",
@@ -783,6 +893,155 @@ test("tokenizeSubtitleService uses Yomitan parser result when available", async
  assert.equal(result.tokens?.[1]?.isKnown, false);
 });

+test("tokenizeSubtitleService logs selected Yomitan groups when debug toggle is enabled", async () => {
+  const infoLogs: string[] = [];
+  const originalInfo = console.info;
+  console.info = (...args: unknown[]) => {
+    infoLogs.push(args.map((value) => String(value)).join(" "));
+  };
+
+  try {
+    await tokenizeSubtitleService(
+      "友達と話した",
+      makeDeps({
+        getYomitanExt: () => ({ id: "dummy-ext" } as any),
+        getYomitanParserWindow: () => ({
+          isDestroyed: () => false,
+          webContents: {
+            executeJavaScript: async () => [
+              {
+                source: "scanning-parser",
+                index: 0,
+                content: [
+                  [
+                    {
+                      text: "友",
+                      reading: "とも",
+                      headwords: [[{ term: "友達" }]],
+                    },
+                    {
+                      text: "達",
+                      reading: "だち",
+                    },
+                  ],
+                  [
+                    {
+                      text: "と",
+                      reading: "と",
+                      headwords: [[{ term: "と" }]],
+                    },
+                  ],
+                ],
+              },
+            ],
+          },
+        } as unknown as Electron.BrowserWindow),
+        tokenizeWithMecab: async () => null,
+        getYomitanGroupDebugEnabled: () => true,
+      }),
+    );
+  } finally {
+    console.info = originalInfo;
+  }
+
+  assert.ok(
+    infoLogs.some((line) => line.includes("Selected Yomitan token groups")),
+  );
+});
+
+test("tokenizeSubtitleService does not log Yomitan groups when debug toggle is disabled", async () => {
+  const infoLogs: string[] = [];
+  const originalInfo = console.info;
+  console.info = (...args: unknown[]) => {
+    infoLogs.push(args.map((value) => String(value)).join(" "));
+  };
+
+  try {
+    await tokenizeSubtitleService(
+      "友達と話した",
+      makeDeps({
+        getYomitanExt: () => ({ id: "dummy-ext" } as any),
+        getYomitanParserWindow: () => ({
+          isDestroyed: () => false,
+          webContents: {
+            executeJavaScript: async () => [
+              {
+                source: "scanning-parser",
+                index: 0,
+                content: [
+                  [
+                    {
+                      text: "友",
+                      reading: "とも",
+                      headwords: [[{ term: "友達" }]],
+                    },
+                    {
+                      text: "達",
+                      reading: "だち",
+                    },
+                  ],
+                ],
+              },
+            ],
+          },
+        } as unknown as Electron.BrowserWindow),
+        tokenizeWithMecab: async () => null,
+        getYomitanGroupDebugEnabled: () => false,
+      }),
+    );
+  } finally {
+    console.info = originalInfo;
+  }
+
+  assert.equal(
+    infoLogs.some((line) => line.includes("Selected Yomitan token groups")),
+    false,
+  );
+});
+
+test("tokenizeSubtitleService preserves segmented Yomitan line as one token", async () => {
+  const parserWindow = {
+    isDestroyed: () => false,
+    webContents: {
+      executeJavaScript: async () => [
+        {
+          source: "scanning-parser",
+          index: 0,
+          content: [
+            [
+              {
+                text: "猫",
+                reading: "ねこ",
+                headwords: [[{ term: "猫です" }]],
+              },
+              {
+                text: "です",
+                reading: "です",
+              },
+            ],
+          ],
+        },
+      ],
+    },
+  } as unknown as Electron.BrowserWindow;
+
+  const result = await tokenizeSubtitleService(
+    "猫です",
+    makeDeps({
+      getYomitanExt: () => ({ id: "dummy-ext" } as any),
+      getYomitanParserWindow: () => parserWindow,
+      tokenizeWithMecab: async () => null,
+    }),
+  );
+
+  assert.equal(result.text, "猫です");
+  assert.equal(result.tokens?.length, 1);
+  assert.equal(result.tokens?.[0]?.surface, "猫です");
+  assert.equal(result.tokens?.[0]?.reading, "ねこです");
+  assert.equal(result.tokens?.[0]?.headword, "猫です");
+  assert.equal(result.tokens?.[0]?.isKnown, false);
+});
+
 test("tokenizeSubtitleService prefers mecab parser tokens when scanning parser returns one token", async () => {
  const result = await tokenizeSubtitleService(
    "俺は小園にいきたい",
@@ -880,6 +1139,59 @@ test("tokenizeSubtitleService keeps scanning parser tokens when they are already
  assert.equal(result.tokens?.[2]?.frequencyRank, undefined);
 });

+test("tokenizeSubtitleService prefers parse candidates with fewer fragment-only kana tokens when source priority is equal", async () => {
+  const result = await tokenizeSubtitleService(
+    "俺は公園にいきたい",
+    makeDeps({
+      getYomitanExt: () => ({ id: "dummy-ext" } as any),
+      getYomitanParserWindow: () => ({
+        isDestroyed: () => false,
+        webContents: {
+          executeJavaScript: async () => [
+            {
+              source: "mecab-fragmented",
+              index: 0,
+              content: [
+                [{ text: "俺", reading: "おれ", headwords: [[{ term: "俺" }]] }],
+                [{ text: "は", reading: "", headwords: [[{ term: "は" }]] }],
+                [{ text: "公園", reading: "こうえん", headwords: [[{ term: "公園" }]] }],
+                [{ text: "にい", reading: "", headwords: [[{ term: "兄" }], [{ term: "二位" }]] }],
+                [{ text: "きたい", reading: "", headwords: [[{ term: "期待" }], [{ term: "来る" }]] }],
+              ],
+            },
+            {
+              source: "mecab",
+              index: 0,
+              content: [
+                [{ text: "俺", reading: "おれ", headwords: [[{ term: "俺" }]] }],
+                [{ text: "は", reading: "は", headwords: [[{ term: "は" }]] }],
+                [{ text: "公園", reading: "こうえん", headwords: [[{ term: "公園" }]] }],
+                [{ text: "に", reading: "に", headwords: [[{ term: "に" }]] }],
+                [{ text: "行きたい", reading: "いきたい", headwords: [[{ term: "行きたい" }]] }],
+              ],
+            },
+          ],
+        },
+      } as unknown as Electron.BrowserWindow),
+      getFrequencyDictionaryEnabled: () => true,
+      getFrequencyRank: (text) =>
+        text === "俺"
+          ? 51
+          : text === "公園"
+          ? 2304
+          : text === "行きたい"
+          ? 1500
+          : null,
+      tokenizeWithMecab: async () => null,
+    }),
+  );
+
+  assert.equal(result.tokens?.map((token) => token.surface).join(","), "俺,は,公園,に,行きたい");
+  assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
+  assert.equal(result.tokens?.[3]?.frequencyRank, undefined);
+  assert.equal(result.tokens?.[4]?.frequencyRank, 1500);
+});
+
 test("tokenizeSubtitleService still assigns frequency to non-known Yomitan tokens", async () => {
  const result = await tokenizeSubtitleService(
    "小園に",
@@ -914,7 +1226,7 @@ test("tokenizeSubtitleService still assigns frequency to non-known Yomitan token
  assert.equal(result.tokens?.[0]?.isKnown, true);
  assert.equal(result.tokens?.[0]?.frequencyRank, 75);
  assert.equal(result.tokens?.[1]?.isKnown, false);
-  assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
+  assert.equal(result.tokens?.[1]?.frequencyRank, 3000);
 });

 test("tokenizeSubtitleService marks tokens as known using callback", async () => {
--- a/src/core/services/tokenizer-service.ts
+++ b/src/core/services/tokenizer-service.ts
@@ -13,6 +13,7 @@ import {
  shouldIgnoreJlptForMecabPos1,
  shouldIgnoreJlptByTerm,
 } from "./jlpt-token-filter";
+import { createLogger } from "../../logger";

 interface YomitanParseHeadword {
  term?: unknown;
@@ -37,6 +38,7 @@ const KATAKANA_CODEPOINT_START = 0x30a1;
 const KATAKANA_CODEPOINT_END = 0x30f6;
 const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
 const FREQUENCY_RANK_LOOKUP_CACHE_LIMIT = 2048;
+const logger = createLogger("main:tokenizer");

 const jlptLevelLookupCaches = new WeakMap<
  (text: string) => JlptLevel | null,
@@ -70,6 +72,7 @@ export interface TokenizerServiceDeps {
  getFrequencyDictionaryEnabled?: () => boolean;
  getFrequencyRank?: FrequencyDictionaryLookup;
  getMinSentenceWordsForNPlusOne?: () => number;
+  getYomitanGroupDebugEnabled?: () => boolean;
  tokenizeWithMecab: (text: string) => Promise<MergedToken[] | null>;
 }

@@ -92,6 +95,7 @@ export interface TokenizerDepsRuntimeOptions {
  getFrequencyDictionaryEnabled?: () => boolean;
  getFrequencyRank?: FrequencyDictionaryLookup;
  getMinSentenceWordsForNPlusOne?: () => number;
+  getYomitanGroupDebugEnabled?: () => boolean;
  getMecabTokenizer: () => MecabTokenizerLike | null;
 }

@@ -197,6 +201,8 @@ export function createTokenizerDepsRuntimeService(
    getFrequencyRank: options.getFrequencyRank,
    getMinSentenceWordsForNPlusOne:
      options.getMinSentenceWordsForNPlusOne ?? (() => 3),
+    getYomitanGroupDebugEnabled:
+      options.getYomitanGroupDebugEnabled ?? (() => false),
    tokenizeWithMecab: async (text) => {
      const mecabTokenizer = options.getMecabTokenizer();
      if (!mecabTokenizer) {
@@ -253,40 +259,19 @@ function resolveFrequencyLookupText(token: MergedToken): string {
 }

 function getFrequencyLookupTextCandidates(token: MergedToken): string[] {
-  const tokenWithCandidates = token as MergedToken & {
-    frequencyLookupTerms?: string[];
-  };
-  const lookupTextCandidates: string[] = [];
-  const addLookupText = (text: string | undefined): void => {
-    if (!text) {
-      return;
-    }
-    const trimmed = text.trim();
-    if (!trimmed) {
-      return;
-    }
-    lookupTextCandidates.push(trimmed);
-  };
+  const lookupText = resolveFrequencyLookupText(token).trim();
+  return lookupText ? [lookupText] : [];
+}

-  if (Array.isArray(tokenWithCandidates.frequencyLookupTerms)) {
-    for (const term of tokenWithCandidates.frequencyLookupTerms) {
-      addLookupText(term);
-    }
+function isFrequencyExcludedByPos(token: MergedToken): boolean {
+  if (
+    token.partOfSpeech === PartOfSpeech.particle ||
+    token.partOfSpeech === PartOfSpeech.bound_auxiliary
+  ) {
+    return true;
  }

-  addLookupText(resolveFrequencyLookupText(token));
-
-  const uniqueLookupTerms: string[] = [];
-  const seen = new Set<string>();
-  for (const term of lookupTextCandidates) {
-    if (seen.has(term)) {
-      continue;
-    }
-    seen.add(term);
-    uniqueLookupTerms.push(term);
-  }
-
-  return uniqueLookupTerms;
+  return token.pos1 === "助詞" || token.pos1 === "助動詞";
 }

 function applyFrequencyMarking(
@@ -294,6 +279,10 @@ function applyFrequencyMarking(
  getFrequencyRank: FrequencyDictionaryLookup,
 ): MergedToken[] {
  return tokens.map((token) => {
+    if (isFrequencyExcludedByPos(token)) {
+      return { ...token, frequencyRank: undefined };
+    }
+
    const lookupTexts = getFrequencyLookupTextCandidates(token);
    if (lookupTexts.length === 0) {
      return { ...token, frequencyRank: undefined };
@@ -499,27 +488,6 @@ function extractYomitanHeadword(segment: YomitanParseSegment): string {
  return "";
 }

-function extractYomitanHeadwords(segment: YomitanParseSegment): string[] {
-  const headwords = segment.headwords;
-  if (!isYomitanHeadwordRows(headwords)) {
-    return [];
-  }
-
-  const results: string[] = [];
-  for (const group of headwords) {
-    for (const candidate of group) {
-      if (isString(candidate.term)) {
-        const term = candidate.term.trim();
-        if (term.length > 0) {
-          results.push(term);
-        }
-      }
-    }
-  }
-
-  return results;
-}
-
 function applyJlptMarking(
  tokens: MergedToken[],
  getJlptLevel: (text: string) => JlptLevel | null,
@@ -575,41 +543,53 @@ function mapYomitanParseResultItemToMergedTokens(
    }
    validLineCount += 1;

+    let combinedSurface = "";
+    let combinedReading = "";
+    let combinedHeadword = "";
+
    for (const segment of line) {
      const segmentText = segment.text;
      if (!segmentText || segmentText.length === 0) {
        continue;
      }

-      const start = charOffset;
-      const end = start + segmentText.length;
-      charOffset = end;
-
-      const headword = extractYomitanHeadword(segment) || segmentText;
-      const frequencyLookupTerms = extractYomitanHeadwords(segment);
-
-      tokens.push({
-        surface: segmentText,
-        reading: typeof segment.reading === "string" ? segment.reading : "",
-        headword,
-        startPos: start,
-        endPos: end,
-        partOfSpeech: PartOfSpeech.other,
-        pos1: "",
-        isMerged: true,
-        isNPlusOneTarget: false,
-        isKnown: (() => {
-          const matchText = resolveKnownWordText(
-            segmentText,
-            headword,
-            knownWordMatchMode,
-          );
-          return matchText ? isKnownWord(matchText) : false;
-        })(),
-        frequencyLookupTerms:
-          frequencyLookupTerms.length > 0 ? frequencyLookupTerms : undefined,
-      });
+      combinedSurface += segmentText;
+      if (typeof segment.reading === "string") {
+        combinedReading += segment.reading;
+      }
+      if (!combinedHeadword) {
+        combinedHeadword = extractYomitanHeadword(segment);
+      }
    }
+
+    if (!combinedSurface) {
+      continue;
+    }
+
+    const start = charOffset;
+    const end = start + combinedSurface.length;
+    charOffset = end;
+    const headword = combinedHeadword || combinedSurface;
+
+    tokens.push({
+      surface: combinedSurface,
+      reading: combinedReading,
+      headword,
+      startPos: start,
+      endPos: end,
+      partOfSpeech: PartOfSpeech.other,
+      pos1: "",
+      isMerged: true,
+      isNPlusOneTarget: false,
+      isKnown: (() => {
+        const matchText = resolveKnownWordText(
+          combinedSurface,
+          headword,
+          knownWordMatchMode,
+        );
+        return matchText ? isKnownWord(matchText) : false;
+      })(),
+    });
  }

  if (validLineCount === 0 || tokens.length === 0) {
@@ -641,13 +621,52 @@ function selectBestYomitanParseCandidate(
      current.tokens.length > best.tokens.length ? current : best,
    );

+  const getCandidateScore = (candidate: YomitanParseCandidate): number => {
+    const readableTokenCount = candidate.tokens.filter(
+      (token) => token.reading.trim().length > 0,
+    ).length;
+    const suspiciousKanaFragmentCount = candidate.tokens.filter((token) =>
+      token.reading.trim().length === 0 &&
+      token.surface.length >= 2 &&
+      Array.from(token.surface).every((char) => isKanaChar(char))
+    ).length;
+
+    return (
+      readableTokenCount * 100 -
+      suspiciousKanaFragmentCount * 50 -
+      candidate.tokens.length
+    );
+  };
+
+  const chooseBestCandidate = (
+    items: YomitanParseCandidate[],
+  ): YomitanParseCandidate | null => {
+    if (items.length === 0) {
+      return null;
+    }
+
+    return items.reduce((best, current) => {
+      const bestScore = getCandidateScore(best);
+      const currentScore = getCandidateScore(current);
+      if (currentScore !== bestScore) {
+        return currentScore > bestScore ? current : best;
+      }
+
+      if (current.tokens.length !== best.tokens.length) {
+        return current.tokens.length < best.tokens.length ? current : best;
+      }
+
+      return best;
+    });
+  };
+
  if (scanningCandidates.length > 0) {
    const bestScanning = getBestByTokenCount(scanningCandidates);
    if (bestScanning && bestScanning.tokens.length > 1) {
      return bestScanning.tokens;
    }

-    const bestMecab = getBestByTokenCount(mecabCandidates);
+    const bestMecab = chooseBestCandidate(mecabCandidates);
    if (
      bestMecab &&
      bestMecab.tokens.length > (bestScanning?.tokens.length ?? 0)
@@ -658,7 +677,11 @@ function selectBestYomitanParseCandidate(
    return bestScanning ? bestScanning.tokens : null;
  }

-  const bestCandidate = getBestByTokenCount(candidates);
+  const multiTokenCandidates = candidates.filter(
+    (candidate) => candidate.tokens.length > 1,
+  );
+  const pool = multiTokenCandidates.length > 0 ? multiTokenCandidates : candidates;
+  const bestCandidate = chooseBestCandidate(pool);
  return bestCandidate ? bestCandidate.tokens : null;
 }

@@ -688,6 +711,25 @@ function mapYomitanParseResultsToMergedTokens(
  return bestCandidate;
 }

+function logSelectedYomitanGroups(text: string, tokens: MergedToken[]): void {
+  if (!tokens || tokens.length === 0) {
+    return;
+  }
+
+  logger.info("Selected Yomitan token groups", {
+    text,
+    tokenCount: tokens.length,
+    groups: tokens.map((token, index) => ({
+      index,
+      surface: token.surface,
+      headword: token.headword,
+      reading: token.reading,
+      startPos: token.startPos,
+      endPos: token.endPos,
+    })),
+  });
+}
+
 function pickClosestMecabPos1(
  token: MergedToken,
  mecabTokens: MergedToken[],
@@ -930,6 +972,10 @@ async function parseWithYomitanInternalParser(
      return null;
    }

+    if (deps.getYomitanGroupDebugEnabled?.() === true) {
+      logSelectedYomitanGroups(text, yomitanTokens);
+    }
+
    return enrichYomitanPos1(yomitanTokens, deps, text);
  } catch (err) {
    console.error("Yomitan parser request failed:", (err as Error).message);