Fix Yomitan token headword frequency matching and add frequency tests

2026-06-15 15:13:31 -07:00 · 2026-02-16 13:21:19 -08:00
parent e142d2dc3b
commit 0eb2868805
7 changed files with 1586 additions and 80 deletions
@@ -228,6 +228,223 @@ test("tokenizeSubtitleService applies frequency dictionary ranks", async () => {
  assert.equal(result.tokens?.[1]?.frequencyRank, 1200);
 });

+test("tokenizeSubtitleService uses all Yomitan headword candidates for frequency lookup", async () => {
+  const result = await tokenizeSubtitleService(
+    "猫です",
+    makeDeps({
+      getFrequencyDictionaryEnabled: () => true,
+      getYomitanExt: () => ({ id: "dummy-ext" } as any),
+      getYomitanParserWindow: () => ({
+        isDestroyed: () => false,
+        webContents: {
+          executeJavaScript: async () => [
+            {
+              source: "scanning-parser",
+              index: 0,
+              content: [
+                [
+                  {
+                    text: "猫です",
+                    reading: "ねこです",
+                    headwords: [
+                      [{ term: "猫です" }],
+                      [{ term: "猫" }],
+                    ],
+                  },
+                ],
+              ],
+            },
+          ],
+        },
+      } as unknown as Electron.BrowserWindow),
+      getFrequencyRank: (text) => (text === "猫" ? 40 : text === "猫です" ? 1200 : null),
+    }),
+  );
+
+  assert.equal(result.tokens?.length, 1);
+  assert.equal(result.tokens?.[0]?.frequencyRank, 40);
+});
+
+test("tokenizeSubtitleService prefers exact headword frequency over surface/reading when available", async () => {
+  const result = await tokenizeSubtitleService(
+    "猫です",
+    makeDeps({
+      getFrequencyDictionaryEnabled: () => true,
+      getYomitanExt: () => ({ id: "dummy-ext" } as any),
+      getYomitanParserWindow: () => ({
+        isDestroyed: () => false,
+        webContents: {
+          executeJavaScript: async () => [
+            {
+              source: "scanning-parser",
+              index: 0,
+              content: [
+                [
+                  {
+                    text: "猫",
+                    reading: "ねこ",
+                    headwords: [[{ term: "ネコ" }]],
+                  },
+                ],
+              ],
+            },
+          ],
+        },
+      } as unknown as Electron.BrowserWindow),
+      getFrequencyRank: (text) => (text === "猫" ? 1200 : text === "ネコ" ? 8 : null),
+    }),
+  );
+
+  assert.equal(result.tokens?.length, 1);
+  assert.equal(result.tokens?.[0]?.frequencyRank, 8);
+});
+
+test("tokenizeSubtitleService keeps no frequency when only reading matches and headword candidates miss", async () => {
+  const result = await tokenizeSubtitleService(
+    "猫です",
+    makeDeps({
+      getFrequencyDictionaryEnabled: () => true,
+      getYomitanExt: () => ({ id: "dummy-ext" } as any),
+      getYomitanParserWindow: () => ({
+        isDestroyed: () => false,
+        webContents: {
+          executeJavaScript: async () => [
+            {
+              source: "scanning-parser",
+              index: 0,
+              content: [
+                [
+                  {
+                    text: "猫",
+                    reading: "ねこ",
+                    headwords: [[{ term: "猫です" }]],
+                  },
+                ],
+              ],
+            },
+          ],
+        },
+      } as unknown as Electron.BrowserWindow),
+      getFrequencyRank: (text) => (text === "ねこ" ? 77 : null),
+    }),
+  );
+
+  assert.equal(result.tokens?.length, 1);
+  assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
+});
+
+test("tokenizeSubtitleService ignores invalid frequency ranks and takes best valid headword candidate", async () => {
+  const result = await tokenizeSubtitleService(
+    "猫です",
+    makeDeps({
+      getFrequencyDictionaryEnabled: () => true,
+      getYomitanExt: () => ({ id: "dummy-ext" } as any),
+      getYomitanParserWindow: () => ({
+        isDestroyed: () => false,
+        webContents: {
+          executeJavaScript: async () => [
+            {
+              source: "scanning-parser",
+              index: 0,
+              content: [
+                [
+                  {
+                    text: "猫です",
+                    reading: "ねこです",
+                    headwords: [
+                      [{ term: "猫" }],
+                      [{ term: "猫です" }],
+                    ],
+                  },
+                ],
+              ],
+            },
+          ],
+        },
+      } as unknown as Electron.BrowserWindow),
+      getFrequencyRank: (text) => (text === "猫" ? Number.NaN : text === "猫です" ? 500 : null),
+    }),
+  );
+
+  assert.equal(result.tokens?.length, 1);
+  assert.equal(result.tokens?.[0]?.frequencyRank, 500);
+});
+
+test("tokenizeSubtitleService handles real-word frequency candidates and prefers most frequent term", async () => {
+  const result = await tokenizeSubtitleService(
+    "昨日",
+    makeDeps({
+      getFrequencyDictionaryEnabled: () => true,
+      getYomitanExt: () => ({ id: "dummy-ext" } as any),
+      getYomitanParserWindow: () => ({
+        isDestroyed: () => false,
+        webContents: {
+          executeJavaScript: async () => [
+            {
+              source: "scanning-parser",
+              index: 0,
+              content: [
+                [
+                  {
+                    text: "昨日",
+                    reading: "きのう",
+                    headwords: [
+                      [{ term: "昨日" }],
+                      [{ term: "きのう" }],
+                    ],
+                  },
+                ],
+              ],
+            },
+          ],
+        },
+      } as unknown as Electron.BrowserWindow),
+      getFrequencyRank: (text) => (text === "きのう" ? 120 : text === "昨日" ? 40 : null),
+    }),
+  );
+
+  assert.equal(result.tokens?.length, 1);
+  assert.equal(result.tokens?.[0]?.frequencyRank, 40);
+});
+
+test("tokenizeSubtitleService ignores candidates with no dictionary rank when higher-frequency candidate exists", async () => {
+  const result = await tokenizeSubtitleService(
+    "猫です",
+    makeDeps({
+      getFrequencyDictionaryEnabled: () => true,
+      getYomitanExt: () => ({ id: "dummy-ext" } as any),
+      getYomitanParserWindow: () => ({
+        isDestroyed: () => false,
+        webContents: {
+          executeJavaScript: async () => [
+            {
+              source: "scanning-parser",
+              index: 0,
+              content: [
+                [
+                  {
+                    text: "猫",
+                    reading: "ねこ",
+                    headwords: [
+                      [{ term: "猫" }],
+                      [{ term: "猫です" }],
+                      [{ term: "unknown-term" }],
+                    ],
+                  },
+                ],
+              ],
+            },
+          ],
+        },
+      } as unknown as Electron.BrowserWindow),
+      getFrequencyRank: (text) => (text === "unknown-term" ? -1 : text === "猫" ? 88 : text === "猫です" ? 9000 : null),
+    }),
+  );
+
+  assert.equal(result.tokens?.length, 1);
+  assert.equal(result.tokens?.[0]?.frequencyRank, 88);
+});
+
 test("tokenizeSubtitleService ignores frequency lookup failures", async () => {
  const result = await tokenizeSubtitleService(
    "猫",
@@ -557,10 +774,147 @@ test("tokenizeSubtitleService uses Yomitan parser result when available", async
  );

  assert.equal(result.text, "猫です");
-  assert.equal(result.tokens?.length, 1);
-  assert.equal(result.tokens?.[0]?.surface, "猫です");
-  assert.equal(result.tokens?.[0]?.reading, "ねこです");
+  assert.equal(result.tokens?.length, 2);
+  assert.equal(result.tokens?.[0]?.surface, "猫");
+  assert.equal(result.tokens?.[0]?.reading, "ねこ");
  assert.equal(result.tokens?.[0]?.isKnown, false);
+  assert.equal(result.tokens?.[1]?.surface, "です");
+  assert.equal(result.tokens?.[1]?.reading, "です");
+  assert.equal(result.tokens?.[1]?.isKnown, false);
+});
+
+test("tokenizeSubtitleService prefers mecab parser tokens when scanning parser returns one token", async () => {
+  const result = await tokenizeSubtitleService(
+    "俺は小園にいきたい",
+    makeDeps({
+      getYomitanExt: () => ({ id: "dummy-ext" } as any),
+      getYomitanParserWindow: () => ({
+        isDestroyed: () => false,
+        webContents: {
+          executeJavaScript: async () => [
+            {
+              source: "scanning-parser",
+              index: 0,
+              content: [
+                [
+                  {
+                    text: "俺は小園にいきたい",
+                    reading: "おれは小園にいきたい",
+                    headwords: [[{ term: "俺は小園にいきたい" }]],
+                  },
+                ],
+              ],
+            },
+            {
+              source: "mecab",
+              index: 0,
+              content: [
+                [{ text: "俺", reading: "おれ", headwords: [[{ term: "俺" }]] }],
+                [{ text: "は", reading: "は", headwords: [[{ term: "は" }]] }],
+                [{ text: "小園", reading: "おうえん", headwords: [[{ term: "小園" }]] }],
+                [{ text: "に", reading: "に", headwords: [[{ term: "に" }]] }],
+                [{ text: "いきたい", reading: "いきたい", headwords: [[{ term: "いきたい" }]] }],
+              ],
+            },
+          ],
+        },
+      } as unknown as Electron.BrowserWindow),
+      getFrequencyDictionaryEnabled: () => true,
+      tokenizeWithMecab: async () => null,
+      getFrequencyRank: (text) =>
+        text === "小園" ? 25 : text === "いきたい" ? 1500 : null,
+    }),
+  );
+
+  assert.equal(result.tokens?.length, 5);
+  assert.equal(result.tokens?.map((token) => token.surface).join(","), "俺,は,小園,に,いきたい");
+  assert.equal(result.tokens?.[2]?.surface, "小園");
+  assert.equal(result.tokens?.[2]?.frequencyRank, 25);
+});
+
+test("tokenizeSubtitleService keeps scanning parser tokens when they are already split", async () => {
+  const result = await tokenizeSubtitleService(
+    "小園に行きたい",
+    makeDeps({
+      getYomitanExt: () => ({ id: "dummy-ext" } as any),
+      getYomitanParserWindow: () => ({
+        isDestroyed: () => false,
+        webContents: {
+          executeJavaScript: async () => [
+            {
+              source: "scanning-parser",
+              index: 0,
+              content: [
+                [{ text: "小園", reading: "おうえん", headwords: [[{ term: "小園" }]] }],
+                [{ text: "に", reading: "に", headwords: [[{ term: "に" }]] }],
+                [{ text: "行きたい", reading: "いきたい", headwords: [[{ term: "行きたい" }]] }],
+              ],
+            },
+            {
+              source: "mecab",
+              index: 0,
+              content: [
+                [{ text: "小", reading: "お", headwords: [[{ term: "小" }]] }],
+                [{ text: "園", reading: "えん", headwords: [[{ term: "園" }]] }],
+                [{ text: "に", reading: "に", headwords: [[{ term: "に" }]] }],
+                [{ text: "行き", reading: "いき", headwords: [[{ term: "行き" }]] }],
+                [{ text: "たい", reading: "たい", headwords: [[{ term: "たい" }]] }],
+              ],
+            },
+          ],
+        },
+      } as unknown as Electron.BrowserWindow),
+      getFrequencyDictionaryEnabled: () => true,
+      getFrequencyRank: (text) => (text === "小園" ? 20 : null),
+      tokenizeWithMecab: async () => null,
+    }),
+  );
+
+  assert.equal(result.tokens?.length, 3);
+  assert.equal(
+    result.tokens?.map((token) => token.surface).join(","),
+    "小園,に,行きたい",
+  );
+  assert.equal(result.tokens?.[0]?.frequencyRank, 20);
+  assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
+  assert.equal(result.tokens?.[2]?.frequencyRank, undefined);
+});
+
+test("tokenizeSubtitleService still assigns frequency to non-known Yomitan tokens", async () => {
+  const result = await tokenizeSubtitleService(
+    "小園に",
+    makeDeps({
+      getYomitanExt: () => ({ id: "dummy-ext" } as any),
+      getYomitanParserWindow: () => ({
+        isDestroyed: () => false,
+        webContents: {
+          executeJavaScript: async () => [
+            {
+              source: "scanning-parser",
+              index: 0,
+              content: [
+                [
+                  { text: "小園", reading: "おうえん", headwords: [[{ term: "小園" }]] },
+                ],
+                [
+                  { text: "に", reading: "に", headwords: [[{ term: "に" }]] },
+                ],
+              ],
+            },
+          ],
+        },
+      } as unknown as Electron.BrowserWindow),
+      getFrequencyDictionaryEnabled: () => true,
+      getFrequencyRank: (text) => (text === "小園" ? 75 : text === "に" ? 3000 : null),
+      isKnownWord: (text) => text === "小園",
+    }),
+  );
+
+  assert.equal(result.tokens?.length, 2);
+  assert.equal(result.tokens?.[0]?.isKnown, true);
+  assert.equal(result.tokens?.[0]?.frequencyRank, 75);
+  assert.equal(result.tokens?.[1]?.isKnown, false);
+  assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
 });

 test("tokenizeSubtitleService marks tokens as known using callback", async () => {
@@ -589,6 +943,63 @@ test("tokenizeSubtitleService marks tokens as known using callback", async () =>
  assert.equal(result.tokens?.[0]?.isKnown, true);
 });

+test("tokenizeSubtitleService still assigns frequency rank to non-known tokens", async () => {
+  const result = await tokenizeSubtitleService(
+    "既知未知",
+    makeDeps({
+      tokenizeWithMecab: async () => [
+        {
+          surface: "既知",
+          reading: "キチ",
+          partOfSpeech: PartOfSpeech.noun,
+          pos1: "",
+          pos2: "",
+          pos3: "",
+          pos4: "",
+          inflectionType: "",
+          inflectionForm: "",
+          headword: "既知",
+          katakanaReading: "キチ",
+          pronunciation: "キチ",
+          startPos: 0,
+          endPos: 2,
+          isMerged: false,
+          isKnown: false,
+          isNPlusOneTarget: false,
+        },
+        {
+          surface: "未知",
+          reading: "ミチ",
+          partOfSpeech: PartOfSpeech.noun,
+          pos1: "",
+          pos2: "",
+          pos3: "",
+          pos4: "",
+          inflectionType: "",
+          inflectionForm: "",
+          headword: "未知",
+          katakanaReading: "ミチ",
+          pronunciation: "ミチ",
+          startPos: 2,
+          endPos: 4,
+          isMerged: false,
+          isKnown: false,
+          isNPlusOneTarget: false,
+        },
+      ],
+      getFrequencyDictionaryEnabled: () => true,
+      getFrequencyRank: (text) => (text === "既知" ? 20 : text === "未知" ? 30 : null),
+      isKnownWord: (text) => text === "既知",
+    }),
+  );
+
+  assert.equal(result.tokens?.length, 2);
+  assert.equal(result.tokens?.[0]?.isKnown, true);
+  assert.equal(result.tokens?.[0]?.frequencyRank, 20);
+  assert.equal(result.tokens?.[1]?.isKnown, false);
+  assert.equal(result.tokens?.[1]?.frequencyRank, 30);
+});
+
 test("tokenizeSubtitleService selects one N+1 target token", async () => {
  const result = await tokenizeSubtitleService(
    "猫です",
@@ -1,4 +1,4 @@
-import { BrowserWindow, Extension, session } from "electron";
+import type { BrowserWindow, Extension } from "electron";
 import { markNPlusOneTargets, mergeTokens } from "../../token-merger";
 import {
  JlptLevel,
@@ -252,20 +252,67 @@ function resolveFrequencyLookupText(token: MergedToken): string {
  return token.surface;
 }

+function getFrequencyLookupTextCandidates(token: MergedToken): string[] {
+  const tokenWithCandidates = token as MergedToken & {
+    frequencyLookupTerms?: string[];
+  };
+  const lookupTextCandidates: string[] = [];
+  const addLookupText = (text: string | undefined): void => {
+    if (!text) {
+      return;
+    }
+    const trimmed = text.trim();
+    if (!trimmed) {
+      return;
+    }
+    lookupTextCandidates.push(trimmed);
+  };
+
+  if (Array.isArray(tokenWithCandidates.frequencyLookupTerms)) {
+    for (const term of tokenWithCandidates.frequencyLookupTerms) {
+      addLookupText(term);
+    }
+  }
+
+  addLookupText(resolveFrequencyLookupText(token));
+
+  const uniqueLookupTerms: string[] = [];
+  const seen = new Set<string>();
+  for (const term of lookupTextCandidates) {
+    if (seen.has(term)) {
+      continue;
+    }
+    seen.add(term);
+    uniqueLookupTerms.push(term);
+  }
+
+  return uniqueLookupTerms;
+}
+
 function applyFrequencyMarking(
  tokens: MergedToken[],
  getFrequencyRank: FrequencyDictionaryLookup,
 ): MergedToken[] {
  return tokens.map((token) => {
-    const lookupText = resolveFrequencyLookupText(token);
-    if (!lookupText) {
+    const lookupTexts = getFrequencyLookupTextCandidates(token);
+    if (lookupTexts.length === 0) {
      return { ...token, frequencyRank: undefined };
    }

-    const rank = getCachedFrequencyRank(lookupText, getFrequencyRank);
+    let bestRank: number | null = null;
+    for (const lookupText of lookupTexts) {
+      const rank = getCachedFrequencyRank(lookupText, getFrequencyRank);
+      if (rank === null) {
+        continue;
+      }
+      if (bestRank === null || rank < bestRank) {
+        bestRank = rank;
+      }
+    }
+
    return {
      ...token,
-      frequencyRank: rank ?? undefined,
+      frequencyRank: bestRank ?? undefined,
    };
  });
 }
@@ -397,7 +444,7 @@ function isYomitanParseResultItem(
  if (!isObject(value)) {
    return false;
  }
-  if ((value as YomitanParseResultItem).source !== "scanning-parser") {
+  if (!isString((value as YomitanParseResultItem).source)) {
    return false;
  }
  if (!Array.isArray((value as YomitanParseResultItem).content)) {
@@ -452,6 +499,27 @@ function extractYomitanHeadword(segment: YomitanParseSegment): string {
  return "";
 }

+function extractYomitanHeadwords(segment: YomitanParseSegment): string[] {
+  const headwords = segment.headwords;
+  if (!isYomitanHeadwordRows(headwords)) {
+    return [];
+  }
+
+  const results: string[] = [];
+  for (const group of headwords) {
+    for (const candidate of group) {
+      if (isString(candidate.term)) {
+        const term = candidate.term.trim();
+        if (term.length > 0) {
+          results.push(term);
+        }
+      }
+    }
+  }
+
+  return results;
+}
+
 function applyJlptMarking(
  tokens: MergedToken[],
  getJlptLevel: (text: string) => JlptLevel | null,
@@ -475,29 +543,27 @@ function applyJlptMarking(
  });
 }

-function mapYomitanParseResultsToMergedTokens(
-  parseResults: unknown,
+interface YomitanParseCandidate {
+  source: string;
+  index: number;
+  tokens: MergedToken[];
+}
+
+function mapYomitanParseResultItemToMergedTokens(
+  parseResult: YomitanParseResultItem,
  isKnownWord: (text: string) => boolean,
  knownWordMatchMode: NPlusOneMatchMode,
-): MergedToken[] | null {
-  if (!Array.isArray(parseResults) || parseResults.length === 0) {
+): YomitanParseCandidate | null {
+  const content = parseResult.content;
+  if (!Array.isArray(content) || content.length === 0) {
    return null;
  }

-  const scanningItems = parseResults.filter(
-    (item): item is YomitanParseResultItem => isYomitanParseResultItem(item),
-  );
-
-  if (scanningItems.length === 0) {
-    return null;
-  }
-
-  const primaryItem =
-    scanningItems.find((item) => item.index === 0) || scanningItems[0];
-  const content = primaryItem.content;
-  if (!Array.isArray(content)) {
-    return null;
-  }
+  const source = String(parseResult.source ?? "");
+  const index =
+    typeof parseResult.index === "number" && Number.isInteger(parseResult.index)
+      ? parseResult.index
+      : 0;

  const tokens: MergedToken[] = [];
  let charOffset = 0;
@@ -509,60 +575,117 @@ function mapYomitanParseResultsToMergedTokens(
    }
    validLineCount += 1;

-    let surface = "";
-    let reading = "";
-    let headword = "";
-
    for (const segment of line) {
      const segmentText = segment.text;
      if (!segmentText || segmentText.length === 0) {
        continue;
      }

-      surface += segmentText;
+      const start = charOffset;
+      const end = start + segmentText.length;
+      charOffset = end;

-      if (typeof segment.reading === "string") {
-        reading += segment.reading;
-      }
+      const headword = extractYomitanHeadword(segment) || segmentText;
+      const frequencyLookupTerms = extractYomitanHeadwords(segment);

-      if (!headword) {
-        headword = extractYomitanHeadword(segment);
-      }
+      tokens.push({
+        surface: segmentText,
+        reading: typeof segment.reading === "string" ? segment.reading : "",
+        headword,
+        startPos: start,
+        endPos: end,
+        partOfSpeech: PartOfSpeech.other,
+        pos1: "",
+        isMerged: true,
+        isNPlusOneTarget: false,
+        isKnown: (() => {
+          const matchText = resolveKnownWordText(
+            segmentText,
+            headword,
+            knownWordMatchMode,
+          );
+          return matchText ? isKnownWord(matchText) : false;
+        })(),
+        frequencyLookupTerms:
+          frequencyLookupTerms.length > 0 ? frequencyLookupTerms : undefined,
+      });
    }
-
-    if (!surface) {
-      continue;
-    }
-
-    const start = charOffset;
-    const end = start + surface.length;
-    charOffset = end;
-
-    tokens.push({
-      surface,
-      reading,
-      headword: headword || surface,
-      startPos: start,
-      endPos: end,
-      partOfSpeech: PartOfSpeech.other,
-      pos1: "",
-      isMerged: true,
-      isNPlusOneTarget: false,
-      isKnown: (() => {
-        const matchText = resolveKnownWordText(
-          surface,
-          headword,
-          knownWordMatchMode,
-        );
-        return matchText ? isKnownWord(matchText) : false;
-      })(),
-    });
  }

-  if (validLineCount === 0) {
+  if (validLineCount === 0 || tokens.length === 0) {
    return null;
  }
-  return tokens.length > 0 ? tokens : null;
+
+  return { source, index, tokens };
+}
+
+function selectBestYomitanParseCandidate(
+  candidates: YomitanParseCandidate[],
+): MergedToken[] | null {
+  if (candidates.length === 0) {
+    return null;
+  }
+
+  const scanningCandidates = candidates.filter(
+    (candidate) => candidate.source === "scanning-parser",
+  );
+  const mecabCandidates = candidates.filter(
+    (candidate) => candidate.source === "mecab",
+  );
+
+  const getBestByTokenCount = (
+    items: YomitanParseCandidate[],
+  ): YomitanParseCandidate | null => items.length === 0
+    ? null
+    : items.reduce((best, current) =>
+      current.tokens.length > best.tokens.length ? current : best,
+    );
+
+  if (scanningCandidates.length > 0) {
+    const bestScanning = getBestByTokenCount(scanningCandidates);
+    if (bestScanning && bestScanning.tokens.length > 1) {
+      return bestScanning.tokens;
+    }
+
+    const bestMecab = getBestByTokenCount(mecabCandidates);
+    if (
+      bestMecab &&
+      bestMecab.tokens.length > (bestScanning?.tokens.length ?? 0)
+    ) {
+      return bestMecab.tokens;
+    }
+
+    return bestScanning ? bestScanning.tokens : null;
+  }
+
+  const bestCandidate = getBestByTokenCount(candidates);
+  return bestCandidate ? bestCandidate.tokens : null;
+}
+
+function mapYomitanParseResultsToMergedTokens(
+  parseResults: unknown,
+  isKnownWord: (text: string) => boolean,
+  knownWordMatchMode: NPlusOneMatchMode,
+): MergedToken[] | null {
+  if (!Array.isArray(parseResults) || parseResults.length === 0) {
+    return null;
+  }
+
+  const candidates = parseResults
+    .filter((item): item is YomitanParseResultItem =>
+      isYomitanParseResultItem(item),
+    )
+    .map((item) =>
+      mapYomitanParseResultItemToMergedTokens(
+        item,
+        isKnownWord,
+        knownWordMatchMode,
+      ),
+    )
+    .filter((candidate): candidate is YomitanParseCandidate => candidate !== null);
+
+  const bestCandidate = selectBestYomitanParseCandidate(candidates);
+  return bestCandidate;
 }

 function pickClosestMecabPos1(
@@ -664,6 +787,7 @@ async function enrichYomitanPos1(
 async function ensureYomitanParserWindow(
  deps: TokenizerServiceDeps,
 ): Promise<boolean> {
+  const electron = await import("electron");
  const yomitanExt = deps.getYomitanExt();
  if (!yomitanExt) {
    return false;
@@ -680,6 +804,7 @@ async function ensureYomitanParserWindow(
  }

  const initPromise = (async () => {
+    const { BrowserWindow, session } = electron;
    const parserWindow = new BrowserWindow({
      show: false,
      width: 800,
@@ -786,7 +911,7 @@ async function parseWithYomitanInternalParser(
        optionsContext: { index: profileIndex },
        scanLength,
        useInternalParser: true,
-        useMecabParser: false
+        useMecabParser: true
      });
    })();
  `;
@@ -86,14 +86,29 @@ export function parseMecabLine(line: string): Token | null {
  };
 }

+export interface MecabTokenizerOptions {
+  mecabCommand?: string;
+  dictionaryPath?: string;
+}
+
 export class MecabTokenizer {
  private mecabPath: string | null = null;
+  private mecabCommand: string;
+  private dictionaryPath: string | null;
  private available: boolean = false;
  private enabled: boolean = true;

+  constructor(options: MecabTokenizerOptions = {}) {
+    this.mecabCommand = options.mecabCommand?.trim() || "mecab";
+    this.dictionaryPath = options.dictionaryPath?.trim() || null;
+  }
+
  async checkAvailability(): Promise<boolean> {
    try {
-      const result = execSync("which mecab", { encoding: "utf-8" }).trim();
+      const command = this.mecabCommand;
+      const result = command.includes("/")
+        ? command
+        : execSync(`which ${command}`, { encoding: "utf-8" }).trim();
      if (result) {
        this.mecabPath = result;
        this.available = true;
@@ -114,7 +129,11 @@ export class MecabTokenizer {
    }

    return new Promise((resolve) => {
-      const mecab = spawn("mecab", [], {
+      const mecabArgs: string[] = [];
+      if (this.dictionaryPath) {
+        mecabArgs.push("-d", this.dictionaryPath);
+      }
+      const mecab = spawn(this.mecabPath ?? this.mecabCommand, mecabArgs, {
        stdio: ["pipe", "pipe", "pipe"],
      });

@@ -149,6 +168,21 @@ export class MecabTokenizer {
          }
        }

+        if (tokens.length === 0 && text.trim().length > 0) {
+          const trimmedStdout = stdout.trim();
+          const trimmedStderr = stderr.trim();
+          if (trimmedStdout) {
+            log.warn(
+              "MeCab returned no parseable tokens.",
+              `command=${this.mecabPath ?? this.mecabCommand}`,
+              `stdout=${trimmedStdout.slice(0, 1024)}`,
+            );
+          }
+          if (trimmedStderr) {
+            log.warn("MeCab stderr while tokenizing:", trimmedStderr);
+          }
+        }
+
        resolve(tokens);
      });

@@ -23,15 +23,35 @@ function createToken(overrides: Partial<MergedToken>): MergedToken {
 }

 function extractClassBlock(cssText: string, selector: string): string {
-  const start = cssText.indexOf(selector);
-  if (start < 0) return "";
+  const ruleRegex = /([^{}]+)\{([^}]*)\}/g;
+  let match: RegExpExecArray | null = null;
+  let fallbackBlock = "";

-  const openBrace = cssText.indexOf("{", start);
-  if (openBrace < 0) return "";
-  const closeBrace = cssText.indexOf("}", openBrace);
-  if (closeBrace < 0) return "";
+  while ((match = ruleRegex.exec(cssText)) !== null) {
+    const selectorsBlock = match[1]?.trim() ?? "";
+    const selectorBlock = match[2] ?? "";

-  return cssText.slice(openBrace + 1, closeBrace);
+    const selectors = selectorsBlock
+      .split(",")
+      .map((entry) => entry.trim())
+      .filter((entry) => entry.length > 0);
+
+    if (selectors.includes(selector)) {
+      if (selectors.length === 1) {
+        return selectorBlock;
+      }
+
+      if (!fallbackBlock) {
+        fallbackBlock = selectorBlock;
+      }
+    }
+  }
+
+  if (fallbackBlock) {
+    return fallbackBlock;
+  }
+
+  return "";
 }

 test("computeWordClass preserves known and n+1 classes while adding JLPT classes", () => {
@@ -173,10 +193,16 @@ test("computeWordClass uses configured band count for banded mode", () => {
    topX: 4,
    mode: "banded",
    singleColor: "#000000",
-    bandedColors: ["#111111", "#222222", "#333333"] as any,
+    bandedColors: [
+      "#111111",
+      "#222222",
+      "#333333",
+      "#444444",
+      "#555555",
+    ],
  } as any);

-  assert.equal(actual, "word word-frequency-band-1");
+  assert.equal(actual, "word word-frequency-band-3");
 });

 test("computeWordClass skips frequency class when rank is out of topX", () => {
@@ -56,6 +56,7 @@ export interface MergedToken {
  isNPlusOneTarget: boolean;
  jlptLevel?: JlptLevel;
  frequencyRank?: number;
+  frequencyLookupTerms?: string[];
 }

 export type FrequencyDictionaryLookup = (term: string) => number | null;