Fix Yomitan token headword frequency matching and add frequency tests

This commit is contained in:
kyasuda
2026-02-16 13:21:19 -08:00
parent e142d2dc3b
commit 0eb2868805
7 changed files with 1586 additions and 80 deletions

View File

@@ -228,6 +228,223 @@ test("tokenizeSubtitleService applies frequency dictionary ranks", async () => {
assert.equal(result.tokens?.[1]?.frequencyRank, 1200);
});
test("tokenizeSubtitleService uses all Yomitan headword candidates for frequency lookup", async () => {
const result = await tokenizeSubtitleService(
"猫です",
makeDeps({
getFrequencyDictionaryEnabled: () => true,
getYomitanExt: () => ({ id: "dummy-ext" } as any),
getYomitanParserWindow: () => ({
isDestroyed: () => false,
webContents: {
executeJavaScript: async () => [
{
source: "scanning-parser",
index: 0,
content: [
[
{
text: "猫です",
reading: "ねこです",
headwords: [
[{ term: "猫です" }],
[{ term: "猫" }],
],
},
],
],
},
],
},
} as unknown as Electron.BrowserWindow),
getFrequencyRank: (text) => (text === "猫" ? 40 : text === "猫です" ? 1200 : null),
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.frequencyRank, 40);
});
test("tokenizeSubtitleService prefers exact headword frequency over surface/reading when available", async () => {
const result = await tokenizeSubtitleService(
"猫です",
makeDeps({
getFrequencyDictionaryEnabled: () => true,
getYomitanExt: () => ({ id: "dummy-ext" } as any),
getYomitanParserWindow: () => ({
isDestroyed: () => false,
webContents: {
executeJavaScript: async () => [
{
source: "scanning-parser",
index: 0,
content: [
[
{
text: "猫",
reading: "ねこ",
headwords: [[{ term: "ネコ" }]],
},
],
],
},
],
},
} as unknown as Electron.BrowserWindow),
getFrequencyRank: (text) => (text === "猫" ? 1200 : text === "ネコ" ? 8 : null),
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.frequencyRank, 8);
});
test("tokenizeSubtitleService keeps no frequency when only reading matches and headword candidates miss", async () => {
const result = await tokenizeSubtitleService(
"猫です",
makeDeps({
getFrequencyDictionaryEnabled: () => true,
getYomitanExt: () => ({ id: "dummy-ext" } as any),
getYomitanParserWindow: () => ({
isDestroyed: () => false,
webContents: {
executeJavaScript: async () => [
{
source: "scanning-parser",
index: 0,
content: [
[
{
text: "猫",
reading: "ねこ",
headwords: [[{ term: "猫です" }]],
},
],
],
},
],
},
} as unknown as Electron.BrowserWindow),
getFrequencyRank: (text) => (text === "ねこ" ? 77 : null),
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
});
test("tokenizeSubtitleService ignores invalid frequency ranks and takes best valid headword candidate", async () => {
const result = await tokenizeSubtitleService(
"猫です",
makeDeps({
getFrequencyDictionaryEnabled: () => true,
getYomitanExt: () => ({ id: "dummy-ext" } as any),
getYomitanParserWindow: () => ({
isDestroyed: () => false,
webContents: {
executeJavaScript: async () => [
{
source: "scanning-parser",
index: 0,
content: [
[
{
text: "猫です",
reading: "ねこです",
headwords: [
[{ term: "猫" }],
[{ term: "猫です" }],
],
},
],
],
},
],
},
} as unknown as Electron.BrowserWindow),
getFrequencyRank: (text) => (text === "猫" ? Number.NaN : text === "猫です" ? 500 : null),
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.frequencyRank, 500);
});
test("tokenizeSubtitleService handles real-word frequency candidates and prefers most frequent term", async () => {
const result = await tokenizeSubtitleService(
"昨日",
makeDeps({
getFrequencyDictionaryEnabled: () => true,
getYomitanExt: () => ({ id: "dummy-ext" } as any),
getYomitanParserWindow: () => ({
isDestroyed: () => false,
webContents: {
executeJavaScript: async () => [
{
source: "scanning-parser",
index: 0,
content: [
[
{
text: "昨日",
reading: "きのう",
headwords: [
[{ term: "昨日" }],
[{ term: "きのう" }],
],
},
],
],
},
],
},
} as unknown as Electron.BrowserWindow),
getFrequencyRank: (text) => (text === "きのう" ? 120 : text === "昨日" ? 40 : null),
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.frequencyRank, 40);
});
test("tokenizeSubtitleService ignores candidates with no dictionary rank when higher-frequency candidate exists", async () => {
const result = await tokenizeSubtitleService(
"猫です",
makeDeps({
getFrequencyDictionaryEnabled: () => true,
getYomitanExt: () => ({ id: "dummy-ext" } as any),
getYomitanParserWindow: () => ({
isDestroyed: () => false,
webContents: {
executeJavaScript: async () => [
{
source: "scanning-parser",
index: 0,
content: [
[
{
text: "猫",
reading: "ねこ",
headwords: [
[{ term: "猫" }],
[{ term: "猫です" }],
[{ term: "unknown-term" }],
],
},
],
],
},
],
},
} as unknown as Electron.BrowserWindow),
getFrequencyRank: (text) => (text === "unknown-term" ? -1 : text === "猫" ? 88 : text === "猫です" ? 9000 : null),
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.frequencyRank, 88);
});
test("tokenizeSubtitleService ignores frequency lookup failures", async () => {
const result = await tokenizeSubtitleService(
"猫",
@@ -557,10 +774,147 @@ test("tokenizeSubtitleService uses Yomitan parser result when available", async
);
assert.equal(result.text, "猫です");
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.surface, "猫です");
assert.equal(result.tokens?.[0]?.reading, "ねこです");
assert.equal(result.tokens?.length, 2);
assert.equal(result.tokens?.[0]?.surface, "猫");
assert.equal(result.tokens?.[0]?.reading, "ねこ");
assert.equal(result.tokens?.[0]?.isKnown, false);
assert.equal(result.tokens?.[1]?.surface, "です");
assert.equal(result.tokens?.[1]?.reading, "です");
assert.equal(result.tokens?.[1]?.isKnown, false);
});
test("tokenizeSubtitleService prefers mecab parser tokens when scanning parser returns one token", async () => {
const result = await tokenizeSubtitleService(
"俺は小園にいきたい",
makeDeps({
getYomitanExt: () => ({ id: "dummy-ext" } as any),
getYomitanParserWindow: () => ({
isDestroyed: () => false,
webContents: {
executeJavaScript: async () => [
{
source: "scanning-parser",
index: 0,
content: [
[
{
text: "俺は小園にいきたい",
reading: "おれは小園にいきたい",
headwords: [[{ term: "俺は小園にいきたい" }]],
},
],
],
},
{
source: "mecab",
index: 0,
content: [
[{ text: "俺", reading: "おれ", headwords: [[{ term: "俺" }]] }],
[{ text: "は", reading: "は", headwords: [[{ term: "は" }]] }],
[{ text: "小園", reading: "おうえん", headwords: [[{ term: "小園" }]] }],
[{ text: "に", reading: "に", headwords: [[{ term: "に" }]] }],
[{ text: "いきたい", reading: "いきたい", headwords: [[{ term: "いきたい" }]] }],
],
},
],
},
} as unknown as Electron.BrowserWindow),
getFrequencyDictionaryEnabled: () => true,
tokenizeWithMecab: async () => null,
getFrequencyRank: (text) =>
text === "小園" ? 25 : text === "いきたい" ? 1500 : null,
}),
);
assert.equal(result.tokens?.length, 5);
assert.equal(result.tokens?.map((token) => token.surface).join(","), "俺,は,小園,に,いきたい");
assert.equal(result.tokens?.[2]?.surface, "小園");
assert.equal(result.tokens?.[2]?.frequencyRank, 25);
});
test("tokenizeSubtitleService keeps scanning parser tokens when they are already split", async () => {
const result = await tokenizeSubtitleService(
"小園に行きたい",
makeDeps({
getYomitanExt: () => ({ id: "dummy-ext" } as any),
getYomitanParserWindow: () => ({
isDestroyed: () => false,
webContents: {
executeJavaScript: async () => [
{
source: "scanning-parser",
index: 0,
content: [
[{ text: "小園", reading: "おうえん", headwords: [[{ term: "小園" }]] }],
[{ text: "に", reading: "に", headwords: [[{ term: "に" }]] }],
[{ text: "行きたい", reading: "いきたい", headwords: [[{ term: "行きたい" }]] }],
],
},
{
source: "mecab",
index: 0,
content: [
[{ text: "小", reading: "お", headwords: [[{ term: "小" }]] }],
[{ text: "園", reading: "えん", headwords: [[{ term: "園" }]] }],
[{ text: "に", reading: "に", headwords: [[{ term: "に" }]] }],
[{ text: "行き", reading: "いき", headwords: [[{ term: "行き" }]] }],
[{ text: "たい", reading: "たい", headwords: [[{ term: "たい" }]] }],
],
},
],
},
} as unknown as Electron.BrowserWindow),
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) => (text === "小園" ? 20 : null),
tokenizeWithMecab: async () => null,
}),
);
assert.equal(result.tokens?.length, 3);
assert.equal(
result.tokens?.map((token) => token.surface).join(","),
"小園,に,行きたい",
);
assert.equal(result.tokens?.[0]?.frequencyRank, 20);
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
assert.equal(result.tokens?.[2]?.frequencyRank, undefined);
});
test("tokenizeSubtitleService still assigns frequency to non-known Yomitan tokens", async () => {
const result = await tokenizeSubtitleService(
"小園に",
makeDeps({
getYomitanExt: () => ({ id: "dummy-ext" } as any),
getYomitanParserWindow: () => ({
isDestroyed: () => false,
webContents: {
executeJavaScript: async () => [
{
source: "scanning-parser",
index: 0,
content: [
[
{ text: "小園", reading: "おうえん", headwords: [[{ term: "小園" }]] },
],
[
{ text: "に", reading: "に", headwords: [[{ term: "に" }]] },
],
],
},
],
},
} as unknown as Electron.BrowserWindow),
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) => (text === "小園" ? 75 : text === "に" ? 3000 : null),
isKnownWord: (text) => text === "小園",
}),
);
assert.equal(result.tokens?.length, 2);
assert.equal(result.tokens?.[0]?.isKnown, true);
assert.equal(result.tokens?.[0]?.frequencyRank, 75);
assert.equal(result.tokens?.[1]?.isKnown, false);
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
});
test("tokenizeSubtitleService marks tokens as known using callback", async () => {
@@ -589,6 +943,63 @@ test("tokenizeSubtitleService marks tokens as known using callback", async () =>
assert.equal(result.tokens?.[0]?.isKnown, true);
});
test("tokenizeSubtitleService still assigns frequency rank to non-known tokens", async () => {
const result = await tokenizeSubtitleService(
"既知未知",
makeDeps({
tokenizeWithMecab: async () => [
{
surface: "既知",
reading: "キチ",
partOfSpeech: PartOfSpeech.noun,
pos1: "",
pos2: "",
pos3: "",
pos4: "",
inflectionType: "",
inflectionForm: "",
headword: "既知",
katakanaReading: "キチ",
pronunciation: "キチ",
startPos: 0,
endPos: 2,
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
surface: "未知",
reading: "ミチ",
partOfSpeech: PartOfSpeech.noun,
pos1: "",
pos2: "",
pos3: "",
pos4: "",
inflectionType: "",
inflectionForm: "",
headword: "未知",
katakanaReading: "ミチ",
pronunciation: "ミチ",
startPos: 2,
endPos: 4,
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
],
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) => (text === "既知" ? 20 : text === "未知" ? 30 : null),
isKnownWord: (text) => text === "既知",
}),
);
assert.equal(result.tokens?.length, 2);
assert.equal(result.tokens?.[0]?.isKnown, true);
assert.equal(result.tokens?.[0]?.frequencyRank, 20);
assert.equal(result.tokens?.[1]?.isKnown, false);
assert.equal(result.tokens?.[1]?.frequencyRank, 30);
});
test("tokenizeSubtitleService selects one N+1 target token", async () => {
const result = await tokenizeSubtitleService(
"猫です",

View File

@@ -1,4 +1,4 @@
import { BrowserWindow, Extension, session } from "electron";
import type { BrowserWindow, Extension } from "electron";
import { markNPlusOneTargets, mergeTokens } from "../../token-merger";
import {
JlptLevel,
@@ -252,20 +252,67 @@ function resolveFrequencyLookupText(token: MergedToken): string {
return token.surface;
}
function getFrequencyLookupTextCandidates(token: MergedToken): string[] {
const tokenWithCandidates = token as MergedToken & {
frequencyLookupTerms?: string[];
};
const lookupTextCandidates: string[] = [];
const addLookupText = (text: string | undefined): void => {
if (!text) {
return;
}
const trimmed = text.trim();
if (!trimmed) {
return;
}
lookupTextCandidates.push(trimmed);
};
if (Array.isArray(tokenWithCandidates.frequencyLookupTerms)) {
for (const term of tokenWithCandidates.frequencyLookupTerms) {
addLookupText(term);
}
}
addLookupText(resolveFrequencyLookupText(token));
const uniqueLookupTerms: string[] = [];
const seen = new Set<string>();
for (const term of lookupTextCandidates) {
if (seen.has(term)) {
continue;
}
seen.add(term);
uniqueLookupTerms.push(term);
}
return uniqueLookupTerms;
}
function applyFrequencyMarking(
tokens: MergedToken[],
getFrequencyRank: FrequencyDictionaryLookup,
): MergedToken[] {
return tokens.map((token) => {
const lookupText = resolveFrequencyLookupText(token);
if (!lookupText) {
const lookupTexts = getFrequencyLookupTextCandidates(token);
if (lookupTexts.length === 0) {
return { ...token, frequencyRank: undefined };
}
const rank = getCachedFrequencyRank(lookupText, getFrequencyRank);
let bestRank: number | null = null;
for (const lookupText of lookupTexts) {
const rank = getCachedFrequencyRank(lookupText, getFrequencyRank);
if (rank === null) {
continue;
}
if (bestRank === null || rank < bestRank) {
bestRank = rank;
}
}
return {
...token,
frequencyRank: rank ?? undefined,
frequencyRank: bestRank ?? undefined,
};
});
}
@@ -397,7 +444,7 @@ function isYomitanParseResultItem(
if (!isObject(value)) {
return false;
}
if ((value as YomitanParseResultItem).source !== "scanning-parser") {
if (!isString((value as YomitanParseResultItem).source)) {
return false;
}
if (!Array.isArray((value as YomitanParseResultItem).content)) {
@@ -452,6 +499,27 @@ function extractYomitanHeadword(segment: YomitanParseSegment): string {
return "";
}
function extractYomitanHeadwords(segment: YomitanParseSegment): string[] {
const headwords = segment.headwords;
if (!isYomitanHeadwordRows(headwords)) {
return [];
}
const results: string[] = [];
for (const group of headwords) {
for (const candidate of group) {
if (isString(candidate.term)) {
const term = candidate.term.trim();
if (term.length > 0) {
results.push(term);
}
}
}
}
return results;
}
function applyJlptMarking(
tokens: MergedToken[],
getJlptLevel: (text: string) => JlptLevel | null,
@@ -475,29 +543,27 @@ function applyJlptMarking(
});
}
function mapYomitanParseResultsToMergedTokens(
parseResults: unknown,
interface YomitanParseCandidate {
source: string;
index: number;
tokens: MergedToken[];
}
function mapYomitanParseResultItemToMergedTokens(
parseResult: YomitanParseResultItem,
isKnownWord: (text: string) => boolean,
knownWordMatchMode: NPlusOneMatchMode,
): MergedToken[] | null {
if (!Array.isArray(parseResults) || parseResults.length === 0) {
): YomitanParseCandidate | null {
const content = parseResult.content;
if (!Array.isArray(content) || content.length === 0) {
return null;
}
const scanningItems = parseResults.filter(
(item): item is YomitanParseResultItem => isYomitanParseResultItem(item),
);
if (scanningItems.length === 0) {
return null;
}
const primaryItem =
scanningItems.find((item) => item.index === 0) || scanningItems[0];
const content = primaryItem.content;
if (!Array.isArray(content)) {
return null;
}
const source = String(parseResult.source ?? "");
const index =
typeof parseResult.index === "number" && Number.isInteger(parseResult.index)
? parseResult.index
: 0;
const tokens: MergedToken[] = [];
let charOffset = 0;
@@ -509,60 +575,117 @@ function mapYomitanParseResultsToMergedTokens(
}
validLineCount += 1;
let surface = "";
let reading = "";
let headword = "";
for (const segment of line) {
const segmentText = segment.text;
if (!segmentText || segmentText.length === 0) {
continue;
}
surface += segmentText;
const start = charOffset;
const end = start + segmentText.length;
charOffset = end;
if (typeof segment.reading === "string") {
reading += segment.reading;
}
const headword = extractYomitanHeadword(segment) || segmentText;
const frequencyLookupTerms = extractYomitanHeadwords(segment);
if (!headword) {
headword = extractYomitanHeadword(segment);
}
tokens.push({
surface: segmentText,
reading: typeof segment.reading === "string" ? segment.reading : "",
headword,
startPos: start,
endPos: end,
partOfSpeech: PartOfSpeech.other,
pos1: "",
isMerged: true,
isNPlusOneTarget: false,
isKnown: (() => {
const matchText = resolveKnownWordText(
segmentText,
headword,
knownWordMatchMode,
);
return matchText ? isKnownWord(matchText) : false;
})(),
frequencyLookupTerms:
frequencyLookupTerms.length > 0 ? frequencyLookupTerms : undefined,
});
}
if (!surface) {
continue;
}
const start = charOffset;
const end = start + surface.length;
charOffset = end;
tokens.push({
surface,
reading,
headword: headword || surface,
startPos: start,
endPos: end,
partOfSpeech: PartOfSpeech.other,
pos1: "",
isMerged: true,
isNPlusOneTarget: false,
isKnown: (() => {
const matchText = resolveKnownWordText(
surface,
headword,
knownWordMatchMode,
);
return matchText ? isKnownWord(matchText) : false;
})(),
});
}
if (validLineCount === 0) {
if (validLineCount === 0 || tokens.length === 0) {
return null;
}
return tokens.length > 0 ? tokens : null;
return { source, index, tokens };
}
function selectBestYomitanParseCandidate(
candidates: YomitanParseCandidate[],
): MergedToken[] | null {
if (candidates.length === 0) {
return null;
}
const scanningCandidates = candidates.filter(
(candidate) => candidate.source === "scanning-parser",
);
const mecabCandidates = candidates.filter(
(candidate) => candidate.source === "mecab",
);
const getBestByTokenCount = (
items: YomitanParseCandidate[],
): YomitanParseCandidate | null => items.length === 0
? null
: items.reduce((best, current) =>
current.tokens.length > best.tokens.length ? current : best,
);
if (scanningCandidates.length > 0) {
const bestScanning = getBestByTokenCount(scanningCandidates);
if (bestScanning && bestScanning.tokens.length > 1) {
return bestScanning.tokens;
}
const bestMecab = getBestByTokenCount(mecabCandidates);
if (
bestMecab &&
bestMecab.tokens.length > (bestScanning?.tokens.length ?? 0)
) {
return bestMecab.tokens;
}
return bestScanning ? bestScanning.tokens : null;
}
const bestCandidate = getBestByTokenCount(candidates);
return bestCandidate ? bestCandidate.tokens : null;
}
function mapYomitanParseResultsToMergedTokens(
parseResults: unknown,
isKnownWord: (text: string) => boolean,
knownWordMatchMode: NPlusOneMatchMode,
): MergedToken[] | null {
if (!Array.isArray(parseResults) || parseResults.length === 0) {
return null;
}
const candidates = parseResults
.filter((item): item is YomitanParseResultItem =>
isYomitanParseResultItem(item),
)
.map((item) =>
mapYomitanParseResultItemToMergedTokens(
item,
isKnownWord,
knownWordMatchMode,
),
)
.filter((candidate): candidate is YomitanParseCandidate => candidate !== null);
const bestCandidate = selectBestYomitanParseCandidate(candidates);
return bestCandidate;
}
function pickClosestMecabPos1(
@@ -664,6 +787,7 @@ async function enrichYomitanPos1(
async function ensureYomitanParserWindow(
deps: TokenizerServiceDeps,
): Promise<boolean> {
const electron = await import("electron");
const yomitanExt = deps.getYomitanExt();
if (!yomitanExt) {
return false;
@@ -680,6 +804,7 @@ async function ensureYomitanParserWindow(
}
const initPromise = (async () => {
const { BrowserWindow, session } = electron;
const parserWindow = new BrowserWindow({
show: false,
width: 800,
@@ -786,7 +911,7 @@ async function parseWithYomitanInternalParser(
optionsContext: { index: profileIndex },
scanLength,
useInternalParser: true,
useMecabParser: false
useMecabParser: true
});
})();
`;