mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-02-28 06:22:45 -08:00
Fix Yomitan token headword frequency matching and add frequency tests
This commit is contained in:
@@ -228,6 +228,223 @@ test("tokenizeSubtitleService applies frequency dictionary ranks", async () => {
|
||||
assert.equal(result.tokens?.[1]?.frequencyRank, 1200);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService uses all Yomitan headword candidates for frequency lookup", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"猫です",
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getYomitanExt: () => ({ id: "dummy-ext" } as any),
|
||||
getYomitanParserWindow: () => ({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async () => [
|
||||
{
|
||||
source: "scanning-parser",
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
{
|
||||
text: "猫です",
|
||||
reading: "ねこです",
|
||||
headwords: [
|
||||
[{ term: "猫です" }],
|
||||
[{ term: "猫" }],
|
||||
],
|
||||
},
|
||||
],
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
} as unknown as Electron.BrowserWindow),
|
||||
getFrequencyRank: (text) => (text === "猫" ? 40 : text === "猫です" ? 1200 : null),
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 40);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService prefers exact headword frequency over surface/reading when available", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"猫です",
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getYomitanExt: () => ({ id: "dummy-ext" } as any),
|
||||
getYomitanParserWindow: () => ({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async () => [
|
||||
{
|
||||
source: "scanning-parser",
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
{
|
||||
text: "猫",
|
||||
reading: "ねこ",
|
||||
headwords: [[{ term: "ネコ" }]],
|
||||
},
|
||||
],
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
} as unknown as Electron.BrowserWindow),
|
||||
getFrequencyRank: (text) => (text === "猫" ? 1200 : text === "ネコ" ? 8 : null),
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 8);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService keeps no frequency when only reading matches and headword candidates miss", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"猫です",
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getYomitanExt: () => ({ id: "dummy-ext" } as any),
|
||||
getYomitanParserWindow: () => ({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async () => [
|
||||
{
|
||||
source: "scanning-parser",
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
{
|
||||
text: "猫",
|
||||
reading: "ねこ",
|
||||
headwords: [[{ term: "猫です" }]],
|
||||
},
|
||||
],
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
} as unknown as Electron.BrowserWindow),
|
||||
getFrequencyRank: (text) => (text === "ねこ" ? 77 : null),
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService ignores invalid frequency ranks and takes best valid headword candidate", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"猫です",
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getYomitanExt: () => ({ id: "dummy-ext" } as any),
|
||||
getYomitanParserWindow: () => ({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async () => [
|
||||
{
|
||||
source: "scanning-parser",
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
{
|
||||
text: "猫です",
|
||||
reading: "ねこです",
|
||||
headwords: [
|
||||
[{ term: "猫" }],
|
||||
[{ term: "猫です" }],
|
||||
],
|
||||
},
|
||||
],
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
} as unknown as Electron.BrowserWindow),
|
||||
getFrequencyRank: (text) => (text === "猫" ? Number.NaN : text === "猫です" ? 500 : null),
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 500);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService handles real-word frequency candidates and prefers most frequent term", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"昨日",
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getYomitanExt: () => ({ id: "dummy-ext" } as any),
|
||||
getYomitanParserWindow: () => ({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async () => [
|
||||
{
|
||||
source: "scanning-parser",
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
{
|
||||
text: "昨日",
|
||||
reading: "きのう",
|
||||
headwords: [
|
||||
[{ term: "昨日" }],
|
||||
[{ term: "きのう" }],
|
||||
],
|
||||
},
|
||||
],
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
} as unknown as Electron.BrowserWindow),
|
||||
getFrequencyRank: (text) => (text === "きのう" ? 120 : text === "昨日" ? 40 : null),
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 40);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService ignores candidates with no dictionary rank when higher-frequency candidate exists", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"猫です",
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getYomitanExt: () => ({ id: "dummy-ext" } as any),
|
||||
getYomitanParserWindow: () => ({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async () => [
|
||||
{
|
||||
source: "scanning-parser",
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
{
|
||||
text: "猫",
|
||||
reading: "ねこ",
|
||||
headwords: [
|
||||
[{ term: "猫" }],
|
||||
[{ term: "猫です" }],
|
||||
[{ term: "unknown-term" }],
|
||||
],
|
||||
},
|
||||
],
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
} as unknown as Electron.BrowserWindow),
|
||||
getFrequencyRank: (text) => (text === "unknown-term" ? -1 : text === "猫" ? 88 : text === "猫です" ? 9000 : null),
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 88);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService ignores frequency lookup failures", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"猫",
|
||||
@@ -557,10 +774,147 @@ test("tokenizeSubtitleService uses Yomitan parser result when available", async
|
||||
);
|
||||
|
||||
assert.equal(result.text, "猫です");
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.surface, "猫です");
|
||||
assert.equal(result.tokens?.[0]?.reading, "ねこです");
|
||||
assert.equal(result.tokens?.length, 2);
|
||||
assert.equal(result.tokens?.[0]?.surface, "猫");
|
||||
assert.equal(result.tokens?.[0]?.reading, "ねこ");
|
||||
assert.equal(result.tokens?.[0]?.isKnown, false);
|
||||
assert.equal(result.tokens?.[1]?.surface, "です");
|
||||
assert.equal(result.tokens?.[1]?.reading, "です");
|
||||
assert.equal(result.tokens?.[1]?.isKnown, false);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService prefers mecab parser tokens when scanning parser returns one token", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"俺は小園にいきたい",
|
||||
makeDeps({
|
||||
getYomitanExt: () => ({ id: "dummy-ext" } as any),
|
||||
getYomitanParserWindow: () => ({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async () => [
|
||||
{
|
||||
source: "scanning-parser",
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
{
|
||||
text: "俺は小園にいきたい",
|
||||
reading: "おれは小園にいきたい",
|
||||
headwords: [[{ term: "俺は小園にいきたい" }]],
|
||||
},
|
||||
],
|
||||
],
|
||||
},
|
||||
{
|
||||
source: "mecab",
|
||||
index: 0,
|
||||
content: [
|
||||
[{ text: "俺", reading: "おれ", headwords: [[{ term: "俺" }]] }],
|
||||
[{ text: "は", reading: "は", headwords: [[{ term: "は" }]] }],
|
||||
[{ text: "小園", reading: "おうえん", headwords: [[{ term: "小園" }]] }],
|
||||
[{ text: "に", reading: "に", headwords: [[{ term: "に" }]] }],
|
||||
[{ text: "いきたい", reading: "いきたい", headwords: [[{ term: "いきたい" }]] }],
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
} as unknown as Electron.BrowserWindow),
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
tokenizeWithMecab: async () => null,
|
||||
getFrequencyRank: (text) =>
|
||||
text === "小園" ? 25 : text === "いきたい" ? 1500 : null,
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 5);
|
||||
assert.equal(result.tokens?.map((token) => token.surface).join(","), "俺,は,小園,に,いきたい");
|
||||
assert.equal(result.tokens?.[2]?.surface, "小園");
|
||||
assert.equal(result.tokens?.[2]?.frequencyRank, 25);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService keeps scanning parser tokens when they are already split", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"小園に行きたい",
|
||||
makeDeps({
|
||||
getYomitanExt: () => ({ id: "dummy-ext" } as any),
|
||||
getYomitanParserWindow: () => ({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async () => [
|
||||
{
|
||||
source: "scanning-parser",
|
||||
index: 0,
|
||||
content: [
|
||||
[{ text: "小園", reading: "おうえん", headwords: [[{ term: "小園" }]] }],
|
||||
[{ text: "に", reading: "に", headwords: [[{ term: "に" }]] }],
|
||||
[{ text: "行きたい", reading: "いきたい", headwords: [[{ term: "行きたい" }]] }],
|
||||
],
|
||||
},
|
||||
{
|
||||
source: "mecab",
|
||||
index: 0,
|
||||
content: [
|
||||
[{ text: "小", reading: "お", headwords: [[{ term: "小" }]] }],
|
||||
[{ text: "園", reading: "えん", headwords: [[{ term: "園" }]] }],
|
||||
[{ text: "に", reading: "に", headwords: [[{ term: "に" }]] }],
|
||||
[{ text: "行き", reading: "いき", headwords: [[{ term: "行き" }]] }],
|
||||
[{ text: "たい", reading: "たい", headwords: [[{ term: "たい" }]] }],
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
} as unknown as Electron.BrowserWindow),
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) => (text === "小園" ? 20 : null),
|
||||
tokenizeWithMecab: async () => null,
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 3);
|
||||
assert.equal(
|
||||
result.tokens?.map((token) => token.surface).join(","),
|
||||
"小園,に,行きたい",
|
||||
);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 20);
|
||||
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
|
||||
assert.equal(result.tokens?.[2]?.frequencyRank, undefined);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService still assigns frequency to non-known Yomitan tokens", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"小園に",
|
||||
makeDeps({
|
||||
getYomitanExt: () => ({ id: "dummy-ext" } as any),
|
||||
getYomitanParserWindow: () => ({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async () => [
|
||||
{
|
||||
source: "scanning-parser",
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
{ text: "小園", reading: "おうえん", headwords: [[{ term: "小園" }]] },
|
||||
],
|
||||
[
|
||||
{ text: "に", reading: "に", headwords: [[{ term: "に" }]] },
|
||||
],
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
} as unknown as Electron.BrowserWindow),
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) => (text === "小園" ? 75 : text === "に" ? 3000 : null),
|
||||
isKnownWord: (text) => text === "小園",
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 2);
|
||||
assert.equal(result.tokens?.[0]?.isKnown, true);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 75);
|
||||
assert.equal(result.tokens?.[1]?.isKnown, false);
|
||||
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService marks tokens as known using callback", async () => {
|
||||
@@ -589,6 +943,63 @@ test("tokenizeSubtitleService marks tokens as known using callback", async () =>
|
||||
assert.equal(result.tokens?.[0]?.isKnown, true);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService still assigns frequency rank to non-known tokens", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"既知未知",
|
||||
makeDeps({
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
surface: "既知",
|
||||
reading: "キチ",
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: "",
|
||||
pos2: "",
|
||||
pos3: "",
|
||||
pos4: "",
|
||||
inflectionType: "",
|
||||
inflectionForm: "",
|
||||
headword: "既知",
|
||||
katakanaReading: "キチ",
|
||||
pronunciation: "キチ",
|
||||
startPos: 0,
|
||||
endPos: 2,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
surface: "未知",
|
||||
reading: "ミチ",
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: "",
|
||||
pos2: "",
|
||||
pos3: "",
|
||||
pos4: "",
|
||||
inflectionType: "",
|
||||
inflectionForm: "",
|
||||
headword: "未知",
|
||||
katakanaReading: "ミチ",
|
||||
pronunciation: "ミチ",
|
||||
startPos: 2,
|
||||
endPos: 4,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) => (text === "既知" ? 20 : text === "未知" ? 30 : null),
|
||||
isKnownWord: (text) => text === "既知",
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 2);
|
||||
assert.equal(result.tokens?.[0]?.isKnown, true);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 20);
|
||||
assert.equal(result.tokens?.[1]?.isKnown, false);
|
||||
assert.equal(result.tokens?.[1]?.frequencyRank, 30);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService selects one N+1 target token", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"猫です",
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { BrowserWindow, Extension, session } from "electron";
|
||||
import type { BrowserWindow, Extension } from "electron";
|
||||
import { markNPlusOneTargets, mergeTokens } from "../../token-merger";
|
||||
import {
|
||||
JlptLevel,
|
||||
@@ -252,20 +252,67 @@ function resolveFrequencyLookupText(token: MergedToken): string {
|
||||
return token.surface;
|
||||
}
|
||||
|
||||
function getFrequencyLookupTextCandidates(token: MergedToken): string[] {
|
||||
const tokenWithCandidates = token as MergedToken & {
|
||||
frequencyLookupTerms?: string[];
|
||||
};
|
||||
const lookupTextCandidates: string[] = [];
|
||||
const addLookupText = (text: string | undefined): void => {
|
||||
if (!text) {
|
||||
return;
|
||||
}
|
||||
const trimmed = text.trim();
|
||||
if (!trimmed) {
|
||||
return;
|
||||
}
|
||||
lookupTextCandidates.push(trimmed);
|
||||
};
|
||||
|
||||
if (Array.isArray(tokenWithCandidates.frequencyLookupTerms)) {
|
||||
for (const term of tokenWithCandidates.frequencyLookupTerms) {
|
||||
addLookupText(term);
|
||||
}
|
||||
}
|
||||
|
||||
addLookupText(resolveFrequencyLookupText(token));
|
||||
|
||||
const uniqueLookupTerms: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
for (const term of lookupTextCandidates) {
|
||||
if (seen.has(term)) {
|
||||
continue;
|
||||
}
|
||||
seen.add(term);
|
||||
uniqueLookupTerms.push(term);
|
||||
}
|
||||
|
||||
return uniqueLookupTerms;
|
||||
}
|
||||
|
||||
function applyFrequencyMarking(
|
||||
tokens: MergedToken[],
|
||||
getFrequencyRank: FrequencyDictionaryLookup,
|
||||
): MergedToken[] {
|
||||
return tokens.map((token) => {
|
||||
const lookupText = resolveFrequencyLookupText(token);
|
||||
if (!lookupText) {
|
||||
const lookupTexts = getFrequencyLookupTextCandidates(token);
|
||||
if (lookupTexts.length === 0) {
|
||||
return { ...token, frequencyRank: undefined };
|
||||
}
|
||||
|
||||
const rank = getCachedFrequencyRank(lookupText, getFrequencyRank);
|
||||
let bestRank: number | null = null;
|
||||
for (const lookupText of lookupTexts) {
|
||||
const rank = getCachedFrequencyRank(lookupText, getFrequencyRank);
|
||||
if (rank === null) {
|
||||
continue;
|
||||
}
|
||||
if (bestRank === null || rank < bestRank) {
|
||||
bestRank = rank;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
...token,
|
||||
frequencyRank: rank ?? undefined,
|
||||
frequencyRank: bestRank ?? undefined,
|
||||
};
|
||||
});
|
||||
}
|
||||
@@ -397,7 +444,7 @@ function isYomitanParseResultItem(
|
||||
if (!isObject(value)) {
|
||||
return false;
|
||||
}
|
||||
if ((value as YomitanParseResultItem).source !== "scanning-parser") {
|
||||
if (!isString((value as YomitanParseResultItem).source)) {
|
||||
return false;
|
||||
}
|
||||
if (!Array.isArray((value as YomitanParseResultItem).content)) {
|
||||
@@ -452,6 +499,27 @@ function extractYomitanHeadword(segment: YomitanParseSegment): string {
|
||||
return "";
|
||||
}
|
||||
|
||||
function extractYomitanHeadwords(segment: YomitanParseSegment): string[] {
|
||||
const headwords = segment.headwords;
|
||||
if (!isYomitanHeadwordRows(headwords)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const results: string[] = [];
|
||||
for (const group of headwords) {
|
||||
for (const candidate of group) {
|
||||
if (isString(candidate.term)) {
|
||||
const term = candidate.term.trim();
|
||||
if (term.length > 0) {
|
||||
results.push(term);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
function applyJlptMarking(
|
||||
tokens: MergedToken[],
|
||||
getJlptLevel: (text: string) => JlptLevel | null,
|
||||
@@ -475,29 +543,27 @@ function applyJlptMarking(
|
||||
});
|
||||
}
|
||||
|
||||
function mapYomitanParseResultsToMergedTokens(
|
||||
parseResults: unknown,
|
||||
interface YomitanParseCandidate {
|
||||
source: string;
|
||||
index: number;
|
||||
tokens: MergedToken[];
|
||||
}
|
||||
|
||||
function mapYomitanParseResultItemToMergedTokens(
|
||||
parseResult: YomitanParseResultItem,
|
||||
isKnownWord: (text: string) => boolean,
|
||||
knownWordMatchMode: NPlusOneMatchMode,
|
||||
): MergedToken[] | null {
|
||||
if (!Array.isArray(parseResults) || parseResults.length === 0) {
|
||||
): YomitanParseCandidate | null {
|
||||
const content = parseResult.content;
|
||||
if (!Array.isArray(content) || content.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const scanningItems = parseResults.filter(
|
||||
(item): item is YomitanParseResultItem => isYomitanParseResultItem(item),
|
||||
);
|
||||
|
||||
if (scanningItems.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const primaryItem =
|
||||
scanningItems.find((item) => item.index === 0) || scanningItems[0];
|
||||
const content = primaryItem.content;
|
||||
if (!Array.isArray(content)) {
|
||||
return null;
|
||||
}
|
||||
const source = String(parseResult.source ?? "");
|
||||
const index =
|
||||
typeof parseResult.index === "number" && Number.isInteger(parseResult.index)
|
||||
? parseResult.index
|
||||
: 0;
|
||||
|
||||
const tokens: MergedToken[] = [];
|
||||
let charOffset = 0;
|
||||
@@ -509,60 +575,117 @@ function mapYomitanParseResultsToMergedTokens(
|
||||
}
|
||||
validLineCount += 1;
|
||||
|
||||
let surface = "";
|
||||
let reading = "";
|
||||
let headword = "";
|
||||
|
||||
for (const segment of line) {
|
||||
const segmentText = segment.text;
|
||||
if (!segmentText || segmentText.length === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
surface += segmentText;
|
||||
const start = charOffset;
|
||||
const end = start + segmentText.length;
|
||||
charOffset = end;
|
||||
|
||||
if (typeof segment.reading === "string") {
|
||||
reading += segment.reading;
|
||||
}
|
||||
const headword = extractYomitanHeadword(segment) || segmentText;
|
||||
const frequencyLookupTerms = extractYomitanHeadwords(segment);
|
||||
|
||||
if (!headword) {
|
||||
headword = extractYomitanHeadword(segment);
|
||||
}
|
||||
tokens.push({
|
||||
surface: segmentText,
|
||||
reading: typeof segment.reading === "string" ? segment.reading : "",
|
||||
headword,
|
||||
startPos: start,
|
||||
endPos: end,
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: "",
|
||||
isMerged: true,
|
||||
isNPlusOneTarget: false,
|
||||
isKnown: (() => {
|
||||
const matchText = resolveKnownWordText(
|
||||
segmentText,
|
||||
headword,
|
||||
knownWordMatchMode,
|
||||
);
|
||||
return matchText ? isKnownWord(matchText) : false;
|
||||
})(),
|
||||
frequencyLookupTerms:
|
||||
frequencyLookupTerms.length > 0 ? frequencyLookupTerms : undefined,
|
||||
});
|
||||
}
|
||||
|
||||
if (!surface) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const start = charOffset;
|
||||
const end = start + surface.length;
|
||||
charOffset = end;
|
||||
|
||||
tokens.push({
|
||||
surface,
|
||||
reading,
|
||||
headword: headword || surface,
|
||||
startPos: start,
|
||||
endPos: end,
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: "",
|
||||
isMerged: true,
|
||||
isNPlusOneTarget: false,
|
||||
isKnown: (() => {
|
||||
const matchText = resolveKnownWordText(
|
||||
surface,
|
||||
headword,
|
||||
knownWordMatchMode,
|
||||
);
|
||||
return matchText ? isKnownWord(matchText) : false;
|
||||
})(),
|
||||
});
|
||||
}
|
||||
|
||||
if (validLineCount === 0) {
|
||||
if (validLineCount === 0 || tokens.length === 0) {
|
||||
return null;
|
||||
}
|
||||
return tokens.length > 0 ? tokens : null;
|
||||
|
||||
return { source, index, tokens };
|
||||
}
|
||||
|
||||
function selectBestYomitanParseCandidate(
|
||||
candidates: YomitanParseCandidate[],
|
||||
): MergedToken[] | null {
|
||||
if (candidates.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const scanningCandidates = candidates.filter(
|
||||
(candidate) => candidate.source === "scanning-parser",
|
||||
);
|
||||
const mecabCandidates = candidates.filter(
|
||||
(candidate) => candidate.source === "mecab",
|
||||
);
|
||||
|
||||
const getBestByTokenCount = (
|
||||
items: YomitanParseCandidate[],
|
||||
): YomitanParseCandidate | null => items.length === 0
|
||||
? null
|
||||
: items.reduce((best, current) =>
|
||||
current.tokens.length > best.tokens.length ? current : best,
|
||||
);
|
||||
|
||||
if (scanningCandidates.length > 0) {
|
||||
const bestScanning = getBestByTokenCount(scanningCandidates);
|
||||
if (bestScanning && bestScanning.tokens.length > 1) {
|
||||
return bestScanning.tokens;
|
||||
}
|
||||
|
||||
const bestMecab = getBestByTokenCount(mecabCandidates);
|
||||
if (
|
||||
bestMecab &&
|
||||
bestMecab.tokens.length > (bestScanning?.tokens.length ?? 0)
|
||||
) {
|
||||
return bestMecab.tokens;
|
||||
}
|
||||
|
||||
return bestScanning ? bestScanning.tokens : null;
|
||||
}
|
||||
|
||||
const bestCandidate = getBestByTokenCount(candidates);
|
||||
return bestCandidate ? bestCandidate.tokens : null;
|
||||
}
|
||||
|
||||
function mapYomitanParseResultsToMergedTokens(
|
||||
parseResults: unknown,
|
||||
isKnownWord: (text: string) => boolean,
|
||||
knownWordMatchMode: NPlusOneMatchMode,
|
||||
): MergedToken[] | null {
|
||||
if (!Array.isArray(parseResults) || parseResults.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const candidates = parseResults
|
||||
.filter((item): item is YomitanParseResultItem =>
|
||||
isYomitanParseResultItem(item),
|
||||
)
|
||||
.map((item) =>
|
||||
mapYomitanParseResultItemToMergedTokens(
|
||||
item,
|
||||
isKnownWord,
|
||||
knownWordMatchMode,
|
||||
),
|
||||
)
|
||||
.filter((candidate): candidate is YomitanParseCandidate => candidate !== null);
|
||||
|
||||
const bestCandidate = selectBestYomitanParseCandidate(candidates);
|
||||
return bestCandidate;
|
||||
}
|
||||
|
||||
function pickClosestMecabPos1(
|
||||
@@ -664,6 +787,7 @@ async function enrichYomitanPos1(
|
||||
async function ensureYomitanParserWindow(
|
||||
deps: TokenizerServiceDeps,
|
||||
): Promise<boolean> {
|
||||
const electron = await import("electron");
|
||||
const yomitanExt = deps.getYomitanExt();
|
||||
if (!yomitanExt) {
|
||||
return false;
|
||||
@@ -680,6 +804,7 @@ async function ensureYomitanParserWindow(
|
||||
}
|
||||
|
||||
const initPromise = (async () => {
|
||||
const { BrowserWindow, session } = electron;
|
||||
const parserWindow = new BrowserWindow({
|
||||
show: false,
|
||||
width: 800,
|
||||
@@ -786,7 +911,7 @@ async function parseWithYomitanInternalParser(
|
||||
optionsContext: { index: profileIndex },
|
||||
scanLength,
|
||||
useInternalParser: true,
|
||||
useMecabParser: false
|
||||
useMecabParser: true
|
||||
});
|
||||
})();
|
||||
`;
|
||||
|
||||
@@ -86,14 +86,29 @@ export function parseMecabLine(line: string): Token | null {
|
||||
};
|
||||
}
|
||||
|
||||
export interface MecabTokenizerOptions {
|
||||
mecabCommand?: string;
|
||||
dictionaryPath?: string;
|
||||
}
|
||||
|
||||
export class MecabTokenizer {
|
||||
private mecabPath: string | null = null;
|
||||
private mecabCommand: string;
|
||||
private dictionaryPath: string | null;
|
||||
private available: boolean = false;
|
||||
private enabled: boolean = true;
|
||||
|
||||
constructor(options: MecabTokenizerOptions = {}) {
|
||||
this.mecabCommand = options.mecabCommand?.trim() || "mecab";
|
||||
this.dictionaryPath = options.dictionaryPath?.trim() || null;
|
||||
}
|
||||
|
||||
async checkAvailability(): Promise<boolean> {
|
||||
try {
|
||||
const result = execSync("which mecab", { encoding: "utf-8" }).trim();
|
||||
const command = this.mecabCommand;
|
||||
const result = command.includes("/")
|
||||
? command
|
||||
: execSync(`which ${command}`, { encoding: "utf-8" }).trim();
|
||||
if (result) {
|
||||
this.mecabPath = result;
|
||||
this.available = true;
|
||||
@@ -114,7 +129,11 @@ export class MecabTokenizer {
|
||||
}
|
||||
|
||||
return new Promise((resolve) => {
|
||||
const mecab = spawn("mecab", [], {
|
||||
const mecabArgs: string[] = [];
|
||||
if (this.dictionaryPath) {
|
||||
mecabArgs.push("-d", this.dictionaryPath);
|
||||
}
|
||||
const mecab = spawn(this.mecabPath ?? this.mecabCommand, mecabArgs, {
|
||||
stdio: ["pipe", "pipe", "pipe"],
|
||||
});
|
||||
|
||||
@@ -149,6 +168,21 @@ export class MecabTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
if (tokens.length === 0 && text.trim().length > 0) {
|
||||
const trimmedStdout = stdout.trim();
|
||||
const trimmedStderr = stderr.trim();
|
||||
if (trimmedStdout) {
|
||||
log.warn(
|
||||
"MeCab returned no parseable tokens.",
|
||||
`command=${this.mecabPath ?? this.mecabCommand}`,
|
||||
`stdout=${trimmedStdout.slice(0, 1024)}`,
|
||||
);
|
||||
}
|
||||
if (trimmedStderr) {
|
||||
log.warn("MeCab stderr while tokenizing:", trimmedStderr);
|
||||
}
|
||||
}
|
||||
|
||||
resolve(tokens);
|
||||
});
|
||||
|
||||
|
||||
@@ -23,15 +23,35 @@ function createToken(overrides: Partial<MergedToken>): MergedToken {
|
||||
}
|
||||
|
||||
function extractClassBlock(cssText: string, selector: string): string {
|
||||
const start = cssText.indexOf(selector);
|
||||
if (start < 0) return "";
|
||||
const ruleRegex = /([^{}]+)\{([^}]*)\}/g;
|
||||
let match: RegExpExecArray | null = null;
|
||||
let fallbackBlock = "";
|
||||
|
||||
const openBrace = cssText.indexOf("{", start);
|
||||
if (openBrace < 0) return "";
|
||||
const closeBrace = cssText.indexOf("}", openBrace);
|
||||
if (closeBrace < 0) return "";
|
||||
while ((match = ruleRegex.exec(cssText)) !== null) {
|
||||
const selectorsBlock = match[1]?.trim() ?? "";
|
||||
const selectorBlock = match[2] ?? "";
|
||||
|
||||
return cssText.slice(openBrace + 1, closeBrace);
|
||||
const selectors = selectorsBlock
|
||||
.split(",")
|
||||
.map((entry) => entry.trim())
|
||||
.filter((entry) => entry.length > 0);
|
||||
|
||||
if (selectors.includes(selector)) {
|
||||
if (selectors.length === 1) {
|
||||
return selectorBlock;
|
||||
}
|
||||
|
||||
if (!fallbackBlock) {
|
||||
fallbackBlock = selectorBlock;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (fallbackBlock) {
|
||||
return fallbackBlock;
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
test("computeWordClass preserves known and n+1 classes while adding JLPT classes", () => {
|
||||
@@ -173,10 +193,16 @@ test("computeWordClass uses configured band count for banded mode", () => {
|
||||
topX: 4,
|
||||
mode: "banded",
|
||||
singleColor: "#000000",
|
||||
bandedColors: ["#111111", "#222222", "#333333"] as any,
|
||||
bandedColors: [
|
||||
"#111111",
|
||||
"#222222",
|
||||
"#333333",
|
||||
"#444444",
|
||||
"#555555",
|
||||
],
|
||||
} as any);
|
||||
|
||||
assert.equal(actual, "word word-frequency-band-1");
|
||||
assert.equal(actual, "word word-frequency-band-3");
|
||||
});
|
||||
|
||||
test("computeWordClass skips frequency class when rank is out of topX", () => {
|
||||
|
||||
@@ -56,6 +56,7 @@ export interface MergedToken {
|
||||
isNPlusOneTarget: boolean;
|
||||
jlptLevel?: JlptLevel;
|
||||
frequencyRank?: number;
|
||||
frequencyLookupTerms?: string[];
|
||||
}
|
||||
|
||||
export type FrequencyDictionaryLookup = (term: string) => number | null;
|
||||
|
||||
Reference in New Issue
Block a user