mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-02-28 06:22:45 -08:00
feat(tokenizer): refine Yomitan grouping and parser tooling
- map segmented Yomitan lines into single logical tokens and improve candidate selection heuristics - limit frequency lookup to selected token text with POS-based exclusions and add debug logging hook - add standalone Yomitan parser test script, deterministic utility-script shutdown, and docs/backlog updates
This commit is contained in:
@@ -213,7 +213,7 @@ test("tokenizeSubtitleService applies frequency dictionary ranks", async () => {
|
||||
reading: "デス",
|
||||
startPos: 1,
|
||||
endPos: 2,
|
||||
partOfSpeech: PartOfSpeech.bound_auxiliary,
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
@@ -228,7 +228,7 @@ test("tokenizeSubtitleService applies frequency dictionary ranks", async () => {
|
||||
assert.equal(result.tokens?.[1]?.frequencyRank, 1200);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService uses all Yomitan headword candidates for frequency lookup", async () => {
|
||||
test("tokenizeSubtitleService uses only selected Yomitan headword for frequency lookup", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"猫です",
|
||||
makeDeps({
|
||||
@@ -262,7 +262,66 @@ test("tokenizeSubtitleService uses all Yomitan headword candidates for frequency
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 40);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 1200);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService keeps furigana-split Yomitan segments as one token", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"友達と話した",
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getYomitanExt: () => ({ id: "dummy-ext" } as any),
|
||||
getYomitanParserWindow: () => ({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async () => [
|
||||
{
|
||||
source: "scanning-parser",
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
{
|
||||
text: "友",
|
||||
reading: "とも",
|
||||
headwords: [[{ term: "友達" }]],
|
||||
},
|
||||
{
|
||||
text: "達",
|
||||
reading: "だち",
|
||||
},
|
||||
],
|
||||
[
|
||||
{
|
||||
text: "と",
|
||||
reading: "と",
|
||||
headwords: [[{ term: "と" }]],
|
||||
},
|
||||
],
|
||||
[
|
||||
{
|
||||
text: "話した",
|
||||
reading: "はなした",
|
||||
headwords: [[{ term: "話す" }]],
|
||||
},
|
||||
],
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
} as unknown as Electron.BrowserWindow),
|
||||
getFrequencyRank: (text) => (text === "友達" ? 22 : text === "話す" ? 90 : null),
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 3);
|
||||
assert.equal(result.tokens?.[0]?.surface, "友達");
|
||||
assert.equal(result.tokens?.[0]?.reading, "ともだち");
|
||||
assert.equal(result.tokens?.[0]?.headword, "友達");
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 22);
|
||||
assert.equal(result.tokens?.[1]?.surface, "と");
|
||||
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
|
||||
assert.equal(result.tokens?.[2]?.surface, "話した");
|
||||
assert.equal(result.tokens?.[2]?.frequencyRank, 90);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService prefers exact headword frequency over surface/reading when available", async () => {
|
||||
@@ -299,7 +358,7 @@ test("tokenizeSubtitleService prefers exact headword frequency over surface/read
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 8);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService keeps no frequency when only reading matches and headword candidates miss", async () => {
|
||||
test("tokenizeSubtitleService keeps no frequency when only reading matches and headword misses", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"猫です",
|
||||
makeDeps({
|
||||
@@ -333,7 +392,7 @@ test("tokenizeSubtitleService keeps no frequency when only reading matches and h
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService ignores invalid frequency ranks and takes best valid headword candidate", async () => {
|
||||
test("tokenizeSubtitleService ignores invalid frequency rank on selected headword", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"猫です",
|
||||
makeDeps({
|
||||
@@ -367,7 +426,7 @@ test("tokenizeSubtitleService ignores invalid frequency ranks and takes best val
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 500);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService handles real-word frequency candidates and prefers most frequent term", async () => {
|
||||
@@ -472,6 +531,55 @@ test("tokenizeSubtitleService ignores frequency lookup failures", async () => {
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService skips frequency rank when Yomitan token is enriched as particle by mecab pos1", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"は",
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getYomitanExt: () => ({ id: "dummy-ext" } as any),
|
||||
getYomitanParserWindow: () => ({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async () => [
|
||||
{
|
||||
source: "scanning-parser",
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
{
|
||||
text: "は",
|
||||
reading: "は",
|
||||
headwords: [[{ term: "は" }]],
|
||||
},
|
||||
],
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
} as unknown as Electron.BrowserWindow),
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
headword: "は",
|
||||
surface: "は",
|
||||
reading: "ハ",
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
partOfSpeech: PartOfSpeech.particle,
|
||||
pos1: "助詞",
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
getFrequencyRank: (text) => (text === "は" ? 10 : null),
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.pos1, "助詞");
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService ignores invalid frequency ranks", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"猫",
|
||||
@@ -753,6 +861,8 @@ test("tokenizeSubtitleService uses Yomitan parser result when available", async
|
||||
reading: "ねこ",
|
||||
headwords: [[{ term: "猫" }]],
|
||||
},
|
||||
],
|
||||
[
|
||||
{
|
||||
text: "です",
|
||||
reading: "です",
|
||||
@@ -783,6 +893,155 @@ test("tokenizeSubtitleService uses Yomitan parser result when available", async
|
||||
assert.equal(result.tokens?.[1]?.isKnown, false);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService logs selected Yomitan groups when debug toggle is enabled", async () => {
|
||||
const infoLogs: string[] = [];
|
||||
const originalInfo = console.info;
|
||||
console.info = (...args: unknown[]) => {
|
||||
infoLogs.push(args.map((value) => String(value)).join(" "));
|
||||
};
|
||||
|
||||
try {
|
||||
await tokenizeSubtitleService(
|
||||
"友達と話した",
|
||||
makeDeps({
|
||||
getYomitanExt: () => ({ id: "dummy-ext" } as any),
|
||||
getYomitanParserWindow: () => ({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async () => [
|
||||
{
|
||||
source: "scanning-parser",
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
{
|
||||
text: "友",
|
||||
reading: "とも",
|
||||
headwords: [[{ term: "友達" }]],
|
||||
},
|
||||
{
|
||||
text: "達",
|
||||
reading: "だち",
|
||||
},
|
||||
],
|
||||
[
|
||||
{
|
||||
text: "と",
|
||||
reading: "と",
|
||||
headwords: [[{ term: "と" }]],
|
||||
},
|
||||
],
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
} as unknown as Electron.BrowserWindow),
|
||||
tokenizeWithMecab: async () => null,
|
||||
getYomitanGroupDebugEnabled: () => true,
|
||||
}),
|
||||
);
|
||||
} finally {
|
||||
console.info = originalInfo;
|
||||
}
|
||||
|
||||
assert.ok(
|
||||
infoLogs.some((line) => line.includes("Selected Yomitan token groups")),
|
||||
);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService does not log Yomitan groups when debug toggle is disabled", async () => {
|
||||
const infoLogs: string[] = [];
|
||||
const originalInfo = console.info;
|
||||
console.info = (...args: unknown[]) => {
|
||||
infoLogs.push(args.map((value) => String(value)).join(" "));
|
||||
};
|
||||
|
||||
try {
|
||||
await tokenizeSubtitleService(
|
||||
"友達と話した",
|
||||
makeDeps({
|
||||
getYomitanExt: () => ({ id: "dummy-ext" } as any),
|
||||
getYomitanParserWindow: () => ({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async () => [
|
||||
{
|
||||
source: "scanning-parser",
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
{
|
||||
text: "友",
|
||||
reading: "とも",
|
||||
headwords: [[{ term: "友達" }]],
|
||||
},
|
||||
{
|
||||
text: "達",
|
||||
reading: "だち",
|
||||
},
|
||||
],
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
} as unknown as Electron.BrowserWindow),
|
||||
tokenizeWithMecab: async () => null,
|
||||
getYomitanGroupDebugEnabled: () => false,
|
||||
}),
|
||||
);
|
||||
} finally {
|
||||
console.info = originalInfo;
|
||||
}
|
||||
|
||||
assert.equal(
|
||||
infoLogs.some((line) => line.includes("Selected Yomitan token groups")),
|
||||
false,
|
||||
);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService preserves segmented Yomitan line as one token", async () => {
|
||||
const parserWindow = {
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async () => [
|
||||
{
|
||||
source: "scanning-parser",
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
{
|
||||
text: "猫",
|
||||
reading: "ねこ",
|
||||
headwords: [[{ term: "猫です" }]],
|
||||
},
|
||||
{
|
||||
text: "です",
|
||||
reading: "です",
|
||||
},
|
||||
],
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
} as unknown as Electron.BrowserWindow;
|
||||
|
||||
const result = await tokenizeSubtitleService(
|
||||
"猫です",
|
||||
makeDeps({
|
||||
getYomitanExt: () => ({ id: "dummy-ext" } as any),
|
||||
getYomitanParserWindow: () => parserWindow,
|
||||
tokenizeWithMecab: async () => null,
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.text, "猫です");
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.surface, "猫です");
|
||||
assert.equal(result.tokens?.[0]?.reading, "ねこです");
|
||||
assert.equal(result.tokens?.[0]?.headword, "猫です");
|
||||
assert.equal(result.tokens?.[0]?.isKnown, false);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService prefers mecab parser tokens when scanning parser returns one token", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"俺は小園にいきたい",
|
||||
@@ -880,6 +1139,59 @@ test("tokenizeSubtitleService keeps scanning parser tokens when they are already
|
||||
assert.equal(result.tokens?.[2]?.frequencyRank, undefined);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService prefers parse candidates with fewer fragment-only kana tokens when source priority is equal", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"俺は公園にいきたい",
|
||||
makeDeps({
|
||||
getYomitanExt: () => ({ id: "dummy-ext" } as any),
|
||||
getYomitanParserWindow: () => ({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async () => [
|
||||
{
|
||||
source: "mecab-fragmented",
|
||||
index: 0,
|
||||
content: [
|
||||
[{ text: "俺", reading: "おれ", headwords: [[{ term: "俺" }]] }],
|
||||
[{ text: "は", reading: "", headwords: [[{ term: "は" }]] }],
|
||||
[{ text: "公園", reading: "こうえん", headwords: [[{ term: "公園" }]] }],
|
||||
[{ text: "にい", reading: "", headwords: [[{ term: "兄" }], [{ term: "二位" }]] }],
|
||||
[{ text: "きたい", reading: "", headwords: [[{ term: "期待" }], [{ term: "来る" }]] }],
|
||||
],
|
||||
},
|
||||
{
|
||||
source: "mecab",
|
||||
index: 0,
|
||||
content: [
|
||||
[{ text: "俺", reading: "おれ", headwords: [[{ term: "俺" }]] }],
|
||||
[{ text: "は", reading: "は", headwords: [[{ term: "は" }]] }],
|
||||
[{ text: "公園", reading: "こうえん", headwords: [[{ term: "公園" }]] }],
|
||||
[{ text: "に", reading: "に", headwords: [[{ term: "に" }]] }],
|
||||
[{ text: "行きたい", reading: "いきたい", headwords: [[{ term: "行きたい" }]] }],
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
} as unknown as Electron.BrowserWindow),
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) =>
|
||||
text === "俺"
|
||||
? 51
|
||||
: text === "公園"
|
||||
? 2304
|
||||
: text === "行きたい"
|
||||
? 1500
|
||||
: null,
|
||||
tokenizeWithMecab: async () => null,
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.map((token) => token.surface).join(","), "俺,は,公園,に,行きたい");
|
||||
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
|
||||
assert.equal(result.tokens?.[3]?.frequencyRank, undefined);
|
||||
assert.equal(result.tokens?.[4]?.frequencyRank, 1500);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService still assigns frequency to non-known Yomitan tokens", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"小園に",
|
||||
@@ -914,7 +1226,7 @@ test("tokenizeSubtitleService still assigns frequency to non-known Yomitan token
|
||||
assert.equal(result.tokens?.[0]?.isKnown, true);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 75);
|
||||
assert.equal(result.tokens?.[1]?.isKnown, false);
|
||||
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
|
||||
assert.equal(result.tokens?.[1]?.frequencyRank, 3000);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService marks tokens as known using callback", async () => {
|
||||
|
||||
@@ -13,6 +13,7 @@ import {
|
||||
shouldIgnoreJlptForMecabPos1,
|
||||
shouldIgnoreJlptByTerm,
|
||||
} from "./jlpt-token-filter";
|
||||
import { createLogger } from "../../logger";
|
||||
|
||||
interface YomitanParseHeadword {
|
||||
term?: unknown;
|
||||
@@ -37,6 +38,7 @@ const KATAKANA_CODEPOINT_START = 0x30a1;
|
||||
const KATAKANA_CODEPOINT_END = 0x30f6;
|
||||
const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
|
||||
const FREQUENCY_RANK_LOOKUP_CACHE_LIMIT = 2048;
|
||||
const logger = createLogger("main:tokenizer");
|
||||
|
||||
const jlptLevelLookupCaches = new WeakMap<
|
||||
(text: string) => JlptLevel | null,
|
||||
@@ -70,6 +72,7 @@ export interface TokenizerServiceDeps {
|
||||
getFrequencyDictionaryEnabled?: () => boolean;
|
||||
getFrequencyRank?: FrequencyDictionaryLookup;
|
||||
getMinSentenceWordsForNPlusOne?: () => number;
|
||||
getYomitanGroupDebugEnabled?: () => boolean;
|
||||
tokenizeWithMecab: (text: string) => Promise<MergedToken[] | null>;
|
||||
}
|
||||
|
||||
@@ -92,6 +95,7 @@ export interface TokenizerDepsRuntimeOptions {
|
||||
getFrequencyDictionaryEnabled?: () => boolean;
|
||||
getFrequencyRank?: FrequencyDictionaryLookup;
|
||||
getMinSentenceWordsForNPlusOne?: () => number;
|
||||
getYomitanGroupDebugEnabled?: () => boolean;
|
||||
getMecabTokenizer: () => MecabTokenizerLike | null;
|
||||
}
|
||||
|
||||
@@ -197,6 +201,8 @@ export function createTokenizerDepsRuntimeService(
|
||||
getFrequencyRank: options.getFrequencyRank,
|
||||
getMinSentenceWordsForNPlusOne:
|
||||
options.getMinSentenceWordsForNPlusOne ?? (() => 3),
|
||||
getYomitanGroupDebugEnabled:
|
||||
options.getYomitanGroupDebugEnabled ?? (() => false),
|
||||
tokenizeWithMecab: async (text) => {
|
||||
const mecabTokenizer = options.getMecabTokenizer();
|
||||
if (!mecabTokenizer) {
|
||||
@@ -253,40 +259,19 @@ function resolveFrequencyLookupText(token: MergedToken): string {
|
||||
}
|
||||
|
||||
function getFrequencyLookupTextCandidates(token: MergedToken): string[] {
|
||||
const tokenWithCandidates = token as MergedToken & {
|
||||
frequencyLookupTerms?: string[];
|
||||
};
|
||||
const lookupTextCandidates: string[] = [];
|
||||
const addLookupText = (text: string | undefined): void => {
|
||||
if (!text) {
|
||||
return;
|
||||
}
|
||||
const trimmed = text.trim();
|
||||
if (!trimmed) {
|
||||
return;
|
||||
}
|
||||
lookupTextCandidates.push(trimmed);
|
||||
};
|
||||
const lookupText = resolveFrequencyLookupText(token).trim();
|
||||
return lookupText ? [lookupText] : [];
|
||||
}
|
||||
|
||||
if (Array.isArray(tokenWithCandidates.frequencyLookupTerms)) {
|
||||
for (const term of tokenWithCandidates.frequencyLookupTerms) {
|
||||
addLookupText(term);
|
||||
}
|
||||
function isFrequencyExcludedByPos(token: MergedToken): boolean {
|
||||
if (
|
||||
token.partOfSpeech === PartOfSpeech.particle ||
|
||||
token.partOfSpeech === PartOfSpeech.bound_auxiliary
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
addLookupText(resolveFrequencyLookupText(token));
|
||||
|
||||
const uniqueLookupTerms: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
for (const term of lookupTextCandidates) {
|
||||
if (seen.has(term)) {
|
||||
continue;
|
||||
}
|
||||
seen.add(term);
|
||||
uniqueLookupTerms.push(term);
|
||||
}
|
||||
|
||||
return uniqueLookupTerms;
|
||||
return token.pos1 === "助詞" || token.pos1 === "助動詞";
|
||||
}
|
||||
|
||||
function applyFrequencyMarking(
|
||||
@@ -294,6 +279,10 @@ function applyFrequencyMarking(
|
||||
getFrequencyRank: FrequencyDictionaryLookup,
|
||||
): MergedToken[] {
|
||||
return tokens.map((token) => {
|
||||
if (isFrequencyExcludedByPos(token)) {
|
||||
return { ...token, frequencyRank: undefined };
|
||||
}
|
||||
|
||||
const lookupTexts = getFrequencyLookupTextCandidates(token);
|
||||
if (lookupTexts.length === 0) {
|
||||
return { ...token, frequencyRank: undefined };
|
||||
@@ -499,27 +488,6 @@ function extractYomitanHeadword(segment: YomitanParseSegment): string {
|
||||
return "";
|
||||
}
|
||||
|
||||
function extractYomitanHeadwords(segment: YomitanParseSegment): string[] {
|
||||
const headwords = segment.headwords;
|
||||
if (!isYomitanHeadwordRows(headwords)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const results: string[] = [];
|
||||
for (const group of headwords) {
|
||||
for (const candidate of group) {
|
||||
if (isString(candidate.term)) {
|
||||
const term = candidate.term.trim();
|
||||
if (term.length > 0) {
|
||||
results.push(term);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
function applyJlptMarking(
|
||||
tokens: MergedToken[],
|
||||
getJlptLevel: (text: string) => JlptLevel | null,
|
||||
@@ -575,41 +543,53 @@ function mapYomitanParseResultItemToMergedTokens(
|
||||
}
|
||||
validLineCount += 1;
|
||||
|
||||
let combinedSurface = "";
|
||||
let combinedReading = "";
|
||||
let combinedHeadword = "";
|
||||
|
||||
for (const segment of line) {
|
||||
const segmentText = segment.text;
|
||||
if (!segmentText || segmentText.length === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const start = charOffset;
|
||||
const end = start + segmentText.length;
|
||||
charOffset = end;
|
||||
|
||||
const headword = extractYomitanHeadword(segment) || segmentText;
|
||||
const frequencyLookupTerms = extractYomitanHeadwords(segment);
|
||||
|
||||
tokens.push({
|
||||
surface: segmentText,
|
||||
reading: typeof segment.reading === "string" ? segment.reading : "",
|
||||
headword,
|
||||
startPos: start,
|
||||
endPos: end,
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: "",
|
||||
isMerged: true,
|
||||
isNPlusOneTarget: false,
|
||||
isKnown: (() => {
|
||||
const matchText = resolveKnownWordText(
|
||||
segmentText,
|
||||
headword,
|
||||
knownWordMatchMode,
|
||||
);
|
||||
return matchText ? isKnownWord(matchText) : false;
|
||||
})(),
|
||||
frequencyLookupTerms:
|
||||
frequencyLookupTerms.length > 0 ? frequencyLookupTerms : undefined,
|
||||
});
|
||||
combinedSurface += segmentText;
|
||||
if (typeof segment.reading === "string") {
|
||||
combinedReading += segment.reading;
|
||||
}
|
||||
if (!combinedHeadword) {
|
||||
combinedHeadword = extractYomitanHeadword(segment);
|
||||
}
|
||||
}
|
||||
|
||||
if (!combinedSurface) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const start = charOffset;
|
||||
const end = start + combinedSurface.length;
|
||||
charOffset = end;
|
||||
const headword = combinedHeadword || combinedSurface;
|
||||
|
||||
tokens.push({
|
||||
surface: combinedSurface,
|
||||
reading: combinedReading,
|
||||
headword,
|
||||
startPos: start,
|
||||
endPos: end,
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: "",
|
||||
isMerged: true,
|
||||
isNPlusOneTarget: false,
|
||||
isKnown: (() => {
|
||||
const matchText = resolveKnownWordText(
|
||||
combinedSurface,
|
||||
headword,
|
||||
knownWordMatchMode,
|
||||
);
|
||||
return matchText ? isKnownWord(matchText) : false;
|
||||
})(),
|
||||
});
|
||||
}
|
||||
|
||||
if (validLineCount === 0 || tokens.length === 0) {
|
||||
@@ -641,13 +621,52 @@ function selectBestYomitanParseCandidate(
|
||||
current.tokens.length > best.tokens.length ? current : best,
|
||||
);
|
||||
|
||||
const getCandidateScore = (candidate: YomitanParseCandidate): number => {
|
||||
const readableTokenCount = candidate.tokens.filter(
|
||||
(token) => token.reading.trim().length > 0,
|
||||
).length;
|
||||
const suspiciousKanaFragmentCount = candidate.tokens.filter((token) =>
|
||||
token.reading.trim().length === 0 &&
|
||||
token.surface.length >= 2 &&
|
||||
Array.from(token.surface).every((char) => isKanaChar(char))
|
||||
).length;
|
||||
|
||||
return (
|
||||
readableTokenCount * 100 -
|
||||
suspiciousKanaFragmentCount * 50 -
|
||||
candidate.tokens.length
|
||||
);
|
||||
};
|
||||
|
||||
const chooseBestCandidate = (
|
||||
items: YomitanParseCandidate[],
|
||||
): YomitanParseCandidate | null => {
|
||||
if (items.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return items.reduce((best, current) => {
|
||||
const bestScore = getCandidateScore(best);
|
||||
const currentScore = getCandidateScore(current);
|
||||
if (currentScore !== bestScore) {
|
||||
return currentScore > bestScore ? current : best;
|
||||
}
|
||||
|
||||
if (current.tokens.length !== best.tokens.length) {
|
||||
return current.tokens.length < best.tokens.length ? current : best;
|
||||
}
|
||||
|
||||
return best;
|
||||
});
|
||||
};
|
||||
|
||||
if (scanningCandidates.length > 0) {
|
||||
const bestScanning = getBestByTokenCount(scanningCandidates);
|
||||
if (bestScanning && bestScanning.tokens.length > 1) {
|
||||
return bestScanning.tokens;
|
||||
}
|
||||
|
||||
const bestMecab = getBestByTokenCount(mecabCandidates);
|
||||
const bestMecab = chooseBestCandidate(mecabCandidates);
|
||||
if (
|
||||
bestMecab &&
|
||||
bestMecab.tokens.length > (bestScanning?.tokens.length ?? 0)
|
||||
@@ -658,7 +677,11 @@ function selectBestYomitanParseCandidate(
|
||||
return bestScanning ? bestScanning.tokens : null;
|
||||
}
|
||||
|
||||
const bestCandidate = getBestByTokenCount(candidates);
|
||||
const multiTokenCandidates = candidates.filter(
|
||||
(candidate) => candidate.tokens.length > 1,
|
||||
);
|
||||
const pool = multiTokenCandidates.length > 0 ? multiTokenCandidates : candidates;
|
||||
const bestCandidate = chooseBestCandidate(pool);
|
||||
return bestCandidate ? bestCandidate.tokens : null;
|
||||
}
|
||||
|
||||
@@ -688,6 +711,25 @@ function mapYomitanParseResultsToMergedTokens(
|
||||
return bestCandidate;
|
||||
}
|
||||
|
||||
function logSelectedYomitanGroups(text: string, tokens: MergedToken[]): void {
|
||||
if (!tokens || tokens.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
logger.info("Selected Yomitan token groups", {
|
||||
text,
|
||||
tokenCount: tokens.length,
|
||||
groups: tokens.map((token, index) => ({
|
||||
index,
|
||||
surface: token.surface,
|
||||
headword: token.headword,
|
||||
reading: token.reading,
|
||||
startPos: token.startPos,
|
||||
endPos: token.endPos,
|
||||
})),
|
||||
});
|
||||
}
|
||||
|
||||
function pickClosestMecabPos1(
|
||||
token: MergedToken,
|
||||
mecabTokens: MergedToken[],
|
||||
@@ -930,6 +972,10 @@ async function parseWithYomitanInternalParser(
|
||||
return null;
|
||||
}
|
||||
|
||||
if (deps.getYomitanGroupDebugEnabled?.() === true) {
|
||||
logSelectedYomitanGroups(text, yomitanTokens);
|
||||
}
|
||||
|
||||
return enrichYomitanPos1(yomitanTokens, deps, text);
|
||||
} catch (err) {
|
||||
console.error("Yomitan parser request failed:", (err as Error).message);
|
||||
|
||||
Reference in New Issue
Block a user