refactor(tokenizer): split pipeline into explicit stages

This commit is contained in:
2026-02-21 15:51:37 -08:00
parent 7e1a7df403
commit b71a1a3d29
10 changed files with 1368 additions and 952 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,159 @@
import assert from 'node:assert/strict';
import test from 'node:test';
import { MergedToken, PartOfSpeech } from '../../../types';
import { annotateTokens, AnnotationStageDeps } from './annotation-stage';
function makeToken(overrides: Partial<MergedToken> = {}): MergedToken {
return {
surface: '猫',
reading: 'ネコ',
headword: '猫',
startPos: 0,
endPos: 1,
partOfSpeech: PartOfSpeech.noun,
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
...overrides,
};
}
function makeDeps(overrides: Partial<AnnotationStageDeps> = {}): AnnotationStageDeps {
return {
isKnownWord: () => false,
knownWordMatchMode: 'headword',
getJlptLevel: () => null,
...overrides,
};
}
test('annotateTokens known-word match mode uses headword vs surface', () => {
const tokens = [makeToken({ surface: '食べた', headword: '食べる', reading: 'タベタ' })];
const isKnownWord = (text: string): boolean => text === '食べる';
const headwordResult = annotateTokens(
tokens,
makeDeps({
isKnownWord,
knownWordMatchMode: 'headword',
}),
);
const surfaceResult = annotateTokens(
tokens,
makeDeps({
isKnownWord,
knownWordMatchMode: 'surface',
}),
);
assert.equal(headwordResult[0]?.isKnown, true);
assert.equal(surfaceResult[0]?.isKnown, false);
});
test('annotateTokens excludes frequency for particle/bound_auxiliary and pos1 exclusions', () => {
const lookupCalls: string[] = [];
const tokens = [
makeToken({ surface: 'は', headword: 'は', partOfSpeech: PartOfSpeech.particle }),
makeToken({
surface: 'です',
headword: 'です',
partOfSpeech: PartOfSpeech.bound_auxiliary,
startPos: 1,
endPos: 3,
}),
makeToken({
surface: 'の',
headword: 'の',
partOfSpeech: PartOfSpeech.other,
pos1: '助詞',
startPos: 3,
endPos: 4,
}),
makeToken({
surface: '猫',
headword: '猫',
partOfSpeech: PartOfSpeech.noun,
startPos: 4,
endPos: 5,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
getFrequencyRank: (text) => {
lookupCalls.push(text);
return text === '猫' ? 11 : 999;
},
}),
);
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[1]?.frequencyRank, undefined);
assert.equal(result[2]?.frequencyRank, undefined);
assert.equal(result[3]?.frequencyRank, 11);
assert.deepEqual(lookupCalls, ['猫']);
});
test('annotateTokens handles JLPT disabled and eligibility exclusion paths', () => {
let disabledLookupCalls = 0;
const disabledResult = annotateTokens(
[makeToken({ surface: '猫', headword: '猫' })],
makeDeps({
getJlptLevel: () => {
disabledLookupCalls += 1;
return 'N5';
},
}),
{ jlptEnabled: false },
);
assert.equal(disabledResult[0]?.jlptLevel, undefined);
assert.equal(disabledLookupCalls, 0);
let excludedLookupCalls = 0;
const excludedResult = annotateTokens(
[
makeToken({
surface: '',
headword: '',
reading: '',
pos1: '記号',
partOfSpeech: PartOfSpeech.symbol,
}),
],
makeDeps({
getJlptLevel: () => {
excludedLookupCalls += 1;
return 'N5';
},
}),
);
assert.equal(excludedResult[0]?.jlptLevel, undefined);
assert.equal(excludedLookupCalls, 0);
});
test('annotateTokens N+1 handoff marks expected target when threshold is satisfied', () => {
const tokens = [
makeToken({ surface: '私', headword: '私', startPos: 0, endPos: 1 }),
makeToken({ surface: '猫', headword: '猫', startPos: 1, endPos: 2 }),
makeToken({
surface: '見る',
headword: '見る',
partOfSpeech: PartOfSpeech.verb,
startPos: 2,
endPos: 4,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === '私' || text === '見る',
}),
{ minSentenceWordsForNPlusOne: 3 },
);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[1]?.isNPlusOneTarget, true);
assert.equal(result[2]?.isNPlusOneTarget, false);
});

View File

@@ -0,0 +1,375 @@
import { markNPlusOneTargets } from '../../../token-merger';
import {
FrequencyDictionaryLookup,
JlptLevel,
MergedToken,
NPlusOneMatchMode,
PartOfSpeech,
} from '../../../types';
import { shouldIgnoreJlptByTerm, shouldIgnoreJlptForMecabPos1 } from '../jlpt-token-filter';
const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
const KATAKANA_CODEPOINT_START = 0x30a1;
const KATAKANA_CODEPOINT_END = 0x30f6;
const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
const FREQUENCY_RANK_LOOKUP_CACHE_LIMIT = 2048;
const jlptLevelLookupCaches = new WeakMap<
(text: string) => JlptLevel | null,
Map<string, JlptLevel | null>
>();
const frequencyRankLookupCaches = new WeakMap<
FrequencyDictionaryLookup,
Map<string, number | null>
>();
export interface AnnotationStageDeps {
isKnownWord: (text: string) => boolean;
knownWordMatchMode: NPlusOneMatchMode;
getJlptLevel: (text: string) => JlptLevel | null;
getFrequencyRank?: FrequencyDictionaryLookup;
}
export interface AnnotationStageOptions {
jlptEnabled?: boolean;
frequencyEnabled?: boolean;
minSentenceWordsForNPlusOne?: number;
}
function resolveKnownWordText(
surface: string,
headword: string,
matchMode: NPlusOneMatchMode,
): string {
return matchMode === 'surface' ? surface : headword;
}
function applyKnownWordMarking(
tokens: MergedToken[],
isKnownWord: (text: string) => boolean,
knownWordMatchMode: NPlusOneMatchMode,
): MergedToken[] {
return tokens.map((token) => {
const matchText = resolveKnownWordText(token.surface, token.headword, knownWordMatchMode);
return {
...token,
isKnown: token.isKnown || (matchText ? isKnownWord(matchText) : false),
};
});
}
function normalizeFrequencyLookupText(rawText: string): string {
return rawText.trim().toLowerCase();
}
function getCachedFrequencyRank(
lookupText: string,
getFrequencyRank: FrequencyDictionaryLookup,
): number | null {
const normalizedText = normalizeFrequencyLookupText(lookupText);
if (!normalizedText) {
return null;
}
let cache = frequencyRankLookupCaches.get(getFrequencyRank);
if (!cache) {
cache = new Map<string, number | null>();
frequencyRankLookupCaches.set(getFrequencyRank, cache);
}
if (cache.has(normalizedText)) {
return cache.get(normalizedText) ?? null;
}
let rank: number | null;
try {
rank = getFrequencyRank(normalizedText);
} catch {
rank = null;
}
if (rank !== null) {
if (!Number.isFinite(rank) || rank <= 0) {
rank = null;
}
}
cache.set(normalizedText, rank);
while (cache.size > FREQUENCY_RANK_LOOKUP_CACHE_LIMIT) {
const firstKey = cache.keys().next().value;
if (firstKey !== undefined) {
cache.delete(firstKey);
}
}
return rank;
}
function resolveFrequencyLookupText(token: MergedToken): string {
if (token.headword && token.headword.length > 0) {
return token.headword;
}
if (token.reading && token.reading.length > 0) {
return token.reading;
}
return token.surface;
}
function getFrequencyLookupTextCandidates(token: MergedToken): string[] {
const lookupText = resolveFrequencyLookupText(token).trim();
return lookupText ? [lookupText] : [];
}
function isFrequencyExcludedByPos(token: MergedToken): boolean {
if (
token.partOfSpeech === PartOfSpeech.particle ||
token.partOfSpeech === PartOfSpeech.bound_auxiliary
) {
return true;
}
return token.pos1 === '助詞' || token.pos1 === '助動詞';
}
function applyFrequencyMarking(
tokens: MergedToken[],
getFrequencyRank: FrequencyDictionaryLookup,
): MergedToken[] {
return tokens.map((token) => {
if (isFrequencyExcludedByPos(token)) {
return { ...token, frequencyRank: undefined };
}
const lookupTexts = getFrequencyLookupTextCandidates(token);
if (lookupTexts.length === 0) {
return { ...token, frequencyRank: undefined };
}
let bestRank: number | null = null;
for (const lookupText of lookupTexts) {
const rank = getCachedFrequencyRank(lookupText, getFrequencyRank);
if (rank === null) {
continue;
}
if (bestRank === null || rank < bestRank) {
bestRank = rank;
}
}
return {
...token,
frequencyRank: bestRank ?? undefined,
};
});
}
function getCachedJlptLevel(
lookupText: string,
getJlptLevel: (text: string) => JlptLevel | null,
): JlptLevel | null {
const normalizedText = lookupText.trim();
if (!normalizedText) {
return null;
}
let cache = jlptLevelLookupCaches.get(getJlptLevel);
if (!cache) {
cache = new Map<string, JlptLevel | null>();
jlptLevelLookupCaches.set(getJlptLevel, cache);
}
if (cache.has(normalizedText)) {
return cache.get(normalizedText) ?? null;
}
let level: JlptLevel | null;
try {
level = getJlptLevel(normalizedText);
} catch {
level = null;
}
cache.set(normalizedText, level);
while (cache.size > JLPT_LEVEL_LOOKUP_CACHE_LIMIT) {
const firstKey = cache.keys().next().value;
if (firstKey !== undefined) {
cache.delete(firstKey);
}
}
return level;
}
function resolveJlptLookupText(token: MergedToken): string {
if (token.headword && token.headword.length > 0) {
return token.headword;
}
if (token.reading && token.reading.length > 0) {
return token.reading;
}
return token.surface;
}
function normalizeJlptTextForExclusion(text: string): string {
const raw = text.trim();
if (!raw) {
return '';
}
let normalized = '';
for (const char of raw) {
const code = char.codePointAt(0);
if (code === undefined) {
continue;
}
if (code >= KATAKANA_CODEPOINT_START && code <= KATAKANA_CODEPOINT_END) {
normalized += String.fromCodePoint(code - KATAKANA_TO_HIRAGANA_OFFSET);
continue;
}
normalized += char;
}
return normalized;
}
function isKanaChar(char: string): boolean {
const code = char.codePointAt(0);
if (code === undefined) {
return false;
}
return (
(code >= 0x3041 && code <= 0x3096) ||
(code >= 0x309b && code <= 0x309f) ||
(code >= 0x30a0 && code <= 0x30fa) ||
(code >= 0x30fd && code <= 0x30ff)
);
}
function isRepeatedKanaSfx(text: string): boolean {
const normalized = text.trim();
if (!normalized) {
return false;
}
const chars = [...normalized];
if (!chars.every(isKanaChar)) {
return false;
}
const counts = new Map<string, number>();
let hasAdjacentRepeat = false;
for (let i = 0; i < chars.length; i += 1) {
const char = chars[i]!;
counts.set(char, (counts.get(char) ?? 0) + 1);
if (i > 0 && chars[i] === chars[i - 1]) {
hasAdjacentRepeat = true;
}
}
const topCount = Math.max(...counts.values());
if (chars.length <= 2) {
return hasAdjacentRepeat || topCount >= 2;
}
if (hasAdjacentRepeat) {
return true;
}
return topCount >= Math.ceil(chars.length / 2);
}
function isJlptEligibleToken(token: MergedToken): boolean {
if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) {
return false;
}
const candidates = [
resolveJlptLookupText(token),
token.surface,
token.reading,
token.headword,
].filter(
(candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
);
for (const candidate of candidates) {
const normalizedCandidate = normalizeJlptTextForExclusion(candidate);
if (!normalizedCandidate) {
continue;
}
const trimmedCandidate = candidate.trim();
if (shouldIgnoreJlptByTerm(trimmedCandidate) || shouldIgnoreJlptByTerm(normalizedCandidate)) {
return false;
}
if (isRepeatedKanaSfx(candidate) || isRepeatedKanaSfx(normalizedCandidate)) {
return false;
}
}
return true;
}
function applyJlptMarking(
tokens: MergedToken[],
getJlptLevel: (text: string) => JlptLevel | null,
): MergedToken[] {
return tokens.map((token) => {
if (!isJlptEligibleToken(token)) {
return { ...token, jlptLevel: undefined };
}
const primaryLevel = getCachedJlptLevel(resolveJlptLookupText(token), getJlptLevel);
const fallbackLevel =
primaryLevel === null ? getCachedJlptLevel(token.surface, getJlptLevel) : null;
return {
...token,
jlptLevel: primaryLevel ?? fallbackLevel ?? token.jlptLevel,
};
});
}
export function annotateTokens(
tokens: MergedToken[],
deps: AnnotationStageDeps,
options: AnnotationStageOptions = {},
): MergedToken[] {
const knownMarkedTokens = applyKnownWordMarking(
tokens,
deps.isKnownWord,
deps.knownWordMatchMode,
);
const frequencyEnabled = options.frequencyEnabled !== false;
const frequencyMarkedTokens =
frequencyEnabled && deps.getFrequencyRank
? applyFrequencyMarking(knownMarkedTokens, deps.getFrequencyRank)
: knownMarkedTokens.map((token) => ({
...token,
frequencyRank: undefined,
}));
const jlptEnabled = options.jlptEnabled !== false;
const jlptMarkedTokens = jlptEnabled
? applyJlptMarking(frequencyMarkedTokens, deps.getJlptLevel)
: frequencyMarkedTokens.map((token) => ({
...token,
jlptLevel: undefined,
}));
const minSentenceWordsForNPlusOne = options.minSentenceWordsForNPlusOne;
const sanitizedMinSentenceWordsForNPlusOne =
minSentenceWordsForNPlusOne !== undefined &&
Number.isInteger(minSentenceWordsForNPlusOne) &&
minSentenceWordsForNPlusOne > 0
? minSentenceWordsForNPlusOne
: 3;
return markNPlusOneTargets(jlptMarkedTokens, sanitizedMinSentenceWordsForNPlusOne);
}

View File

@@ -0,0 +1,49 @@
import test from 'node:test';
import assert from 'node:assert/strict';
import { MergedToken, PartOfSpeech } from '../../../types';
import { enrichTokensWithMecabPos1 } from './parser-enrichment-stage';
function makeToken(overrides: Partial<MergedToken>): MergedToken {
return {
surface: 'token',
reading: '',
headword: 'token',
startPos: 0,
endPos: 1,
partOfSpeech: PartOfSpeech.other,
isMerged: true,
isKnown: false,
isNPlusOneTarget: false,
pos1: '',
...overrides,
};
}
test('enrichTokensWithMecabPos1 picks pos1 by best overlap when no surface match exists', () => {
const tokens = [makeToken({ surface: 'grouped', startPos: 2, endPos: 7 })];
const mecabTokens = [
makeToken({ surface: 'left', startPos: 0, endPos: 4, pos1: 'A' }),
makeToken({ surface: 'right', startPos: 2, endPos: 6, pos1: 'B' }),
];
const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
assert.equal(enriched[0]?.pos1, 'B');
});
test('enrichTokensWithMecabPos1 fills missing pos1 using surface-sequence fallback', () => {
const tokens = [makeToken({ surface: ' は ', startPos: 10, endPos: 13 })];
const mecabTokens = [makeToken({ surface: 'は', startPos: 0, endPos: 1, pos1: '助詞' })];
const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
assert.equal(enriched[0]?.pos1, '助詞');
});
test('enrichTokensWithMecabPos1 passes through unchanged when mecab tokens are null or empty', () => {
const tokens = [makeToken({ surface: '猫', startPos: 0, endPos: 1 })];
const nullResult = enrichTokensWithMecabPos1(tokens, null);
assert.strictEqual(nullResult, tokens);
const emptyResult = enrichTokensWithMecabPos1(tokens, []);
assert.strictEqual(emptyResult, tokens);
});

View File

@@ -0,0 +1,167 @@
import { MergedToken } from '../../../types';
function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): string | undefined {
if (mecabTokens.length === 0) {
return undefined;
}
const tokenStart = token.startPos ?? 0;
const tokenEnd = token.endPos ?? tokenStart + token.surface.length;
let bestSurfaceMatchPos1: string | undefined;
let bestSurfaceMatchDistance = Number.MAX_SAFE_INTEGER;
let bestSurfaceMatchEndDistance = Number.MAX_SAFE_INTEGER;
for (const mecabToken of mecabTokens) {
if (!mecabToken.pos1) {
continue;
}
if (mecabToken.surface !== token.surface) {
continue;
}
const mecabStart = mecabToken.startPos ?? 0;
const mecabEnd = mecabToken.endPos ?? mecabStart + mecabToken.surface.length;
const startDistance = Math.abs(mecabStart - tokenStart);
const endDistance = Math.abs(mecabEnd - tokenEnd);
if (
startDistance < bestSurfaceMatchDistance ||
(startDistance === bestSurfaceMatchDistance && endDistance < bestSurfaceMatchEndDistance)
) {
bestSurfaceMatchDistance = startDistance;
bestSurfaceMatchEndDistance = endDistance;
bestSurfaceMatchPos1 = mecabToken.pos1;
}
}
if (bestSurfaceMatchPos1) {
return bestSurfaceMatchPos1;
}
let bestPos1: string | undefined;
let bestOverlap = 0;
let bestSpan = 0;
let bestStartDistance = Number.MAX_SAFE_INTEGER;
let bestStart = Number.MAX_SAFE_INTEGER;
for (const mecabToken of mecabTokens) {
if (!mecabToken.pos1) {
continue;
}
const mecabStart = mecabToken.startPos ?? 0;
const mecabEnd = mecabToken.endPos ?? mecabStart + mecabToken.surface.length;
const overlapStart = Math.max(tokenStart, mecabStart);
const overlapEnd = Math.min(tokenEnd, mecabEnd);
const overlap = Math.max(0, overlapEnd - overlapStart);
if (overlap === 0) {
continue;
}
const span = mecabEnd - mecabStart;
if (
overlap > bestOverlap ||
(overlap === bestOverlap &&
(Math.abs(mecabStart - tokenStart) < bestStartDistance ||
(Math.abs(mecabStart - tokenStart) === bestStartDistance &&
(span > bestSpan || (span === bestSpan && mecabStart < bestStart)))))
) {
bestOverlap = overlap;
bestSpan = span;
bestStartDistance = Math.abs(mecabStart - tokenStart);
bestStart = mecabStart;
bestPos1 = mecabToken.pos1;
}
}
return bestOverlap > 0 ? bestPos1 : undefined;
}
function fillMissingPos1BySurfaceSequence(
tokens: MergedToken[],
mecabTokens: MergedToken[],
): MergedToken[] {
const indexedMecabTokens = mecabTokens
.map((token, index) => ({ token, index }))
.filter(({ token }) => token.pos1 && token.surface.trim().length > 0);
if (indexedMecabTokens.length === 0) {
return tokens;
}
let cursor = 0;
return tokens.map((token) => {
if (token.pos1 && token.pos1.trim().length > 0) {
return token;
}
const surface = token.surface.trim();
if (!surface) {
return token;
}
let best: { pos1: string; index: number } | null = null;
for (const candidate of indexedMecabTokens) {
if (candidate.token.surface !== surface) {
continue;
}
if (candidate.index < cursor) {
continue;
}
best = { pos1: candidate.token.pos1 as string, index: candidate.index };
break;
}
if (!best) {
for (const candidate of indexedMecabTokens) {
if (candidate.token.surface !== surface) {
continue;
}
best = { pos1: candidate.token.pos1 as string, index: candidate.index };
break;
}
}
if (!best) {
return token;
}
cursor = best.index + 1;
return {
...token,
pos1: best.pos1,
};
});
}
export function enrichTokensWithMecabPos1(
tokens: MergedToken[],
mecabTokens: MergedToken[] | null,
): MergedToken[] {
if (!tokens || tokens.length === 0) {
return tokens;
}
if (!mecabTokens || mecabTokens.length === 0) {
return tokens;
}
const overlapEnriched = tokens.map((token) => {
if (token.pos1) {
return token;
}
const pos1 = pickClosestMecabPos1(token, mecabTokens);
if (!pos1) {
return token;
}
return {
...token,
pos1,
};
});
return fillMissingPos1BySurfaceSequence(overlapEnriched, mecabTokens);
}

View File

@@ -0,0 +1,85 @@
import test from 'node:test';
import assert from 'node:assert/strict';
import { selectYomitanParseTokens } from './parser-selection-stage';
interface ParseSegmentInput {
text: string;
reading?: string;
headword?: string;
}
function makeParseItem(
source: string,
lines: ParseSegmentInput[][],
): {
source: string;
index: number;
content: Array<
Array<{ text: string; reading?: string; headwords?: Array<Array<{ term: string }>> }>
>;
} {
return {
source,
index: 0,
content: lines.map((line) =>
line.map((segment) => ({
text: segment.text,
reading: segment.reading,
headwords: segment.headword ? [[{ term: segment.headword }]] : undefined,
})),
),
};
}
test('prefers scanning parser when scanning candidate has more than one token', () => {
const parseResults = [
makeParseItem('scanning-parser', [
[{ text: '小園', reading: 'おうえん', headword: '小園' }],
[{ text: 'に', reading: 'に', headword: 'に' }],
]),
makeParseItem('mecab', [
[{ text: '小', reading: 'お', headword: '小' }],
[{ text: '園', reading: 'えん', headword: '園' }],
[{ text: 'に', reading: 'に', headword: 'に' }],
]),
];
const tokens = selectYomitanParseTokens(parseResults, () => false, 'headword');
assert.equal(tokens?.map((token) => token.surface).join(','), '小園,に');
});
test('prefers mecab candidate when scanning candidate is single token and mecab has better split', () => {
const parseResults = [
makeParseItem('scanning-parser', [
[{ text: '俺は公園にいきたい', reading: 'おれはこうえんにいきたい' }],
]),
makeParseItem('mecab', [
[{ text: '俺', reading: 'おれ', headword: '俺' }],
[{ text: 'は', reading: 'は', headword: 'は' }],
[{ text: '公園', reading: 'こうえん', headword: '公園' }],
[{ text: 'に', reading: 'に', headword: 'に' }],
[{ text: 'いきたい', reading: 'いきたい', headword: '行きたい' }],
]),
];
const tokens = selectYomitanParseTokens(parseResults, () => false, 'headword');
assert.equal(tokens?.map((token) => token.surface).join(','), '俺,は,公園,に,いきたい');
});
test('tie-break prefers fewer suspicious kana fragments', () => {
const parseResults = [
makeParseItem('mecab-fragmented', [
[{ text: '俺', reading: 'おれ', headword: '俺' }],
[{ text: 'にい', reading: '', headword: '兄' }],
[{ text: 'きたい', reading: '', headword: '期待' }],
]),
makeParseItem('mecab', [
[{ text: '俺', reading: 'おれ', headword: '俺' }],
[{ text: 'に', reading: 'に', headword: 'に' }],
[{ text: '行きたい', reading: 'いきたい', headword: '行きたい' }],
]),
];
const tokens = selectYomitanParseTokens(parseResults, () => false, 'headword');
assert.equal(tokens?.map((token) => token.surface).join(','), '俺,に,行きたい');
});

View File

@@ -0,0 +1,281 @@
import { MergedToken, NPlusOneMatchMode, PartOfSpeech } from '../../../types';
interface YomitanParseHeadword {
term?: unknown;
}
interface YomitanParseSegment {
text?: string;
reading?: string;
headwords?: unknown;
}
interface YomitanParseResultItem {
source?: unknown;
index?: unknown;
content?: unknown;
}
type YomitanParseLine = YomitanParseSegment[];
export interface YomitanParseCandidate {
source: string;
index: number;
tokens: MergedToken[];
}
function isObject(value: unknown): value is Record<string, unknown> {
return Boolean(value && typeof value === 'object');
}
function isString(value: unknown): value is string {
return typeof value === 'string';
}
function resolveKnownWordText(
surface: string,
headword: string,
matchMode: NPlusOneMatchMode,
): string {
return matchMode === 'surface' ? surface : headword;
}
function isKanaChar(char: string): boolean {
const code = char.codePointAt(0);
if (code === undefined) {
return false;
}
return (
(code >= 0x3041 && code <= 0x3096) ||
(code >= 0x309b && code <= 0x309f) ||
(code >= 0x30a0 && code <= 0x30fa) ||
(code >= 0x30fd && code <= 0x30ff)
);
}
function isYomitanParseLine(value: unknown): value is YomitanParseLine {
if (!Array.isArray(value)) {
return false;
}
return value.every((segment) => {
if (!isObject(segment)) {
return false;
}
const candidate = segment as YomitanParseSegment;
return isString(candidate.text);
});
}
export function isYomitanParseResultItem(value: unknown): value is YomitanParseResultItem {
if (!isObject(value)) {
return false;
}
if (!isString((value as YomitanParseResultItem).source)) {
return false;
}
if (!Array.isArray((value as YomitanParseResultItem).content)) {
return false;
}
return true;
}
function isYomitanHeadwordRows(value: unknown): value is YomitanParseHeadword[][] {
return (
Array.isArray(value) &&
value.every(
(group) =>
Array.isArray(group) &&
group.every((item) => isObject(item) && isString((item as YomitanParseHeadword).term)),
)
);
}
function extractYomitanHeadword(segment: YomitanParseSegment): string {
const headwords = segment.headwords;
if (!isYomitanHeadwordRows(headwords)) {
return '';
}
for (const group of headwords) {
if (group.length > 0) {
const firstHeadword = group[0] as YomitanParseHeadword;
if (isString(firstHeadword?.term)) {
return firstHeadword.term;
}
}
}
return '';
}
export function mapYomitanParseResultItemToMergedTokens(
parseResult: YomitanParseResultItem,
isKnownWord: (text: string) => boolean,
knownWordMatchMode: NPlusOneMatchMode,
): YomitanParseCandidate | null {
const content = parseResult.content;
if (!Array.isArray(content) || content.length === 0) {
return null;
}
const source = String(parseResult.source ?? '');
const index =
typeof parseResult.index === 'number' && Number.isInteger(parseResult.index)
? parseResult.index
: 0;
const tokens: MergedToken[] = [];
let charOffset = 0;
let validLineCount = 0;
for (const line of content) {
if (!isYomitanParseLine(line)) {
continue;
}
validLineCount += 1;
let combinedSurface = '';
let combinedReading = '';
let combinedHeadword = '';
for (const segment of line) {
const segmentText = segment.text;
if (!segmentText || segmentText.length === 0) {
continue;
}
combinedSurface += segmentText;
if (typeof segment.reading === 'string') {
combinedReading += segment.reading;
}
if (!combinedHeadword) {
combinedHeadword = extractYomitanHeadword(segment);
}
}
if (!combinedSurface) {
continue;
}
const start = charOffset;
const end = start + combinedSurface.length;
charOffset = end;
const headword = combinedHeadword || combinedSurface;
tokens.push({
surface: combinedSurface,
reading: combinedReading,
headword,
startPos: start,
endPos: end,
partOfSpeech: PartOfSpeech.other,
pos1: '',
isMerged: true,
isNPlusOneTarget: false,
isKnown: (() => {
const matchText = resolveKnownWordText(combinedSurface, headword, knownWordMatchMode);
return matchText ? isKnownWord(matchText) : false;
})(),
});
}
if (validLineCount === 0 || tokens.length === 0) {
return null;
}
return { source, index, tokens };
}
export function selectBestYomitanParseCandidate(
candidates: YomitanParseCandidate[],
): MergedToken[] | null {
if (candidates.length === 0) {
return null;
}
const scanningCandidates = candidates.filter(
(candidate) => candidate.source === 'scanning-parser',
);
const mecabCandidates = candidates.filter((candidate) => candidate.source === 'mecab');
const getBestByTokenCount = (items: YomitanParseCandidate[]): YomitanParseCandidate | null =>
items.length === 0
? null
: items.reduce((best, current) =>
current.tokens.length > best.tokens.length ? current : best,
);
const getCandidateScore = (candidate: YomitanParseCandidate): number => {
const readableTokenCount = candidate.tokens.filter(
(token) => token.reading.trim().length > 0,
).length;
const suspiciousKanaFragmentCount = candidate.tokens.filter(
(token) =>
token.reading.trim().length === 0 &&
token.surface.length >= 2 &&
Array.from(token.surface).every((char) => isKanaChar(char)),
).length;
return readableTokenCount * 100 - suspiciousKanaFragmentCount * 50 - candidate.tokens.length;
};
const chooseBestCandidate = (items: YomitanParseCandidate[]): YomitanParseCandidate | null => {
if (items.length === 0) {
return null;
}
return items.reduce((best, current) => {
const bestScore = getCandidateScore(best);
const currentScore = getCandidateScore(current);
if (currentScore !== bestScore) {
return currentScore > bestScore ? current : best;
}
if (current.tokens.length !== best.tokens.length) {
return current.tokens.length < best.tokens.length ? current : best;
}
return best;
});
};
if (scanningCandidates.length > 0) {
const bestScanning = getBestByTokenCount(scanningCandidates);
if (bestScanning && bestScanning.tokens.length > 1) {
return bestScanning.tokens;
}
const bestMecab = chooseBestCandidate(mecabCandidates);
if (bestMecab && bestMecab.tokens.length > (bestScanning?.tokens.length ?? 0)) {
return bestMecab.tokens;
}
return bestScanning ? bestScanning.tokens : null;
}
const multiTokenCandidates = candidates.filter((candidate) => candidate.tokens.length > 1);
const pool = multiTokenCandidates.length > 0 ? multiTokenCandidates : candidates;
const bestCandidate = chooseBestCandidate(pool);
return bestCandidate ? bestCandidate.tokens : null;
}
export function selectYomitanParseTokens(
parseResults: unknown,
isKnownWord: (text: string) => boolean,
knownWordMatchMode: NPlusOneMatchMode,
): MergedToken[] | null {
if (!Array.isArray(parseResults) || parseResults.length === 0) {
return null;
}
const candidates = parseResults
.filter((item): item is YomitanParseResultItem => isYomitanParseResultItem(item))
.map((item) => mapYomitanParseResultItemToMergedTokens(item, isKnownWord, knownWordMatchMode))
.filter((candidate): candidate is YomitanParseCandidate => candidate !== null);
const bestCandidate = selectBestYomitanParseCandidate(candidates);
return bestCandidate;
}

View File

@@ -0,0 +1,154 @@
import type { BrowserWindow, Extension } from 'electron';
interface LoggerLike {
error: (message: string, ...args: unknown[]) => void;
}
interface YomitanParserRuntimeDeps {
getYomitanExt: () => Extension | null;
getYomitanParserWindow: () => BrowserWindow | null;
setYomitanParserWindow: (window: BrowserWindow | null) => void;
getYomitanParserReadyPromise: () => Promise<void> | null;
setYomitanParserReadyPromise: (promise: Promise<void> | null) => void;
getYomitanParserInitPromise: () => Promise<boolean> | null;
setYomitanParserInitPromise: (promise: Promise<boolean> | null) => void;
}
async function ensureYomitanParserWindow(
deps: YomitanParserRuntimeDeps,
logger: LoggerLike,
): Promise<boolean> {
const electron = await import('electron');
const yomitanExt = deps.getYomitanExt();
if (!yomitanExt) {
return false;
}
const currentWindow = deps.getYomitanParserWindow();
if (currentWindow && !currentWindow.isDestroyed()) {
return true;
}
const existingInitPromise = deps.getYomitanParserInitPromise();
if (existingInitPromise) {
return existingInitPromise;
}
const initPromise = (async () => {
const { BrowserWindow, session } = electron;
const parserWindow = new BrowserWindow({
show: false,
width: 800,
height: 600,
webPreferences: {
contextIsolation: true,
nodeIntegration: false,
session: session.defaultSession,
},
});
deps.setYomitanParserWindow(parserWindow);
deps.setYomitanParserReadyPromise(
new Promise((resolve, reject) => {
parserWindow.webContents.once('did-finish-load', () => resolve());
parserWindow.webContents.once('did-fail-load', (_event, _errorCode, errorDescription) => {
reject(new Error(errorDescription));
});
}),
);
parserWindow.on('closed', () => {
if (deps.getYomitanParserWindow() === parserWindow) {
deps.setYomitanParserWindow(null);
deps.setYomitanParserReadyPromise(null);
}
});
try {
await parserWindow.loadURL(`chrome-extension://${yomitanExt.id}/search.html`);
const readyPromise = deps.getYomitanParserReadyPromise();
if (readyPromise) {
await readyPromise;
}
return true;
} catch (err) {
logger.error('Failed to initialize Yomitan parser window:', (err as Error).message);
if (!parserWindow.isDestroyed()) {
parserWindow.destroy();
}
if (deps.getYomitanParserWindow() === parserWindow) {
deps.setYomitanParserWindow(null);
deps.setYomitanParserReadyPromise(null);
}
return false;
} finally {
deps.setYomitanParserInitPromise(null);
}
})();
deps.setYomitanParserInitPromise(initPromise);
return initPromise;
}
export async function requestYomitanParseResults(
text: string,
deps: YomitanParserRuntimeDeps,
logger: LoggerLike,
): Promise<unknown[] | null> {
const yomitanExt = deps.getYomitanExt();
if (!text || !yomitanExt) {
return null;
}
const isReady = await ensureYomitanParserWindow(deps, logger);
const parserWindow = deps.getYomitanParserWindow();
if (!isReady || !parserWindow || parserWindow.isDestroyed()) {
return null;
}
const script = `
(async () => {
const invoke = (action, params) =>
new Promise((resolve, reject) => {
chrome.runtime.sendMessage({ action, params }, (response) => {
if (chrome.runtime.lastError) {
reject(new Error(chrome.runtime.lastError.message));
return;
}
if (!response || typeof response !== "object") {
reject(new Error("Invalid response from Yomitan backend"));
return;
}
if (response.error) {
reject(new Error(response.error.message || "Yomitan backend error"));
return;
}
resolve(response.result);
});
});
const optionsFull = await invoke("optionsGetFull", undefined);
const profileIndex = optionsFull.profileCurrent;
const scanLength =
optionsFull.profiles?.[profileIndex]?.options?.scanning?.length ?? 40;
return await invoke("parseText", {
text: ${JSON.stringify(text)},
optionsContext: { index: profileIndex },
scanLength,
useInternalParser: true,
useMecabParser: true
});
})();
`;
try {
const parseResults = await parserWindow.webContents.executeJavaScript(script, true);
return Array.isArray(parseResults) ? parseResults : null;
} catch (err) {
logger.error('Yomitan parser request failed:', (err as Error).message);
return null;
}
}