Overlay 2.0 (#12)

This commit is contained in:
2026-03-01 02:36:51 -08:00
committed by GitHub
parent 45df3c466b
commit 44c7761c7c
397 changed files with 15139 additions and 7127 deletions

View File

@@ -51,15 +51,20 @@ test('annotateTokens known-word match mode uses headword vs surface', () => {
});
test('annotateTokens excludes frequency for particle/bound_auxiliary and pos1 exclusions', () => {
const lookupCalls: string[] = [];
const tokens = [
makeToken({ surface: 'は', headword: 'は', partOfSpeech: PartOfSpeech.particle }),
makeToken({
surface: 'は',
headword: 'は',
partOfSpeech: PartOfSpeech.particle,
frequencyRank: 3,
}),
makeToken({
surface: 'です',
headword: 'です',
partOfSpeech: PartOfSpeech.bound_auxiliary,
startPos: 1,
endPos: 3,
frequencyRank: 4,
}),
makeToken({
surface: 'の',
@@ -68,6 +73,7 @@ test('annotateTokens excludes frequency for particle/bound_auxiliary and pos1 ex
pos1: '助詞',
startPos: 3,
endPos: 4,
frequencyRank: 5,
}),
makeToken({
surface: '猫',
@@ -75,24 +81,36 @@ test('annotateTokens excludes frequency for particle/bound_auxiliary and pos1 ex
partOfSpeech: PartOfSpeech.noun,
startPos: 4,
endPos: 5,
frequencyRank: 11,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
getFrequencyRank: (text) => {
lookupCalls.push(text);
return text === '猫' ? 11 : 999;
},
}),
);
const result = annotateTokens(tokens, makeDeps());
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[1]?.frequencyRank, undefined);
assert.equal(result[2]?.frequencyRank, undefined);
assert.equal(result[3]?.frequencyRank, 11);
assert.deepEqual(lookupCalls, ['猫']);
});
test('annotateTokens preserves existing frequency rank when frequency is enabled', () => {
const tokens = [makeToken({ surface: '猫', headword: '猫', frequencyRank: 42 })];
const result = annotateTokens(tokens, makeDeps());
assert.equal(result[0]?.frequencyRank, 42);
});
test('annotateTokens drops invalid frequency rank values', () => {
const tokens = [makeToken({ surface: '猫', headword: '猫', frequencyRank: Number.NaN })];
const result = annotateTokens(tokens, makeDeps());
assert.equal(result[0]?.frequencyRank, undefined);
});
test('annotateTokens clears frequency rank when frequency is disabled', () => {
const tokens = [makeToken({ surface: '猫', headword: '猫', frequencyRank: 42 })];
const result = annotateTokens(tokens, makeDeps(), { frequencyEnabled: false });
assert.equal(result[0]?.frequencyRank, undefined);
});
test('annotateTokens handles JLPT disabled and eligibility exclusion paths', () => {
@@ -157,3 +175,206 @@ test('annotateTokens N+1 handoff marks expected target when threshold is satisfi
assert.equal(result[1]?.isNPlusOneTarget, true);
assert.equal(result[2]?.isNPlusOneTarget, false);
});
test('annotateTokens N+1 minimum sentence words counts only eligible word tokens', () => {
const tokens = [
makeToken({ surface: '猫', headword: '猫', startPos: 0, endPos: 1 }),
makeToken({
surface: 'が',
headword: 'が',
partOfSpeech: PartOfSpeech.particle,
pos1: '助詞',
startPos: 1,
endPos: 2,
}),
makeToken({
surface: 'です',
headword: 'です',
partOfSpeech: PartOfSpeech.bound_auxiliary,
pos1: '助動詞',
startPos: 2,
endPos: 4,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === 'が' || text === 'です',
}),
{ minSentenceWordsForNPlusOne: 3 },
);
assert.equal(result[0]?.isKnown, false);
assert.equal(result[1]?.isKnown, true);
assert.equal(result[2]?.isKnown, true);
assert.equal(result[0]?.isNPlusOneTarget, false);
});
test('annotateTokens applies configured pos1 exclusions to both frequency and N+1', () => {
const tokens = [
makeToken({
surface: '猫',
headword: '猫',
pos1: '名詞',
frequencyRank: 21,
startPos: 0,
endPos: 1,
}),
makeToken({
surface: '走る',
headword: '走る',
pos1: '動詞',
partOfSpeech: PartOfSpeech.verb,
startPos: 1,
endPos: 3,
frequencyRank: 22,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === '走る',
}),
{
minSentenceWordsForNPlusOne: 1,
pos1Exclusions: new Set(['名詞']),
},
);
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[1]?.frequencyRank, 22);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[1]?.isNPlusOneTarget, false);
});
test('annotateTokens allows previously default-excluded pos1 when removed from effective set', () => {
const tokens = [
makeToken({
surface: 'は',
headword: 'は',
partOfSpeech: PartOfSpeech.other,
pos1: '助詞',
startPos: 0,
endPos: 1,
frequencyRank: 8,
}),
];
const result = annotateTokens(tokens, makeDeps(), {
minSentenceWordsForNPlusOne: 1,
pos1Exclusions: new Set(),
});
assert.equal(result[0]?.frequencyRank, 8);
assert.equal(result[0]?.isNPlusOneTarget, true);
});
test('annotateTokens excludes default non-independent pos2 from frequency and N+1', () => {
const tokens = [
makeToken({
surface: 'になれば',
headword: 'なる',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '非自立',
startPos: 0,
endPos: 4,
frequencyRank: 7,
}),
];
const result = annotateTokens(tokens, makeDeps(), {
minSentenceWordsForNPlusOne: 1,
});
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.isNPlusOneTarget, false);
});
test('annotateTokens excludes likely kana SFX tokens from frequency when POS tags are missing', () => {
const tokens = [
makeToken({
surface: 'ぐわっ',
reading: 'ぐわっ',
headword: 'ぐわっ',
pos1: '',
pos2: '',
frequencyRank: 12,
startPos: 0,
endPos: 3,
}),
];
const result = annotateTokens(tokens, makeDeps(), {
minSentenceWordsForNPlusOne: 1,
});
assert.equal(result[0]?.frequencyRank, undefined);
});
test('annotateTokens allows previously default-excluded pos2 when removed from effective set', () => {
const tokens = [
makeToken({
surface: 'になれば',
headword: 'なる',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '非自立',
startPos: 0,
endPos: 4,
frequencyRank: 9,
}),
];
const result = annotateTokens(tokens, makeDeps(), {
minSentenceWordsForNPlusOne: 1,
pos2Exclusions: new Set(),
});
assert.equal(result[0]?.frequencyRank, 9);
assert.equal(result[0]?.isNPlusOneTarget, true);
});
test('annotateTokens keeps composite tokens when any component pos tag is content-bearing', () => {
const tokens = [
makeToken({
surface: 'になれば',
headword: 'なる',
pos1: '助詞|動詞',
pos2: '格助詞|自立|接続助詞',
startPos: 0,
endPos: 4,
frequencyRank: 5,
}),
];
const result = annotateTokens(tokens, makeDeps(), {
minSentenceWordsForNPlusOne: 1,
});
assert.equal(result[0]?.frequencyRank, 5);
assert.equal(result[0]?.isNPlusOneTarget, true);
});
test('annotateTokens excludes composite tokens when all component pos tags are excluded', () => {
const tokens = [
makeToken({
surface: 'けど',
headword: 'けど',
pos1: '助詞|助詞',
pos2: '接続助詞|終助詞',
startPos: 0,
endPos: 2,
frequencyRank: 6,
}),
];
const result = annotateTokens(tokens, makeDeps(), {
minSentenceWordsForNPlusOne: 1,
});
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.isNPlusOneTarget, false);
});

View File

@@ -1,39 +1,38 @@
import { markNPlusOneTargets } from '../../../token-merger';
import {
FrequencyDictionaryLookup,
JlptLevel,
MergedToken,
NPlusOneMatchMode,
PartOfSpeech,
} from '../../../types';
DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
resolveAnnotationPos1ExclusionSet,
} from '../../../token-pos1-exclusions';
import {
DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
resolveAnnotationPos2ExclusionSet,
} from '../../../token-pos2-exclusions';
import { JlptLevel, MergedToken, NPlusOneMatchMode, PartOfSpeech } from '../../../types';
import { shouldIgnoreJlptByTerm, shouldIgnoreJlptForMecabPos1 } from '../jlpt-token-filter';
const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
const KATAKANA_CODEPOINT_START = 0x30a1;
const KATAKANA_CODEPOINT_END = 0x30f6;
const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
const FREQUENCY_RANK_LOOKUP_CACHE_LIMIT = 2048;
const jlptLevelLookupCaches = new WeakMap<
(text: string) => JlptLevel | null,
Map<string, JlptLevel | null>
>();
const frequencyRankLookupCaches = new WeakMap<
FrequencyDictionaryLookup,
Map<string, number | null>
>();
export interface AnnotationStageDeps {
isKnownWord: (text: string) => boolean;
knownWordMatchMode: NPlusOneMatchMode;
getJlptLevel: (text: string) => JlptLevel | null;
getFrequencyRank?: FrequencyDictionaryLookup;
}
export interface AnnotationStageOptions {
nPlusOneEnabled?: boolean;
jlptEnabled?: boolean;
frequencyEnabled?: boolean;
minSentenceWordsForNPlusOne?: number;
pos1Exclusions?: ReadonlySet<string>;
pos2Exclusions?: ReadonlySet<string>;
}
function resolveKnownWordText(
@@ -59,106 +58,94 @@ function applyKnownWordMarking(
});
}
function normalizeFrequencyLookupText(rawText: string): string {
return rawText.trim().toLowerCase();
function normalizePos1Tag(pos1: string | undefined): string {
return typeof pos1 === 'string' ? pos1.trim() : '';
}
function getCachedFrequencyRank(
lookupText: string,
getFrequencyRank: FrequencyDictionaryLookup,
): number | null {
const normalizedText = normalizeFrequencyLookupText(lookupText);
if (!normalizedText) {
return null;
function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet<string>): boolean {
if (!normalizedTag) {
return false;
}
let cache = frequencyRankLookupCaches.get(getFrequencyRank);
if (!cache) {
cache = new Map<string, number | null>();
frequencyRankLookupCaches.set(getFrequencyRank, cache);
const parts = normalizedTag
.split('|')
.map((part) => part.trim())
.filter((part) => part.length > 0);
if (parts.length === 0) {
return false;
}
if (cache.has(normalizedText)) {
return cache.get(normalizedText) ?? null;
}
let rank: number | null;
try {
rank = getFrequencyRank(normalizedText);
} catch {
rank = null;
}
if (rank !== null) {
if (!Number.isFinite(rank) || rank <= 0) {
rank = null;
}
}
cache.set(normalizedText, rank);
while (cache.size > FREQUENCY_RANK_LOOKUP_CACHE_LIMIT) {
const firstKey = cache.keys().next().value;
if (firstKey !== undefined) {
cache.delete(firstKey);
}
}
return rank;
// Composite tags like "助詞|名詞" stay eligible unless every component is excluded.
return parts.every((part) => exclusions.has(part));
}
function resolveFrequencyLookupText(token: MergedToken): string {
if (token.headword && token.headword.length > 0) {
return token.headword;
function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
if (options.pos1Exclusions) {
return options.pos1Exclusions;
}
if (token.reading && token.reading.length > 0) {
return token.reading;
}
return token.surface;
return resolveAnnotationPos1ExclusionSet(DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG);
}
function getFrequencyLookupTextCandidates(token: MergedToken): string[] {
const lookupText = resolveFrequencyLookupText(token).trim();
return lookupText ? [lookupText] : [];
function resolvePos2Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
if (options.pos2Exclusions) {
return options.pos2Exclusions;
}
return resolveAnnotationPos2ExclusionSet(DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG);
}
function isFrequencyExcludedByPos(token: MergedToken): boolean {
if (
token.partOfSpeech === PartOfSpeech.particle ||
token.partOfSpeech === PartOfSpeech.bound_auxiliary
) {
function normalizePos2Tag(pos2: string | undefined): string {
return typeof pos2 === 'string' ? pos2.trim() : '';
}
function isFrequencyExcludedByPos(
token: MergedToken,
pos1Exclusions: ReadonlySet<string>,
pos2Exclusions: ReadonlySet<string>,
): boolean {
const normalizedPos1 = normalizePos1Tag(token.pos1);
const hasPos1 = normalizedPos1.length > 0;
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
return true;
}
return token.pos1 === '助詞' || token.pos1 === '助動詞';
const normalizedPos2 = normalizePos2Tag(token.pos2);
const hasPos2 = normalizedPos2.length > 0;
if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) {
return true;
}
if (hasPos1 || hasPos2) {
return false;
}
if (isLikelyFrequencyNoiseToken(token)) {
return true;
}
return (
token.partOfSpeech === PartOfSpeech.particle ||
token.partOfSpeech === PartOfSpeech.bound_auxiliary
);
}
function applyFrequencyMarking(
tokens: MergedToken[],
getFrequencyRank: FrequencyDictionaryLookup,
pos1Exclusions: ReadonlySet<string>,
pos2Exclusions: ReadonlySet<string>,
): MergedToken[] {
return tokens.map((token) => {
if (isFrequencyExcludedByPos(token)) {
if (isFrequencyExcludedByPos(token, pos1Exclusions, pos2Exclusions)) {
return { ...token, frequencyRank: undefined };
}
const lookupTexts = getFrequencyLookupTextCandidates(token);
if (lookupTexts.length === 0) {
return { ...token, frequencyRank: undefined };
}
let bestRank: number | null = null;
for (const lookupText of lookupTexts) {
const rank = getCachedFrequencyRank(lookupText, getFrequencyRank);
if (rank === null) {
continue;
}
if (bestRank === null || rank < bestRank) {
bestRank = rank;
}
if (typeof token.frequencyRank === 'number' && Number.isFinite(token.frequencyRank)) {
const rank = Math.max(1, Math.floor(token.frequencyRank));
return { ...token, frequencyRank: rank };
}
return {
...token,
frequencyRank: bestRank ?? undefined,
frequencyRank: undefined,
};
});
}
@@ -282,6 +269,98 @@ function isRepeatedKanaSfx(text: string): boolean {
return topCount >= Math.ceil(chars.length / 2);
}
function isTrailingSmallTsuKanaSfx(text: string): boolean {
const normalized = normalizeJlptTextForExclusion(text);
if (!normalized) {
return false;
}
const chars = [...normalized];
if (chars.length < 2 || chars.length > 4) {
return false;
}
if (!chars.every(isKanaChar)) {
return false;
}
return chars[chars.length - 1] === 'っ';
}
function isReduplicatedKanaSfx(text: string): boolean {
const normalized = normalizeJlptTextForExclusion(text);
if (!normalized) {
return false;
}
const chars = [...normalized];
if (chars.length < 4 || chars.length % 2 !== 0) {
return false;
}
if (!chars.every(isKanaChar)) {
return false;
}
const half = chars.length / 2;
return chars.slice(0, half).join('') === chars.slice(half).join('');
}
function hasAdjacentKanaRepeat(text: string): boolean {
const normalized = normalizeJlptTextForExclusion(text);
if (!normalized) {
return false;
}
const chars = [...normalized];
if (!chars.every(isKanaChar)) {
return false;
}
for (let i = 1; i < chars.length; i += 1) {
if (chars[i] === chars[i - 1]) {
return true;
}
}
return false;
}
function isLikelyFrequencyNoiseToken(token: MergedToken): boolean {
const candidates = [token.headword, token.surface].filter(
(candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
);
for (const candidate of candidates) {
const trimmedCandidate = candidate.trim();
if (!trimmedCandidate) {
continue;
}
const normalizedCandidate = normalizeJlptTextForExclusion(trimmedCandidate);
if (!normalizedCandidate) {
continue;
}
if (shouldIgnoreJlptByTerm(trimmedCandidate) || shouldIgnoreJlptByTerm(normalizedCandidate)) {
return true;
}
if (
hasAdjacentKanaRepeat(trimmedCandidate) ||
hasAdjacentKanaRepeat(normalizedCandidate) ||
isReduplicatedKanaSfx(trimmedCandidate) ||
isReduplicatedKanaSfx(normalizedCandidate) ||
isTrailingSmallTsuKanaSfx(trimmedCandidate) ||
isTrailingSmallTsuKanaSfx(normalizedCandidate)
) {
return true;
}
}
return false;
}
function isJlptEligibleToken(token: MergedToken): boolean {
if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) {
return false;
@@ -340,20 +419,24 @@ export function annotateTokens(
deps: AnnotationStageDeps,
options: AnnotationStageOptions = {},
): MergedToken[] {
const knownMarkedTokens = applyKnownWordMarking(
tokens,
deps.isKnownWord,
deps.knownWordMatchMode,
);
const pos1Exclusions = resolvePos1Exclusions(options);
const pos2Exclusions = resolvePos2Exclusions(options);
const nPlusOneEnabled = options.nPlusOneEnabled !== false;
const knownMarkedTokens = nPlusOneEnabled
? applyKnownWordMarking(tokens, deps.isKnownWord, deps.knownWordMatchMode)
: tokens.map((token) => ({
...token,
isKnown: false,
isNPlusOneTarget: false,
}));
const frequencyEnabled = options.frequencyEnabled !== false;
const frequencyMarkedTokens =
frequencyEnabled && deps.getFrequencyRank
? applyFrequencyMarking(knownMarkedTokens, deps.getFrequencyRank)
: knownMarkedTokens.map((token) => ({
...token,
frequencyRank: undefined,
}));
const frequencyMarkedTokens = frequencyEnabled
? applyFrequencyMarking(knownMarkedTokens, pos1Exclusions, pos2Exclusions)
: knownMarkedTokens.map((token) => ({
...token,
frequencyRank: undefined,
}));
const jlptEnabled = options.jlptEnabled !== false;
const jlptMarkedTokens = jlptEnabled
@@ -363,6 +446,14 @@ export function annotateTokens(
jlptLevel: undefined,
}));
if (!nPlusOneEnabled) {
return jlptMarkedTokens.map((token) => ({
...token,
isKnown: false,
isNPlusOneTarget: false,
}));
}
const minSentenceWordsForNPlusOne = options.minSentenceWordsForNPlusOne;
const sanitizedMinSentenceWordsForNPlusOne =
minSentenceWordsForNPlusOne !== undefined &&
@@ -371,5 +462,10 @@ export function annotateTokens(
? minSentenceWordsForNPlusOne
: 3;
return markNPlusOneTargets(jlptMarkedTokens, sanitizedMinSentenceWordsForNPlusOne);
return markNPlusOneTargets(
jlptMarkedTokens,
sanitizedMinSentenceWordsForNPlusOne,
pos1Exclusions,
pos2Exclusions,
);
}

View File

@@ -22,12 +22,13 @@ function makeToken(overrides: Partial<MergedToken>): MergedToken {
test('enrichTokensWithMecabPos1 picks pos1 by best overlap when no surface match exists', () => {
const tokens = [makeToken({ surface: 'grouped', startPos: 2, endPos: 7 })];
const mecabTokens = [
makeToken({ surface: 'left', startPos: 0, endPos: 4, pos1: 'A' }),
makeToken({ surface: 'right', startPos: 2, endPos: 6, pos1: 'B' }),
makeToken({ surface: 'left', startPos: 0, endPos: 4, pos1: 'A', pos2: 'L2' }),
makeToken({ surface: 'right', startPos: 2, endPos: 6, pos1: 'B', pos2: '非自立' }),
];
const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
assert.equal(enriched[0]?.pos1, 'B');
assert.equal(enriched[0]?.pos1, 'A|B');
assert.equal(enriched[0]?.pos2, 'L2|非自立');
});
test('enrichTokensWithMecabPos1 fills missing pos1 using surface-sequence fallback', () => {

View File

@@ -1,13 +1,45 @@
import { MergedToken } from '../../../types';
function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): string | undefined {
if (mecabTokens.length === 0) {
type MecabPosMetadata = {
pos1: string;
pos2?: string;
pos3?: string;
};
function joinUniqueTags(values: Array<string | undefined>): string | undefined {
const unique: string[] = [];
for (const value of values) {
if (!value) {
continue;
}
const trimmed = value.trim();
if (!trimmed) {
continue;
}
if (!unique.includes(trimmed)) {
unique.push(trimmed);
}
}
if (unique.length === 0) {
return undefined;
}
if (unique.length === 1) {
return unique[0];
}
return unique.join('|');
}
function pickClosestMecabPosMetadata(
token: MergedToken,
mecabTokens: MergedToken[],
): MecabPosMetadata | null {
if (mecabTokens.length === 0) {
return null;
}
const tokenStart = token.startPos ?? 0;
const tokenEnd = token.endPos ?? tokenStart + token.surface.length;
let bestSurfaceMatchPos1: string | undefined;
let bestSurfaceMatchToken: MergedToken | null = null;
let bestSurfaceMatchDistance = Number.MAX_SAFE_INTEGER;
let bestSurfaceMatchEndDistance = Number.MAX_SAFE_INTEGER;
@@ -31,19 +63,24 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s
) {
bestSurfaceMatchDistance = startDistance;
bestSurfaceMatchEndDistance = endDistance;
bestSurfaceMatchPos1 = mecabToken.pos1;
bestSurfaceMatchToken = mecabToken;
}
}
if (bestSurfaceMatchPos1) {
return bestSurfaceMatchPos1;
if (bestSurfaceMatchToken) {
return {
pos1: bestSurfaceMatchToken.pos1 as string,
pos2: bestSurfaceMatchToken.pos2,
pos3: bestSurfaceMatchToken.pos3,
};
}
let bestPos1: string | undefined;
let bestToken: MergedToken | null = null;
let bestOverlap = 0;
let bestSpan = 0;
let bestStartDistance = Number.MAX_SAFE_INTEGER;
let bestStart = Number.MAX_SAFE_INTEGER;
const overlappingTokens: MergedToken[] = [];
for (const mecabToken of mecabTokens) {
if (!mecabToken.pos1) {
@@ -58,6 +95,7 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s
if (overlap === 0) {
continue;
}
overlappingTokens.push(mecabToken);
const span = mecabEnd - mecabStart;
if (
@@ -71,11 +109,23 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s
bestSpan = span;
bestStartDistance = Math.abs(mecabStart - tokenStart);
bestStart = mecabStart;
bestPos1 = mecabToken.pos1;
bestToken = mecabToken;
}
}
return bestOverlap > 0 ? bestPos1 : undefined;
if (bestOverlap === 0 || !bestToken) {
return null;
}
const overlapPos1 = joinUniqueTags(overlappingTokens.map((token) => token.pos1));
const overlapPos2 = joinUniqueTags(overlappingTokens.map((token) => token.pos2));
const overlapPos3 = joinUniqueTags(overlappingTokens.map((token) => token.pos3));
return {
pos1: overlapPos1 ?? (bestToken.pos1 as string),
pos2: overlapPos2 ?? bestToken.pos2,
pos3: overlapPos3 ?? bestToken.pos3,
};
}
function fillMissingPos1BySurfaceSequence(
@@ -101,7 +151,7 @@ function fillMissingPos1BySurfaceSequence(
return token;
}
let best: { pos1: string; index: number } | null = null;
let best: { token: MergedToken; index: number } | null = null;
for (const candidate of indexedMecabTokens) {
if (candidate.token.surface !== surface) {
continue;
@@ -109,7 +159,7 @@ function fillMissingPos1BySurfaceSequence(
if (candidate.index < cursor) {
continue;
}
best = { pos1: candidate.token.pos1 as string, index: candidate.index };
best = { token: candidate.token, index: candidate.index };
break;
}
@@ -118,7 +168,7 @@ function fillMissingPos1BySurfaceSequence(
if (candidate.token.surface !== surface) {
continue;
}
best = { pos1: candidate.token.pos1 as string, index: candidate.index };
best = { token: candidate.token, index: candidate.index };
break;
}
}
@@ -130,7 +180,9 @@ function fillMissingPos1BySurfaceSequence(
cursor = best.index + 1;
return {
...token,
pos1: best.pos1,
pos1: best.token.pos1,
pos2: best.token.pos2,
pos3: best.token.pos3,
};
});
}
@@ -152,14 +204,16 @@ export function enrichTokensWithMecabPos1(
return token;
}
const pos1 = pickClosestMecabPos1(token, mecabTokens);
if (!pos1) {
const metadata = pickClosestMecabPosMetadata(token, mecabTokens);
if (!metadata) {
return token;
}
return {
...token,
pos1,
pos1: metadata.pos1,
pos2: metadata.pos2,
pos3: metadata.pos3,
};
});

View File

@@ -0,0 +1,149 @@
import type { MergedToken } from '../../../types';
import { createLogger } from '../../../logger';
import { enrichTokensWithMecabPos1 } from './parser-enrichment-stage';
const logger = createLogger('main:tokenizer');
const DISABLE_WORKER_ENV = 'SUBMINER_DISABLE_MECAB_ENRICHMENT_WORKER';
interface WorkerRequest {
id: number;
tokens: MergedToken[];
mecabTokens: MergedToken[] | null;
}
interface WorkerResponse {
id?: unknown;
result?: unknown;
error?: unknown;
}
type PendingRequest = {
resolve: (value: MergedToken[]) => void;
reject: (reason?: unknown) => void;
};
class ParserEnrichmentWorkerRuntime {
private worker: import('node:worker_threads').Worker | null = null;
private nextRequestId = 1;
private pending = new Map<number, PendingRequest>();
private initAttempted = false;
async enrichTokens(
tokens: MergedToken[],
mecabTokens: MergedToken[] | null,
): Promise<MergedToken[]> {
const worker = await this.getWorker();
if (!worker) {
return enrichTokensWithMecabPos1(tokens, mecabTokens);
}
return new Promise<MergedToken[]>((resolve, reject) => {
const id = this.nextRequestId++;
this.pending.set(id, { resolve, reject });
const request: WorkerRequest = { id, tokens, mecabTokens };
worker.postMessage(request);
});
}
private async getWorker(): Promise<import('node:worker_threads').Worker | null> {
if (process.env[DISABLE_WORKER_ENV] === '1') {
return null;
}
if (this.worker) {
return this.worker;
}
if (this.initAttempted) {
return null;
}
this.initAttempted = true;
let workerThreads: typeof import('node:worker_threads');
try {
workerThreads = await import('node:worker_threads');
} catch {
return null;
}
let workerPath = '';
try {
workerPath = require.resolve('./parser-enrichment-worker-thread.js');
} catch {
return null;
}
try {
const worker = new workerThreads.Worker(workerPath);
worker.on('message', (message: WorkerResponse) => this.handleWorkerMessage(message));
worker.on('error', (error: Error) => this.handleWorkerFailure(error));
worker.on('exit', (code: number) => {
if (code !== 0) {
this.handleWorkerFailure(new Error(`parser enrichment worker exited with code ${code}`));
} else {
this.worker = null;
}
});
this.worker = worker;
return worker;
} catch (error) {
logger.debug(`Failed to start parser enrichment worker: ${(error as Error).message}`);
return null;
}
}
private handleWorkerMessage(message: WorkerResponse): void {
if (typeof message.id !== 'number') {
return;
}
const request = this.pending.get(message.id);
if (!request) {
return;
}
this.pending.delete(message.id);
if (typeof message.error === 'string' && message.error.length > 0) {
request.reject(new Error(message.error));
return;
}
if (!Array.isArray(message.result)) {
request.reject(new Error('Parser enrichment worker returned invalid payload'));
return;
}
request.resolve(message.result as MergedToken[]);
}
private handleWorkerFailure(error: Error): void {
logger.debug(
`Parser enrichment worker unavailable, falling back to main thread: ${error.message}`,
);
for (const pending of this.pending.values()) {
pending.reject(error);
}
this.pending.clear();
if (this.worker) {
this.worker.removeAllListeners();
this.worker = null;
}
}
}
let runtime: ParserEnrichmentWorkerRuntime | null = null;
export async function enrichTokensWithMecabPos1Async(
tokens: MergedToken[],
mecabTokens: MergedToken[] | null,
): Promise<MergedToken[]> {
if (!runtime) {
runtime = new ParserEnrichmentWorkerRuntime();
}
try {
return await runtime.enrichTokens(tokens, mecabTokens);
} catch {
return enrichTokensWithMecabPos1(tokens, mecabTokens);
}
}

View File

@@ -0,0 +1,25 @@
import { parentPort } from 'node:worker_threads';
import type { MergedToken } from '../../../types';
import { enrichTokensWithMecabPos1 } from './parser-enrichment-stage';
interface WorkerRequest {
id: number;
tokens: MergedToken[];
mecabTokens: MergedToken[] | null;
}
if (!parentPort) {
throw new Error('parser-enrichment worker missing parent port');
}
const port = parentPort;
port.on('message', (message: WorkerRequest) => {
try {
const result = enrichTokensWithMecabPos1(message.tokens, message.mecabTokens);
port.postMessage({ id: message.id, result });
} catch (error) {
const messageText = error instanceof Error ? error.message : String(error);
port.postMessage({ id: message.id, error: messageText });
}
});

View File

@@ -0,0 +1,248 @@
import assert from 'node:assert/strict';
import test from 'node:test';
import {
requestYomitanTermFrequencies,
syncYomitanDefaultAnkiServer,
} from './yomitan-parser-runtime';
function createDeps(executeJavaScript: (script: string) => Promise<unknown>) {
const parserWindow = {
isDestroyed: () => false,
webContents: {
executeJavaScript: async (script: string) => await executeJavaScript(script),
},
};
return {
getYomitanExt: () => ({ id: 'ext-id' }) as never,
getYomitanParserWindow: () => parserWindow as never,
setYomitanParserWindow: () => undefined,
getYomitanParserReadyPromise: () => null,
setYomitanParserReadyPromise: () => undefined,
getYomitanParserInitPromise: () => null,
setYomitanParserInitPromise: () => undefined,
};
}
test('syncYomitanDefaultAnkiServer updates default profile server when script reports update', async () => {
let scriptValue = '';
const deps = createDeps(async (script) => {
scriptValue = script;
return { updated: true };
});
const infoLogs: string[] = [];
const updated = await syncYomitanDefaultAnkiServer('http://127.0.0.1:8766', deps, {
error: () => undefined,
info: (message) => infoLogs.push(message),
});
assert.equal(updated, true);
assert.match(scriptValue, /optionsGetFull/);
assert.match(scriptValue, /setAllSettings/);
assert.equal(infoLogs.length, 1);
});
test('syncYomitanDefaultAnkiServer returns false when script reports no change', async () => {
const deps = createDeps(async () => ({ updated: false }));
const updated = await syncYomitanDefaultAnkiServer('http://127.0.0.1:8766', deps, {
error: () => undefined,
info: () => undefined,
});
assert.equal(updated, false);
});
test('syncYomitanDefaultAnkiServer logs and returns false on script failure', async () => {
const deps = createDeps(async () => {
throw new Error('execute failed');
});
const errorLogs: string[] = [];
const updated = await syncYomitanDefaultAnkiServer('http://127.0.0.1:8766', deps, {
error: (message) => errorLogs.push(message),
info: () => undefined,
});
assert.equal(updated, false);
assert.equal(errorLogs.length, 1);
});
test('syncYomitanDefaultAnkiServer no-ops for empty target url', async () => {
let executeCount = 0;
const deps = createDeps(async () => {
executeCount += 1;
return { updated: true };
});
const updated = await syncYomitanDefaultAnkiServer(' ', deps, {
error: () => undefined,
info: () => undefined,
});
assert.equal(updated, false);
assert.equal(executeCount, 0);
});
test('requestYomitanTermFrequencies returns normalized frequency entries', async () => {
let scriptValue = '';
const deps = createDeps(async (script) => {
scriptValue = script;
return [
{
term: '猫',
reading: 'ねこ',
dictionary: 'freq-dict',
dictionaryPriority: 0,
frequency: 77,
displayValue: '77',
displayValueParsed: true,
},
{
term: '鍛える',
reading: 'きたえる',
dictionary: 'freq-dict',
dictionaryPriority: 1,
frequency: 46961,
displayValue: '2847,46961',
displayValueParsed: true,
},
{
term: 'invalid',
dictionary: 'freq-dict',
frequency: 0,
},
];
});
const result = await requestYomitanTermFrequencies([{ term: '猫', reading: 'ねこ' }], deps, {
error: () => undefined,
});
assert.equal(result.length, 2);
assert.equal(result[0]?.term, '猫');
assert.equal(result[0]?.frequency, 77);
assert.equal(result[0]?.dictionaryPriority, 0);
assert.equal(result[1]?.term, '鍛える');
assert.equal(result[1]?.frequency, 2847);
assert.match(scriptValue, /getTermFrequencies/);
assert.match(scriptValue, /optionsGetFull/);
});
test('requestYomitanTermFrequencies prefers primary rank from displayValue array pair', async () => {
const deps = createDeps(async () => [
{
term: '無人',
reading: 'むじん',
dictionary: 'freq-dict',
dictionaryPriority: 0,
frequency: 157632,
displayValue: [7141, 157632],
displayValueParsed: true,
},
]);
const result = await requestYomitanTermFrequencies([{ term: '無人', reading: 'むじん' }], deps, {
error: () => undefined,
});
assert.equal(result.length, 1);
assert.equal(result[0]?.term, '無人');
assert.equal(result[0]?.frequency, 7141);
});
test('requestYomitanTermFrequencies caches profile metadata between calls', async () => {
const scripts: string[] = [];
const deps = createDeps(async (script) => {
scripts.push(script);
if (script.includes('optionsGetFull')) {
return {
profileCurrent: 0,
profiles: [
{
options: {
scanning: { length: 40 },
dictionaries: [{ name: 'freq-dict', enabled: true, id: 0 }],
},
},
],
};
}
if (script.includes('"term":"犬"')) {
return [
{
term: '犬',
reading: 'いぬ',
dictionary: 'freq-dict',
frequency: 12,
displayValue: '12',
displayValueParsed: true,
},
];
}
return [
{
term: '猫',
reading: 'ねこ',
dictionary: 'freq-dict',
frequency: 77,
displayValue: '77',
displayValueParsed: true,
},
];
});
await requestYomitanTermFrequencies([{ term: '猫', reading: 'ねこ' }], deps, {
error: () => undefined,
});
await requestYomitanTermFrequencies([{ term: '犬', reading: 'いぬ' }], deps, {
error: () => undefined,
});
const optionsCalls = scripts.filter((script) => script.includes('optionsGetFull')).length;
assert.equal(optionsCalls, 1);
});
test('requestYomitanTermFrequencies caches repeated term+reading lookups', async () => {
const scripts: string[] = [];
const deps = createDeps(async (script) => {
scripts.push(script);
if (script.includes('optionsGetFull')) {
return {
profileCurrent: 0,
profiles: [
{
options: {
scanning: { length: 40 },
dictionaries: [{ name: 'freq-dict', enabled: true, id: 0 }],
},
},
],
};
}
return [
{
term: '猫',
reading: 'ねこ',
dictionary: 'freq-dict',
frequency: 77,
displayValue: '77',
displayValueParsed: true,
},
];
});
await requestYomitanTermFrequencies([{ term: '猫', reading: 'ねこ' }], deps, {
error: () => undefined,
});
await requestYomitanTermFrequencies([{ term: '猫', reading: 'ねこ' }], deps, {
error: () => undefined,
});
const frequencyCalls = scripts.filter((script) => script.includes('getTermFrequencies')).length;
assert.equal(frequencyCalls, 1);
});

View File

@@ -2,6 +2,7 @@ import type { BrowserWindow, Extension } from 'electron';
interface LoggerLike {
error: (message: string, ...args: unknown[]) => void;
info?: (message: string, ...args: unknown[]) => void;
}
interface YomitanParserRuntimeDeps {
@@ -14,6 +15,395 @@ interface YomitanParserRuntimeDeps {
setYomitanParserInitPromise: (promise: Promise<boolean> | null) => void;
}
export interface YomitanTermFrequency {
term: string;
reading: string | null;
dictionary: string;
dictionaryPriority: number;
frequency: number;
displayValue: string | null;
displayValueParsed: boolean;
}
export interface YomitanTermReadingPair {
term: string;
reading: string | null;
}
interface YomitanProfileMetadata {
profileIndex: number;
scanLength: number;
dictionaries: string[];
dictionaryPriorityByName: Record<string, number>;
}
const DEFAULT_YOMITAN_SCAN_LENGTH = 40;
const yomitanProfileMetadataByWindow = new WeakMap<BrowserWindow, YomitanProfileMetadata>();
const yomitanFrequencyCacheByWindow = new WeakMap<BrowserWindow, Map<string, YomitanTermFrequency[]>>();
function isObject(value: unknown): value is Record<string, unknown> {
return Boolean(value && typeof value === 'object');
}
function makeTermReadingCacheKey(term: string, reading: string | null): string {
return `${term}\u0000${reading ?? ''}`;
}
function getWindowFrequencyCache(window: BrowserWindow): Map<string, YomitanTermFrequency[]> {
let cache = yomitanFrequencyCacheByWindow.get(window);
if (!cache) {
cache = new Map<string, YomitanTermFrequency[]>();
yomitanFrequencyCacheByWindow.set(window, cache);
}
return cache;
}
function clearWindowCaches(window: BrowserWindow): void {
yomitanProfileMetadataByWindow.delete(window);
yomitanFrequencyCacheByWindow.delete(window);
}
export function clearYomitanParserCachesForWindow(window: BrowserWindow): void {
clearWindowCaches(window);
}
function asPositiveInteger(value: unknown): number | null {
if (typeof value !== 'number' || !Number.isFinite(value) || value <= 0) {
return null;
}
return Math.max(1, Math.floor(value));
}
function parsePositiveFrequencyString(value: string): number | null {
const trimmed = value.trim();
if (!trimmed) {
return null;
}
const numericPrefix = trimmed.match(/^\d[\d,]*/)?.[0];
if (!numericPrefix) {
return null;
}
const chunks = numericPrefix.split(',');
const normalizedNumber =
chunks.length <= 1
? chunks[0] ?? ''
: chunks.slice(1).every((chunk) => /^\d{3}$/.test(chunk))
? chunks.join('')
: (chunks[0] ?? '');
const parsed = Number.parseInt(normalizedNumber, 10);
if (!Number.isFinite(parsed) || parsed <= 0) {
return null;
}
return parsed;
}
function parsePositiveFrequencyValue(value: unknown): number | null {
const numeric = asPositiveInteger(value);
if (numeric !== null) {
return numeric;
}
if (typeof value === 'string') {
return parsePositiveFrequencyString(value);
}
if (Array.isArray(value)) {
for (const item of value) {
const parsed = parsePositiveFrequencyValue(item);
if (parsed !== null) {
return parsed;
}
}
}
return null;
}
function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
if (!isObject(value)) {
return null;
}
const term = typeof value.term === 'string' ? value.term.trim() : '';
const dictionary = typeof value.dictionary === 'string' ? value.dictionary.trim() : '';
const rawFrequency = parsePositiveFrequencyValue(value.frequency);
const displayValueRaw = value.displayValue;
const parsedDisplayFrequency =
displayValueRaw !== null && displayValueRaw !== undefined
? parsePositiveFrequencyValue(displayValueRaw)
: null;
const frequency = parsedDisplayFrequency ?? rawFrequency;
if (!term || !dictionary || frequency === null) {
return null;
}
const dictionaryPriorityRaw = (value as { dictionaryPriority?: unknown }).dictionaryPriority;
const dictionaryPriority =
typeof dictionaryPriorityRaw === 'number' && Number.isFinite(dictionaryPriorityRaw)
? Math.max(0, Math.floor(dictionaryPriorityRaw))
: Number.MAX_SAFE_INTEGER;
const reading =
value.reading === null
? null
: typeof value.reading === 'string'
? value.reading
: null;
const displayValue = typeof displayValueRaw === 'string' ? displayValueRaw : null;
const displayValueParsed = value.displayValueParsed === true;
return {
term,
reading,
dictionary,
dictionaryPriority,
frequency,
displayValue,
displayValueParsed,
};
}
function normalizeTermReadingList(termReadingList: YomitanTermReadingPair[]): YomitanTermReadingPair[] {
const normalized: YomitanTermReadingPair[] = [];
const seen = new Set<string>();
for (const pair of termReadingList) {
const term = typeof pair.term === 'string' ? pair.term.trim() : '';
if (!term) {
continue;
}
const reading =
typeof pair.reading === 'string' && pair.reading.trim().length > 0 ? pair.reading.trim() : null;
const key = `${term}\u0000${reading ?? ''}`;
if (seen.has(key)) {
continue;
}
seen.add(key);
normalized.push({ term, reading });
}
return normalized;
}
function toYomitanProfileMetadata(value: unknown): YomitanProfileMetadata | null {
if (!isObject(value)) {
return null;
}
const profileIndexRaw = value.profileIndex ?? value.profileCurrent;
const profileIndex =
typeof profileIndexRaw === 'number' && Number.isFinite(profileIndexRaw)
? Math.max(0, Math.floor(profileIndexRaw))
: 0;
const scanLengthRaw =
value.scanLength ??
(Array.isArray(value.profiles) && isObject(value.profiles[profileIndex])
? (value.profiles[profileIndex] as { options?: { scanning?: { length?: unknown } } }).options
?.scanning?.length
: undefined);
const scanLength =
typeof scanLengthRaw === 'number' && Number.isFinite(scanLengthRaw)
? Math.max(1, Math.floor(scanLengthRaw))
: DEFAULT_YOMITAN_SCAN_LENGTH;
const dictionariesRaw =
value.dictionaries ??
(Array.isArray(value.profiles) && isObject(value.profiles[profileIndex])
? (value.profiles[profileIndex] as { options?: { dictionaries?: unknown[] } }).options
?.dictionaries
: undefined);
const dictionaries = Array.isArray(dictionariesRaw)
? dictionariesRaw
.map((entry, index) => {
if (typeof entry === 'string') {
return { name: entry.trim(), priority: index };
}
if (!isObject(entry) || entry.enabled === false || typeof entry.name !== 'string') {
return null;
}
const normalizedName = entry.name.trim();
if (!normalizedName) {
return null;
}
const priorityRaw = (entry as { id?: unknown }).id;
const priority =
typeof priorityRaw === 'number' && Number.isFinite(priorityRaw)
? Math.max(0, Math.floor(priorityRaw))
: index;
return { name: normalizedName, priority };
})
.filter((entry): entry is { name: string; priority: number } => entry !== null)
.sort((a, b) => a.priority - b.priority)
.map((entry) => entry.name)
.filter((entry) => entry.length > 0)
: [];
const dictionaryPriorityByNameRaw = value.dictionaryPriorityByName;
const dictionaryPriorityByName: Record<string, number> = {};
if (isObject(dictionaryPriorityByNameRaw)) {
for (const [name, priorityRaw] of Object.entries(dictionaryPriorityByNameRaw)) {
if (typeof priorityRaw !== 'number' || !Number.isFinite(priorityRaw)) {
continue;
}
const normalizedName = name.trim();
if (!normalizedName) {
continue;
}
dictionaryPriorityByName[normalizedName] = Math.max(0, Math.floor(priorityRaw));
}
}
for (let index = 0; index < dictionaries.length; index += 1) {
const dictionary = dictionaries[index];
if (!dictionary) {
continue;
}
if (dictionaryPriorityByName[dictionary] === undefined) {
dictionaryPriorityByName[dictionary] = index;
}
}
return {
profileIndex,
scanLength,
dictionaries,
dictionaryPriorityByName,
};
}
function normalizeFrequencyEntriesWithPriority(
rawResult: unknown[],
dictionaryPriorityByName: Record<string, number>,
): YomitanTermFrequency[] {
const normalized: YomitanTermFrequency[] = [];
for (const entry of rawResult) {
const frequency = toYomitanTermFrequency(entry);
if (!frequency) {
continue;
}
const dictionaryPriority = dictionaryPriorityByName[frequency.dictionary];
normalized.push({
...frequency,
dictionaryPriority:
dictionaryPriority !== undefined ? dictionaryPriority : frequency.dictionaryPriority,
});
}
return normalized;
}
function groupFrequencyEntriesByPair(
entries: YomitanTermFrequency[],
): Map<string, YomitanTermFrequency[]> {
const grouped = new Map<string, YomitanTermFrequency[]>();
for (const entry of entries) {
const reading =
typeof entry.reading === 'string' && entry.reading.trim().length > 0 ? entry.reading.trim() : null;
const key = makeTermReadingCacheKey(entry.term.trim(), reading);
const existing = grouped.get(key);
if (existing) {
existing.push(entry);
continue;
}
grouped.set(key, [entry]);
}
return grouped;
}
function groupFrequencyEntriesByTerm(
entries: YomitanTermFrequency[],
): Map<string, YomitanTermFrequency[]> {
const grouped = new Map<string, YomitanTermFrequency[]>();
for (const entry of entries) {
const term = entry.term.trim();
if (!term) {
continue;
}
const existing = grouped.get(term);
if (existing) {
existing.push(entry);
continue;
}
grouped.set(term, [entry]);
}
return grouped;
}
async function requestYomitanProfileMetadata(
parserWindow: BrowserWindow,
logger: LoggerLike,
): Promise<YomitanProfileMetadata | null> {
const cached = yomitanProfileMetadataByWindow.get(parserWindow);
if (cached) {
return cached;
}
const script = `
(async () => {
const invoke = (action, params) =>
new Promise((resolve, reject) => {
chrome.runtime.sendMessage({ action, params }, (response) => {
if (chrome.runtime.lastError) {
reject(new Error(chrome.runtime.lastError.message));
return;
}
if (!response || typeof response !== "object") {
reject(new Error("Invalid response from Yomitan backend"));
return;
}
if (response.error) {
reject(new Error(response.error.message || "Yomitan backend error"));
return;
}
resolve(response.result);
});
});
const optionsFull = await invoke("optionsGetFull", undefined);
const profileIndex =
typeof optionsFull.profileCurrent === "number" && Number.isFinite(optionsFull.profileCurrent)
? Math.max(0, Math.floor(optionsFull.profileCurrent))
: 0;
const scanLengthRaw = optionsFull.profiles?.[profileIndex]?.options?.scanning?.length;
const scanLength =
typeof scanLengthRaw === "number" && Number.isFinite(scanLengthRaw)
? Math.max(1, Math.floor(scanLengthRaw))
: ${DEFAULT_YOMITAN_SCAN_LENGTH};
const dictionariesRaw = optionsFull.profiles?.[profileIndex]?.options?.dictionaries ?? [];
const dictionaryEntries = Array.isArray(dictionariesRaw)
? dictionariesRaw
.filter((entry) => entry && typeof entry === "object" && entry.enabled === true && typeof entry.name === "string")
.map((entry, index) => ({
name: entry.name,
id: typeof entry.id === "number" && Number.isFinite(entry.id) ? Math.max(0, Math.floor(entry.id)) : index
}))
.sort((a, b) => a.id - b.id)
: [];
const dictionaries = dictionaryEntries.map((entry) => entry.name);
const dictionaryPriorityByName = dictionaryEntries.reduce((acc, entry, index) => {
acc[entry.name] = index;
return acc;
}, {});
return { profileIndex, scanLength, dictionaries, dictionaryPriorityByName };
})();
`;
try {
const rawMetadata = await parserWindow.webContents.executeJavaScript(script, true);
const metadata = toYomitanProfileMetadata(rawMetadata);
if (!metadata) {
return null;
}
yomitanProfileMetadataByWindow.set(parserWindow, metadata);
return metadata;
} catch (err) {
logger.error('Yomitan parser metadata request failed:', (err as Error).message);
return null;
}
}
async function ensureYomitanParserWindow(
deps: YomitanParserRuntimeDeps,
logger: LoggerLike,
@@ -58,6 +448,7 @@ async function ensureYomitanParserWindow(
);
parserWindow.on('closed', () => {
clearWindowCaches(parserWindow);
if (deps.getYomitanParserWindow() === parserWindow) {
deps.setYomitanParserWindow(null);
deps.setYomitanParserReadyPromise(null);
@@ -77,6 +468,7 @@ async function ensureYomitanParserWindow(
if (!parserWindow.isDestroyed()) {
parserWindow.destroy();
}
clearWindowCaches(parserWindow);
if (deps.getYomitanParserWindow() === parserWindow) {
deps.setYomitanParserWindow(null);
deps.setYomitanParserReadyPromise(null);
@@ -108,7 +500,40 @@ export async function requestYomitanParseResults(
return null;
}
const script = `
const metadata = await requestYomitanProfileMetadata(parserWindow, logger);
const script =
metadata !== null
? `
(async () => {
const invoke = (action, params) =>
new Promise((resolve, reject) => {
chrome.runtime.sendMessage({ action, params }, (response) => {
if (chrome.runtime.lastError) {
reject(new Error(chrome.runtime.lastError.message));
return;
}
if (!response || typeof response !== "object") {
reject(new Error("Invalid response from Yomitan backend"));
return;
}
if (response.error) {
reject(new Error(response.error.message || "Yomitan backend error"));
return;
}
resolve(response.result);
});
});
return await invoke("parseText", {
text: ${JSON.stringify(text)},
optionsContext: { index: ${metadata.profileIndex} },
scanLength: ${metadata.scanLength},
useInternalParser: true,
useMecabParser: true
});
})();
`
: `
(async () => {
const invoke = (action, params) =>
new Promise((resolve, reject) => {
@@ -132,7 +557,7 @@ export async function requestYomitanParseResults(
const optionsFull = await invoke("optionsGetFull", undefined);
const profileIndex = optionsFull.profileCurrent;
const scanLength =
optionsFull.profiles?.[profileIndex]?.options?.scanning?.length ?? 40;
optionsFull.profiles?.[profileIndex]?.options?.scanning?.length ?? ${DEFAULT_YOMITAN_SCAN_LENGTH};
return await invoke("parseText", {
text: ${JSON.stringify(text)},
@@ -152,3 +577,278 @@ export async function requestYomitanParseResults(
return null;
}
}
export async function requestYomitanTermFrequencies(
termReadingList: YomitanTermReadingPair[],
deps: YomitanParserRuntimeDeps,
logger: LoggerLike,
): Promise<YomitanTermFrequency[]> {
const normalizedTermReadingList = normalizeTermReadingList(termReadingList);
const yomitanExt = deps.getYomitanExt();
if (normalizedTermReadingList.length === 0 || !yomitanExt) {
return [];
}
const isReady = await ensureYomitanParserWindow(deps, logger);
const parserWindow = deps.getYomitanParserWindow();
if (!isReady || !parserWindow || parserWindow.isDestroyed()) {
return [];
}
const metadata = await requestYomitanProfileMetadata(parserWindow, logger);
const frequencyCache = getWindowFrequencyCache(parserWindow);
const missingTermReadingList: YomitanTermReadingPair[] = [];
const buildCachedResult = (): YomitanTermFrequency[] => {
const result: YomitanTermFrequency[] = [];
for (const pair of normalizedTermReadingList) {
const key = makeTermReadingCacheKey(pair.term, pair.reading);
const cached = frequencyCache.get(key);
if (cached && cached.length > 0) {
result.push(...cached);
}
}
return result;
};
for (const pair of normalizedTermReadingList) {
const key = makeTermReadingCacheKey(pair.term, pair.reading);
if (!frequencyCache.has(key)) {
missingTermReadingList.push(pair);
}
}
if (missingTermReadingList.length === 0) {
return buildCachedResult();
}
if (metadata && metadata.dictionaries.length > 0) {
const script = `
(async () => {
const invoke = (action, params) =>
new Promise((resolve, reject) => {
chrome.runtime.sendMessage({ action, params }, (response) => {
if (chrome.runtime.lastError) {
reject(new Error(chrome.runtime.lastError.message));
return;
}
if (!response || typeof response !== "object") {
reject(new Error("Invalid response from Yomitan backend"));
return;
}
if (response.error) {
reject(new Error(response.error.message || "Yomitan backend error"));
return;
}
resolve(response.result);
});
});
return await invoke("getTermFrequencies", {
termReadingList: ${JSON.stringify(missingTermReadingList)},
dictionaries: ${JSON.stringify(metadata.dictionaries)}
});
})();
`;
try {
const rawResult = await parserWindow.webContents.executeJavaScript(script, true);
const fetchedEntries = Array.isArray(rawResult)
? normalizeFrequencyEntriesWithPriority(rawResult, metadata.dictionaryPriorityByName)
: [];
const groupedByPair = groupFrequencyEntriesByPair(fetchedEntries);
const groupedByTerm = groupFrequencyEntriesByTerm(fetchedEntries);
const missingTerms = new Set(missingTermReadingList.map((pair) => pair.term));
for (const pair of missingTermReadingList) {
const key = makeTermReadingCacheKey(pair.term, pair.reading);
const exactEntries = groupedByPair.get(key);
const termEntries = groupedByTerm.get(pair.term) ?? [];
frequencyCache.set(key, exactEntries ?? termEntries);
}
const cachedResult = buildCachedResult();
const unmatchedEntries = fetchedEntries.filter((entry) => !missingTerms.has(entry.term.trim()));
return [...cachedResult, ...unmatchedEntries];
} catch (err) {
logger.error('Yomitan term frequency request failed:', (err as Error).message);
}
return buildCachedResult();
}
const script = `
(async () => {
const invoke = (action, params) =>
new Promise((resolve, reject) => {
chrome.runtime.sendMessage({ action, params }, (response) => {
if (chrome.runtime.lastError) {
reject(new Error(chrome.runtime.lastError.message));
return;
}
if (!response || typeof response !== "object") {
reject(new Error("Invalid response from Yomitan backend"));
return;
}
if (response.error) {
reject(new Error(response.error.message || "Yomitan backend error"));
return;
}
resolve(response.result);
});
});
const optionsFull = await invoke("optionsGetFull", undefined);
const profileIndex = optionsFull.profileCurrent;
const dictionariesRaw = optionsFull.profiles?.[profileIndex]?.options?.dictionaries ?? [];
const dictionaryEntries = Array.isArray(dictionariesRaw)
? dictionariesRaw
.filter((entry) => entry && typeof entry === "object" && entry.enabled === true && typeof entry.name === "string")
.map((entry, index) => ({
name: entry.name,
id: typeof entry.id === "number" && Number.isFinite(entry.id) ? Math.floor(entry.id) : index
}))
.sort((a, b) => a.id - b.id)
: [];
const dictionaries = dictionaryEntries.map((entry) => entry.name);
const dictionaryPriorityByName = dictionaryEntries.reduce((acc, entry, index) => {
acc[entry.name] = index;
return acc;
}, {});
if (dictionaries.length === 0) {
return [];
}
const rawFrequencies = await invoke("getTermFrequencies", {
termReadingList: ${JSON.stringify(missingTermReadingList)},
dictionaries
});
if (!Array.isArray(rawFrequencies)) {
return [];
}
return rawFrequencies
.filter((entry) => entry && typeof entry === "object")
.map((entry) => ({
...entry,
dictionaryPriority:
typeof entry.dictionary === "string" && dictionaryPriorityByName[entry.dictionary] !== undefined
? dictionaryPriorityByName[entry.dictionary]
: Number.MAX_SAFE_INTEGER
}));
})();
`;
try {
const rawResult = await parserWindow.webContents.executeJavaScript(script, true);
const fetchedEntries = Array.isArray(rawResult)
? rawResult
.map((entry) => toYomitanTermFrequency(entry))
.filter((entry): entry is YomitanTermFrequency => entry !== null)
: [];
const groupedByPair = groupFrequencyEntriesByPair(fetchedEntries);
const groupedByTerm = groupFrequencyEntriesByTerm(fetchedEntries);
const missingTerms = new Set(missingTermReadingList.map((pair) => pair.term));
for (const pair of missingTermReadingList) {
const key = makeTermReadingCacheKey(pair.term, pair.reading);
const exactEntries = groupedByPair.get(key);
const termEntries = groupedByTerm.get(pair.term) ?? [];
frequencyCache.set(key, exactEntries ?? termEntries);
}
const cachedResult = buildCachedResult();
const unmatchedEntries = fetchedEntries.filter((entry) => !missingTerms.has(entry.term.trim()));
return [...cachedResult, ...unmatchedEntries];
} catch (err) {
logger.error('Yomitan term frequency request failed:', (err as Error).message);
return buildCachedResult();
}
}
export async function syncYomitanDefaultAnkiServer(
serverUrl: string,
deps: YomitanParserRuntimeDeps,
logger: LoggerLike,
): Promise<boolean> {
const normalizedTargetServer = serverUrl.trim();
if (!normalizedTargetServer) {
return false;
}
const isReady = await ensureYomitanParserWindow(deps, logger);
const parserWindow = deps.getYomitanParserWindow();
if (!isReady || !parserWindow || parserWindow.isDestroyed()) {
return false;
}
const script = `
(async () => {
const invoke = (action, params) =>
new Promise((resolve, reject) => {
chrome.runtime.sendMessage({ action, params }, (response) => {
if (chrome.runtime.lastError) {
reject(new Error(chrome.runtime.lastError.message));
return;
}
if (!response || typeof response !== "object") {
reject(new Error("Invalid response from Yomitan backend"));
return;
}
if (response.error) {
reject(new Error(response.error.message || "Yomitan backend error"));
return;
}
resolve(response.result);
});
});
const targetServer = ${JSON.stringify(normalizedTargetServer)};
const optionsFull = await invoke("optionsGetFull", undefined);
const profiles = Array.isArray(optionsFull.profiles) ? optionsFull.profiles : [];
if (profiles.length === 0) {
return { updated: false, reason: "no-profiles" };
}
const defaultProfile = profiles[0];
if (!defaultProfile || typeof defaultProfile !== "object") {
return { updated: false, reason: "invalid-default-profile" };
}
defaultProfile.options = defaultProfile.options && typeof defaultProfile.options === "object"
? defaultProfile.options
: {};
defaultProfile.options.anki = defaultProfile.options.anki && typeof defaultProfile.options.anki === "object"
? defaultProfile.options.anki
: {};
const currentServerRaw = defaultProfile.options.anki.server;
const currentServer = typeof currentServerRaw === "string" ? currentServerRaw.trim() : "";
const canReplaceDefault =
currentServer.length === 0 || currentServer === "http://127.0.0.1:8765";
if (!canReplaceDefault || currentServer === targetServer) {
return { updated: false, reason: "no-change", currentServer, targetServer };
}
defaultProfile.options.anki.server = targetServer;
await invoke("setAllSettings", { value: optionsFull, source: "subminer" });
return { updated: true, currentServer, targetServer };
})();
`;
try {
const result = await parserWindow.webContents.executeJavaScript(script, true);
const updated =
typeof result === 'object' &&
result !== null &&
(result as { updated?: unknown }).updated === true;
if (updated) {
logger.info?.(`Updated Yomitan default profile Anki server to ${normalizedTargetServer}`);
return true;
}
return false;
} catch (err) {
logger.error('Failed to sync Yomitan default profile Anki server:', (err as Error).message);
return false;
}
}