fix(subtitle): stabilize frequency highlighting with yomitan ranks

This commit is contained in:
2026-02-28 16:44:28 -08:00
parent 9c2618c4c7
commit d2af09d941
22 changed files with 536 additions and 189 deletions

View File

@@ -33,6 +33,7 @@ export const SUBTITLE_DEFAULT_CONFIG: Pick<ResolvedConfig, 'subtitleStyle'> = {
sourcePath: '',
topX: 1000,
mode: 'single',
matchMode: 'headword',
singleColor: '#f5a97f',
bandedColors: ['#ed8796', '#f5a97f', '#f9e2af', '#a6e3a1', '#8aadf4'],
},

View File

@@ -61,6 +61,14 @@ export function buildSubtitleConfigOptionRegistry(
description:
'single: use one color for all matching tokens. banded: use color ramp by frequency band.',
},
{
path: 'subtitleStyle.frequencyDictionary.matchMode',
kind: 'enum',
enumValues: ['headword', 'surface'],
defaultValue: defaultConfig.subtitleStyle.frequencyDictionary.matchMode,
description:
'headword: frequency lookup uses dictionary form. surface: lookup uses subtitle-visible token text.',
},
{
path: 'subtitleStyle.frequencyDictionary.singleColor',
kind: 'string',

View File

@@ -102,9 +102,18 @@ export function applySubtitleDomainConfig(context: ResolveContext): void {
const fallbackSubtitleStyleHoverTokenColor = resolved.subtitleStyle.hoverTokenColor;
const fallbackSubtitleStyleHoverTokenBackgroundColor =
resolved.subtitleStyle.hoverTokenBackgroundColor;
const fallbackFrequencyDictionary = {
...resolved.subtitleStyle.frequencyDictionary,
};
resolved.subtitleStyle = {
...resolved.subtitleStyle,
...(src.subtitleStyle as ResolvedConfig['subtitleStyle']),
frequencyDictionary: {
...resolved.subtitleStyle.frequencyDictionary,
...(isObject((src.subtitleStyle as { frequencyDictionary?: unknown }).frequencyDictionary)
? ((src.subtitleStyle as { frequencyDictionary?: unknown }).frequencyDictionary as ResolvedConfig['subtitleStyle']['frequencyDictionary'])
: {}),
},
secondary: {
...resolved.subtitleStyle.secondary,
...(isObject(src.subtitleStyle.secondary)
@@ -186,6 +195,7 @@ export function applySubtitleDomainConfig(context: ResolveContext): void {
if (frequencyEnabled !== undefined) {
resolved.subtitleStyle.frequencyDictionary.enabled = frequencyEnabled;
} else if ((frequencyDictionary as { enabled?: unknown }).enabled !== undefined) {
resolved.subtitleStyle.frequencyDictionary.enabled = fallbackFrequencyDictionary.enabled;
warn(
'subtitleStyle.frequencyDictionary.enabled',
(frequencyDictionary as { enabled?: unknown }).enabled,
@@ -198,6 +208,7 @@ export function applySubtitleDomainConfig(context: ResolveContext): void {
if (sourcePath !== undefined) {
resolved.subtitleStyle.frequencyDictionary.sourcePath = sourcePath;
} else if ((frequencyDictionary as { sourcePath?: unknown }).sourcePath !== undefined) {
resolved.subtitleStyle.frequencyDictionary.sourcePath = fallbackFrequencyDictionary.sourcePath;
warn(
'subtitleStyle.frequencyDictionary.sourcePath',
(frequencyDictionary as { sourcePath?: unknown }).sourcePath,
@@ -210,6 +221,7 @@ export function applySubtitleDomainConfig(context: ResolveContext): void {
if (topX !== undefined && Number.isInteger(topX) && topX > 0) {
resolved.subtitleStyle.frequencyDictionary.topX = Math.floor(topX);
} else if ((frequencyDictionary as { topX?: unknown }).topX !== undefined) {
resolved.subtitleStyle.frequencyDictionary.topX = fallbackFrequencyDictionary.topX;
warn(
'subtitleStyle.frequencyDictionary.topX',
(frequencyDictionary as { topX?: unknown }).topX,
@@ -222,6 +234,7 @@ export function applySubtitleDomainConfig(context: ResolveContext): void {
if (frequencyMode === 'single' || frequencyMode === 'banded') {
resolved.subtitleStyle.frequencyDictionary.mode = frequencyMode;
} else if (frequencyMode !== undefined) {
resolved.subtitleStyle.frequencyDictionary.mode = fallbackFrequencyDictionary.mode;
warn(
'subtitleStyle.frequencyDictionary.mode',
frequencyDictionary.mode,
@@ -230,10 +243,24 @@ export function applySubtitleDomainConfig(context: ResolveContext): void {
);
}
const frequencyMatchMode = (frequencyDictionary as { matchMode?: unknown }).matchMode;
if (frequencyMatchMode === 'headword' || frequencyMatchMode === 'surface') {
resolved.subtitleStyle.frequencyDictionary.matchMode = frequencyMatchMode;
} else if (frequencyMatchMode !== undefined) {
resolved.subtitleStyle.frequencyDictionary.matchMode = fallbackFrequencyDictionary.matchMode;
warn(
'subtitleStyle.frequencyDictionary.matchMode',
frequencyMatchMode,
resolved.subtitleStyle.frequencyDictionary.matchMode,
"Expected 'headword' or 'surface'.",
);
}
const singleColor = asColor((frequencyDictionary as { singleColor?: unknown }).singleColor);
if (singleColor !== undefined) {
resolved.subtitleStyle.frequencyDictionary.singleColor = singleColor;
} else if ((frequencyDictionary as { singleColor?: unknown }).singleColor !== undefined) {
resolved.subtitleStyle.frequencyDictionary.singleColor = fallbackFrequencyDictionary.singleColor;
warn(
'subtitleStyle.frequencyDictionary.singleColor',
(frequencyDictionary as { singleColor?: unknown }).singleColor,
@@ -248,6 +275,8 @@ export function applySubtitleDomainConfig(context: ResolveContext): void {
if (bandedColors !== undefined) {
resolved.subtitleStyle.frequencyDictionary.bandedColors = bandedColors;
} else if ((frequencyDictionary as { bandedColors?: unknown }).bandedColors !== undefined) {
resolved.subtitleStyle.frequencyDictionary.bandedColors =
fallbackFrequencyDictionary.bandedColors;
warn(
'subtitleStyle.frequencyDictionary.bandedColors',
(frequencyDictionary as { bandedColors?: unknown }).bandedColors,

View File

@@ -27,3 +27,32 @@ test('subtitleStyle preserveLineBreaks falls back while merge is preserved', ()
),
);
});
test('subtitleStyle frequencyDictionary.matchMode accepts valid values and warns on invalid', () => {
const valid = createResolveContext({
subtitleStyle: {
frequencyDictionary: {
matchMode: 'surface',
},
},
});
applySubtitleDomainConfig(valid.context);
assert.equal(valid.context.resolved.subtitleStyle.frequencyDictionary.matchMode, 'surface');
const invalid = createResolveContext({
subtitleStyle: {
frequencyDictionary: {
matchMode: 'reading' as unknown as 'headword' | 'surface',
},
},
});
applySubtitleDomainConfig(invalid.context);
assert.equal(invalid.context.resolved.subtitleStyle.frequencyDictionary.matchMode, 'headword');
assert.ok(
invalid.warnings.some(
(warning) =>
warning.path === 'subtitleStyle.frequencyDictionary.matchMode' &&
warning.message === "Expected 'headword' or 'surface'.",
),
);
});

View File

@@ -80,7 +80,7 @@ test('createFrequencyDictionaryLookup aggregates duplicate-term logs into a sing
);
});
test('createFrequencyDictionaryLookup prefers frequency.value over displayValue', async () => {
test('createFrequencyDictionaryLookup prefers frequency.displayValue over value when both exist', async () => {
const logs: string[] = [];
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-frequency-dict-'));
const bankPath = path.join(tempDir, 'term_meta_bank_1.json');
@@ -88,6 +88,7 @@ test('createFrequencyDictionaryLookup prefers frequency.value over displayValue'
bankPath,
JSON.stringify([
['猫', 1, { frequency: { value: 1234, displayValue: 1200 } }],
['鍛える', 2, { frequency: { value: 46961, displayValue: 2847 } }],
['犬', 2, { frequency: { displayValue: 88 } }],
]),
);
@@ -99,10 +100,31 @@ test('createFrequencyDictionaryLookup prefers frequency.value over displayValue'
},
});
assert.equal(lookup('猫'), 1234);
assert.equal(lookup('猫'), 1200);
assert.equal(lookup('鍛える'), 2847);
assert.equal(lookup('犬'), 88);
assert.equal(
logs.some((entry) => entry.includes('Frequency dictionary loaded from')),
true,
);
});
test('createFrequencyDictionaryLookup parses composite displayValue by primary rank', async () => {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-frequency-dict-'));
const bankPath = path.join(tempDir, 'term_meta_bank_1.json');
fs.writeFileSync(
bankPath,
JSON.stringify([
['鍛える', 1, { frequency: { displayValue: '3272,52377' } }],
['高み', 2, { frequency: { displayValue: '9933,108961' } }],
]),
);
const lookup = await createFrequencyDictionaryLookup({
searchPaths: [tempDir],
log: () => undefined,
});
assert.equal(lookup('鍛える'), 3272);
assert.equal(lookup('高み'), 9933);
});

View File

@@ -18,6 +18,32 @@ function normalizeFrequencyTerm(value: string): string {
return value.trim().toLowerCase();
}
function parsePositiveFrequencyString(value: string): number | null {
const trimmed = value.trim();
if (!trimmed) {
return null;
}
const numericPrefix = trimmed.match(/^\d[\d,]*/)?.[0];
if (!numericPrefix) {
return null;
}
const chunks = numericPrefix.split(',');
const normalizedNumber =
chunks.length <= 1
? chunks[0] ?? ''
: chunks.slice(1).every((chunk) => /^\d{3}$/.test(chunk))
? chunks.join('')
: (chunks[0] ?? '');
const parsed = Number.parseInt(normalizedNumber, 10);
if (!Number.isFinite(parsed) || parsed <= 0) {
return null;
}
return parsed;
}
function parsePositiveFrequencyNumber(value: unknown): number | null {
if (typeof value === 'number') {
if (!Number.isFinite(value) || value <= 0) return null;
@@ -25,10 +51,7 @@ function parsePositiveFrequencyNumber(value: unknown): number | null {
}
if (typeof value === 'string') {
const normalized = value.trim().replace(/,/g, '');
const parsed = Number.parseInt(normalized, 10);
if (!Number.isFinite(parsed) || parsed <= 0) return null;
return parsed;
return parsePositiveFrequencyString(value);
}
return null;
@@ -38,14 +61,14 @@ function extractFrequencyDisplayValue(meta: unknown): number | null {
if (!meta || typeof meta !== 'object') return null;
const frequency = (meta as { frequency?: unknown }).frequency;
if (!frequency || typeof frequency !== 'object') return null;
const rawValue = (frequency as { value?: unknown }).value;
const parsedValue = parsePositiveFrequencyNumber(rawValue);
if (parsedValue !== null) {
return parsedValue;
const displayValue = (frequency as { displayValue?: unknown }).displayValue;
const parsedDisplayValue = parsePositiveFrequencyNumber(displayValue);
if (parsedDisplayValue !== null) {
return parsedDisplayValue;
}
const displayValue = (frequency as { displayValue?: unknown }).displayValue;
return parsePositiveFrequencyNumber(displayValue);
const rawValue = (frequency as { value?: unknown }).value;
return parsePositiveFrequencyNumber(rawValue);
}
function asFrequencyDictionaryEntry(entry: unknown): FrequencyDictionaryEntry | null {

View File

@@ -218,6 +218,119 @@ test('tokenizeSubtitle loads frequency ranks from Yomitan installed dictionaries
assert.equal(result.tokens?.[0]?.frequencyRank, 77);
});
test('tokenizeSubtitle queries headword frequencies without forcing surface reading', async () => {
const result = await tokenizeSubtitle(
'鍛えた',
makeDeps({
getFrequencyDictionaryEnabled: () => true,
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
getYomitanParserWindow: () =>
({
isDestroyed: () => false,
webContents: {
executeJavaScript: async (script: string) => {
if (script.includes('getTermFrequencies')) {
if (!script.includes('"term":"鍛える","reading":null')) {
return [];
}
return [
{
term: '鍛える',
reading: 'きたえる',
dictionary: 'freq-dict',
frequency: 46961,
displayValue: '2847,46961',
displayValueParsed: true,
},
];
}
return [
{
source: 'scanning-parser',
index: 0,
content: [
[
{
text: '鍛えた',
reading: 'きた',
headwords: [[{ term: '鍛える' }]],
},
],
],
},
];
},
},
}) as unknown as Electron.BrowserWindow,
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.headword, '鍛える');
assert.equal(result.tokens?.[0]?.reading, 'きた');
assert.equal(result.tokens?.[0]?.frequencyRank, 2847);
});
test('tokenizeSubtitle prefers Yomitan frequency from highest-priority dictionary', async () => {
const result = await tokenizeSubtitle(
'猫',
makeDeps({
getFrequencyDictionaryEnabled: () => true,
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
getYomitanParserWindow: () =>
({
isDestroyed: () => false,
webContents: {
executeJavaScript: async (script: string) => {
if (script.includes('getTermFrequencies')) {
return [
{
term: '猫',
reading: 'ねこ',
dictionary: 'low-priority',
dictionaryPriority: 2,
frequency: 5,
displayValue: '5',
displayValueParsed: true,
},
{
term: '猫',
reading: 'ねこ',
dictionary: 'high-priority',
dictionaryPriority: 0,
frequency: 100,
displayValue: '100',
displayValueParsed: true,
},
];
}
return [
{
source: 'scanning-parser',
index: 0,
content: [
[
{
text: '猫',
reading: 'ねこ',
headwords: [[{ term: '猫' }]],
},
],
],
},
];
},
},
}) as unknown as Electron.BrowserWindow,
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.frequencyRank, 100);
});
test('tokenizeSubtitle uses only selected Yomitan headword for frequency lookup', async () => {
const result = await tokenizeSubtitle(
'猫です',
@@ -1693,6 +1806,20 @@ test('tokenizeSubtitle checks known words by surface when configured', async ()
assert.equal(result.tokens?.[0]?.isKnown, true);
});
test('tokenizeSubtitle uses frequency surface match mode when configured', async () => {
const result = await tokenizeSubtitle(
'鍛えた',
makeDepsFromYomitanTokens([{ surface: '鍛えた', reading: 'きたえた', headword: '鍛える' }], {
getFrequencyDictionaryEnabled: () => true,
getFrequencyDictionaryMatchMode: () => 'surface',
getFrequencyRank: (text) => (text === '鍛えた' ? 2847 : null),
}),
);
assert.equal(result.text, '鍛えた');
assert.equal(result.tokens?.[0]?.frequencyRank, 2847);
});
test('createTokenizerDepsRuntime checks MeCab availability before first tokenizeWithMecab call', async () => {
let available = false;
let checkCalls = 0;

View File

@@ -2,6 +2,7 @@ import type { BrowserWindow, Extension } from 'electron';
import { mergeTokens } from '../../token-merger';
import { createLogger } from '../../logger';
import {
FrequencyDictionaryMatchMode,
MergedToken,
NPlusOneMatchMode,
SubtitleData,
@@ -36,6 +37,7 @@ export interface TokenizerServiceDeps {
getNPlusOneEnabled?: () => boolean;
getJlptEnabled?: () => boolean;
getFrequencyDictionaryEnabled?: () => boolean;
getFrequencyDictionaryMatchMode?: () => FrequencyDictionaryMatchMode;
getFrequencyRank?: FrequencyDictionaryLookup;
getMinSentenceWordsForNPlusOne?: () => number;
getYomitanGroupDebugEnabled?: () => boolean;
@@ -63,6 +65,7 @@ export interface TokenizerDepsRuntimeOptions {
getNPlusOneEnabled?: () => boolean;
getJlptEnabled?: () => boolean;
getFrequencyDictionaryEnabled?: () => boolean;
getFrequencyDictionaryMatchMode?: () => FrequencyDictionaryMatchMode;
getFrequencyRank?: FrequencyDictionaryLookup;
getMinSentenceWordsForNPlusOne?: () => number;
getYomitanGroupDebugEnabled?: () => boolean;
@@ -73,6 +76,7 @@ interface TokenizerAnnotationOptions {
nPlusOneEnabled: boolean;
jlptEnabled: boolean;
frequencyEnabled: boolean;
frequencyMatchMode: FrequencyDictionaryMatchMode;
minSentenceWordsForNPlusOne: number | undefined;
}
@@ -139,7 +143,6 @@ async function applyAnnotationStage(
isKnownWord: getKnownWordLookup(deps, options),
knownWordMatchMode: deps.getKnownWordMatchMode(),
getJlptLevel: deps.getJlptLevel,
getFrequencyRank: deps.getFrequencyRank,
},
options,
);
@@ -164,6 +167,8 @@ export function createTokenizerDepsRuntime(
getNPlusOneEnabled: options.getNPlusOneEnabled,
getJlptEnabled: options.getJlptEnabled,
getFrequencyDictionaryEnabled: options.getFrequencyDictionaryEnabled,
getFrequencyDictionaryMatchMode:
options.getFrequencyDictionaryMatchMode ?? (() => 'headword'),
getFrequencyRank: options.getFrequencyRank,
getMinSentenceWordsForNPlusOne: options.getMinSentenceWordsForNPlusOne ?? (() => 3),
getYomitanGroupDebugEnabled: options.getYomitanGroupDebugEnabled ?? (() => false),
@@ -224,7 +229,24 @@ function normalizePositiveFrequencyRank(value: unknown): number | null {
return Math.max(1, Math.floor(value));
}
function resolveFrequencyLookupText(token: MergedToken): string {
function normalizeFrequencyLookupText(rawText: string): string {
return rawText.trim().toLowerCase();
}
function resolveFrequencyLookupText(
token: MergedToken,
matchMode: FrequencyDictionaryMatchMode,
): string {
if (matchMode === 'surface') {
if (token.surface && token.surface.length > 0) {
return token.surface;
}
if (token.headword && token.headword.length > 0) {
return token.headword;
}
return token.reading;
}
if (token.headword && token.headword.length > 0) {
return token.headword;
}
@@ -234,43 +256,128 @@ function resolveFrequencyLookupText(token: MergedToken): string {
return token.surface;
}
function applyYomitanFrequencyRanks(
function buildYomitanFrequencyTermReadingList(
tokens: MergedToken[],
frequencies: ReadonlyArray<{ term: string; frequency: number }>,
): MergedToken[] {
if (tokens.length === 0 || frequencies.length === 0) {
return tokens;
}
matchMode: FrequencyDictionaryMatchMode,
): Array<{ term: string; reading: string | null }> {
return tokens
.map((token) => {
const term = resolveFrequencyLookupText(token, matchMode).trim();
if (!term) {
return null;
}
const readingRaw =
token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null;
const reading = matchMode === 'headword' ? null : readingRaw;
return { term, reading };
})
.filter((pair): pair is { term: string; reading: string | null } => pair !== null);
}
const rankByTerm = new Map<string, number>();
function buildYomitanFrequencyRankMap(
frequencies: ReadonlyArray<{ term: string; frequency: number; dictionaryPriority?: number }>,
): Map<string, number> {
const rankByTerm = new Map<string, { rank: number; dictionaryPriority: number }>();
for (const frequency of frequencies) {
const normalizedTerm = frequency.term.trim();
const rank = normalizePositiveFrequencyRank(frequency.frequency);
if (!normalizedTerm || rank === null) {
continue;
}
const dictionaryPriority =
typeof frequency.dictionaryPriority === 'number' && Number.isFinite(frequency.dictionaryPriority)
? Math.max(0, Math.floor(frequency.dictionaryPriority))
: Number.MAX_SAFE_INTEGER;
const current = rankByTerm.get(normalizedTerm);
if (current === undefined || rank < current) {
rankByTerm.set(normalizedTerm, rank);
if (
current === undefined ||
dictionaryPriority < current.dictionaryPriority ||
(dictionaryPriority === current.dictionaryPriority && rank < current.rank)
) {
rankByTerm.set(normalizedTerm, { rank, dictionaryPriority });
}
}
if (rankByTerm.size === 0) {
const collapsedRankByTerm = new Map<string, number>();
for (const [term, entry] of rankByTerm.entries()) {
collapsedRankByTerm.set(term, entry.rank);
}
return collapsedRankByTerm;
}
function getLocalFrequencyRank(
lookupText: string,
getFrequencyRank: FrequencyDictionaryLookup,
cache: Map<string, number | null>,
): number | null {
const normalizedText = normalizeFrequencyLookupText(lookupText);
if (!normalizedText) {
return null;
}
if (cache.has(normalizedText)) {
return cache.get(normalizedText) ?? null;
}
let rank: number | null;
try {
rank = getFrequencyRank(normalizedText);
} catch {
rank = null;
}
rank = normalizePositiveFrequencyRank(rank);
cache.set(normalizedText, rank);
return rank;
}
function applyFrequencyRanks(
tokens: MergedToken[],
matchMode: FrequencyDictionaryMatchMode,
yomitanRankByTerm: Map<string, number>,
getFrequencyRank: FrequencyDictionaryLookup | undefined,
): MergedToken[] {
if (tokens.length === 0) {
return tokens;
}
const localLookupCache = new Map<string, number | null>();
return tokens.map((token) => {
const lookupText = resolveFrequencyLookupText(token).trim();
const existingRank = normalizePositiveFrequencyRank(token.frequencyRank);
if (existingRank !== null) {
return {
...token,
frequencyRank: existingRank,
};
}
const lookupText = resolveFrequencyLookupText(token, matchMode).trim();
if (!lookupText) {
return token;
return {
...token,
frequencyRank: undefined,
};
}
const rank = rankByTerm.get(lookupText);
if (rank === undefined) {
return token;
const yomitanRank = yomitanRankByTerm.get(lookupText);
if (yomitanRank !== undefined) {
return {
...token,
frequencyRank: yomitanRank,
};
}
if (!getFrequencyRank) {
return {
...token,
frequencyRank: undefined,
};
}
const localRank = getLocalFrequencyRank(lookupText, getFrequencyRank, localLookupCache);
return {
...token,
frequencyRank: rank,
frequencyRank: localRank ?? undefined,
};
});
}
@@ -280,6 +387,7 @@ function getAnnotationOptions(deps: TokenizerServiceDeps): TokenizerAnnotationOp
nPlusOneEnabled: deps.getNPlusOneEnabled?.() !== false,
jlptEnabled: deps.getJlptEnabled?.() !== false,
frequencyEnabled: deps.getFrequencyDictionaryEnabled?.() !== false,
frequencyMatchMode: deps.getFrequencyDictionaryMatchMode?.() ?? 'headword',
minSentenceWordsForNPlusOne: deps.getMinSentenceWordsForNPlusOne?.(),
};
}
@@ -307,34 +415,44 @@ async function parseWithYomitanInternalParser(
logSelectedYomitanGroups(text, selectedTokens);
}
let tokensWithFrequency = selectedTokens;
let yomitanRankByTerm = new Map<string, number>();
if (options.frequencyEnabled) {
const termReadingList = selectedTokens.map((token) => ({
term: resolveFrequencyLookupText(token),
reading: token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null,
}));
const yomitanFrequencies = await requestYomitanTermFrequencies(termReadingList, deps, logger);
tokensWithFrequency = applyYomitanFrequencyRanks(selectedTokens, yomitanFrequencies);
}
if (!needsMecabPosEnrichment(options)) {
return tokensWithFrequency;
}
try {
const mecabTokens = await deps.tokenizeWithMecab(text);
const enrichTokensWithMecab = deps.enrichTokensWithMecab ?? enrichTokensWithMecabAsync;
return await enrichTokensWithMecab(tokensWithFrequency, mecabTokens);
} catch (err) {
const error = err as Error;
logger.warn(
'Failed to enrich Yomitan tokens with MeCab POS:',
error.message,
`tokenCount=${selectedTokens.length}`,
`textLength=${text.length}`,
const frequencyMatchMode = options.frequencyMatchMode;
const termReadingList = buildYomitanFrequencyTermReadingList(
selectedTokens,
frequencyMatchMode,
);
return tokensWithFrequency;
const yomitanFrequencies = await requestYomitanTermFrequencies(termReadingList, deps, logger);
yomitanRankByTerm = buildYomitanFrequencyRankMap(yomitanFrequencies);
}
let enrichedTokens = selectedTokens;
if (needsMecabPosEnrichment(options)) {
try {
const mecabTokens = await deps.tokenizeWithMecab(text);
const enrichTokensWithMecab = deps.enrichTokensWithMecab ?? enrichTokensWithMecabAsync;
enrichedTokens = await enrichTokensWithMecab(enrichedTokens, mecabTokens);
} catch (err) {
const error = err as Error;
logger.warn(
'Failed to enrich Yomitan tokens with MeCab POS:',
error.message,
`tokenCount=${selectedTokens.length}`,
`textLength=${text.length}`,
);
}
}
if (options.frequencyEnabled) {
return applyFrequencyRanks(
enrichedTokens,
options.frequencyMatchMode,
yomitanRankByTerm,
deps.getFrequencyRank,
);
}
return enrichedTokens;
}
export async function tokenizeSubtitle(

View File

@@ -51,15 +51,15 @@ test('annotateTokens known-word match mode uses headword vs surface', () => {
});
test('annotateTokens excludes frequency for particle/bound_auxiliary and pos1 exclusions', () => {
const lookupCalls: string[] = [];
const tokens = [
makeToken({ surface: 'は', headword: 'は', partOfSpeech: PartOfSpeech.particle }),
makeToken({ surface: 'は', headword: 'は', partOfSpeech: PartOfSpeech.particle, frequencyRank: 3 }),
makeToken({
surface: 'です',
headword: 'です',
partOfSpeech: PartOfSpeech.bound_auxiliary,
startPos: 1,
endPos: 3,
frequencyRank: 4,
}),
makeToken({
surface: 'の',
@@ -68,6 +68,7 @@ test('annotateTokens excludes frequency for particle/bound_auxiliary and pos1 ex
pos1: '助詞',
startPos: 3,
endPos: 4,
frequencyRank: 5,
}),
makeToken({
surface: '猫',
@@ -75,45 +76,36 @@ test('annotateTokens excludes frequency for particle/bound_auxiliary and pos1 ex
partOfSpeech: PartOfSpeech.noun,
startPos: 4,
endPos: 5,
frequencyRank: 11,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
getFrequencyRank: (text) => {
lookupCalls.push(text);
return text === '猫' ? 11 : 999;
},
}),
);
const result = annotateTokens(tokens, makeDeps());
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[1]?.frequencyRank, undefined);
assert.equal(result[2]?.frequencyRank, undefined);
assert.equal(result[3]?.frequencyRank, 11);
assert.deepEqual(lookupCalls, ['猫']);
});
test('annotateTokens preserves existing frequency rank when lookup is unavailable', () => {
test('annotateTokens preserves existing frequency rank when frequency is enabled', () => {
const tokens = [makeToken({ surface: '猫', headword: '猫', frequencyRank: 42 })];
const result = annotateTokens(tokens, makeDeps({ getFrequencyRank: undefined }));
const result = annotateTokens(tokens, makeDeps());
assert.equal(result[0]?.frequencyRank, 42);
});
test('annotateTokens prefers existing frequency rank over fallback lookup', () => {
test('annotateTokens drops invalid frequency rank values', () => {
const tokens = [makeToken({ surface: '猫', headword: '猫', frequencyRank: Number.NaN })];
const result = annotateTokens(tokens, makeDeps());
assert.equal(result[0]?.frequencyRank, undefined);
});
test('annotateTokens clears frequency rank when frequency is disabled', () => {
const tokens = [makeToken({ surface: '猫', headword: '猫', frequencyRank: 42 })];
const result = annotateTokens(
tokens,
makeDeps({
getFrequencyRank: () => 9,
}),
);
assert.equal(result[0]?.frequencyRank, 42);
const result = annotateTokens(tokens, makeDeps(), { frequencyEnabled: false });
assert.equal(result[0]?.frequencyRank, undefined);
});
test('annotateTokens handles JLPT disabled and eligibility exclusion paths', () => {

View File

@@ -1,6 +1,5 @@
import { markNPlusOneTargets } from '../../../token-merger';
import {
FrequencyDictionaryLookup,
JlptLevel,
MergedToken,
NPlusOneMatchMode,
@@ -12,22 +11,16 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
const KATAKANA_CODEPOINT_START = 0x30a1;
const KATAKANA_CODEPOINT_END = 0x30f6;
const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
const FREQUENCY_RANK_LOOKUP_CACHE_LIMIT = 2048;
const jlptLevelLookupCaches = new WeakMap<
(text: string) => JlptLevel | null,
Map<string, JlptLevel | null>
>();
const frequencyRankLookupCaches = new WeakMap<
FrequencyDictionaryLookup,
Map<string, number | null>
>();
export interface AnnotationStageDeps {
isKnownWord: (text: string) => boolean;
knownWordMatchMode: NPlusOneMatchMode;
getJlptLevel: (text: string) => JlptLevel | null;
getFrequencyRank?: FrequencyDictionaryLookup;
}
export interface AnnotationStageOptions {
@@ -60,67 +53,6 @@ function applyKnownWordMarking(
});
}
function normalizeFrequencyLookupText(rawText: string): string {
return rawText.trim().toLowerCase();
}
function getCachedFrequencyRank(
lookupText: string,
getFrequencyRank: FrequencyDictionaryLookup,
): number | null {
const normalizedText = normalizeFrequencyLookupText(lookupText);
if (!normalizedText) {
return null;
}
let cache = frequencyRankLookupCaches.get(getFrequencyRank);
if (!cache) {
cache = new Map<string, number | null>();
frequencyRankLookupCaches.set(getFrequencyRank, cache);
}
if (cache.has(normalizedText)) {
return cache.get(normalizedText) ?? null;
}
let rank: number | null;
try {
rank = getFrequencyRank(normalizedText);
} catch {
rank = null;
}
if (rank !== null) {
if (!Number.isFinite(rank) || rank <= 0) {
rank = null;
}
}
cache.set(normalizedText, rank);
while (cache.size > FREQUENCY_RANK_LOOKUP_CACHE_LIMIT) {
const firstKey = cache.keys().next().value;
if (firstKey !== undefined) {
cache.delete(firstKey);
}
}
return rank;
}
function resolveFrequencyLookupText(token: MergedToken): string {
if (token.headword && token.headword.length > 0) {
return token.headword;
}
if (token.reading && token.reading.length > 0) {
return token.reading;
}
return token.surface;
}
function getFrequencyLookupTextCandidates(token: MergedToken): string[] {
const lookupText = resolveFrequencyLookupText(token).trim();
return lookupText ? [lookupText] : [];
}
function isFrequencyExcludedByPos(token: MergedToken): boolean {
if (
token.partOfSpeech === PartOfSpeech.particle ||
@@ -134,7 +66,6 @@ function isFrequencyExcludedByPos(token: MergedToken): boolean {
function applyFrequencyMarking(
tokens: MergedToken[],
getFrequencyRank: FrequencyDictionaryLookup,
): MergedToken[] {
return tokens.map((token) => {
if (isFrequencyExcludedByPos(token)) {
@@ -146,25 +77,9 @@ function applyFrequencyMarking(
return { ...token, frequencyRank: rank };
}
const lookupTexts = getFrequencyLookupTextCandidates(token);
if (lookupTexts.length === 0) {
return { ...token, frequencyRank: undefined };
}
let bestRank: number | null = null;
for (const lookupText of lookupTexts) {
const rank = getCachedFrequencyRank(lookupText, getFrequencyRank);
if (rank === null) {
continue;
}
if (bestRank === null || rank < bestRank) {
bestRank = rank;
}
}
return {
...token,
frequencyRank: bestRank ?? undefined,
frequencyRank: undefined,
};
});
}
@@ -357,16 +272,8 @@ export function annotateTokens(
const frequencyEnabled = options.frequencyEnabled !== false;
const frequencyMarkedTokens =
frequencyEnabled && deps.getFrequencyRank
? applyFrequencyMarking(knownMarkedTokens, deps.getFrequencyRank)
: frequencyEnabled
? knownMarkedTokens.map((token) => ({
...token,
frequencyRank:
typeof token.frequencyRank === 'number' && Number.isFinite(token.frequencyRank)
? Math.max(1, Math.floor(token.frequencyRank))
: undefined,
}))
frequencyEnabled
? applyFrequencyMarking(knownMarkedTokens)
: knownMarkedTokens.map((token) => ({
...token,
frequencyRank: undefined,

View File

@@ -94,10 +94,20 @@ test('requestYomitanTermFrequencies returns normalized frequency entries', async
term: '猫',
reading: 'ねこ',
dictionary: 'freq-dict',
dictionaryPriority: 0,
frequency: 77,
displayValue: '77',
displayValueParsed: true,
},
{
term: '鍛える',
reading: 'きたえる',
dictionary: 'freq-dict',
dictionaryPriority: 1,
frequency: 46961,
displayValue: '2847,46961',
displayValueParsed: true,
},
{
term: 'invalid',
dictionary: 'freq-dict',
@@ -110,9 +120,12 @@ test('requestYomitanTermFrequencies returns normalized frequency entries', async
error: () => undefined,
});
assert.equal(result.length, 1);
assert.equal(result.length, 2);
assert.equal(result[0]?.term, '猫');
assert.equal(result[0]?.frequency, 77);
assert.equal(result[0]?.dictionaryPriority, 0);
assert.equal(result[1]?.term, '鍛える');
assert.equal(result[1]?.frequency, 2847);
assert.match(scriptValue, /getTermFrequencies/);
assert.match(scriptValue, /optionsGetFull/);
});

View File

@@ -19,6 +19,7 @@ export interface YomitanTermFrequency {
term: string;
reading: string | null;
dictionary: string;
dictionaryPriority: number;
frequency: number;
displayValue: string | null;
displayValueParsed: boolean;
@@ -40,6 +41,32 @@ function asPositiveInteger(value: unknown): number | null {
return Math.max(1, Math.floor(value));
}
function parsePositiveFrequencyString(value: string): number | null {
const trimmed = value.trim();
if (!trimmed) {
return null;
}
const numericPrefix = trimmed.match(/^\d[\d,]*/)?.[0];
if (!numericPrefix) {
return null;
}
const chunks = numericPrefix.split(',');
const normalizedNumber =
chunks.length <= 1
? chunks[0] ?? ''
: chunks.slice(1).every((chunk) => /^\d{3}$/.test(chunk))
? chunks.join('')
: (chunks[0] ?? '');
const parsed = Number.parseInt(normalizedNumber, 10);
if (!Number.isFinite(parsed) || parsed <= 0) {
return null;
}
return parsed;
}
function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
if (!isObject(value)) {
return null;
@@ -47,10 +74,24 @@ function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
const term = typeof value.term === 'string' ? value.term.trim() : '';
const dictionary = typeof value.dictionary === 'string' ? value.dictionary.trim() : '';
const frequency = asPositiveInteger(value.frequency);
const rawFrequency = asPositiveInteger(value.frequency);
const displayValueRaw =
value.displayValue === null
? null
: typeof value.displayValue === 'string'
? value.displayValue
: null;
const parsedDisplayFrequency =
displayValueRaw !== null ? parsePositiveFrequencyString(displayValueRaw) : null;
const frequency = parsedDisplayFrequency ?? rawFrequency;
if (!term || !dictionary || frequency === null) {
return null;
}
const dictionaryPriorityRaw = (value as { dictionaryPriority?: unknown }).dictionaryPriority;
const dictionaryPriority =
typeof dictionaryPriorityRaw === 'number' && Number.isFinite(dictionaryPriorityRaw)
? Math.max(0, Math.floor(dictionaryPriorityRaw))
: Number.MAX_SAFE_INTEGER;
const reading =
value.reading === null
@@ -58,18 +99,14 @@ function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
: typeof value.reading === 'string'
? value.reading
: null;
const displayValue =
value.displayValue === null
? null
: typeof value.displayValue === 'string'
? value.displayValue
: null;
const displayValue = displayValueRaw;
const displayValueParsed = value.displayValueParsed === true;
return {
term,
reading,
dictionary,
dictionaryPriority,
frequency,
displayValue,
displayValueParsed,
@@ -278,20 +315,43 @@ export async function requestYomitanTermFrequencies(
const optionsFull = await invoke("optionsGetFull", undefined);
const profileIndex = optionsFull.profileCurrent;
const dictionariesRaw = optionsFull.profiles?.[profileIndex]?.options?.dictionaries ?? [];
const dictionaries = Array.isArray(dictionariesRaw)
const dictionaryEntries = Array.isArray(dictionariesRaw)
? dictionariesRaw
.filter((entry) => entry && typeof entry === "object" && entry.enabled === true && typeof entry.name === "string")
.map((entry) => entry.name)
.map((entry, index) => ({
name: entry.name,
id: typeof entry.id === "number" && Number.isFinite(entry.id) ? Math.floor(entry.id) : index
}))
.sort((a, b) => a.id - b.id)
: [];
const dictionaries = dictionaryEntries.map((entry) => entry.name);
const dictionaryPriorityByName = dictionaryEntries.reduce((acc, entry, index) => {
acc[entry.name] = index;
return acc;
}, {});
if (dictionaries.length === 0) {
return [];
}
return await invoke("getTermFrequencies", {
const rawFrequencies = await invoke("getTermFrequencies", {
termReadingList: ${JSON.stringify(normalizedTermReadingList)},
dictionaries
});
if (!Array.isArray(rawFrequencies)) {
return [];
}
return rawFrequencies
.filter((entry) => entry && typeof entry === "object")
.map((entry) => ({
...entry,
dictionaryPriority:
typeof entry.dictionary === "string" && dictionaryPriorityByName[entry.dictionary] !== undefined
? dictionaryPriorityByName[entry.dictionary]
: Number.MAX_SAFE_INTEGER
}));
})();
`;

View File

@@ -2303,6 +2303,8 @@ const {
getJlptEnabled: () => getResolvedConfig().subtitleStyle.enableJlpt,
getFrequencyDictionaryEnabled: () =>
getResolvedConfig().subtitleStyle.frequencyDictionary.enabled,
getFrequencyDictionaryMatchMode: () =>
getResolvedConfig().subtitleStyle.frequencyDictionary.matchMode,
getFrequencyRank: (text) => appState.frequencyRankLookup(text),
getYomitanGroupDebugEnabled: () => appState.overlayDebugVisualizationEnabled,
getMecabTokenizer: () => appState.mecabTokenizer,

View File

@@ -128,6 +128,7 @@ test('composeMpvRuntimeHandlers returns callable handlers and forwards to inject
getJlptLevel: () => null,
getJlptEnabled: () => true,
getFrequencyDictionaryEnabled: () => true,
getFrequencyDictionaryMatchMode: () => 'headword',
getFrequencyRank: () => null,
getYomitanGroupDebugEnabled: () => false,
getMecabTokenizer: () => null,

View File

@@ -35,6 +35,7 @@ test('tokenizer deps builder records known-word lookups and maps readers', () =>
getJlptLevel: () => 'N2',
getJlptEnabled: () => true,
getFrequencyDictionaryEnabled: () => true,
getFrequencyDictionaryMatchMode: () => 'surface',
getFrequencyRank: () => 5,
getYomitanGroupDebugEnabled: () => false,
getMecabTokenizer: () => null,
@@ -47,6 +48,7 @@ test('tokenizer deps builder records known-word lookups and maps readers', () =>
deps.setYomitanParserInitPromise(null);
assert.equal(deps.getNPlusOneEnabled?.(), true);
assert.equal(deps.getMinSentenceWordsForNPlusOne?.(), 3);
assert.equal(deps.getFrequencyDictionaryMatchMode?.(), 'surface');
assert.deepEqual(calls, ['lookup:true', 'lookup:false', 'set-window', 'set-ready', 'set-init']);
});

View File

@@ -5,6 +5,9 @@ type TokenizerMainDeps = TokenizerDepsRuntimeOptions & {
getFrequencyDictionaryEnabled: NonNullable<
TokenizerDepsRuntimeOptions['getFrequencyDictionaryEnabled']
>;
getFrequencyDictionaryMatchMode: NonNullable<
TokenizerDepsRuntimeOptions['getFrequencyDictionaryMatchMode']
>;
getFrequencyRank: NonNullable<TokenizerDepsRuntimeOptions['getFrequencyRank']>;
getMinSentenceWordsForNPlusOne: NonNullable<
TokenizerDepsRuntimeOptions['getMinSentenceWordsForNPlusOne']
@@ -41,6 +44,7 @@ export function createBuildTokenizerDepsMainHandler(deps: TokenizerMainDeps) {
getJlptLevel: (text: string) => deps.getJlptLevel(text),
getJlptEnabled: () => deps.getJlptEnabled(),
getFrequencyDictionaryEnabled: () => deps.getFrequencyDictionaryEnabled(),
getFrequencyDictionaryMatchMode: () => deps.getFrequencyDictionaryMatchMode(),
getFrequencyRank: (text: string) => deps.getFrequencyRank(text),
getYomitanGroupDebugEnabled: () => deps.getYomitanGroupDebugEnabled(),
getMecabTokenizer: () => deps.getMecabTokenizer(),

View File

@@ -79,7 +79,7 @@ test('computeWordClass preserves known and n+1 classes while adding JLPT classes
assert.equal(computeWordClass(nPlusOneJlpt), 'word word-n-plus-one word-jlpt-n2');
});
test('computeWordClass does not add frequency class to known or N+1 terms', () => {
test('computeWordClass keeps known/N+1 color classes exclusive over frequency classes', () => {
const known = createToken({
isKnown: true,
frequencyRank: 10,
@@ -231,7 +231,7 @@ test('getFrequencyRankLabelForToken returns rank only for frequency-colored toke
const outOfRangeToken = createToken({ surface: '圏外', frequencyRank: 1000 });
assert.equal(getFrequencyRankLabelForToken(frequencyToken, settings), '20');
assert.equal(getFrequencyRankLabelForToken(knownToken, settings), null);
assert.equal(getFrequencyRankLabelForToken(knownToken, settings), '20');
assert.equal(getFrequencyRankLabelForToken(outOfRangeToken, settings), null);
});

View File

@@ -184,7 +184,7 @@ export function getFrequencyRankLabelForToken(
token: MergedToken,
frequencySettings?: Partial<FrequencyRenderSettings>,
): string | null {
if (token.isKnown || token.isNPlusOneTarget) {
if (token.isNPlusOneTarget) {
return null;
}

View File

@@ -177,6 +177,7 @@ export type RuntimeOptionValueType = 'boolean' | 'enum';
export type RuntimeOptionValue = boolean | string;
export type NPlusOneMatchMode = 'headword' | 'surface';
export type FrequencyDictionaryMatchMode = 'headword' | 'surface';
export interface RuntimeOptionState {
id: RuntimeOptionId;
@@ -312,6 +313,7 @@ export interface SubtitleStyleConfig {
sourcePath?: string;
topX?: number;
mode?: FrequencyDictionaryMode;
matchMode?: FrequencyDictionaryMatchMode;
singleColor?: string;
bandedColors?: [string, string, string, string, string];
};
@@ -536,6 +538,7 @@ export interface ResolvedConfig {
sourcePath: string;
topX: number;
mode: FrequencyDictionaryMode;
matchMode: FrequencyDictionaryMatchMode;
singleColor: string;
bandedColors: [string, string, string, string, string];
};