fix(subtitle): stabilize frequency highlighting with yomitan ranks

This commit is contained in:
2026-02-28 16:44:28 -08:00
parent 9c2618c4c7
commit d2af09d941
22 changed files with 536 additions and 189 deletions

View File

@@ -136,6 +136,7 @@
"sourcePath": "", // Optional absolute path to a frequency dictionary directory. If empty, SubMiner searches installed/default frequency-dictionary locations. "sourcePath": "", // Optional absolute path to a frequency dictionary directory. If empty, SubMiner searches installed/default frequency-dictionary locations.
"topX": 1000, // Only color tokens with frequency rank <= topX (default: 1000). "topX": 1000, // Only color tokens with frequency rank <= topX (default: 1000).
"mode": "single", // single: use one color for all matching tokens. banded: use color ramp by frequency band. Values: single | banded "mode": "single", // single: use one color for all matching tokens. banded: use color ramp by frequency band. Values: single | banded
"matchMode": "headword", // Frequency lookup text selection mode. Values: headword | surface
"singleColor": "#f5a97f", // Color used when frequencyDictionary.mode is `single`. "singleColor": "#f5a97f", // Color used when frequencyDictionary.mode is `single`.
"bandedColors": [ "bandedColors": [
"#ed8796", "#ed8796",

View File

@@ -601,6 +601,8 @@ See `config.example.jsonc` for detailed configuration options and more examples.
**Supported commands:** Any valid mpv JSON IPC command array (`["cycle", "pause"]`, `["seek", 5]`, `["script-binding", "..."]`, etc.) **Supported commands:** Any valid mpv JSON IPC command array (`["cycle", "pause"]`, `["seek", 5]`, `["script-binding", "..."]`, etc.)
For subtitle-position and subtitle-track proxy commands (`sub-pos`, `sid`, `secondary-sid`), SubMiner also shows an mpv OSD notification after the command runs.
**See `config.example.jsonc`** for more keybinding examples and configuration options. **See `config.example.jsonc`** for more keybinding examples and configuration options.
### Runtime Option Palette ### Runtime Option Palette
@@ -760,6 +762,7 @@ See `config.example.jsonc` for detailed configuration options.
| `frequencyDictionary.sourcePath` | string | Path to a local frequency dictionary root. Leave empty or omit to use installed/default frequency-dictionary search paths. | | `frequencyDictionary.sourcePath` | string | Path to a local frequency dictionary root. Leave empty or omit to use installed/default frequency-dictionary search paths. |
| `frequencyDictionary.topX` | number | Only color tokens whose frequency rank is `<= topX` (`1000` by default) | | `frequencyDictionary.topX` | number | Only color tokens whose frequency rank is `<= topX` (`1000` by default) |
| `frequencyDictionary.mode` | string | `"single"` or `"banded"` (`"single"` by default) | | `frequencyDictionary.mode` | string | `"single"` or `"banded"` (`"single"` by default) |
| `frequencyDictionary.matchMode` | string | `"headword"` or `"surface"` (`"headword"` by default) |
| `frequencyDictionary.singleColor` | string | Color used for all highlighted tokens in single mode | | `frequencyDictionary.singleColor` | string | Color used for all highlighted tokens in single mode |
| `frequencyDictionary.bandedColors` | string[] | Array of five hex colors used for ranked bands in banded mode | | `frequencyDictionary.bandedColors` | string[] | Array of five hex colors used for ranked bands in banded mode |
| `nPlusOneColor` | string | Existing n+1 highlight color (default: `#c6a0f6`) | | `nPlusOneColor` | string | Existing n+1 highlight color (default: `#c6a0f6`) |
@@ -776,6 +779,7 @@ Lookup behavior:
- Set `frequencyDictionary.sourcePath` to a directory containing `term_meta_bank_*.json` for a fully custom source. - Set `frequencyDictionary.sourcePath` to a directory containing `term_meta_bank_*.json` for a fully custom source.
- If `sourcePath` is missing or empty, SubMiner searches default install/runtime locations for `frequency-dictionary` directories (for example app resources, user data paths, and current working directory). - If `sourcePath` is missing or empty, SubMiner searches default install/runtime locations for `frequency-dictionary` directories (for example app resources, user data paths, and current working directory).
- In both cases, only terms with a valid `frequencyRank` are used; everything else falls back to no highlighting. - In both cases, only terms with a valid `frequencyRank` are used; everything else falls back to no highlighting.
- `frequencyDictionary.matchMode` controls which token text is used for frequency lookups: `headword` (dictionary form) or `surface` (visible subtitle text).
In `single` mode all highlights use `singleColor`; in `banded` mode tokens map to five ascending color bands from most common to least common inside the topX window. In `single` mode all highlights use `singleColor`; in `banded` mode tokens map to five ascending color bands from most common to least common inside the topX window.

View File

@@ -136,6 +136,7 @@
"sourcePath": "", // Optional absolute path to a frequency dictionary directory. If empty, SubMiner searches installed/default frequency-dictionary locations. "sourcePath": "", // Optional absolute path to a frequency dictionary directory. If empty, SubMiner searches installed/default frequency-dictionary locations.
"topX": 1000, // Only color tokens with frequency rank <= topX (default: 1000). "topX": 1000, // Only color tokens with frequency rank <= topX (default: 1000).
"mode": "single", // single: use one color for all matching tokens. banded: use color ramp by frequency band. Values: single | banded "mode": "single", // single: use one color for all matching tokens. banded: use color ramp by frequency band. Values: single | banded
"matchMode": "headword", // Frequency lookup text selection mode. Values: headword | surface
"singleColor": "#f5a97f", // Color used when frequencyDictionary.mode is `single`. "singleColor": "#f5a97f", // Color used when frequencyDictionary.mode is `single`.
"bandedColors": [ "bandedColors": [
"#ed8796", "#ed8796",

View File

@@ -33,6 +33,7 @@ export const SUBTITLE_DEFAULT_CONFIG: Pick<ResolvedConfig, 'subtitleStyle'> = {
sourcePath: '', sourcePath: '',
topX: 1000, topX: 1000,
mode: 'single', mode: 'single',
matchMode: 'headword',
singleColor: '#f5a97f', singleColor: '#f5a97f',
bandedColors: ['#ed8796', '#f5a97f', '#f9e2af', '#a6e3a1', '#8aadf4'], bandedColors: ['#ed8796', '#f5a97f', '#f9e2af', '#a6e3a1', '#8aadf4'],
}, },

View File

@@ -61,6 +61,14 @@ export function buildSubtitleConfigOptionRegistry(
description: description:
'single: use one color for all matching tokens. banded: use color ramp by frequency band.', 'single: use one color for all matching tokens. banded: use color ramp by frequency band.',
}, },
{
path: 'subtitleStyle.frequencyDictionary.matchMode',
kind: 'enum',
enumValues: ['headword', 'surface'],
defaultValue: defaultConfig.subtitleStyle.frequencyDictionary.matchMode,
description:
'headword: frequency lookup uses dictionary form. surface: lookup uses subtitle-visible token text.',
},
{ {
path: 'subtitleStyle.frequencyDictionary.singleColor', path: 'subtitleStyle.frequencyDictionary.singleColor',
kind: 'string', kind: 'string',

View File

@@ -102,9 +102,18 @@ export function applySubtitleDomainConfig(context: ResolveContext): void {
const fallbackSubtitleStyleHoverTokenColor = resolved.subtitleStyle.hoverTokenColor; const fallbackSubtitleStyleHoverTokenColor = resolved.subtitleStyle.hoverTokenColor;
const fallbackSubtitleStyleHoverTokenBackgroundColor = const fallbackSubtitleStyleHoverTokenBackgroundColor =
resolved.subtitleStyle.hoverTokenBackgroundColor; resolved.subtitleStyle.hoverTokenBackgroundColor;
const fallbackFrequencyDictionary = {
...resolved.subtitleStyle.frequencyDictionary,
};
resolved.subtitleStyle = { resolved.subtitleStyle = {
...resolved.subtitleStyle, ...resolved.subtitleStyle,
...(src.subtitleStyle as ResolvedConfig['subtitleStyle']), ...(src.subtitleStyle as ResolvedConfig['subtitleStyle']),
frequencyDictionary: {
...resolved.subtitleStyle.frequencyDictionary,
...(isObject((src.subtitleStyle as { frequencyDictionary?: unknown }).frequencyDictionary)
? ((src.subtitleStyle as { frequencyDictionary?: unknown }).frequencyDictionary as ResolvedConfig['subtitleStyle']['frequencyDictionary'])
: {}),
},
secondary: { secondary: {
...resolved.subtitleStyle.secondary, ...resolved.subtitleStyle.secondary,
...(isObject(src.subtitleStyle.secondary) ...(isObject(src.subtitleStyle.secondary)
@@ -186,6 +195,7 @@ export function applySubtitleDomainConfig(context: ResolveContext): void {
if (frequencyEnabled !== undefined) { if (frequencyEnabled !== undefined) {
resolved.subtitleStyle.frequencyDictionary.enabled = frequencyEnabled; resolved.subtitleStyle.frequencyDictionary.enabled = frequencyEnabled;
} else if ((frequencyDictionary as { enabled?: unknown }).enabled !== undefined) { } else if ((frequencyDictionary as { enabled?: unknown }).enabled !== undefined) {
resolved.subtitleStyle.frequencyDictionary.enabled = fallbackFrequencyDictionary.enabled;
warn( warn(
'subtitleStyle.frequencyDictionary.enabled', 'subtitleStyle.frequencyDictionary.enabled',
(frequencyDictionary as { enabled?: unknown }).enabled, (frequencyDictionary as { enabled?: unknown }).enabled,
@@ -198,6 +208,7 @@ export function applySubtitleDomainConfig(context: ResolveContext): void {
if (sourcePath !== undefined) { if (sourcePath !== undefined) {
resolved.subtitleStyle.frequencyDictionary.sourcePath = sourcePath; resolved.subtitleStyle.frequencyDictionary.sourcePath = sourcePath;
} else if ((frequencyDictionary as { sourcePath?: unknown }).sourcePath !== undefined) { } else if ((frequencyDictionary as { sourcePath?: unknown }).sourcePath !== undefined) {
resolved.subtitleStyle.frequencyDictionary.sourcePath = fallbackFrequencyDictionary.sourcePath;
warn( warn(
'subtitleStyle.frequencyDictionary.sourcePath', 'subtitleStyle.frequencyDictionary.sourcePath',
(frequencyDictionary as { sourcePath?: unknown }).sourcePath, (frequencyDictionary as { sourcePath?: unknown }).sourcePath,
@@ -210,6 +221,7 @@ export function applySubtitleDomainConfig(context: ResolveContext): void {
if (topX !== undefined && Number.isInteger(topX) && topX > 0) { if (topX !== undefined && Number.isInteger(topX) && topX > 0) {
resolved.subtitleStyle.frequencyDictionary.topX = Math.floor(topX); resolved.subtitleStyle.frequencyDictionary.topX = Math.floor(topX);
} else if ((frequencyDictionary as { topX?: unknown }).topX !== undefined) { } else if ((frequencyDictionary as { topX?: unknown }).topX !== undefined) {
resolved.subtitleStyle.frequencyDictionary.topX = fallbackFrequencyDictionary.topX;
warn( warn(
'subtitleStyle.frequencyDictionary.topX', 'subtitleStyle.frequencyDictionary.topX',
(frequencyDictionary as { topX?: unknown }).topX, (frequencyDictionary as { topX?: unknown }).topX,
@@ -222,6 +234,7 @@ export function applySubtitleDomainConfig(context: ResolveContext): void {
if (frequencyMode === 'single' || frequencyMode === 'banded') { if (frequencyMode === 'single' || frequencyMode === 'banded') {
resolved.subtitleStyle.frequencyDictionary.mode = frequencyMode; resolved.subtitleStyle.frequencyDictionary.mode = frequencyMode;
} else if (frequencyMode !== undefined) { } else if (frequencyMode !== undefined) {
resolved.subtitleStyle.frequencyDictionary.mode = fallbackFrequencyDictionary.mode;
warn( warn(
'subtitleStyle.frequencyDictionary.mode', 'subtitleStyle.frequencyDictionary.mode',
frequencyDictionary.mode, frequencyDictionary.mode,
@@ -230,10 +243,24 @@ export function applySubtitleDomainConfig(context: ResolveContext): void {
); );
} }
const frequencyMatchMode = (frequencyDictionary as { matchMode?: unknown }).matchMode;
if (frequencyMatchMode === 'headword' || frequencyMatchMode === 'surface') {
resolved.subtitleStyle.frequencyDictionary.matchMode = frequencyMatchMode;
} else if (frequencyMatchMode !== undefined) {
resolved.subtitleStyle.frequencyDictionary.matchMode = fallbackFrequencyDictionary.matchMode;
warn(
'subtitleStyle.frequencyDictionary.matchMode',
frequencyMatchMode,
resolved.subtitleStyle.frequencyDictionary.matchMode,
"Expected 'headword' or 'surface'.",
);
}
const singleColor = asColor((frequencyDictionary as { singleColor?: unknown }).singleColor); const singleColor = asColor((frequencyDictionary as { singleColor?: unknown }).singleColor);
if (singleColor !== undefined) { if (singleColor !== undefined) {
resolved.subtitleStyle.frequencyDictionary.singleColor = singleColor; resolved.subtitleStyle.frequencyDictionary.singleColor = singleColor;
} else if ((frequencyDictionary as { singleColor?: unknown }).singleColor !== undefined) { } else if ((frequencyDictionary as { singleColor?: unknown }).singleColor !== undefined) {
resolved.subtitleStyle.frequencyDictionary.singleColor = fallbackFrequencyDictionary.singleColor;
warn( warn(
'subtitleStyle.frequencyDictionary.singleColor', 'subtitleStyle.frequencyDictionary.singleColor',
(frequencyDictionary as { singleColor?: unknown }).singleColor, (frequencyDictionary as { singleColor?: unknown }).singleColor,
@@ -248,6 +275,8 @@ export function applySubtitleDomainConfig(context: ResolveContext): void {
if (bandedColors !== undefined) { if (bandedColors !== undefined) {
resolved.subtitleStyle.frequencyDictionary.bandedColors = bandedColors; resolved.subtitleStyle.frequencyDictionary.bandedColors = bandedColors;
} else if ((frequencyDictionary as { bandedColors?: unknown }).bandedColors !== undefined) { } else if ((frequencyDictionary as { bandedColors?: unknown }).bandedColors !== undefined) {
resolved.subtitleStyle.frequencyDictionary.bandedColors =
fallbackFrequencyDictionary.bandedColors;
warn( warn(
'subtitleStyle.frequencyDictionary.bandedColors', 'subtitleStyle.frequencyDictionary.bandedColors',
(frequencyDictionary as { bandedColors?: unknown }).bandedColors, (frequencyDictionary as { bandedColors?: unknown }).bandedColors,

View File

@@ -27,3 +27,32 @@ test('subtitleStyle preserveLineBreaks falls back while merge is preserved', ()
), ),
); );
}); });
test('subtitleStyle frequencyDictionary.matchMode accepts valid values and warns on invalid', () => {
const valid = createResolveContext({
subtitleStyle: {
frequencyDictionary: {
matchMode: 'surface',
},
},
});
applySubtitleDomainConfig(valid.context);
assert.equal(valid.context.resolved.subtitleStyle.frequencyDictionary.matchMode, 'surface');
const invalid = createResolveContext({
subtitleStyle: {
frequencyDictionary: {
matchMode: 'reading' as unknown as 'headword' | 'surface',
},
},
});
applySubtitleDomainConfig(invalid.context);
assert.equal(invalid.context.resolved.subtitleStyle.frequencyDictionary.matchMode, 'headword');
assert.ok(
invalid.warnings.some(
(warning) =>
warning.path === 'subtitleStyle.frequencyDictionary.matchMode' &&
warning.message === "Expected 'headword' or 'surface'.",
),
);
});

View File

@@ -80,7 +80,7 @@ test('createFrequencyDictionaryLookup aggregates duplicate-term logs into a sing
); );
}); });
test('createFrequencyDictionaryLookup prefers frequency.value over displayValue', async () => { test('createFrequencyDictionaryLookup prefers frequency.displayValue over value when both exist', async () => {
const logs: string[] = []; const logs: string[] = [];
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-frequency-dict-')); const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-frequency-dict-'));
const bankPath = path.join(tempDir, 'term_meta_bank_1.json'); const bankPath = path.join(tempDir, 'term_meta_bank_1.json');
@@ -88,6 +88,7 @@ test('createFrequencyDictionaryLookup prefers frequency.value over displayValue'
bankPath, bankPath,
JSON.stringify([ JSON.stringify([
['猫', 1, { frequency: { value: 1234, displayValue: 1200 } }], ['猫', 1, { frequency: { value: 1234, displayValue: 1200 } }],
['鍛える', 2, { frequency: { value: 46961, displayValue: 2847 } }],
['犬', 2, { frequency: { displayValue: 88 } }], ['犬', 2, { frequency: { displayValue: 88 } }],
]), ]),
); );
@@ -99,10 +100,31 @@ test('createFrequencyDictionaryLookup prefers frequency.value over displayValue'
}, },
}); });
assert.equal(lookup('猫'), 1234); assert.equal(lookup('猫'), 1200);
assert.equal(lookup('鍛える'), 2847);
assert.equal(lookup('犬'), 88); assert.equal(lookup('犬'), 88);
assert.equal( assert.equal(
logs.some((entry) => entry.includes('Frequency dictionary loaded from')), logs.some((entry) => entry.includes('Frequency dictionary loaded from')),
true, true,
); );
}); });
test('createFrequencyDictionaryLookup parses composite displayValue by primary rank', async () => {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-frequency-dict-'));
const bankPath = path.join(tempDir, 'term_meta_bank_1.json');
fs.writeFileSync(
bankPath,
JSON.stringify([
['鍛える', 1, { frequency: { displayValue: '3272,52377' } }],
['高み', 2, { frequency: { displayValue: '9933,108961' } }],
]),
);
const lookup = await createFrequencyDictionaryLookup({
searchPaths: [tempDir],
log: () => undefined,
});
assert.equal(lookup('鍛える'), 3272);
assert.equal(lookup('高み'), 9933);
});

View File

@@ -18,6 +18,32 @@ function normalizeFrequencyTerm(value: string): string {
return value.trim().toLowerCase(); return value.trim().toLowerCase();
} }
function parsePositiveFrequencyString(value: string): number | null {
const trimmed = value.trim();
if (!trimmed) {
return null;
}
const numericPrefix = trimmed.match(/^\d[\d,]*/)?.[0];
if (!numericPrefix) {
return null;
}
const chunks = numericPrefix.split(',');
const normalizedNumber =
chunks.length <= 1
? chunks[0] ?? ''
: chunks.slice(1).every((chunk) => /^\d{3}$/.test(chunk))
? chunks.join('')
: (chunks[0] ?? '');
const parsed = Number.parseInt(normalizedNumber, 10);
if (!Number.isFinite(parsed) || parsed <= 0) {
return null;
}
return parsed;
}
function parsePositiveFrequencyNumber(value: unknown): number | null { function parsePositiveFrequencyNumber(value: unknown): number | null {
if (typeof value === 'number') { if (typeof value === 'number') {
if (!Number.isFinite(value) || value <= 0) return null; if (!Number.isFinite(value) || value <= 0) return null;
@@ -25,10 +51,7 @@ function parsePositiveFrequencyNumber(value: unknown): number | null {
} }
if (typeof value === 'string') { if (typeof value === 'string') {
const normalized = value.trim().replace(/,/g, ''); return parsePositiveFrequencyString(value);
const parsed = Number.parseInt(normalized, 10);
if (!Number.isFinite(parsed) || parsed <= 0) return null;
return parsed;
} }
return null; return null;
@@ -38,14 +61,14 @@ function extractFrequencyDisplayValue(meta: unknown): number | null {
if (!meta || typeof meta !== 'object') return null; if (!meta || typeof meta !== 'object') return null;
const frequency = (meta as { frequency?: unknown }).frequency; const frequency = (meta as { frequency?: unknown }).frequency;
if (!frequency || typeof frequency !== 'object') return null; if (!frequency || typeof frequency !== 'object') return null;
const rawValue = (frequency as { value?: unknown }).value; const displayValue = (frequency as { displayValue?: unknown }).displayValue;
const parsedValue = parsePositiveFrequencyNumber(rawValue); const parsedDisplayValue = parsePositiveFrequencyNumber(displayValue);
if (parsedValue !== null) { if (parsedDisplayValue !== null) {
return parsedValue; return parsedDisplayValue;
} }
const displayValue = (frequency as { displayValue?: unknown }).displayValue; const rawValue = (frequency as { value?: unknown }).value;
return parsePositiveFrequencyNumber(displayValue); return parsePositiveFrequencyNumber(rawValue);
} }
function asFrequencyDictionaryEntry(entry: unknown): FrequencyDictionaryEntry | null { function asFrequencyDictionaryEntry(entry: unknown): FrequencyDictionaryEntry | null {

View File

@@ -218,6 +218,119 @@ test('tokenizeSubtitle loads frequency ranks from Yomitan installed dictionaries
assert.equal(result.tokens?.[0]?.frequencyRank, 77); assert.equal(result.tokens?.[0]?.frequencyRank, 77);
}); });
test('tokenizeSubtitle queries headword frequencies without forcing surface reading', async () => {
const result = await tokenizeSubtitle(
'鍛えた',
makeDeps({
getFrequencyDictionaryEnabled: () => true,
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
getYomitanParserWindow: () =>
({
isDestroyed: () => false,
webContents: {
executeJavaScript: async (script: string) => {
if (script.includes('getTermFrequencies')) {
if (!script.includes('"term":"鍛える","reading":null')) {
return [];
}
return [
{
term: '鍛える',
reading: 'きたえる',
dictionary: 'freq-dict',
frequency: 46961,
displayValue: '2847,46961',
displayValueParsed: true,
},
];
}
return [
{
source: 'scanning-parser',
index: 0,
content: [
[
{
text: '鍛えた',
reading: 'きた',
headwords: [[{ term: '鍛える' }]],
},
],
],
},
];
},
},
}) as unknown as Electron.BrowserWindow,
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.headword, '鍛える');
assert.equal(result.tokens?.[0]?.reading, 'きた');
assert.equal(result.tokens?.[0]?.frequencyRank, 2847);
});
test('tokenizeSubtitle prefers Yomitan frequency from highest-priority dictionary', async () => {
const result = await tokenizeSubtitle(
'猫',
makeDeps({
getFrequencyDictionaryEnabled: () => true,
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
getYomitanParserWindow: () =>
({
isDestroyed: () => false,
webContents: {
executeJavaScript: async (script: string) => {
if (script.includes('getTermFrequencies')) {
return [
{
term: '猫',
reading: 'ねこ',
dictionary: 'low-priority',
dictionaryPriority: 2,
frequency: 5,
displayValue: '5',
displayValueParsed: true,
},
{
term: '猫',
reading: 'ねこ',
dictionary: 'high-priority',
dictionaryPriority: 0,
frequency: 100,
displayValue: '100',
displayValueParsed: true,
},
];
}
return [
{
source: 'scanning-parser',
index: 0,
content: [
[
{
text: '猫',
reading: 'ねこ',
headwords: [[{ term: '猫' }]],
},
],
],
},
];
},
},
}) as unknown as Electron.BrowserWindow,
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.frequencyRank, 100);
});
test('tokenizeSubtitle uses only selected Yomitan headword for frequency lookup', async () => { test('tokenizeSubtitle uses only selected Yomitan headword for frequency lookup', async () => {
const result = await tokenizeSubtitle( const result = await tokenizeSubtitle(
'猫です', '猫です',
@@ -1693,6 +1806,20 @@ test('tokenizeSubtitle checks known words by surface when configured', async ()
assert.equal(result.tokens?.[0]?.isKnown, true); assert.equal(result.tokens?.[0]?.isKnown, true);
}); });
test('tokenizeSubtitle uses frequency surface match mode when configured', async () => {
const result = await tokenizeSubtitle(
'鍛えた',
makeDepsFromYomitanTokens([{ surface: '鍛えた', reading: 'きたえた', headword: '鍛える' }], {
getFrequencyDictionaryEnabled: () => true,
getFrequencyDictionaryMatchMode: () => 'surface',
getFrequencyRank: (text) => (text === '鍛えた' ? 2847 : null),
}),
);
assert.equal(result.text, '鍛えた');
assert.equal(result.tokens?.[0]?.frequencyRank, 2847);
});
test('createTokenizerDepsRuntime checks MeCab availability before first tokenizeWithMecab call', async () => { test('createTokenizerDepsRuntime checks MeCab availability before first tokenizeWithMecab call', async () => {
let available = false; let available = false;
let checkCalls = 0; let checkCalls = 0;

View File

@@ -2,6 +2,7 @@ import type { BrowserWindow, Extension } from 'electron';
import { mergeTokens } from '../../token-merger'; import { mergeTokens } from '../../token-merger';
import { createLogger } from '../../logger'; import { createLogger } from '../../logger';
import { import {
FrequencyDictionaryMatchMode,
MergedToken, MergedToken,
NPlusOneMatchMode, NPlusOneMatchMode,
SubtitleData, SubtitleData,
@@ -36,6 +37,7 @@ export interface TokenizerServiceDeps {
getNPlusOneEnabled?: () => boolean; getNPlusOneEnabled?: () => boolean;
getJlptEnabled?: () => boolean; getJlptEnabled?: () => boolean;
getFrequencyDictionaryEnabled?: () => boolean; getFrequencyDictionaryEnabled?: () => boolean;
getFrequencyDictionaryMatchMode?: () => FrequencyDictionaryMatchMode;
getFrequencyRank?: FrequencyDictionaryLookup; getFrequencyRank?: FrequencyDictionaryLookup;
getMinSentenceWordsForNPlusOne?: () => number; getMinSentenceWordsForNPlusOne?: () => number;
getYomitanGroupDebugEnabled?: () => boolean; getYomitanGroupDebugEnabled?: () => boolean;
@@ -63,6 +65,7 @@ export interface TokenizerDepsRuntimeOptions {
getNPlusOneEnabled?: () => boolean; getNPlusOneEnabled?: () => boolean;
getJlptEnabled?: () => boolean; getJlptEnabled?: () => boolean;
getFrequencyDictionaryEnabled?: () => boolean; getFrequencyDictionaryEnabled?: () => boolean;
getFrequencyDictionaryMatchMode?: () => FrequencyDictionaryMatchMode;
getFrequencyRank?: FrequencyDictionaryLookup; getFrequencyRank?: FrequencyDictionaryLookup;
getMinSentenceWordsForNPlusOne?: () => number; getMinSentenceWordsForNPlusOne?: () => number;
getYomitanGroupDebugEnabled?: () => boolean; getYomitanGroupDebugEnabled?: () => boolean;
@@ -73,6 +76,7 @@ interface TokenizerAnnotationOptions {
nPlusOneEnabled: boolean; nPlusOneEnabled: boolean;
jlptEnabled: boolean; jlptEnabled: boolean;
frequencyEnabled: boolean; frequencyEnabled: boolean;
frequencyMatchMode: FrequencyDictionaryMatchMode;
minSentenceWordsForNPlusOne: number | undefined; minSentenceWordsForNPlusOne: number | undefined;
} }
@@ -139,7 +143,6 @@ async function applyAnnotationStage(
isKnownWord: getKnownWordLookup(deps, options), isKnownWord: getKnownWordLookup(deps, options),
knownWordMatchMode: deps.getKnownWordMatchMode(), knownWordMatchMode: deps.getKnownWordMatchMode(),
getJlptLevel: deps.getJlptLevel, getJlptLevel: deps.getJlptLevel,
getFrequencyRank: deps.getFrequencyRank,
}, },
options, options,
); );
@@ -164,6 +167,8 @@ export function createTokenizerDepsRuntime(
getNPlusOneEnabled: options.getNPlusOneEnabled, getNPlusOneEnabled: options.getNPlusOneEnabled,
getJlptEnabled: options.getJlptEnabled, getJlptEnabled: options.getJlptEnabled,
getFrequencyDictionaryEnabled: options.getFrequencyDictionaryEnabled, getFrequencyDictionaryEnabled: options.getFrequencyDictionaryEnabled,
getFrequencyDictionaryMatchMode:
options.getFrequencyDictionaryMatchMode ?? (() => 'headword'),
getFrequencyRank: options.getFrequencyRank, getFrequencyRank: options.getFrequencyRank,
getMinSentenceWordsForNPlusOne: options.getMinSentenceWordsForNPlusOne ?? (() => 3), getMinSentenceWordsForNPlusOne: options.getMinSentenceWordsForNPlusOne ?? (() => 3),
getYomitanGroupDebugEnabled: options.getYomitanGroupDebugEnabled ?? (() => false), getYomitanGroupDebugEnabled: options.getYomitanGroupDebugEnabled ?? (() => false),
@@ -224,7 +229,24 @@ function normalizePositiveFrequencyRank(value: unknown): number | null {
return Math.max(1, Math.floor(value)); return Math.max(1, Math.floor(value));
} }
function resolveFrequencyLookupText(token: MergedToken): string { function normalizeFrequencyLookupText(rawText: string): string {
return rawText.trim().toLowerCase();
}
function resolveFrequencyLookupText(
token: MergedToken,
matchMode: FrequencyDictionaryMatchMode,
): string {
if (matchMode === 'surface') {
if (token.surface && token.surface.length > 0) {
return token.surface;
}
if (token.headword && token.headword.length > 0) {
return token.headword;
}
return token.reading;
}
if (token.headword && token.headword.length > 0) { if (token.headword && token.headword.length > 0) {
return token.headword; return token.headword;
} }
@@ -234,43 +256,128 @@ function resolveFrequencyLookupText(token: MergedToken): string {
return token.surface; return token.surface;
} }
function applyYomitanFrequencyRanks( function buildYomitanFrequencyTermReadingList(
tokens: MergedToken[], tokens: MergedToken[],
frequencies: ReadonlyArray<{ term: string; frequency: number }>, matchMode: FrequencyDictionaryMatchMode,
): MergedToken[] { ): Array<{ term: string; reading: string | null }> {
if (tokens.length === 0 || frequencies.length === 0) { return tokens
return tokens; .map((token) => {
const term = resolveFrequencyLookupText(token, matchMode).trim();
if (!term) {
return null;
} }
const readingRaw =
token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null;
const reading = matchMode === 'headword' ? null : readingRaw;
return { term, reading };
})
.filter((pair): pair is { term: string; reading: string | null } => pair !== null);
}
const rankByTerm = new Map<string, number>(); function buildYomitanFrequencyRankMap(
frequencies: ReadonlyArray<{ term: string; frequency: number; dictionaryPriority?: number }>,
): Map<string, number> {
const rankByTerm = new Map<string, { rank: number; dictionaryPriority: number }>();
for (const frequency of frequencies) { for (const frequency of frequencies) {
const normalizedTerm = frequency.term.trim(); const normalizedTerm = frequency.term.trim();
const rank = normalizePositiveFrequencyRank(frequency.frequency); const rank = normalizePositiveFrequencyRank(frequency.frequency);
if (!normalizedTerm || rank === null) { if (!normalizedTerm || rank === null) {
continue; continue;
} }
const dictionaryPriority =
typeof frequency.dictionaryPriority === 'number' && Number.isFinite(frequency.dictionaryPriority)
? Math.max(0, Math.floor(frequency.dictionaryPriority))
: Number.MAX_SAFE_INTEGER;
const current = rankByTerm.get(normalizedTerm); const current = rankByTerm.get(normalizedTerm);
if (current === undefined || rank < current) { if (
rankByTerm.set(normalizedTerm, rank); current === undefined ||
dictionaryPriority < current.dictionaryPriority ||
(dictionaryPriority === current.dictionaryPriority && rank < current.rank)
) {
rankByTerm.set(normalizedTerm, { rank, dictionaryPriority });
} }
} }
if (rankByTerm.size === 0) { const collapsedRankByTerm = new Map<string, number>();
for (const [term, entry] of rankByTerm.entries()) {
collapsedRankByTerm.set(term, entry.rank);
}
return collapsedRankByTerm;
}
function getLocalFrequencyRank(
lookupText: string,
getFrequencyRank: FrequencyDictionaryLookup,
cache: Map<string, number | null>,
): number | null {
const normalizedText = normalizeFrequencyLookupText(lookupText);
if (!normalizedText) {
return null;
}
if (cache.has(normalizedText)) {
return cache.get(normalizedText) ?? null;
}
let rank: number | null;
try {
rank = getFrequencyRank(normalizedText);
} catch {
rank = null;
}
rank = normalizePositiveFrequencyRank(rank);
cache.set(normalizedText, rank);
return rank;
}
function applyFrequencyRanks(
tokens: MergedToken[],
matchMode: FrequencyDictionaryMatchMode,
yomitanRankByTerm: Map<string, number>,
getFrequencyRank: FrequencyDictionaryLookup | undefined,
): MergedToken[] {
if (tokens.length === 0) {
return tokens; return tokens;
} }
const localLookupCache = new Map<string, number | null>();
return tokens.map((token) => { return tokens.map((token) => {
const lookupText = resolveFrequencyLookupText(token).trim(); const existingRank = normalizePositiveFrequencyRank(token.frequencyRank);
if (!lookupText) { if (existingRank !== null) {
return token;
}
const rank = rankByTerm.get(lookupText);
if (rank === undefined) {
return token;
}
return { return {
...token, ...token,
frequencyRank: rank, frequencyRank: existingRank,
};
}
const lookupText = resolveFrequencyLookupText(token, matchMode).trim();
if (!lookupText) {
return {
...token,
frequencyRank: undefined,
};
}
const yomitanRank = yomitanRankByTerm.get(lookupText);
if (yomitanRank !== undefined) {
return {
...token,
frequencyRank: yomitanRank,
};
}
if (!getFrequencyRank) {
return {
...token,
frequencyRank: undefined,
};
}
const localRank = getLocalFrequencyRank(lookupText, getFrequencyRank, localLookupCache);
return {
...token,
frequencyRank: localRank ?? undefined,
}; };
}); });
} }
@@ -280,6 +387,7 @@ function getAnnotationOptions(deps: TokenizerServiceDeps): TokenizerAnnotationOp
nPlusOneEnabled: deps.getNPlusOneEnabled?.() !== false, nPlusOneEnabled: deps.getNPlusOneEnabled?.() !== false,
jlptEnabled: deps.getJlptEnabled?.() !== false, jlptEnabled: deps.getJlptEnabled?.() !== false,
frequencyEnabled: deps.getFrequencyDictionaryEnabled?.() !== false, frequencyEnabled: deps.getFrequencyDictionaryEnabled?.() !== false,
frequencyMatchMode: deps.getFrequencyDictionaryMatchMode?.() ?? 'headword',
minSentenceWordsForNPlusOne: deps.getMinSentenceWordsForNPlusOne?.(), minSentenceWordsForNPlusOne: deps.getMinSentenceWordsForNPlusOne?.(),
}; };
} }
@@ -307,24 +415,23 @@ async function parseWithYomitanInternalParser(
logSelectedYomitanGroups(text, selectedTokens); logSelectedYomitanGroups(text, selectedTokens);
} }
let tokensWithFrequency = selectedTokens; let yomitanRankByTerm = new Map<string, number>();
if (options.frequencyEnabled) { if (options.frequencyEnabled) {
const termReadingList = selectedTokens.map((token) => ({ const frequencyMatchMode = options.frequencyMatchMode;
term: resolveFrequencyLookupText(token), const termReadingList = buildYomitanFrequencyTermReadingList(
reading: token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null, selectedTokens,
})); frequencyMatchMode,
);
const yomitanFrequencies = await requestYomitanTermFrequencies(termReadingList, deps, logger); const yomitanFrequencies = await requestYomitanTermFrequencies(termReadingList, deps, logger);
tokensWithFrequency = applyYomitanFrequencyRanks(selectedTokens, yomitanFrequencies); yomitanRankByTerm = buildYomitanFrequencyRankMap(yomitanFrequencies);
}
if (!needsMecabPosEnrichment(options)) {
return tokensWithFrequency;
} }
let enrichedTokens = selectedTokens;
if (needsMecabPosEnrichment(options)) {
try { try {
const mecabTokens = await deps.tokenizeWithMecab(text); const mecabTokens = await deps.tokenizeWithMecab(text);
const enrichTokensWithMecab = deps.enrichTokensWithMecab ?? enrichTokensWithMecabAsync; const enrichTokensWithMecab = deps.enrichTokensWithMecab ?? enrichTokensWithMecabAsync;
return await enrichTokensWithMecab(tokensWithFrequency, mecabTokens); enrichedTokens = await enrichTokensWithMecab(enrichedTokens, mecabTokens);
} catch (err) { } catch (err) {
const error = err as Error; const error = err as Error;
logger.warn( logger.warn(
@@ -333,8 +440,19 @@ async function parseWithYomitanInternalParser(
`tokenCount=${selectedTokens.length}`, `tokenCount=${selectedTokens.length}`,
`textLength=${text.length}`, `textLength=${text.length}`,
); );
return tokensWithFrequency;
} }
}
if (options.frequencyEnabled) {
return applyFrequencyRanks(
enrichedTokens,
options.frequencyMatchMode,
yomitanRankByTerm,
deps.getFrequencyRank,
);
}
return enrichedTokens;
} }
export async function tokenizeSubtitle( export async function tokenizeSubtitle(

View File

@@ -51,15 +51,15 @@ test('annotateTokens known-word match mode uses headword vs surface', () => {
}); });
test('annotateTokens excludes frequency for particle/bound_auxiliary and pos1 exclusions', () => { test('annotateTokens excludes frequency for particle/bound_auxiliary and pos1 exclusions', () => {
const lookupCalls: string[] = [];
const tokens = [ const tokens = [
makeToken({ surface: 'は', headword: 'は', partOfSpeech: PartOfSpeech.particle }), makeToken({ surface: 'は', headword: 'は', partOfSpeech: PartOfSpeech.particle, frequencyRank: 3 }),
makeToken({ makeToken({
surface: 'です', surface: 'です',
headword: 'です', headword: 'です',
partOfSpeech: PartOfSpeech.bound_auxiliary, partOfSpeech: PartOfSpeech.bound_auxiliary,
startPos: 1, startPos: 1,
endPos: 3, endPos: 3,
frequencyRank: 4,
}), }),
makeToken({ makeToken({
surface: 'の', surface: 'の',
@@ -68,6 +68,7 @@ test('annotateTokens excludes frequency for particle/bound_auxiliary and pos1 ex
pos1: '助詞', pos1: '助詞',
startPos: 3, startPos: 3,
endPos: 4, endPos: 4,
frequencyRank: 5,
}), }),
makeToken({ makeToken({
surface: '猫', surface: '猫',
@@ -75,45 +76,36 @@ test('annotateTokens excludes frequency for particle/bound_auxiliary and pos1 ex
partOfSpeech: PartOfSpeech.noun, partOfSpeech: PartOfSpeech.noun,
startPos: 4, startPos: 4,
endPos: 5, endPos: 5,
frequencyRank: 11,
}), }),
]; ];
const result = annotateTokens( const result = annotateTokens(tokens, makeDeps());
tokens,
makeDeps({
getFrequencyRank: (text) => {
lookupCalls.push(text);
return text === '猫' ? 11 : 999;
},
}),
);
assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[1]?.frequencyRank, undefined); assert.equal(result[1]?.frequencyRank, undefined);
assert.equal(result[2]?.frequencyRank, undefined); assert.equal(result[2]?.frequencyRank, undefined);
assert.equal(result[3]?.frequencyRank, 11); assert.equal(result[3]?.frequencyRank, 11);
assert.deepEqual(lookupCalls, ['猫']);
}); });
test('annotateTokens preserves existing frequency rank when lookup is unavailable', () => { test('annotateTokens preserves existing frequency rank when frequency is enabled', () => {
const tokens = [makeToken({ surface: '猫', headword: '猫', frequencyRank: 42 })]; const tokens = [makeToken({ surface: '猫', headword: '猫', frequencyRank: 42 })];
const result = annotateTokens(tokens, makeDeps({ getFrequencyRank: undefined })); const result = annotateTokens(tokens, makeDeps());
assert.equal(result[0]?.frequencyRank, 42); assert.equal(result[0]?.frequencyRank, 42);
}); });
test('annotateTokens prefers existing frequency rank over fallback lookup', () => { test('annotateTokens drops invalid frequency rank values', () => {
const tokens = [makeToken({ surface: '猫', headword: '猫', frequencyRank: Number.NaN })];
const result = annotateTokens(tokens, makeDeps());
assert.equal(result[0]?.frequencyRank, undefined);
});
test('annotateTokens clears frequency rank when frequency is disabled', () => {
const tokens = [makeToken({ surface: '猫', headword: '猫', frequencyRank: 42 })]; const tokens = [makeToken({ surface: '猫', headword: '猫', frequencyRank: 42 })];
const result = annotateTokens(tokens, makeDeps(), { frequencyEnabled: false });
const result = annotateTokens( assert.equal(result[0]?.frequencyRank, undefined);
tokens,
makeDeps({
getFrequencyRank: () => 9,
}),
);
assert.equal(result[0]?.frequencyRank, 42);
}); });
test('annotateTokens handles JLPT disabled and eligibility exclusion paths', () => { test('annotateTokens handles JLPT disabled and eligibility exclusion paths', () => {

View File

@@ -1,6 +1,5 @@
import { markNPlusOneTargets } from '../../../token-merger'; import { markNPlusOneTargets } from '../../../token-merger';
import { import {
FrequencyDictionaryLookup,
JlptLevel, JlptLevel,
MergedToken, MergedToken,
NPlusOneMatchMode, NPlusOneMatchMode,
@@ -12,22 +11,16 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
const KATAKANA_CODEPOINT_START = 0x30a1; const KATAKANA_CODEPOINT_START = 0x30a1;
const KATAKANA_CODEPOINT_END = 0x30f6; const KATAKANA_CODEPOINT_END = 0x30f6;
const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048; const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
const FREQUENCY_RANK_LOOKUP_CACHE_LIMIT = 2048;
const jlptLevelLookupCaches = new WeakMap< const jlptLevelLookupCaches = new WeakMap<
(text: string) => JlptLevel | null, (text: string) => JlptLevel | null,
Map<string, JlptLevel | null> Map<string, JlptLevel | null>
>(); >();
const frequencyRankLookupCaches = new WeakMap<
FrequencyDictionaryLookup,
Map<string, number | null>
>();
export interface AnnotationStageDeps { export interface AnnotationStageDeps {
isKnownWord: (text: string) => boolean; isKnownWord: (text: string) => boolean;
knownWordMatchMode: NPlusOneMatchMode; knownWordMatchMode: NPlusOneMatchMode;
getJlptLevel: (text: string) => JlptLevel | null; getJlptLevel: (text: string) => JlptLevel | null;
getFrequencyRank?: FrequencyDictionaryLookup;
} }
export interface AnnotationStageOptions { export interface AnnotationStageOptions {
@@ -60,67 +53,6 @@ function applyKnownWordMarking(
}); });
} }
function normalizeFrequencyLookupText(rawText: string): string {
return rawText.trim().toLowerCase();
}
function getCachedFrequencyRank(
lookupText: string,
getFrequencyRank: FrequencyDictionaryLookup,
): number | null {
const normalizedText = normalizeFrequencyLookupText(lookupText);
if (!normalizedText) {
return null;
}
let cache = frequencyRankLookupCaches.get(getFrequencyRank);
if (!cache) {
cache = new Map<string, number | null>();
frequencyRankLookupCaches.set(getFrequencyRank, cache);
}
if (cache.has(normalizedText)) {
return cache.get(normalizedText) ?? null;
}
let rank: number | null;
try {
rank = getFrequencyRank(normalizedText);
} catch {
rank = null;
}
if (rank !== null) {
if (!Number.isFinite(rank) || rank <= 0) {
rank = null;
}
}
cache.set(normalizedText, rank);
while (cache.size > FREQUENCY_RANK_LOOKUP_CACHE_LIMIT) {
const firstKey = cache.keys().next().value;
if (firstKey !== undefined) {
cache.delete(firstKey);
}
}
return rank;
}
function resolveFrequencyLookupText(token: MergedToken): string {
if (token.headword && token.headword.length > 0) {
return token.headword;
}
if (token.reading && token.reading.length > 0) {
return token.reading;
}
return token.surface;
}
function getFrequencyLookupTextCandidates(token: MergedToken): string[] {
const lookupText = resolveFrequencyLookupText(token).trim();
return lookupText ? [lookupText] : [];
}
function isFrequencyExcludedByPos(token: MergedToken): boolean { function isFrequencyExcludedByPos(token: MergedToken): boolean {
if ( if (
token.partOfSpeech === PartOfSpeech.particle || token.partOfSpeech === PartOfSpeech.particle ||
@@ -134,7 +66,6 @@ function isFrequencyExcludedByPos(token: MergedToken): boolean {
function applyFrequencyMarking( function applyFrequencyMarking(
tokens: MergedToken[], tokens: MergedToken[],
getFrequencyRank: FrequencyDictionaryLookup,
): MergedToken[] { ): MergedToken[] {
return tokens.map((token) => { return tokens.map((token) => {
if (isFrequencyExcludedByPos(token)) { if (isFrequencyExcludedByPos(token)) {
@@ -146,25 +77,9 @@ function applyFrequencyMarking(
return { ...token, frequencyRank: rank }; return { ...token, frequencyRank: rank };
} }
const lookupTexts = getFrequencyLookupTextCandidates(token);
if (lookupTexts.length === 0) {
return { ...token, frequencyRank: undefined };
}
let bestRank: number | null = null;
for (const lookupText of lookupTexts) {
const rank = getCachedFrequencyRank(lookupText, getFrequencyRank);
if (rank === null) {
continue;
}
if (bestRank === null || rank < bestRank) {
bestRank = rank;
}
}
return { return {
...token, ...token,
frequencyRank: bestRank ?? undefined, frequencyRank: undefined,
}; };
}); });
} }
@@ -357,16 +272,8 @@ export function annotateTokens(
const frequencyEnabled = options.frequencyEnabled !== false; const frequencyEnabled = options.frequencyEnabled !== false;
const frequencyMarkedTokens = const frequencyMarkedTokens =
frequencyEnabled && deps.getFrequencyRank frequencyEnabled
? applyFrequencyMarking(knownMarkedTokens, deps.getFrequencyRank) ? applyFrequencyMarking(knownMarkedTokens)
: frequencyEnabled
? knownMarkedTokens.map((token) => ({
...token,
frequencyRank:
typeof token.frequencyRank === 'number' && Number.isFinite(token.frequencyRank)
? Math.max(1, Math.floor(token.frequencyRank))
: undefined,
}))
: knownMarkedTokens.map((token) => ({ : knownMarkedTokens.map((token) => ({
...token, ...token,
frequencyRank: undefined, frequencyRank: undefined,

View File

@@ -94,10 +94,20 @@ test('requestYomitanTermFrequencies returns normalized frequency entries', async
term: '猫', term: '猫',
reading: 'ねこ', reading: 'ねこ',
dictionary: 'freq-dict', dictionary: 'freq-dict',
dictionaryPriority: 0,
frequency: 77, frequency: 77,
displayValue: '77', displayValue: '77',
displayValueParsed: true, displayValueParsed: true,
}, },
{
term: '鍛える',
reading: 'きたえる',
dictionary: 'freq-dict',
dictionaryPriority: 1,
frequency: 46961,
displayValue: '2847,46961',
displayValueParsed: true,
},
{ {
term: 'invalid', term: 'invalid',
dictionary: 'freq-dict', dictionary: 'freq-dict',
@@ -110,9 +120,12 @@ test('requestYomitanTermFrequencies returns normalized frequency entries', async
error: () => undefined, error: () => undefined,
}); });
assert.equal(result.length, 1); assert.equal(result.length, 2);
assert.equal(result[0]?.term, '猫'); assert.equal(result[0]?.term, '猫');
assert.equal(result[0]?.frequency, 77); assert.equal(result[0]?.frequency, 77);
assert.equal(result[0]?.dictionaryPriority, 0);
assert.equal(result[1]?.term, '鍛える');
assert.equal(result[1]?.frequency, 2847);
assert.match(scriptValue, /getTermFrequencies/); assert.match(scriptValue, /getTermFrequencies/);
assert.match(scriptValue, /optionsGetFull/); assert.match(scriptValue, /optionsGetFull/);
}); });

View File

@@ -19,6 +19,7 @@ export interface YomitanTermFrequency {
term: string; term: string;
reading: string | null; reading: string | null;
dictionary: string; dictionary: string;
dictionaryPriority: number;
frequency: number; frequency: number;
displayValue: string | null; displayValue: string | null;
displayValueParsed: boolean; displayValueParsed: boolean;
@@ -40,6 +41,32 @@ function asPositiveInteger(value: unknown): number | null {
return Math.max(1, Math.floor(value)); return Math.max(1, Math.floor(value));
} }
function parsePositiveFrequencyString(value: string): number | null {
const trimmed = value.trim();
if (!trimmed) {
return null;
}
const numericPrefix = trimmed.match(/^\d[\d,]*/)?.[0];
if (!numericPrefix) {
return null;
}
const chunks = numericPrefix.split(',');
const normalizedNumber =
chunks.length <= 1
? chunks[0] ?? ''
: chunks.slice(1).every((chunk) => /^\d{3}$/.test(chunk))
? chunks.join('')
: (chunks[0] ?? '');
const parsed = Number.parseInt(normalizedNumber, 10);
if (!Number.isFinite(parsed) || parsed <= 0) {
return null;
}
return parsed;
}
function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null { function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
if (!isObject(value)) { if (!isObject(value)) {
return null; return null;
@@ -47,10 +74,24 @@ function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
const term = typeof value.term === 'string' ? value.term.trim() : ''; const term = typeof value.term === 'string' ? value.term.trim() : '';
const dictionary = typeof value.dictionary === 'string' ? value.dictionary.trim() : ''; const dictionary = typeof value.dictionary === 'string' ? value.dictionary.trim() : '';
const frequency = asPositiveInteger(value.frequency); const rawFrequency = asPositiveInteger(value.frequency);
const displayValueRaw =
value.displayValue === null
? null
: typeof value.displayValue === 'string'
? value.displayValue
: null;
const parsedDisplayFrequency =
displayValueRaw !== null ? parsePositiveFrequencyString(displayValueRaw) : null;
const frequency = parsedDisplayFrequency ?? rawFrequency;
if (!term || !dictionary || frequency === null) { if (!term || !dictionary || frequency === null) {
return null; return null;
} }
const dictionaryPriorityRaw = (value as { dictionaryPriority?: unknown }).dictionaryPriority;
const dictionaryPriority =
typeof dictionaryPriorityRaw === 'number' && Number.isFinite(dictionaryPriorityRaw)
? Math.max(0, Math.floor(dictionaryPriorityRaw))
: Number.MAX_SAFE_INTEGER;
const reading = const reading =
value.reading === null value.reading === null
@@ -58,18 +99,14 @@ function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
: typeof value.reading === 'string' : typeof value.reading === 'string'
? value.reading ? value.reading
: null; : null;
const displayValue = const displayValue = displayValueRaw;
value.displayValue === null
? null
: typeof value.displayValue === 'string'
? value.displayValue
: null;
const displayValueParsed = value.displayValueParsed === true; const displayValueParsed = value.displayValueParsed === true;
return { return {
term, term,
reading, reading,
dictionary, dictionary,
dictionaryPriority,
frequency, frequency,
displayValue, displayValue,
displayValueParsed, displayValueParsed,
@@ -278,20 +315,43 @@ export async function requestYomitanTermFrequencies(
const optionsFull = await invoke("optionsGetFull", undefined); const optionsFull = await invoke("optionsGetFull", undefined);
const profileIndex = optionsFull.profileCurrent; const profileIndex = optionsFull.profileCurrent;
const dictionariesRaw = optionsFull.profiles?.[profileIndex]?.options?.dictionaries ?? []; const dictionariesRaw = optionsFull.profiles?.[profileIndex]?.options?.dictionaries ?? [];
const dictionaries = Array.isArray(dictionariesRaw) const dictionaryEntries = Array.isArray(dictionariesRaw)
? dictionariesRaw ? dictionariesRaw
.filter((entry) => entry && typeof entry === "object" && entry.enabled === true && typeof entry.name === "string") .filter((entry) => entry && typeof entry === "object" && entry.enabled === true && typeof entry.name === "string")
.map((entry) => entry.name) .map((entry, index) => ({
name: entry.name,
id: typeof entry.id === "number" && Number.isFinite(entry.id) ? Math.floor(entry.id) : index
}))
.sort((a, b) => a.id - b.id)
: []; : [];
const dictionaries = dictionaryEntries.map((entry) => entry.name);
const dictionaryPriorityByName = dictionaryEntries.reduce((acc, entry, index) => {
acc[entry.name] = index;
return acc;
}, {});
if (dictionaries.length === 0) { if (dictionaries.length === 0) {
return []; return [];
} }
return await invoke("getTermFrequencies", { const rawFrequencies = await invoke("getTermFrequencies", {
termReadingList: ${JSON.stringify(normalizedTermReadingList)}, termReadingList: ${JSON.stringify(normalizedTermReadingList)},
dictionaries dictionaries
}); });
if (!Array.isArray(rawFrequencies)) {
return [];
}
return rawFrequencies
.filter((entry) => entry && typeof entry === "object")
.map((entry) => ({
...entry,
dictionaryPriority:
typeof entry.dictionary === "string" && dictionaryPriorityByName[entry.dictionary] !== undefined
? dictionaryPriorityByName[entry.dictionary]
: Number.MAX_SAFE_INTEGER
}));
})(); })();
`; `;

View File

@@ -2303,6 +2303,8 @@ const {
getJlptEnabled: () => getResolvedConfig().subtitleStyle.enableJlpt, getJlptEnabled: () => getResolvedConfig().subtitleStyle.enableJlpt,
getFrequencyDictionaryEnabled: () => getFrequencyDictionaryEnabled: () =>
getResolvedConfig().subtitleStyle.frequencyDictionary.enabled, getResolvedConfig().subtitleStyle.frequencyDictionary.enabled,
getFrequencyDictionaryMatchMode: () =>
getResolvedConfig().subtitleStyle.frequencyDictionary.matchMode,
getFrequencyRank: (text) => appState.frequencyRankLookup(text), getFrequencyRank: (text) => appState.frequencyRankLookup(text),
getYomitanGroupDebugEnabled: () => appState.overlayDebugVisualizationEnabled, getYomitanGroupDebugEnabled: () => appState.overlayDebugVisualizationEnabled,
getMecabTokenizer: () => appState.mecabTokenizer, getMecabTokenizer: () => appState.mecabTokenizer,

View File

@@ -128,6 +128,7 @@ test('composeMpvRuntimeHandlers returns callable handlers and forwards to inject
getJlptLevel: () => null, getJlptLevel: () => null,
getJlptEnabled: () => true, getJlptEnabled: () => true,
getFrequencyDictionaryEnabled: () => true, getFrequencyDictionaryEnabled: () => true,
getFrequencyDictionaryMatchMode: () => 'headword',
getFrequencyRank: () => null, getFrequencyRank: () => null,
getYomitanGroupDebugEnabled: () => false, getYomitanGroupDebugEnabled: () => false,
getMecabTokenizer: () => null, getMecabTokenizer: () => null,

View File

@@ -35,6 +35,7 @@ test('tokenizer deps builder records known-word lookups and maps readers', () =>
getJlptLevel: () => 'N2', getJlptLevel: () => 'N2',
getJlptEnabled: () => true, getJlptEnabled: () => true,
getFrequencyDictionaryEnabled: () => true, getFrequencyDictionaryEnabled: () => true,
getFrequencyDictionaryMatchMode: () => 'surface',
getFrequencyRank: () => 5, getFrequencyRank: () => 5,
getYomitanGroupDebugEnabled: () => false, getYomitanGroupDebugEnabled: () => false,
getMecabTokenizer: () => null, getMecabTokenizer: () => null,
@@ -47,6 +48,7 @@ test('tokenizer deps builder records known-word lookups and maps readers', () =>
deps.setYomitanParserInitPromise(null); deps.setYomitanParserInitPromise(null);
assert.equal(deps.getNPlusOneEnabled?.(), true); assert.equal(deps.getNPlusOneEnabled?.(), true);
assert.equal(deps.getMinSentenceWordsForNPlusOne?.(), 3); assert.equal(deps.getMinSentenceWordsForNPlusOne?.(), 3);
assert.equal(deps.getFrequencyDictionaryMatchMode?.(), 'surface');
assert.deepEqual(calls, ['lookup:true', 'lookup:false', 'set-window', 'set-ready', 'set-init']); assert.deepEqual(calls, ['lookup:true', 'lookup:false', 'set-window', 'set-ready', 'set-init']);
}); });

View File

@@ -5,6 +5,9 @@ type TokenizerMainDeps = TokenizerDepsRuntimeOptions & {
getFrequencyDictionaryEnabled: NonNullable< getFrequencyDictionaryEnabled: NonNullable<
TokenizerDepsRuntimeOptions['getFrequencyDictionaryEnabled'] TokenizerDepsRuntimeOptions['getFrequencyDictionaryEnabled']
>; >;
getFrequencyDictionaryMatchMode: NonNullable<
TokenizerDepsRuntimeOptions['getFrequencyDictionaryMatchMode']
>;
getFrequencyRank: NonNullable<TokenizerDepsRuntimeOptions['getFrequencyRank']>; getFrequencyRank: NonNullable<TokenizerDepsRuntimeOptions['getFrequencyRank']>;
getMinSentenceWordsForNPlusOne: NonNullable< getMinSentenceWordsForNPlusOne: NonNullable<
TokenizerDepsRuntimeOptions['getMinSentenceWordsForNPlusOne'] TokenizerDepsRuntimeOptions['getMinSentenceWordsForNPlusOne']
@@ -41,6 +44,7 @@ export function createBuildTokenizerDepsMainHandler(deps: TokenizerMainDeps) {
getJlptLevel: (text: string) => deps.getJlptLevel(text), getJlptLevel: (text: string) => deps.getJlptLevel(text),
getJlptEnabled: () => deps.getJlptEnabled(), getJlptEnabled: () => deps.getJlptEnabled(),
getFrequencyDictionaryEnabled: () => deps.getFrequencyDictionaryEnabled(), getFrequencyDictionaryEnabled: () => deps.getFrequencyDictionaryEnabled(),
getFrequencyDictionaryMatchMode: () => deps.getFrequencyDictionaryMatchMode(),
getFrequencyRank: (text: string) => deps.getFrequencyRank(text), getFrequencyRank: (text: string) => deps.getFrequencyRank(text),
getYomitanGroupDebugEnabled: () => deps.getYomitanGroupDebugEnabled(), getYomitanGroupDebugEnabled: () => deps.getYomitanGroupDebugEnabled(),
getMecabTokenizer: () => deps.getMecabTokenizer(), getMecabTokenizer: () => deps.getMecabTokenizer(),

View File

@@ -79,7 +79,7 @@ test('computeWordClass preserves known and n+1 classes while adding JLPT classes
assert.equal(computeWordClass(nPlusOneJlpt), 'word word-n-plus-one word-jlpt-n2'); assert.equal(computeWordClass(nPlusOneJlpt), 'word word-n-plus-one word-jlpt-n2');
}); });
test('computeWordClass does not add frequency class to known or N+1 terms', () => { test('computeWordClass keeps known/N+1 color classes exclusive over frequency classes', () => {
const known = createToken({ const known = createToken({
isKnown: true, isKnown: true,
frequencyRank: 10, frequencyRank: 10,
@@ -231,7 +231,7 @@ test('getFrequencyRankLabelForToken returns rank only for frequency-colored toke
const outOfRangeToken = createToken({ surface: '圏外', frequencyRank: 1000 }); const outOfRangeToken = createToken({ surface: '圏外', frequencyRank: 1000 });
assert.equal(getFrequencyRankLabelForToken(frequencyToken, settings), '20'); assert.equal(getFrequencyRankLabelForToken(frequencyToken, settings), '20');
assert.equal(getFrequencyRankLabelForToken(knownToken, settings), null); assert.equal(getFrequencyRankLabelForToken(knownToken, settings), '20');
assert.equal(getFrequencyRankLabelForToken(outOfRangeToken, settings), null); assert.equal(getFrequencyRankLabelForToken(outOfRangeToken, settings), null);
}); });

View File

@@ -184,7 +184,7 @@ export function getFrequencyRankLabelForToken(
token: MergedToken, token: MergedToken,
frequencySettings?: Partial<FrequencyRenderSettings>, frequencySettings?: Partial<FrequencyRenderSettings>,
): string | null { ): string | null {
if (token.isKnown || token.isNPlusOneTarget) { if (token.isNPlusOneTarget) {
return null; return null;
} }

View File

@@ -177,6 +177,7 @@ export type RuntimeOptionValueType = 'boolean' | 'enum';
export type RuntimeOptionValue = boolean | string; export type RuntimeOptionValue = boolean | string;
export type NPlusOneMatchMode = 'headword' | 'surface'; export type NPlusOneMatchMode = 'headword' | 'surface';
export type FrequencyDictionaryMatchMode = 'headword' | 'surface';
export interface RuntimeOptionState { export interface RuntimeOptionState {
id: RuntimeOptionId; id: RuntimeOptionId;
@@ -312,6 +313,7 @@ export interface SubtitleStyleConfig {
sourcePath?: string; sourcePath?: string;
topX?: number; topX?: number;
mode?: FrequencyDictionaryMode; mode?: FrequencyDictionaryMode;
matchMode?: FrequencyDictionaryMatchMode;
singleColor?: string; singleColor?: string;
bandedColors?: [string, string, string, string, string]; bandedColors?: [string, string, string, string, string];
}; };
@@ -536,6 +538,7 @@ export interface ResolvedConfig {
sourcePath: string; sourcePath: string;
topX: number; topX: number;
mode: FrequencyDictionaryMode; mode: FrequencyDictionaryMode;
matchMode: FrequencyDictionaryMatchMode;
singleColor: string; singleColor: string;
bandedColors: [string, string, string, string, string]; bandedColors: [string, string, string, string, string];
}; };