mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-02 06:22:42 -08:00
fix(subtitle): stabilize frequency highlighting with yomitan ranks
This commit is contained in:
@@ -136,6 +136,7 @@
|
||||
"sourcePath": "", // Optional absolute path to a frequency dictionary directory. If empty, SubMiner searches installed/default frequency-dictionary locations.
|
||||
"topX": 1000, // Only color tokens with frequency rank <= topX (default: 1000).
|
||||
"mode": "single", // single: use one color for all matching tokens. banded: use color ramp by frequency band. Values: single | banded
|
||||
"matchMode": "headword", // Frequency lookup text selection mode. Values: headword | surface
|
||||
"singleColor": "#f5a97f", // Color used when frequencyDictionary.mode is `single`.
|
||||
"bandedColors": [
|
||||
"#ed8796",
|
||||
|
||||
@@ -601,6 +601,8 @@ See `config.example.jsonc` for detailed configuration options and more examples.
|
||||
|
||||
**Supported commands:** Any valid mpv JSON IPC command array (`["cycle", "pause"]`, `["seek", 5]`, `["script-binding", "..."]`, etc.)
|
||||
|
||||
For subtitle-position and subtitle-track proxy commands (`sub-pos`, `sid`, `secondary-sid`), SubMiner also shows an mpv OSD notification after the command runs.
|
||||
|
||||
**See `config.example.jsonc`** for more keybinding examples and configuration options.
|
||||
|
||||
### Runtime Option Palette
|
||||
@@ -760,6 +762,7 @@ See `config.example.jsonc` for detailed configuration options.
|
||||
| `frequencyDictionary.sourcePath` | string | Path to a local frequency dictionary root. Leave empty or omit to use installed/default frequency-dictionary search paths. |
|
||||
| `frequencyDictionary.topX` | number | Only color tokens whose frequency rank is `<= topX` (`1000` by default) |
|
||||
| `frequencyDictionary.mode` | string | `"single"` or `"banded"` (`"single"` by default) |
|
||||
| `frequencyDictionary.matchMode` | string | `"headword"` or `"surface"` (`"headword"` by default) |
|
||||
| `frequencyDictionary.singleColor` | string | Color used for all highlighted tokens in single mode |
|
||||
| `frequencyDictionary.bandedColors` | string[] | Array of five hex colors used for ranked bands in banded mode |
|
||||
| `nPlusOneColor` | string | Existing n+1 highlight color (default: `#c6a0f6`) |
|
||||
@@ -776,6 +779,7 @@ Lookup behavior:
|
||||
- Set `frequencyDictionary.sourcePath` to a directory containing `term_meta_bank_*.json` for a fully custom source.
|
||||
- If `sourcePath` is missing or empty, SubMiner searches default install/runtime locations for `frequency-dictionary` directories (for example app resources, user data paths, and current working directory).
|
||||
- In both cases, only terms with a valid `frequencyRank` are used; everything else falls back to no highlighting.
|
||||
- `frequencyDictionary.matchMode` controls which token text is used for frequency lookups: `headword` (dictionary form) or `surface` (visible subtitle text).
|
||||
|
||||
In `single` mode all highlights use `singleColor`; in `banded` mode tokens map to five ascending color bands from most common to least common inside the topX window.
|
||||
|
||||
|
||||
@@ -136,6 +136,7 @@
|
||||
"sourcePath": "", // Optional absolute path to a frequency dictionary directory. If empty, SubMiner searches installed/default frequency-dictionary locations.
|
||||
"topX": 1000, // Only color tokens with frequency rank <= topX (default: 1000).
|
||||
"mode": "single", // single: use one color for all matching tokens. banded: use color ramp by frequency band. Values: single | banded
|
||||
"matchMode": "headword", // Frequency lookup text selection mode. Values: headword | surface
|
||||
"singleColor": "#f5a97f", // Color used when frequencyDictionary.mode is `single`.
|
||||
"bandedColors": [
|
||||
"#ed8796",
|
||||
|
||||
@@ -33,6 +33,7 @@ export const SUBTITLE_DEFAULT_CONFIG: Pick<ResolvedConfig, 'subtitleStyle'> = {
|
||||
sourcePath: '',
|
||||
topX: 1000,
|
||||
mode: 'single',
|
||||
matchMode: 'headword',
|
||||
singleColor: '#f5a97f',
|
||||
bandedColors: ['#ed8796', '#f5a97f', '#f9e2af', '#a6e3a1', '#8aadf4'],
|
||||
},
|
||||
|
||||
@@ -61,6 +61,14 @@ export function buildSubtitleConfigOptionRegistry(
|
||||
description:
|
||||
'single: use one color for all matching tokens. banded: use color ramp by frequency band.',
|
||||
},
|
||||
{
|
||||
path: 'subtitleStyle.frequencyDictionary.matchMode',
|
||||
kind: 'enum',
|
||||
enumValues: ['headword', 'surface'],
|
||||
defaultValue: defaultConfig.subtitleStyle.frequencyDictionary.matchMode,
|
||||
description:
|
||||
'headword: frequency lookup uses dictionary form. surface: lookup uses subtitle-visible token text.',
|
||||
},
|
||||
{
|
||||
path: 'subtitleStyle.frequencyDictionary.singleColor',
|
||||
kind: 'string',
|
||||
|
||||
@@ -102,9 +102,18 @@ export function applySubtitleDomainConfig(context: ResolveContext): void {
|
||||
const fallbackSubtitleStyleHoverTokenColor = resolved.subtitleStyle.hoverTokenColor;
|
||||
const fallbackSubtitleStyleHoverTokenBackgroundColor =
|
||||
resolved.subtitleStyle.hoverTokenBackgroundColor;
|
||||
const fallbackFrequencyDictionary = {
|
||||
...resolved.subtitleStyle.frequencyDictionary,
|
||||
};
|
||||
resolved.subtitleStyle = {
|
||||
...resolved.subtitleStyle,
|
||||
...(src.subtitleStyle as ResolvedConfig['subtitleStyle']),
|
||||
frequencyDictionary: {
|
||||
...resolved.subtitleStyle.frequencyDictionary,
|
||||
...(isObject((src.subtitleStyle as { frequencyDictionary?: unknown }).frequencyDictionary)
|
||||
? ((src.subtitleStyle as { frequencyDictionary?: unknown }).frequencyDictionary as ResolvedConfig['subtitleStyle']['frequencyDictionary'])
|
||||
: {}),
|
||||
},
|
||||
secondary: {
|
||||
...resolved.subtitleStyle.secondary,
|
||||
...(isObject(src.subtitleStyle.secondary)
|
||||
@@ -186,6 +195,7 @@ export function applySubtitleDomainConfig(context: ResolveContext): void {
|
||||
if (frequencyEnabled !== undefined) {
|
||||
resolved.subtitleStyle.frequencyDictionary.enabled = frequencyEnabled;
|
||||
} else if ((frequencyDictionary as { enabled?: unknown }).enabled !== undefined) {
|
||||
resolved.subtitleStyle.frequencyDictionary.enabled = fallbackFrequencyDictionary.enabled;
|
||||
warn(
|
||||
'subtitleStyle.frequencyDictionary.enabled',
|
||||
(frequencyDictionary as { enabled?: unknown }).enabled,
|
||||
@@ -198,6 +208,7 @@ export function applySubtitleDomainConfig(context: ResolveContext): void {
|
||||
if (sourcePath !== undefined) {
|
||||
resolved.subtitleStyle.frequencyDictionary.sourcePath = sourcePath;
|
||||
} else if ((frequencyDictionary as { sourcePath?: unknown }).sourcePath !== undefined) {
|
||||
resolved.subtitleStyle.frequencyDictionary.sourcePath = fallbackFrequencyDictionary.sourcePath;
|
||||
warn(
|
||||
'subtitleStyle.frequencyDictionary.sourcePath',
|
||||
(frequencyDictionary as { sourcePath?: unknown }).sourcePath,
|
||||
@@ -210,6 +221,7 @@ export function applySubtitleDomainConfig(context: ResolveContext): void {
|
||||
if (topX !== undefined && Number.isInteger(topX) && topX > 0) {
|
||||
resolved.subtitleStyle.frequencyDictionary.topX = Math.floor(topX);
|
||||
} else if ((frequencyDictionary as { topX?: unknown }).topX !== undefined) {
|
||||
resolved.subtitleStyle.frequencyDictionary.topX = fallbackFrequencyDictionary.topX;
|
||||
warn(
|
||||
'subtitleStyle.frequencyDictionary.topX',
|
||||
(frequencyDictionary as { topX?: unknown }).topX,
|
||||
@@ -222,6 +234,7 @@ export function applySubtitleDomainConfig(context: ResolveContext): void {
|
||||
if (frequencyMode === 'single' || frequencyMode === 'banded') {
|
||||
resolved.subtitleStyle.frequencyDictionary.mode = frequencyMode;
|
||||
} else if (frequencyMode !== undefined) {
|
||||
resolved.subtitleStyle.frequencyDictionary.mode = fallbackFrequencyDictionary.mode;
|
||||
warn(
|
||||
'subtitleStyle.frequencyDictionary.mode',
|
||||
frequencyDictionary.mode,
|
||||
@@ -230,10 +243,24 @@ export function applySubtitleDomainConfig(context: ResolveContext): void {
|
||||
);
|
||||
}
|
||||
|
||||
const frequencyMatchMode = (frequencyDictionary as { matchMode?: unknown }).matchMode;
|
||||
if (frequencyMatchMode === 'headword' || frequencyMatchMode === 'surface') {
|
||||
resolved.subtitleStyle.frequencyDictionary.matchMode = frequencyMatchMode;
|
||||
} else if (frequencyMatchMode !== undefined) {
|
||||
resolved.subtitleStyle.frequencyDictionary.matchMode = fallbackFrequencyDictionary.matchMode;
|
||||
warn(
|
||||
'subtitleStyle.frequencyDictionary.matchMode',
|
||||
frequencyMatchMode,
|
||||
resolved.subtitleStyle.frequencyDictionary.matchMode,
|
||||
"Expected 'headword' or 'surface'.",
|
||||
);
|
||||
}
|
||||
|
||||
const singleColor = asColor((frequencyDictionary as { singleColor?: unknown }).singleColor);
|
||||
if (singleColor !== undefined) {
|
||||
resolved.subtitleStyle.frequencyDictionary.singleColor = singleColor;
|
||||
} else if ((frequencyDictionary as { singleColor?: unknown }).singleColor !== undefined) {
|
||||
resolved.subtitleStyle.frequencyDictionary.singleColor = fallbackFrequencyDictionary.singleColor;
|
||||
warn(
|
||||
'subtitleStyle.frequencyDictionary.singleColor',
|
||||
(frequencyDictionary as { singleColor?: unknown }).singleColor,
|
||||
@@ -248,6 +275,8 @@ export function applySubtitleDomainConfig(context: ResolveContext): void {
|
||||
if (bandedColors !== undefined) {
|
||||
resolved.subtitleStyle.frequencyDictionary.bandedColors = bandedColors;
|
||||
} else if ((frequencyDictionary as { bandedColors?: unknown }).bandedColors !== undefined) {
|
||||
resolved.subtitleStyle.frequencyDictionary.bandedColors =
|
||||
fallbackFrequencyDictionary.bandedColors;
|
||||
warn(
|
||||
'subtitleStyle.frequencyDictionary.bandedColors',
|
||||
(frequencyDictionary as { bandedColors?: unknown }).bandedColors,
|
||||
|
||||
@@ -27,3 +27,32 @@ test('subtitleStyle preserveLineBreaks falls back while merge is preserved', ()
|
||||
),
|
||||
);
|
||||
});
|
||||
|
||||
test('subtitleStyle frequencyDictionary.matchMode accepts valid values and warns on invalid', () => {
|
||||
const valid = createResolveContext({
|
||||
subtitleStyle: {
|
||||
frequencyDictionary: {
|
||||
matchMode: 'surface',
|
||||
},
|
||||
},
|
||||
});
|
||||
applySubtitleDomainConfig(valid.context);
|
||||
assert.equal(valid.context.resolved.subtitleStyle.frequencyDictionary.matchMode, 'surface');
|
||||
|
||||
const invalid = createResolveContext({
|
||||
subtitleStyle: {
|
||||
frequencyDictionary: {
|
||||
matchMode: 'reading' as unknown as 'headword' | 'surface',
|
||||
},
|
||||
},
|
||||
});
|
||||
applySubtitleDomainConfig(invalid.context);
|
||||
assert.equal(invalid.context.resolved.subtitleStyle.frequencyDictionary.matchMode, 'headword');
|
||||
assert.ok(
|
||||
invalid.warnings.some(
|
||||
(warning) =>
|
||||
warning.path === 'subtitleStyle.frequencyDictionary.matchMode' &&
|
||||
warning.message === "Expected 'headword' or 'surface'.",
|
||||
),
|
||||
);
|
||||
});
|
||||
|
||||
@@ -80,7 +80,7 @@ test('createFrequencyDictionaryLookup aggregates duplicate-term logs into a sing
|
||||
);
|
||||
});
|
||||
|
||||
test('createFrequencyDictionaryLookup prefers frequency.value over displayValue', async () => {
|
||||
test('createFrequencyDictionaryLookup prefers frequency.displayValue over value when both exist', async () => {
|
||||
const logs: string[] = [];
|
||||
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-frequency-dict-'));
|
||||
const bankPath = path.join(tempDir, 'term_meta_bank_1.json');
|
||||
@@ -88,6 +88,7 @@ test('createFrequencyDictionaryLookup prefers frequency.value over displayValue'
|
||||
bankPath,
|
||||
JSON.stringify([
|
||||
['猫', 1, { frequency: { value: 1234, displayValue: 1200 } }],
|
||||
['鍛える', 2, { frequency: { value: 46961, displayValue: 2847 } }],
|
||||
['犬', 2, { frequency: { displayValue: 88 } }],
|
||||
]),
|
||||
);
|
||||
@@ -99,10 +100,31 @@ test('createFrequencyDictionaryLookup prefers frequency.value over displayValue'
|
||||
},
|
||||
});
|
||||
|
||||
assert.equal(lookup('猫'), 1234);
|
||||
assert.equal(lookup('猫'), 1200);
|
||||
assert.equal(lookup('鍛える'), 2847);
|
||||
assert.equal(lookup('犬'), 88);
|
||||
assert.equal(
|
||||
logs.some((entry) => entry.includes('Frequency dictionary loaded from')),
|
||||
true,
|
||||
);
|
||||
});
|
||||
|
||||
test('createFrequencyDictionaryLookup parses composite displayValue by primary rank', async () => {
|
||||
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-frequency-dict-'));
|
||||
const bankPath = path.join(tempDir, 'term_meta_bank_1.json');
|
||||
fs.writeFileSync(
|
||||
bankPath,
|
||||
JSON.stringify([
|
||||
['鍛える', 1, { frequency: { displayValue: '3272,52377' } }],
|
||||
['高み', 2, { frequency: { displayValue: '9933,108961' } }],
|
||||
]),
|
||||
);
|
||||
|
||||
const lookup = await createFrequencyDictionaryLookup({
|
||||
searchPaths: [tempDir],
|
||||
log: () => undefined,
|
||||
});
|
||||
|
||||
assert.equal(lookup('鍛える'), 3272);
|
||||
assert.equal(lookup('高み'), 9933);
|
||||
});
|
||||
|
||||
@@ -18,6 +18,32 @@ function normalizeFrequencyTerm(value: string): string {
|
||||
return value.trim().toLowerCase();
|
||||
}
|
||||
|
||||
function parsePositiveFrequencyString(value: string): number | null {
|
||||
const trimmed = value.trim();
|
||||
if (!trimmed) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const numericPrefix = trimmed.match(/^\d[\d,]*/)?.[0];
|
||||
if (!numericPrefix) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const chunks = numericPrefix.split(',');
|
||||
const normalizedNumber =
|
||||
chunks.length <= 1
|
||||
? chunks[0] ?? ''
|
||||
: chunks.slice(1).every((chunk) => /^\d{3}$/.test(chunk))
|
||||
? chunks.join('')
|
||||
: (chunks[0] ?? '');
|
||||
const parsed = Number.parseInt(normalizedNumber, 10);
|
||||
if (!Number.isFinite(parsed) || parsed <= 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return parsed;
|
||||
}
|
||||
|
||||
function parsePositiveFrequencyNumber(value: unknown): number | null {
|
||||
if (typeof value === 'number') {
|
||||
if (!Number.isFinite(value) || value <= 0) return null;
|
||||
@@ -25,10 +51,7 @@ function parsePositiveFrequencyNumber(value: unknown): number | null {
|
||||
}
|
||||
|
||||
if (typeof value === 'string') {
|
||||
const normalized = value.trim().replace(/,/g, '');
|
||||
const parsed = Number.parseInt(normalized, 10);
|
||||
if (!Number.isFinite(parsed) || parsed <= 0) return null;
|
||||
return parsed;
|
||||
return parsePositiveFrequencyString(value);
|
||||
}
|
||||
|
||||
return null;
|
||||
@@ -38,14 +61,14 @@ function extractFrequencyDisplayValue(meta: unknown): number | null {
|
||||
if (!meta || typeof meta !== 'object') return null;
|
||||
const frequency = (meta as { frequency?: unknown }).frequency;
|
||||
if (!frequency || typeof frequency !== 'object') return null;
|
||||
const rawValue = (frequency as { value?: unknown }).value;
|
||||
const parsedValue = parsePositiveFrequencyNumber(rawValue);
|
||||
if (parsedValue !== null) {
|
||||
return parsedValue;
|
||||
const displayValue = (frequency as { displayValue?: unknown }).displayValue;
|
||||
const parsedDisplayValue = parsePositiveFrequencyNumber(displayValue);
|
||||
if (parsedDisplayValue !== null) {
|
||||
return parsedDisplayValue;
|
||||
}
|
||||
|
||||
const displayValue = (frequency as { displayValue?: unknown }).displayValue;
|
||||
return parsePositiveFrequencyNumber(displayValue);
|
||||
const rawValue = (frequency as { value?: unknown }).value;
|
||||
return parsePositiveFrequencyNumber(rawValue);
|
||||
}
|
||||
|
||||
function asFrequencyDictionaryEntry(entry: unknown): FrequencyDictionaryEntry | null {
|
||||
|
||||
@@ -218,6 +218,119 @@ test('tokenizeSubtitle loads frequency ranks from Yomitan installed dictionaries
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 77);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle queries headword frequencies without forcing surface reading', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'鍛えた',
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
||||
getYomitanParserWindow: () =>
|
||||
({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async (script: string) => {
|
||||
if (script.includes('getTermFrequencies')) {
|
||||
if (!script.includes('"term":"鍛える","reading":null')) {
|
||||
return [];
|
||||
}
|
||||
return [
|
||||
{
|
||||
term: '鍛える',
|
||||
reading: 'きたえる',
|
||||
dictionary: 'freq-dict',
|
||||
frequency: 46961,
|
||||
displayValue: '2847,46961',
|
||||
displayValueParsed: true,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
return [
|
||||
{
|
||||
source: 'scanning-parser',
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
{
|
||||
text: '鍛えた',
|
||||
reading: 'きた',
|
||||
headwords: [[{ term: '鍛える' }]],
|
||||
},
|
||||
],
|
||||
],
|
||||
},
|
||||
];
|
||||
},
|
||||
},
|
||||
}) as unknown as Electron.BrowserWindow,
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.headword, '鍛える');
|
||||
assert.equal(result.tokens?.[0]?.reading, 'きた');
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 2847);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle prefers Yomitan frequency from highest-priority dictionary', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫',
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
||||
getYomitanParserWindow: () =>
|
||||
({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async (script: string) => {
|
||||
if (script.includes('getTermFrequencies')) {
|
||||
return [
|
||||
{
|
||||
term: '猫',
|
||||
reading: 'ねこ',
|
||||
dictionary: 'low-priority',
|
||||
dictionaryPriority: 2,
|
||||
frequency: 5,
|
||||
displayValue: '5',
|
||||
displayValueParsed: true,
|
||||
},
|
||||
{
|
||||
term: '猫',
|
||||
reading: 'ねこ',
|
||||
dictionary: 'high-priority',
|
||||
dictionaryPriority: 0,
|
||||
frequency: 100,
|
||||
displayValue: '100',
|
||||
displayValueParsed: true,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
return [
|
||||
{
|
||||
source: 'scanning-parser',
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
{
|
||||
text: '猫',
|
||||
reading: 'ねこ',
|
||||
headwords: [[{ term: '猫' }]],
|
||||
},
|
||||
],
|
||||
],
|
||||
},
|
||||
];
|
||||
},
|
||||
},
|
||||
}) as unknown as Electron.BrowserWindow,
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 100);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle uses only selected Yomitan headword for frequency lookup', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫です',
|
||||
@@ -1693,6 +1806,20 @@ test('tokenizeSubtitle checks known words by surface when configured', async ()
|
||||
assert.equal(result.tokens?.[0]?.isKnown, true);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle uses frequency surface match mode when configured', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'鍛えた',
|
||||
makeDepsFromYomitanTokens([{ surface: '鍛えた', reading: 'きたえた', headword: '鍛える' }], {
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyDictionaryMatchMode: () => 'surface',
|
||||
getFrequencyRank: (text) => (text === '鍛えた' ? 2847 : null),
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.text, '鍛えた');
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 2847);
|
||||
});
|
||||
|
||||
test('createTokenizerDepsRuntime checks MeCab availability before first tokenizeWithMecab call', async () => {
|
||||
let available = false;
|
||||
let checkCalls = 0;
|
||||
|
||||
@@ -2,6 +2,7 @@ import type { BrowserWindow, Extension } from 'electron';
|
||||
import { mergeTokens } from '../../token-merger';
|
||||
import { createLogger } from '../../logger';
|
||||
import {
|
||||
FrequencyDictionaryMatchMode,
|
||||
MergedToken,
|
||||
NPlusOneMatchMode,
|
||||
SubtitleData,
|
||||
@@ -36,6 +37,7 @@ export interface TokenizerServiceDeps {
|
||||
getNPlusOneEnabled?: () => boolean;
|
||||
getJlptEnabled?: () => boolean;
|
||||
getFrequencyDictionaryEnabled?: () => boolean;
|
||||
getFrequencyDictionaryMatchMode?: () => FrequencyDictionaryMatchMode;
|
||||
getFrequencyRank?: FrequencyDictionaryLookup;
|
||||
getMinSentenceWordsForNPlusOne?: () => number;
|
||||
getYomitanGroupDebugEnabled?: () => boolean;
|
||||
@@ -63,6 +65,7 @@ export interface TokenizerDepsRuntimeOptions {
|
||||
getNPlusOneEnabled?: () => boolean;
|
||||
getJlptEnabled?: () => boolean;
|
||||
getFrequencyDictionaryEnabled?: () => boolean;
|
||||
getFrequencyDictionaryMatchMode?: () => FrequencyDictionaryMatchMode;
|
||||
getFrequencyRank?: FrequencyDictionaryLookup;
|
||||
getMinSentenceWordsForNPlusOne?: () => number;
|
||||
getYomitanGroupDebugEnabled?: () => boolean;
|
||||
@@ -73,6 +76,7 @@ interface TokenizerAnnotationOptions {
|
||||
nPlusOneEnabled: boolean;
|
||||
jlptEnabled: boolean;
|
||||
frequencyEnabled: boolean;
|
||||
frequencyMatchMode: FrequencyDictionaryMatchMode;
|
||||
minSentenceWordsForNPlusOne: number | undefined;
|
||||
}
|
||||
|
||||
@@ -139,7 +143,6 @@ async function applyAnnotationStage(
|
||||
isKnownWord: getKnownWordLookup(deps, options),
|
||||
knownWordMatchMode: deps.getKnownWordMatchMode(),
|
||||
getJlptLevel: deps.getJlptLevel,
|
||||
getFrequencyRank: deps.getFrequencyRank,
|
||||
},
|
||||
options,
|
||||
);
|
||||
@@ -164,6 +167,8 @@ export function createTokenizerDepsRuntime(
|
||||
getNPlusOneEnabled: options.getNPlusOneEnabled,
|
||||
getJlptEnabled: options.getJlptEnabled,
|
||||
getFrequencyDictionaryEnabled: options.getFrequencyDictionaryEnabled,
|
||||
getFrequencyDictionaryMatchMode:
|
||||
options.getFrequencyDictionaryMatchMode ?? (() => 'headword'),
|
||||
getFrequencyRank: options.getFrequencyRank,
|
||||
getMinSentenceWordsForNPlusOne: options.getMinSentenceWordsForNPlusOne ?? (() => 3),
|
||||
getYomitanGroupDebugEnabled: options.getYomitanGroupDebugEnabled ?? (() => false),
|
||||
@@ -224,7 +229,24 @@ function normalizePositiveFrequencyRank(value: unknown): number | null {
|
||||
return Math.max(1, Math.floor(value));
|
||||
}
|
||||
|
||||
function resolveFrequencyLookupText(token: MergedToken): string {
|
||||
function normalizeFrequencyLookupText(rawText: string): string {
|
||||
return rawText.trim().toLowerCase();
|
||||
}
|
||||
|
||||
function resolveFrequencyLookupText(
|
||||
token: MergedToken,
|
||||
matchMode: FrequencyDictionaryMatchMode,
|
||||
): string {
|
||||
if (matchMode === 'surface') {
|
||||
if (token.surface && token.surface.length > 0) {
|
||||
return token.surface;
|
||||
}
|
||||
if (token.headword && token.headword.length > 0) {
|
||||
return token.headword;
|
||||
}
|
||||
return token.reading;
|
||||
}
|
||||
|
||||
if (token.headword && token.headword.length > 0) {
|
||||
return token.headword;
|
||||
}
|
||||
@@ -234,43 +256,128 @@ function resolveFrequencyLookupText(token: MergedToken): string {
|
||||
return token.surface;
|
||||
}
|
||||
|
||||
function applyYomitanFrequencyRanks(
|
||||
function buildYomitanFrequencyTermReadingList(
|
||||
tokens: MergedToken[],
|
||||
frequencies: ReadonlyArray<{ term: string; frequency: number }>,
|
||||
): MergedToken[] {
|
||||
if (tokens.length === 0 || frequencies.length === 0) {
|
||||
return tokens;
|
||||
matchMode: FrequencyDictionaryMatchMode,
|
||||
): Array<{ term: string; reading: string | null }> {
|
||||
return tokens
|
||||
.map((token) => {
|
||||
const term = resolveFrequencyLookupText(token, matchMode).trim();
|
||||
if (!term) {
|
||||
return null;
|
||||
}
|
||||
const readingRaw =
|
||||
token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null;
|
||||
const reading = matchMode === 'headword' ? null : readingRaw;
|
||||
return { term, reading };
|
||||
})
|
||||
.filter((pair): pair is { term: string; reading: string | null } => pair !== null);
|
||||
}
|
||||
|
||||
const rankByTerm = new Map<string, number>();
|
||||
function buildYomitanFrequencyRankMap(
|
||||
frequencies: ReadonlyArray<{ term: string; frequency: number; dictionaryPriority?: number }>,
|
||||
): Map<string, number> {
|
||||
const rankByTerm = new Map<string, { rank: number; dictionaryPriority: number }>();
|
||||
for (const frequency of frequencies) {
|
||||
const normalizedTerm = frequency.term.trim();
|
||||
const rank = normalizePositiveFrequencyRank(frequency.frequency);
|
||||
if (!normalizedTerm || rank === null) {
|
||||
continue;
|
||||
}
|
||||
const dictionaryPriority =
|
||||
typeof frequency.dictionaryPriority === 'number' && Number.isFinite(frequency.dictionaryPriority)
|
||||
? Math.max(0, Math.floor(frequency.dictionaryPriority))
|
||||
: Number.MAX_SAFE_INTEGER;
|
||||
const current = rankByTerm.get(normalizedTerm);
|
||||
if (current === undefined || rank < current) {
|
||||
rankByTerm.set(normalizedTerm, rank);
|
||||
if (
|
||||
current === undefined ||
|
||||
dictionaryPriority < current.dictionaryPriority ||
|
||||
(dictionaryPriority === current.dictionaryPriority && rank < current.rank)
|
||||
) {
|
||||
rankByTerm.set(normalizedTerm, { rank, dictionaryPriority });
|
||||
}
|
||||
}
|
||||
|
||||
if (rankByTerm.size === 0) {
|
||||
const collapsedRankByTerm = new Map<string, number>();
|
||||
for (const [term, entry] of rankByTerm.entries()) {
|
||||
collapsedRankByTerm.set(term, entry.rank);
|
||||
}
|
||||
|
||||
return collapsedRankByTerm;
|
||||
}
|
||||
|
||||
function getLocalFrequencyRank(
|
||||
lookupText: string,
|
||||
getFrequencyRank: FrequencyDictionaryLookup,
|
||||
cache: Map<string, number | null>,
|
||||
): number | null {
|
||||
const normalizedText = normalizeFrequencyLookupText(lookupText);
|
||||
if (!normalizedText) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (cache.has(normalizedText)) {
|
||||
return cache.get(normalizedText) ?? null;
|
||||
}
|
||||
|
||||
let rank: number | null;
|
||||
try {
|
||||
rank = getFrequencyRank(normalizedText);
|
||||
} catch {
|
||||
rank = null;
|
||||
}
|
||||
rank = normalizePositiveFrequencyRank(rank);
|
||||
cache.set(normalizedText, rank);
|
||||
return rank;
|
||||
}
|
||||
|
||||
function applyFrequencyRanks(
|
||||
tokens: MergedToken[],
|
||||
matchMode: FrequencyDictionaryMatchMode,
|
||||
yomitanRankByTerm: Map<string, number>,
|
||||
getFrequencyRank: FrequencyDictionaryLookup | undefined,
|
||||
): MergedToken[] {
|
||||
if (tokens.length === 0) {
|
||||
return tokens;
|
||||
}
|
||||
|
||||
const localLookupCache = new Map<string, number | null>();
|
||||
return tokens.map((token) => {
|
||||
const lookupText = resolveFrequencyLookupText(token).trim();
|
||||
if (!lookupText) {
|
||||
return token;
|
||||
}
|
||||
const rank = rankByTerm.get(lookupText);
|
||||
if (rank === undefined) {
|
||||
return token;
|
||||
}
|
||||
const existingRank = normalizePositiveFrequencyRank(token.frequencyRank);
|
||||
if (existingRank !== null) {
|
||||
return {
|
||||
...token,
|
||||
frequencyRank: rank,
|
||||
frequencyRank: existingRank,
|
||||
};
|
||||
}
|
||||
|
||||
const lookupText = resolveFrequencyLookupText(token, matchMode).trim();
|
||||
if (!lookupText) {
|
||||
return {
|
||||
...token,
|
||||
frequencyRank: undefined,
|
||||
};
|
||||
}
|
||||
|
||||
const yomitanRank = yomitanRankByTerm.get(lookupText);
|
||||
if (yomitanRank !== undefined) {
|
||||
return {
|
||||
...token,
|
||||
frequencyRank: yomitanRank,
|
||||
};
|
||||
}
|
||||
|
||||
if (!getFrequencyRank) {
|
||||
return {
|
||||
...token,
|
||||
frequencyRank: undefined,
|
||||
};
|
||||
}
|
||||
|
||||
const localRank = getLocalFrequencyRank(lookupText, getFrequencyRank, localLookupCache);
|
||||
return {
|
||||
...token,
|
||||
frequencyRank: localRank ?? undefined,
|
||||
};
|
||||
});
|
||||
}
|
||||
@@ -280,6 +387,7 @@ function getAnnotationOptions(deps: TokenizerServiceDeps): TokenizerAnnotationOp
|
||||
nPlusOneEnabled: deps.getNPlusOneEnabled?.() !== false,
|
||||
jlptEnabled: deps.getJlptEnabled?.() !== false,
|
||||
frequencyEnabled: deps.getFrequencyDictionaryEnabled?.() !== false,
|
||||
frequencyMatchMode: deps.getFrequencyDictionaryMatchMode?.() ?? 'headword',
|
||||
minSentenceWordsForNPlusOne: deps.getMinSentenceWordsForNPlusOne?.(),
|
||||
};
|
||||
}
|
||||
@@ -307,24 +415,23 @@ async function parseWithYomitanInternalParser(
|
||||
logSelectedYomitanGroups(text, selectedTokens);
|
||||
}
|
||||
|
||||
let tokensWithFrequency = selectedTokens;
|
||||
let yomitanRankByTerm = new Map<string, number>();
|
||||
if (options.frequencyEnabled) {
|
||||
const termReadingList = selectedTokens.map((token) => ({
|
||||
term: resolveFrequencyLookupText(token),
|
||||
reading: token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null,
|
||||
}));
|
||||
const frequencyMatchMode = options.frequencyMatchMode;
|
||||
const termReadingList = buildYomitanFrequencyTermReadingList(
|
||||
selectedTokens,
|
||||
frequencyMatchMode,
|
||||
);
|
||||
const yomitanFrequencies = await requestYomitanTermFrequencies(termReadingList, deps, logger);
|
||||
tokensWithFrequency = applyYomitanFrequencyRanks(selectedTokens, yomitanFrequencies);
|
||||
}
|
||||
|
||||
if (!needsMecabPosEnrichment(options)) {
|
||||
return tokensWithFrequency;
|
||||
yomitanRankByTerm = buildYomitanFrequencyRankMap(yomitanFrequencies);
|
||||
}
|
||||
|
||||
let enrichedTokens = selectedTokens;
|
||||
if (needsMecabPosEnrichment(options)) {
|
||||
try {
|
||||
const mecabTokens = await deps.tokenizeWithMecab(text);
|
||||
const enrichTokensWithMecab = deps.enrichTokensWithMecab ?? enrichTokensWithMecabAsync;
|
||||
return await enrichTokensWithMecab(tokensWithFrequency, mecabTokens);
|
||||
enrichedTokens = await enrichTokensWithMecab(enrichedTokens, mecabTokens);
|
||||
} catch (err) {
|
||||
const error = err as Error;
|
||||
logger.warn(
|
||||
@@ -333,10 +440,21 @@ async function parseWithYomitanInternalParser(
|
||||
`tokenCount=${selectedTokens.length}`,
|
||||
`textLength=${text.length}`,
|
||||
);
|
||||
return tokensWithFrequency;
|
||||
}
|
||||
}
|
||||
|
||||
if (options.frequencyEnabled) {
|
||||
return applyFrequencyRanks(
|
||||
enrichedTokens,
|
||||
options.frequencyMatchMode,
|
||||
yomitanRankByTerm,
|
||||
deps.getFrequencyRank,
|
||||
);
|
||||
}
|
||||
|
||||
return enrichedTokens;
|
||||
}
|
||||
|
||||
export async function tokenizeSubtitle(
|
||||
text: string,
|
||||
deps: TokenizerServiceDeps,
|
||||
|
||||
@@ -51,15 +51,15 @@ test('annotateTokens known-word match mode uses headword vs surface', () => {
|
||||
});
|
||||
|
||||
test('annotateTokens excludes frequency for particle/bound_auxiliary and pos1 exclusions', () => {
|
||||
const lookupCalls: string[] = [];
|
||||
const tokens = [
|
||||
makeToken({ surface: 'は', headword: 'は', partOfSpeech: PartOfSpeech.particle }),
|
||||
makeToken({ surface: 'は', headword: 'は', partOfSpeech: PartOfSpeech.particle, frequencyRank: 3 }),
|
||||
makeToken({
|
||||
surface: 'です',
|
||||
headword: 'です',
|
||||
partOfSpeech: PartOfSpeech.bound_auxiliary,
|
||||
startPos: 1,
|
||||
endPos: 3,
|
||||
frequencyRank: 4,
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'の',
|
||||
@@ -68,6 +68,7 @@ test('annotateTokens excludes frequency for particle/bound_auxiliary and pos1 ex
|
||||
pos1: '助詞',
|
||||
startPos: 3,
|
||||
endPos: 4,
|
||||
frequencyRank: 5,
|
||||
}),
|
||||
makeToken({
|
||||
surface: '猫',
|
||||
@@ -75,45 +76,36 @@ test('annotateTokens excludes frequency for particle/bound_auxiliary and pos1 ex
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
startPos: 4,
|
||||
endPos: 5,
|
||||
frequencyRank: 11,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
getFrequencyRank: (text) => {
|
||||
lookupCalls.push(text);
|
||||
return text === '猫' ? 11 : 999;
|
||||
},
|
||||
}),
|
||||
);
|
||||
const result = annotateTokens(tokens, makeDeps());
|
||||
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
assert.equal(result[1]?.frequencyRank, undefined);
|
||||
assert.equal(result[2]?.frequencyRank, undefined);
|
||||
assert.equal(result[3]?.frequencyRank, 11);
|
||||
assert.deepEqual(lookupCalls, ['猫']);
|
||||
});
|
||||
|
||||
test('annotateTokens preserves existing frequency rank when lookup is unavailable', () => {
|
||||
test('annotateTokens preserves existing frequency rank when frequency is enabled', () => {
|
||||
const tokens = [makeToken({ surface: '猫', headword: '猫', frequencyRank: 42 })];
|
||||
|
||||
const result = annotateTokens(tokens, makeDeps({ getFrequencyRank: undefined }));
|
||||
const result = annotateTokens(tokens, makeDeps());
|
||||
|
||||
assert.equal(result[0]?.frequencyRank, 42);
|
||||
});
|
||||
|
||||
test('annotateTokens prefers existing frequency rank over fallback lookup', () => {
|
||||
test('annotateTokens drops invalid frequency rank values', () => {
|
||||
const tokens = [makeToken({ surface: '猫', headword: '猫', frequencyRank: Number.NaN })];
|
||||
const result = annotateTokens(tokens, makeDeps());
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens clears frequency rank when frequency is disabled', () => {
|
||||
const tokens = [makeToken({ surface: '猫', headword: '猫', frequencyRank: 42 })];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
getFrequencyRank: () => 9,
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.frequencyRank, 42);
|
||||
const result = annotateTokens(tokens, makeDeps(), { frequencyEnabled: false });
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens handles JLPT disabled and eligibility exclusion paths', () => {
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import { markNPlusOneTargets } from '../../../token-merger';
|
||||
import {
|
||||
FrequencyDictionaryLookup,
|
||||
JlptLevel,
|
||||
MergedToken,
|
||||
NPlusOneMatchMode,
|
||||
@@ -12,22 +11,16 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
|
||||
const KATAKANA_CODEPOINT_START = 0x30a1;
|
||||
const KATAKANA_CODEPOINT_END = 0x30f6;
|
||||
const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
|
||||
const FREQUENCY_RANK_LOOKUP_CACHE_LIMIT = 2048;
|
||||
|
||||
const jlptLevelLookupCaches = new WeakMap<
|
||||
(text: string) => JlptLevel | null,
|
||||
Map<string, JlptLevel | null>
|
||||
>();
|
||||
const frequencyRankLookupCaches = new WeakMap<
|
||||
FrequencyDictionaryLookup,
|
||||
Map<string, number | null>
|
||||
>();
|
||||
|
||||
export interface AnnotationStageDeps {
|
||||
isKnownWord: (text: string) => boolean;
|
||||
knownWordMatchMode: NPlusOneMatchMode;
|
||||
getJlptLevel: (text: string) => JlptLevel | null;
|
||||
getFrequencyRank?: FrequencyDictionaryLookup;
|
||||
}
|
||||
|
||||
export interface AnnotationStageOptions {
|
||||
@@ -60,67 +53,6 @@ function applyKnownWordMarking(
|
||||
});
|
||||
}
|
||||
|
||||
function normalizeFrequencyLookupText(rawText: string): string {
|
||||
return rawText.trim().toLowerCase();
|
||||
}
|
||||
|
||||
function getCachedFrequencyRank(
|
||||
lookupText: string,
|
||||
getFrequencyRank: FrequencyDictionaryLookup,
|
||||
): number | null {
|
||||
const normalizedText = normalizeFrequencyLookupText(lookupText);
|
||||
if (!normalizedText) {
|
||||
return null;
|
||||
}
|
||||
|
||||
let cache = frequencyRankLookupCaches.get(getFrequencyRank);
|
||||
if (!cache) {
|
||||
cache = new Map<string, number | null>();
|
||||
frequencyRankLookupCaches.set(getFrequencyRank, cache);
|
||||
}
|
||||
|
||||
if (cache.has(normalizedText)) {
|
||||
return cache.get(normalizedText) ?? null;
|
||||
}
|
||||
|
||||
let rank: number | null;
|
||||
try {
|
||||
rank = getFrequencyRank(normalizedText);
|
||||
} catch {
|
||||
rank = null;
|
||||
}
|
||||
if (rank !== null) {
|
||||
if (!Number.isFinite(rank) || rank <= 0) {
|
||||
rank = null;
|
||||
}
|
||||
}
|
||||
|
||||
cache.set(normalizedText, rank);
|
||||
while (cache.size > FREQUENCY_RANK_LOOKUP_CACHE_LIMIT) {
|
||||
const firstKey = cache.keys().next().value;
|
||||
if (firstKey !== undefined) {
|
||||
cache.delete(firstKey);
|
||||
}
|
||||
}
|
||||
|
||||
return rank;
|
||||
}
|
||||
|
||||
function resolveFrequencyLookupText(token: MergedToken): string {
|
||||
if (token.headword && token.headword.length > 0) {
|
||||
return token.headword;
|
||||
}
|
||||
if (token.reading && token.reading.length > 0) {
|
||||
return token.reading;
|
||||
}
|
||||
return token.surface;
|
||||
}
|
||||
|
||||
function getFrequencyLookupTextCandidates(token: MergedToken): string[] {
|
||||
const lookupText = resolveFrequencyLookupText(token).trim();
|
||||
return lookupText ? [lookupText] : [];
|
||||
}
|
||||
|
||||
function isFrequencyExcludedByPos(token: MergedToken): boolean {
|
||||
if (
|
||||
token.partOfSpeech === PartOfSpeech.particle ||
|
||||
@@ -134,7 +66,6 @@ function isFrequencyExcludedByPos(token: MergedToken): boolean {
|
||||
|
||||
function applyFrequencyMarking(
|
||||
tokens: MergedToken[],
|
||||
getFrequencyRank: FrequencyDictionaryLookup,
|
||||
): MergedToken[] {
|
||||
return tokens.map((token) => {
|
||||
if (isFrequencyExcludedByPos(token)) {
|
||||
@@ -146,25 +77,9 @@ function applyFrequencyMarking(
|
||||
return { ...token, frequencyRank: rank };
|
||||
}
|
||||
|
||||
const lookupTexts = getFrequencyLookupTextCandidates(token);
|
||||
if (lookupTexts.length === 0) {
|
||||
return { ...token, frequencyRank: undefined };
|
||||
}
|
||||
|
||||
let bestRank: number | null = null;
|
||||
for (const lookupText of lookupTexts) {
|
||||
const rank = getCachedFrequencyRank(lookupText, getFrequencyRank);
|
||||
if (rank === null) {
|
||||
continue;
|
||||
}
|
||||
if (bestRank === null || rank < bestRank) {
|
||||
bestRank = rank;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
...token,
|
||||
frequencyRank: bestRank ?? undefined,
|
||||
frequencyRank: undefined,
|
||||
};
|
||||
});
|
||||
}
|
||||
@@ -357,16 +272,8 @@ export function annotateTokens(
|
||||
|
||||
const frequencyEnabled = options.frequencyEnabled !== false;
|
||||
const frequencyMarkedTokens =
|
||||
frequencyEnabled && deps.getFrequencyRank
|
||||
? applyFrequencyMarking(knownMarkedTokens, deps.getFrequencyRank)
|
||||
: frequencyEnabled
|
||||
? knownMarkedTokens.map((token) => ({
|
||||
...token,
|
||||
frequencyRank:
|
||||
typeof token.frequencyRank === 'number' && Number.isFinite(token.frequencyRank)
|
||||
? Math.max(1, Math.floor(token.frequencyRank))
|
||||
: undefined,
|
||||
}))
|
||||
frequencyEnabled
|
||||
? applyFrequencyMarking(knownMarkedTokens)
|
||||
: knownMarkedTokens.map((token) => ({
|
||||
...token,
|
||||
frequencyRank: undefined,
|
||||
|
||||
@@ -94,10 +94,20 @@ test('requestYomitanTermFrequencies returns normalized frequency entries', async
|
||||
term: '猫',
|
||||
reading: 'ねこ',
|
||||
dictionary: 'freq-dict',
|
||||
dictionaryPriority: 0,
|
||||
frequency: 77,
|
||||
displayValue: '77',
|
||||
displayValueParsed: true,
|
||||
},
|
||||
{
|
||||
term: '鍛える',
|
||||
reading: 'きたえる',
|
||||
dictionary: 'freq-dict',
|
||||
dictionaryPriority: 1,
|
||||
frequency: 46961,
|
||||
displayValue: '2847,46961',
|
||||
displayValueParsed: true,
|
||||
},
|
||||
{
|
||||
term: 'invalid',
|
||||
dictionary: 'freq-dict',
|
||||
@@ -110,9 +120,12 @@ test('requestYomitanTermFrequencies returns normalized frequency entries', async
|
||||
error: () => undefined,
|
||||
});
|
||||
|
||||
assert.equal(result.length, 1);
|
||||
assert.equal(result.length, 2);
|
||||
assert.equal(result[0]?.term, '猫');
|
||||
assert.equal(result[0]?.frequency, 77);
|
||||
assert.equal(result[0]?.dictionaryPriority, 0);
|
||||
assert.equal(result[1]?.term, '鍛える');
|
||||
assert.equal(result[1]?.frequency, 2847);
|
||||
assert.match(scriptValue, /getTermFrequencies/);
|
||||
assert.match(scriptValue, /optionsGetFull/);
|
||||
});
|
||||
|
||||
@@ -19,6 +19,7 @@ export interface YomitanTermFrequency {
|
||||
term: string;
|
||||
reading: string | null;
|
||||
dictionary: string;
|
||||
dictionaryPriority: number;
|
||||
frequency: number;
|
||||
displayValue: string | null;
|
||||
displayValueParsed: boolean;
|
||||
@@ -40,6 +41,32 @@ function asPositiveInteger(value: unknown): number | null {
|
||||
return Math.max(1, Math.floor(value));
|
||||
}
|
||||
|
||||
function parsePositiveFrequencyString(value: string): number | null {
|
||||
const trimmed = value.trim();
|
||||
if (!trimmed) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const numericPrefix = trimmed.match(/^\d[\d,]*/)?.[0];
|
||||
if (!numericPrefix) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const chunks = numericPrefix.split(',');
|
||||
const normalizedNumber =
|
||||
chunks.length <= 1
|
||||
? chunks[0] ?? ''
|
||||
: chunks.slice(1).every((chunk) => /^\d{3}$/.test(chunk))
|
||||
? chunks.join('')
|
||||
: (chunks[0] ?? '');
|
||||
const parsed = Number.parseInt(normalizedNumber, 10);
|
||||
if (!Number.isFinite(parsed) || parsed <= 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return parsed;
|
||||
}
|
||||
|
||||
function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
|
||||
if (!isObject(value)) {
|
||||
return null;
|
||||
@@ -47,10 +74,24 @@ function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
|
||||
|
||||
const term = typeof value.term === 'string' ? value.term.trim() : '';
|
||||
const dictionary = typeof value.dictionary === 'string' ? value.dictionary.trim() : '';
|
||||
const frequency = asPositiveInteger(value.frequency);
|
||||
const rawFrequency = asPositiveInteger(value.frequency);
|
||||
const displayValueRaw =
|
||||
value.displayValue === null
|
||||
? null
|
||||
: typeof value.displayValue === 'string'
|
||||
? value.displayValue
|
||||
: null;
|
||||
const parsedDisplayFrequency =
|
||||
displayValueRaw !== null ? parsePositiveFrequencyString(displayValueRaw) : null;
|
||||
const frequency = parsedDisplayFrequency ?? rawFrequency;
|
||||
if (!term || !dictionary || frequency === null) {
|
||||
return null;
|
||||
}
|
||||
const dictionaryPriorityRaw = (value as { dictionaryPriority?: unknown }).dictionaryPriority;
|
||||
const dictionaryPriority =
|
||||
typeof dictionaryPriorityRaw === 'number' && Number.isFinite(dictionaryPriorityRaw)
|
||||
? Math.max(0, Math.floor(dictionaryPriorityRaw))
|
||||
: Number.MAX_SAFE_INTEGER;
|
||||
|
||||
const reading =
|
||||
value.reading === null
|
||||
@@ -58,18 +99,14 @@ function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
|
||||
: typeof value.reading === 'string'
|
||||
? value.reading
|
||||
: null;
|
||||
const displayValue =
|
||||
value.displayValue === null
|
||||
? null
|
||||
: typeof value.displayValue === 'string'
|
||||
? value.displayValue
|
||||
: null;
|
||||
const displayValue = displayValueRaw;
|
||||
const displayValueParsed = value.displayValueParsed === true;
|
||||
|
||||
return {
|
||||
term,
|
||||
reading,
|
||||
dictionary,
|
||||
dictionaryPriority,
|
||||
frequency,
|
||||
displayValue,
|
||||
displayValueParsed,
|
||||
@@ -278,20 +315,43 @@ export async function requestYomitanTermFrequencies(
|
||||
const optionsFull = await invoke("optionsGetFull", undefined);
|
||||
const profileIndex = optionsFull.profileCurrent;
|
||||
const dictionariesRaw = optionsFull.profiles?.[profileIndex]?.options?.dictionaries ?? [];
|
||||
const dictionaries = Array.isArray(dictionariesRaw)
|
||||
const dictionaryEntries = Array.isArray(dictionariesRaw)
|
||||
? dictionariesRaw
|
||||
.filter((entry) => entry && typeof entry === "object" && entry.enabled === true && typeof entry.name === "string")
|
||||
.map((entry) => entry.name)
|
||||
.map((entry, index) => ({
|
||||
name: entry.name,
|
||||
id: typeof entry.id === "number" && Number.isFinite(entry.id) ? Math.floor(entry.id) : index
|
||||
}))
|
||||
.sort((a, b) => a.id - b.id)
|
||||
: [];
|
||||
const dictionaries = dictionaryEntries.map((entry) => entry.name);
|
||||
const dictionaryPriorityByName = dictionaryEntries.reduce((acc, entry, index) => {
|
||||
acc[entry.name] = index;
|
||||
return acc;
|
||||
}, {});
|
||||
|
||||
if (dictionaries.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return await invoke("getTermFrequencies", {
|
||||
const rawFrequencies = await invoke("getTermFrequencies", {
|
||||
termReadingList: ${JSON.stringify(normalizedTermReadingList)},
|
||||
dictionaries
|
||||
});
|
||||
|
||||
if (!Array.isArray(rawFrequencies)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return rawFrequencies
|
||||
.filter((entry) => entry && typeof entry === "object")
|
||||
.map((entry) => ({
|
||||
...entry,
|
||||
dictionaryPriority:
|
||||
typeof entry.dictionary === "string" && dictionaryPriorityByName[entry.dictionary] !== undefined
|
||||
? dictionaryPriorityByName[entry.dictionary]
|
||||
: Number.MAX_SAFE_INTEGER
|
||||
}));
|
||||
})();
|
||||
`;
|
||||
|
||||
|
||||
@@ -2303,6 +2303,8 @@ const {
|
||||
getJlptEnabled: () => getResolvedConfig().subtitleStyle.enableJlpt,
|
||||
getFrequencyDictionaryEnabled: () =>
|
||||
getResolvedConfig().subtitleStyle.frequencyDictionary.enabled,
|
||||
getFrequencyDictionaryMatchMode: () =>
|
||||
getResolvedConfig().subtitleStyle.frequencyDictionary.matchMode,
|
||||
getFrequencyRank: (text) => appState.frequencyRankLookup(text),
|
||||
getYomitanGroupDebugEnabled: () => appState.overlayDebugVisualizationEnabled,
|
||||
getMecabTokenizer: () => appState.mecabTokenizer,
|
||||
|
||||
@@ -128,6 +128,7 @@ test('composeMpvRuntimeHandlers returns callable handlers and forwards to inject
|
||||
getJlptLevel: () => null,
|
||||
getJlptEnabled: () => true,
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyDictionaryMatchMode: () => 'headword',
|
||||
getFrequencyRank: () => null,
|
||||
getYomitanGroupDebugEnabled: () => false,
|
||||
getMecabTokenizer: () => null,
|
||||
|
||||
@@ -35,6 +35,7 @@ test('tokenizer deps builder records known-word lookups and maps readers', () =>
|
||||
getJlptLevel: () => 'N2',
|
||||
getJlptEnabled: () => true,
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyDictionaryMatchMode: () => 'surface',
|
||||
getFrequencyRank: () => 5,
|
||||
getYomitanGroupDebugEnabled: () => false,
|
||||
getMecabTokenizer: () => null,
|
||||
@@ -47,6 +48,7 @@ test('tokenizer deps builder records known-word lookups and maps readers', () =>
|
||||
deps.setYomitanParserInitPromise(null);
|
||||
assert.equal(deps.getNPlusOneEnabled?.(), true);
|
||||
assert.equal(deps.getMinSentenceWordsForNPlusOne?.(), 3);
|
||||
assert.equal(deps.getFrequencyDictionaryMatchMode?.(), 'surface');
|
||||
assert.deepEqual(calls, ['lookup:true', 'lookup:false', 'set-window', 'set-ready', 'set-init']);
|
||||
});
|
||||
|
||||
|
||||
@@ -5,6 +5,9 @@ type TokenizerMainDeps = TokenizerDepsRuntimeOptions & {
|
||||
getFrequencyDictionaryEnabled: NonNullable<
|
||||
TokenizerDepsRuntimeOptions['getFrequencyDictionaryEnabled']
|
||||
>;
|
||||
getFrequencyDictionaryMatchMode: NonNullable<
|
||||
TokenizerDepsRuntimeOptions['getFrequencyDictionaryMatchMode']
|
||||
>;
|
||||
getFrequencyRank: NonNullable<TokenizerDepsRuntimeOptions['getFrequencyRank']>;
|
||||
getMinSentenceWordsForNPlusOne: NonNullable<
|
||||
TokenizerDepsRuntimeOptions['getMinSentenceWordsForNPlusOne']
|
||||
@@ -41,6 +44,7 @@ export function createBuildTokenizerDepsMainHandler(deps: TokenizerMainDeps) {
|
||||
getJlptLevel: (text: string) => deps.getJlptLevel(text),
|
||||
getJlptEnabled: () => deps.getJlptEnabled(),
|
||||
getFrequencyDictionaryEnabled: () => deps.getFrequencyDictionaryEnabled(),
|
||||
getFrequencyDictionaryMatchMode: () => deps.getFrequencyDictionaryMatchMode(),
|
||||
getFrequencyRank: (text: string) => deps.getFrequencyRank(text),
|
||||
getYomitanGroupDebugEnabled: () => deps.getYomitanGroupDebugEnabled(),
|
||||
getMecabTokenizer: () => deps.getMecabTokenizer(),
|
||||
|
||||
@@ -79,7 +79,7 @@ test('computeWordClass preserves known and n+1 classes while adding JLPT classes
|
||||
assert.equal(computeWordClass(nPlusOneJlpt), 'word word-n-plus-one word-jlpt-n2');
|
||||
});
|
||||
|
||||
test('computeWordClass does not add frequency class to known or N+1 terms', () => {
|
||||
test('computeWordClass keeps known/N+1 color classes exclusive over frequency classes', () => {
|
||||
const known = createToken({
|
||||
isKnown: true,
|
||||
frequencyRank: 10,
|
||||
@@ -231,7 +231,7 @@ test('getFrequencyRankLabelForToken returns rank only for frequency-colored toke
|
||||
const outOfRangeToken = createToken({ surface: '圏外', frequencyRank: 1000 });
|
||||
|
||||
assert.equal(getFrequencyRankLabelForToken(frequencyToken, settings), '20');
|
||||
assert.equal(getFrequencyRankLabelForToken(knownToken, settings), null);
|
||||
assert.equal(getFrequencyRankLabelForToken(knownToken, settings), '20');
|
||||
assert.equal(getFrequencyRankLabelForToken(outOfRangeToken, settings), null);
|
||||
});
|
||||
|
||||
|
||||
@@ -184,7 +184,7 @@ export function getFrequencyRankLabelForToken(
|
||||
token: MergedToken,
|
||||
frequencySettings?: Partial<FrequencyRenderSettings>,
|
||||
): string | null {
|
||||
if (token.isKnown || token.isNPlusOneTarget) {
|
||||
if (token.isNPlusOneTarget) {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
@@ -177,6 +177,7 @@ export type RuntimeOptionValueType = 'boolean' | 'enum';
|
||||
export type RuntimeOptionValue = boolean | string;
|
||||
|
||||
export type NPlusOneMatchMode = 'headword' | 'surface';
|
||||
export type FrequencyDictionaryMatchMode = 'headword' | 'surface';
|
||||
|
||||
export interface RuntimeOptionState {
|
||||
id: RuntimeOptionId;
|
||||
@@ -312,6 +313,7 @@ export interface SubtitleStyleConfig {
|
||||
sourcePath?: string;
|
||||
topX?: number;
|
||||
mode?: FrequencyDictionaryMode;
|
||||
matchMode?: FrequencyDictionaryMatchMode;
|
||||
singleColor?: string;
|
||||
bandedColors?: [string, string, string, string, string];
|
||||
};
|
||||
@@ -536,6 +538,7 @@ export interface ResolvedConfig {
|
||||
sourcePath: string;
|
||||
topX: number;
|
||||
mode: FrequencyDictionaryMode;
|
||||
matchMode: FrequencyDictionaryMatchMode;
|
||||
singleColor: string;
|
||||
bandedColors: [string, string, string, string, string];
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user