mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-01 18:22:41 -08:00
feat: source frequency ranks from installed Yomitan dictionaries
This commit is contained in:
@@ -79,3 +79,30 @@ test('createFrequencyDictionaryLookup aggregates duplicate-term logs into a sing
|
||||
false,
|
||||
);
|
||||
});
|
||||
|
||||
test('createFrequencyDictionaryLookup prefers frequency.value over displayValue', async () => {
|
||||
const logs: string[] = [];
|
||||
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-frequency-dict-'));
|
||||
const bankPath = path.join(tempDir, 'term_meta_bank_1.json');
|
||||
fs.writeFileSync(
|
||||
bankPath,
|
||||
JSON.stringify([
|
||||
['猫', 1, { frequency: { value: 1234, displayValue: 1200 } }],
|
||||
['犬', 2, { frequency: { displayValue: 88 } }],
|
||||
]),
|
||||
);
|
||||
|
||||
const lookup = await createFrequencyDictionaryLookup({
|
||||
searchPaths: [tempDir],
|
||||
log: (message) => {
|
||||
logs.push(message);
|
||||
},
|
||||
});
|
||||
|
||||
assert.equal(lookup('猫'), 1234);
|
||||
assert.equal(lookup('犬'), 88);
|
||||
assert.equal(
|
||||
logs.some((entry) => entry.includes('Frequency dictionary loaded from')),
|
||||
true,
|
||||
);
|
||||
});
|
||||
|
||||
@@ -18,17 +18,14 @@ function normalizeFrequencyTerm(value: string): string {
|
||||
return value.trim().toLowerCase();
|
||||
}
|
||||
|
||||
function extractFrequencyDisplayValue(meta: unknown): number | null {
|
||||
if (!meta || typeof meta !== 'object') return null;
|
||||
const frequency = (meta as { frequency?: unknown }).frequency;
|
||||
if (!frequency || typeof frequency !== 'object') return null;
|
||||
const displayValue = (frequency as { displayValue?: unknown }).displayValue;
|
||||
if (typeof displayValue === 'number') {
|
||||
if (!Number.isFinite(displayValue) || displayValue <= 0) return null;
|
||||
return Math.floor(displayValue);
|
||||
function parsePositiveFrequencyNumber(value: unknown): number | null {
|
||||
if (typeof value === 'number') {
|
||||
if (!Number.isFinite(value) || value <= 0) return null;
|
||||
return Math.floor(value);
|
||||
}
|
||||
if (typeof displayValue === 'string') {
|
||||
const normalized = displayValue.trim().replace(/,/g, '');
|
||||
|
||||
if (typeof value === 'string') {
|
||||
const normalized = value.trim().replace(/,/g, '');
|
||||
const parsed = Number.parseInt(normalized, 10);
|
||||
if (!Number.isFinite(parsed) || parsed <= 0) return null;
|
||||
return parsed;
|
||||
@@ -37,6 +34,20 @@ function extractFrequencyDisplayValue(meta: unknown): number | null {
|
||||
return null;
|
||||
}
|
||||
|
||||
function extractFrequencyDisplayValue(meta: unknown): number | null {
|
||||
if (!meta || typeof meta !== 'object') return null;
|
||||
const frequency = (meta as { frequency?: unknown }).frequency;
|
||||
if (!frequency || typeof frequency !== 'object') return null;
|
||||
const rawValue = (frequency as { value?: unknown }).value;
|
||||
const parsedValue = parsePositiveFrequencyNumber(rawValue);
|
||||
if (parsedValue !== null) {
|
||||
return parsedValue;
|
||||
}
|
||||
|
||||
const displayValue = (frequency as { displayValue?: unknown }).displayValue;
|
||||
return parsePositiveFrequencyNumber(displayValue);
|
||||
}
|
||||
|
||||
function asFrequencyDictionaryEntry(entry: unknown): FrequencyDictionaryEntry | null {
|
||||
if (!Array.isArray(entry) || entry.length < 3) {
|
||||
return null;
|
||||
|
||||
@@ -169,6 +169,55 @@ test('tokenizeSubtitle applies frequency dictionary ranks', async () => {
|
||||
assert.equal(result.tokens?.[1]?.frequencyRank, 1200);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle loads frequency ranks from Yomitan installed dictionaries', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫',
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
||||
getYomitanParserWindow: () =>
|
||||
({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async (script: string) => {
|
||||
if (script.includes('getTermFrequencies')) {
|
||||
return [
|
||||
{
|
||||
term: '猫',
|
||||
reading: 'ねこ',
|
||||
dictionary: 'freq-dict',
|
||||
frequency: 77,
|
||||
displayValue: '77',
|
||||
displayValueParsed: true,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
return [
|
||||
{
|
||||
source: 'scanning-parser',
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
{
|
||||
text: '猫',
|
||||
reading: 'ねこ',
|
||||
headwords: [[{ term: '猫' }]],
|
||||
},
|
||||
],
|
||||
],
|
||||
},
|
||||
];
|
||||
},
|
||||
},
|
||||
}) as unknown as Electron.BrowserWindow,
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 77);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle uses only selected Yomitan headword for frequency lookup', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫です',
|
||||
|
||||
@@ -10,7 +10,10 @@ import {
|
||||
JlptLevel,
|
||||
} from '../../types';
|
||||
import { selectYomitanParseTokens } from './tokenizer/parser-selection-stage';
|
||||
import { requestYomitanParseResults } from './tokenizer/yomitan-parser-runtime';
|
||||
import {
|
||||
requestYomitanParseResults,
|
||||
requestYomitanTermFrequencies,
|
||||
} from './tokenizer/yomitan-parser-runtime';
|
||||
|
||||
const logger = createLogger('main:tokenizer');
|
||||
|
||||
@@ -214,6 +217,64 @@ function logSelectedYomitanGroups(text: string, tokens: MergedToken[]): void {
|
||||
});
|
||||
}
|
||||
|
||||
function normalizePositiveFrequencyRank(value: unknown): number | null {
|
||||
if (typeof value !== 'number' || !Number.isFinite(value) || value <= 0) {
|
||||
return null;
|
||||
}
|
||||
return Math.max(1, Math.floor(value));
|
||||
}
|
||||
|
||||
function resolveFrequencyLookupText(token: MergedToken): string {
|
||||
if (token.headword && token.headword.length > 0) {
|
||||
return token.headword;
|
||||
}
|
||||
if (token.reading && token.reading.length > 0) {
|
||||
return token.reading;
|
||||
}
|
||||
return token.surface;
|
||||
}
|
||||
|
||||
function applyYomitanFrequencyRanks(
|
||||
tokens: MergedToken[],
|
||||
frequencies: ReadonlyArray<{ term: string; frequency: number }>,
|
||||
): MergedToken[] {
|
||||
if (tokens.length === 0 || frequencies.length === 0) {
|
||||
return tokens;
|
||||
}
|
||||
|
||||
const rankByTerm = new Map<string, number>();
|
||||
for (const frequency of frequencies) {
|
||||
const normalizedTerm = frequency.term.trim();
|
||||
const rank = normalizePositiveFrequencyRank(frequency.frequency);
|
||||
if (!normalizedTerm || rank === null) {
|
||||
continue;
|
||||
}
|
||||
const current = rankByTerm.get(normalizedTerm);
|
||||
if (current === undefined || rank < current) {
|
||||
rankByTerm.set(normalizedTerm, rank);
|
||||
}
|
||||
}
|
||||
|
||||
if (rankByTerm.size === 0) {
|
||||
return tokens;
|
||||
}
|
||||
|
||||
return tokens.map((token) => {
|
||||
const lookupText = resolveFrequencyLookupText(token).trim();
|
||||
if (!lookupText) {
|
||||
return token;
|
||||
}
|
||||
const rank = rankByTerm.get(lookupText);
|
||||
if (rank === undefined) {
|
||||
return token;
|
||||
}
|
||||
return {
|
||||
...token,
|
||||
frequencyRank: rank,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
function getAnnotationOptions(deps: TokenizerServiceDeps): TokenizerAnnotationOptions {
|
||||
return {
|
||||
nPlusOneEnabled: deps.getNPlusOneEnabled?.() !== false,
|
||||
@@ -246,14 +307,24 @@ async function parseWithYomitanInternalParser(
|
||||
logSelectedYomitanGroups(text, selectedTokens);
|
||||
}
|
||||
|
||||
let tokensWithFrequency = selectedTokens;
|
||||
if (options.frequencyEnabled) {
|
||||
const termReadingList = selectedTokens.map((token) => ({
|
||||
term: resolveFrequencyLookupText(token),
|
||||
reading: token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null,
|
||||
}));
|
||||
const yomitanFrequencies = await requestYomitanTermFrequencies(termReadingList, deps, logger);
|
||||
tokensWithFrequency = applyYomitanFrequencyRanks(selectedTokens, yomitanFrequencies);
|
||||
}
|
||||
|
||||
if (!needsMecabPosEnrichment(options)) {
|
||||
return selectedTokens;
|
||||
return tokensWithFrequency;
|
||||
}
|
||||
|
||||
try {
|
||||
const mecabTokens = await deps.tokenizeWithMecab(text);
|
||||
const enrichTokensWithMecab = deps.enrichTokensWithMecab ?? enrichTokensWithMecabAsync;
|
||||
return await enrichTokensWithMecab(selectedTokens, mecabTokens);
|
||||
return await enrichTokensWithMecab(tokensWithFrequency, mecabTokens);
|
||||
} catch (err) {
|
||||
const error = err as Error;
|
||||
logger.warn(
|
||||
@@ -262,7 +333,7 @@ async function parseWithYomitanInternalParser(
|
||||
`tokenCount=${selectedTokens.length}`,
|
||||
`textLength=${text.length}`,
|
||||
);
|
||||
return selectedTokens;
|
||||
return tokensWithFrequency;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -95,6 +95,27 @@ test('annotateTokens excludes frequency for particle/bound_auxiliary and pos1 ex
|
||||
assert.deepEqual(lookupCalls, ['猫']);
|
||||
});
|
||||
|
||||
test('annotateTokens preserves existing frequency rank when lookup is unavailable', () => {
|
||||
const tokens = [makeToken({ surface: '猫', headword: '猫', frequencyRank: 42 })];
|
||||
|
||||
const result = annotateTokens(tokens, makeDeps({ getFrequencyRank: undefined }));
|
||||
|
||||
assert.equal(result[0]?.frequencyRank, 42);
|
||||
});
|
||||
|
||||
test('annotateTokens prefers existing frequency rank over fallback lookup', () => {
|
||||
const tokens = [makeToken({ surface: '猫', headword: '猫', frequencyRank: 42 })];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
getFrequencyRank: () => 9,
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.frequencyRank, 42);
|
||||
});
|
||||
|
||||
test('annotateTokens handles JLPT disabled and eligibility exclusion paths', () => {
|
||||
let disabledLookupCalls = 0;
|
||||
const disabledResult = annotateTokens(
|
||||
@@ -157,3 +178,38 @@ test('annotateTokens N+1 handoff marks expected target when threshold is satisfi
|
||||
assert.equal(result[1]?.isNPlusOneTarget, true);
|
||||
assert.equal(result[2]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('annotateTokens N+1 minimum sentence words counts only eligible word tokens', () => {
|
||||
const tokens = [
|
||||
makeToken({ surface: '猫', headword: '猫', startPos: 0, endPos: 1 }),
|
||||
makeToken({
|
||||
surface: 'が',
|
||||
headword: 'が',
|
||||
partOfSpeech: PartOfSpeech.particle,
|
||||
pos1: '助詞',
|
||||
startPos: 1,
|
||||
endPos: 2,
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'です',
|
||||
headword: 'です',
|
||||
partOfSpeech: PartOfSpeech.bound_auxiliary,
|
||||
pos1: '助動詞',
|
||||
startPos: 2,
|
||||
endPos: 4,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === 'が' || text === 'です',
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 3 },
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.isKnown, false);
|
||||
assert.equal(result[1]?.isKnown, true);
|
||||
assert.equal(result[2]?.isKnown, true);
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
@@ -141,6 +141,11 @@ function applyFrequencyMarking(
|
||||
return { ...token, frequencyRank: undefined };
|
||||
}
|
||||
|
||||
if (typeof token.frequencyRank === 'number' && Number.isFinite(token.frequencyRank)) {
|
||||
const rank = Math.max(1, Math.floor(token.frequencyRank));
|
||||
return { ...token, frequencyRank: rank };
|
||||
}
|
||||
|
||||
const lookupTexts = getFrequencyLookupTextCandidates(token);
|
||||
if (lookupTexts.length === 0) {
|
||||
return { ...token, frequencyRank: undefined };
|
||||
@@ -354,6 +359,14 @@ export function annotateTokens(
|
||||
const frequencyMarkedTokens =
|
||||
frequencyEnabled && deps.getFrequencyRank
|
||||
? applyFrequencyMarking(knownMarkedTokens, deps.getFrequencyRank)
|
||||
: frequencyEnabled
|
||||
? knownMarkedTokens.map((token) => ({
|
||||
...token,
|
||||
frequencyRank:
|
||||
typeof token.frequencyRank === 'number' && Number.isFinite(token.frequencyRank)
|
||||
? Math.max(1, Math.floor(token.frequencyRank))
|
||||
: undefined,
|
||||
}))
|
||||
: knownMarkedTokens.map((token) => ({
|
||||
...token,
|
||||
frequencyRank: undefined,
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
import assert from 'node:assert/strict';
|
||||
import test from 'node:test';
|
||||
import { syncYomitanDefaultAnkiServer } from './yomitan-parser-runtime';
|
||||
import {
|
||||
requestYomitanTermFrequencies,
|
||||
syncYomitanDefaultAnkiServer,
|
||||
} from './yomitan-parser-runtime';
|
||||
|
||||
function createDeps(executeJavaScript: (script: string) => Promise<unknown>) {
|
||||
const parserWindow = {
|
||||
@@ -81,3 +84,35 @@ test('syncYomitanDefaultAnkiServer no-ops for empty target url', async () => {
|
||||
assert.equal(updated, false);
|
||||
assert.equal(executeCount, 0);
|
||||
});
|
||||
|
||||
test('requestYomitanTermFrequencies returns normalized frequency entries', async () => {
|
||||
let scriptValue = '';
|
||||
const deps = createDeps(async (script) => {
|
||||
scriptValue = script;
|
||||
return [
|
||||
{
|
||||
term: '猫',
|
||||
reading: 'ねこ',
|
||||
dictionary: 'freq-dict',
|
||||
frequency: 77,
|
||||
displayValue: '77',
|
||||
displayValueParsed: true,
|
||||
},
|
||||
{
|
||||
term: 'invalid',
|
||||
dictionary: 'freq-dict',
|
||||
frequency: 0,
|
||||
},
|
||||
];
|
||||
});
|
||||
|
||||
const result = await requestYomitanTermFrequencies([{ term: '猫', reading: 'ねこ' }], deps, {
|
||||
error: () => undefined,
|
||||
});
|
||||
|
||||
assert.equal(result.length, 1);
|
||||
assert.equal(result[0]?.term, '猫');
|
||||
assert.equal(result[0]?.frequency, 77);
|
||||
assert.match(scriptValue, /getTermFrequencies/);
|
||||
assert.match(scriptValue, /optionsGetFull/);
|
||||
});
|
||||
|
||||
@@ -15,6 +15,89 @@ interface YomitanParserRuntimeDeps {
|
||||
setYomitanParserInitPromise: (promise: Promise<boolean> | null) => void;
|
||||
}
|
||||
|
||||
export interface YomitanTermFrequency {
|
||||
term: string;
|
||||
reading: string | null;
|
||||
dictionary: string;
|
||||
frequency: number;
|
||||
displayValue: string | null;
|
||||
displayValueParsed: boolean;
|
||||
}
|
||||
|
||||
export interface YomitanTermReadingPair {
|
||||
term: string;
|
||||
reading: string | null;
|
||||
}
|
||||
|
||||
function isObject(value: unknown): value is Record<string, unknown> {
|
||||
return Boolean(value && typeof value === 'object');
|
||||
}
|
||||
|
||||
function asPositiveInteger(value: unknown): number | null {
|
||||
if (typeof value !== 'number' || !Number.isFinite(value) || value <= 0) {
|
||||
return null;
|
||||
}
|
||||
return Math.max(1, Math.floor(value));
|
||||
}
|
||||
|
||||
function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
|
||||
if (!isObject(value)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const term = typeof value.term === 'string' ? value.term.trim() : '';
|
||||
const dictionary = typeof value.dictionary === 'string' ? value.dictionary.trim() : '';
|
||||
const frequency = asPositiveInteger(value.frequency);
|
||||
if (!term || !dictionary || frequency === null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const reading =
|
||||
value.reading === null
|
||||
? null
|
||||
: typeof value.reading === 'string'
|
||||
? value.reading
|
||||
: null;
|
||||
const displayValue =
|
||||
value.displayValue === null
|
||||
? null
|
||||
: typeof value.displayValue === 'string'
|
||||
? value.displayValue
|
||||
: null;
|
||||
const displayValueParsed = value.displayValueParsed === true;
|
||||
|
||||
return {
|
||||
term,
|
||||
reading,
|
||||
dictionary,
|
||||
frequency,
|
||||
displayValue,
|
||||
displayValueParsed,
|
||||
};
|
||||
}
|
||||
|
||||
function normalizeTermReadingList(termReadingList: YomitanTermReadingPair[]): YomitanTermReadingPair[] {
|
||||
const normalized: YomitanTermReadingPair[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
for (const pair of termReadingList) {
|
||||
const term = typeof pair.term === 'string' ? pair.term.trim() : '';
|
||||
if (!term) {
|
||||
continue;
|
||||
}
|
||||
const reading =
|
||||
typeof pair.reading === 'string' && pair.reading.trim().length > 0 ? pair.reading.trim() : null;
|
||||
const key = `${term}\u0000${reading ?? ''}`;
|
||||
if (seen.has(key)) {
|
||||
continue;
|
||||
}
|
||||
seen.add(key);
|
||||
normalized.push({ term, reading });
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
async function ensureYomitanParserWindow(
|
||||
deps: YomitanParserRuntimeDeps,
|
||||
logger: LoggerLike,
|
||||
@@ -154,6 +237,79 @@ export async function requestYomitanParseResults(
|
||||
}
|
||||
}
|
||||
|
||||
export async function requestYomitanTermFrequencies(
|
||||
termReadingList: YomitanTermReadingPair[],
|
||||
deps: YomitanParserRuntimeDeps,
|
||||
logger: LoggerLike,
|
||||
): Promise<YomitanTermFrequency[]> {
|
||||
const normalizedTermReadingList = normalizeTermReadingList(termReadingList);
|
||||
const yomitanExt = deps.getYomitanExt();
|
||||
if (normalizedTermReadingList.length === 0 || !yomitanExt) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const isReady = await ensureYomitanParserWindow(deps, logger);
|
||||
const parserWindow = deps.getYomitanParserWindow();
|
||||
if (!isReady || !parserWindow || parserWindow.isDestroyed()) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const script = `
|
||||
(async () => {
|
||||
const invoke = (action, params) =>
|
||||
new Promise((resolve, reject) => {
|
||||
chrome.runtime.sendMessage({ action, params }, (response) => {
|
||||
if (chrome.runtime.lastError) {
|
||||
reject(new Error(chrome.runtime.lastError.message));
|
||||
return;
|
||||
}
|
||||
if (!response || typeof response !== "object") {
|
||||
reject(new Error("Invalid response from Yomitan backend"));
|
||||
return;
|
||||
}
|
||||
if (response.error) {
|
||||
reject(new Error(response.error.message || "Yomitan backend error"));
|
||||
return;
|
||||
}
|
||||
resolve(response.result);
|
||||
});
|
||||
});
|
||||
|
||||
const optionsFull = await invoke("optionsGetFull", undefined);
|
||||
const profileIndex = optionsFull.profileCurrent;
|
||||
const dictionariesRaw = optionsFull.profiles?.[profileIndex]?.options?.dictionaries ?? [];
|
||||
const dictionaries = Array.isArray(dictionariesRaw)
|
||||
? dictionariesRaw
|
||||
.filter((entry) => entry && typeof entry === "object" && entry.enabled === true && typeof entry.name === "string")
|
||||
.map((entry) => entry.name)
|
||||
: [];
|
||||
|
||||
if (dictionaries.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return await invoke("getTermFrequencies", {
|
||||
termReadingList: ${JSON.stringify(normalizedTermReadingList)},
|
||||
dictionaries
|
||||
});
|
||||
})();
|
||||
`;
|
||||
|
||||
try {
|
||||
const rawResult = await parserWindow.webContents.executeJavaScript(script, true);
|
||||
if (!Array.isArray(rawResult)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return rawResult
|
||||
.map((entry) => toYomitanTermFrequency(entry))
|
||||
.filter((entry): entry is YomitanTermFrequency => entry !== null);
|
||||
} catch (err) {
|
||||
logger.error('Yomitan term frequency request failed:', (err as Error).message);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
export async function syncYomitanDefaultAnkiServer(
|
||||
serverUrl: string,
|
||||
deps: YomitanParserRuntimeDeps,
|
||||
|
||||
Reference in New Issue
Block a user