mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-02-28 06:22:45 -08:00
Add vendor dict fallback logic
This commit is contained in:
189
src/core/services/frequency-dictionary-service.ts
Normal file
189
src/core/services/frequency-dictionary-service.ts
Normal file
@@ -0,0 +1,189 @@
|
||||
import * as fs from "node:fs";
|
||||
import * as path from "node:path";
|
||||
|
||||
export interface FrequencyDictionaryLookupOptions {
|
||||
searchPaths: string[];
|
||||
log: (message: string) => void;
|
||||
}
|
||||
|
||||
interface FrequencyDictionaryEntry {
|
||||
rank: number;
|
||||
term: string;
|
||||
}
|
||||
|
||||
const FREQUENCY_BANK_FILE_GLOB = /^term_meta_bank_.*\.json$/;
|
||||
const NOOP_LOOKUP = (): null => null;
|
||||
|
||||
function normalizeFrequencyTerm(value: string): string {
|
||||
return value.trim().toLowerCase();
|
||||
}
|
||||
|
||||
function extractFrequencyDisplayValue(meta: unknown): number | null {
|
||||
if (!meta || typeof meta !== "object") return null;
|
||||
const frequency = (meta as { frequency?: unknown }).frequency;
|
||||
if (!frequency || typeof frequency !== "object") return null;
|
||||
const displayValue = (frequency as { displayValue?: unknown }).displayValue;
|
||||
if (typeof displayValue === "number") {
|
||||
if (!Number.isFinite(displayValue) || displayValue <= 0) return null;
|
||||
return Math.floor(displayValue);
|
||||
}
|
||||
if (typeof displayValue === "string") {
|
||||
const normalized = displayValue.trim().replace(/,/g, "");
|
||||
const parsed = Number.parseInt(normalized, 10);
|
||||
if (!Number.isFinite(parsed) || parsed <= 0) return null;
|
||||
return parsed;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function asFrequencyDictionaryEntry(
|
||||
entry: unknown,
|
||||
): FrequencyDictionaryEntry | null {
|
||||
if (!Array.isArray(entry) || entry.length < 3) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const [term, _id, meta] = entry as [
|
||||
unknown,
|
||||
unknown,
|
||||
unknown,
|
||||
];
|
||||
if (typeof term !== "string") {
|
||||
return null;
|
||||
}
|
||||
|
||||
const frequency = extractFrequencyDisplayValue(meta);
|
||||
if (frequency === null) return null;
|
||||
|
||||
const normalizedTerm = normalizeFrequencyTerm(term);
|
||||
if (!normalizedTerm) return null;
|
||||
|
||||
return {
|
||||
term: normalizedTerm,
|
||||
rank: frequency,
|
||||
};
|
||||
}
|
||||
|
||||
function addEntriesToMap(
|
||||
rawEntries: unknown,
|
||||
terms: Map<string, number>,
|
||||
log: (message: string) => void,
|
||||
): void {
|
||||
if (!Array.isArray(rawEntries)) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const rawEntry of rawEntries) {
|
||||
const entry = asFrequencyDictionaryEntry(rawEntry);
|
||||
if (!entry) {
|
||||
continue;
|
||||
}
|
||||
const currentRank = terms.get(entry.term);
|
||||
if (currentRank === undefined || entry.rank < currentRank) {
|
||||
terms.set(entry.term, entry.rank);
|
||||
continue;
|
||||
}
|
||||
|
||||
log(
|
||||
`Frequency dictionary duplicate term ${entry.term} with weaker rank ${entry.rank}; keeping ${currentRank}.`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
function collectDictionaryFromPath(
|
||||
dictionaryPath: string,
|
||||
log: (message: string) => void,
|
||||
): Map<string, number> {
|
||||
const terms = new Map<string, number>();
|
||||
|
||||
let fileNames: string[];
|
||||
try {
|
||||
fileNames = fs.readdirSync(dictionaryPath);
|
||||
} catch {
|
||||
return terms;
|
||||
}
|
||||
|
||||
const bankFiles = fileNames
|
||||
.filter((name) => FREQUENCY_BANK_FILE_GLOB.test(name))
|
||||
.sort();
|
||||
|
||||
if (bankFiles.length === 0) {
|
||||
return terms;
|
||||
}
|
||||
|
||||
for (const bankFile of bankFiles) {
|
||||
const bankPath = path.join(dictionaryPath, bankFile);
|
||||
let rawText: string;
|
||||
try {
|
||||
rawText = fs.readFileSync(bankPath, "utf-8");
|
||||
} catch {
|
||||
log(`Failed to read frequency dictionary file ${bankPath}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
let rawEntries: unknown;
|
||||
try {
|
||||
rawEntries = JSON.parse(rawText) as unknown;
|
||||
} catch {
|
||||
log(`Failed to parse frequency dictionary file as JSON: ${bankPath}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const beforeSize = terms.size;
|
||||
addEntriesToMap(rawEntries, terms, log);
|
||||
if (terms.size === beforeSize) {
|
||||
log(
|
||||
`Frequency dictionary file contained no extractable entries: ${bankPath}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return terms;
|
||||
}
|
||||
|
||||
export async function createFrequencyDictionaryLookupService(
|
||||
options: FrequencyDictionaryLookupOptions,
|
||||
): Promise<(term: string) => number | null> {
|
||||
const attemptedPaths: string[] = [];
|
||||
let foundDictionaryPathCount = 0;
|
||||
|
||||
for (const dictionaryPath of options.searchPaths) {
|
||||
attemptedPaths.push(dictionaryPath);
|
||||
if (!fs.existsSync(dictionaryPath)) {
|
||||
continue;
|
||||
}
|
||||
if (!fs.statSync(dictionaryPath).isDirectory()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
foundDictionaryPathCount += 1;
|
||||
const terms = collectDictionaryFromPath(dictionaryPath, options.log);
|
||||
if (terms.size > 0) {
|
||||
options.log(
|
||||
`Frequency dictionary loaded from ${dictionaryPath} (${terms.size} entries)`,
|
||||
);
|
||||
return (term: string): number | null => {
|
||||
const normalized = normalizeFrequencyTerm(term);
|
||||
if (!normalized) return null;
|
||||
return terms.get(normalized) ?? null;
|
||||
};
|
||||
}
|
||||
|
||||
options.log(
|
||||
`Frequency dictionary directory exists but contains no readable term_meta_bank_*.json files: ${dictionaryPath}`,
|
||||
);
|
||||
}
|
||||
|
||||
options.log(
|
||||
`Frequency dictionary not found. Searched ${attemptedPaths.length} candidate path(s): ${attemptedPaths.join(", ")}`,
|
||||
);
|
||||
if (foundDictionaryPathCount > 0) {
|
||||
options.log(
|
||||
"Frequency dictionary directories found, but no usable term_meta_bank_*.json files were loaded.",
|
||||
);
|
||||
}
|
||||
|
||||
return NOOP_LOOKUP;
|
||||
}
|
||||
|
||||
@@ -32,6 +32,7 @@ export {
|
||||
} from "./startup-service";
|
||||
export { openYomitanSettingsWindow } from "./yomitan-settings-service";
|
||||
export { createTokenizerDepsRuntimeService, tokenizeSubtitleService } from "./tokenizer-service";
|
||||
export { createFrequencyDictionaryLookupService } from "./frequency-dictionary-service";
|
||||
export { createJlptVocabularyLookupService } from "./jlpt-vocab-service";
|
||||
export {
|
||||
getIgnoredPos1Entries,
|
||||
|
||||
@@ -190,6 +190,75 @@ test("tokenizeSubtitleService skips JLPT lookups when disabled", async () => {
|
||||
assert.equal(lookupCalls, 0);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService applies frequency dictionary ranks", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"猫です",
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
headword: "猫",
|
||||
surface: "猫",
|
||||
reading: "ネコ",
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: "です",
|
||||
surface: "です",
|
||||
reading: "デス",
|
||||
startPos: 1,
|
||||
endPos: 2,
|
||||
partOfSpeech: PartOfSpeech.bound_auxiliary,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
getFrequencyRank: (text) => (text === "猫" ? 23 : 1200),
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 2);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 23);
|
||||
assert.equal(result.tokens?.[1]?.frequencyRank, 1200);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService skips frequency lookups when disabled", async () => {
|
||||
let frequencyCalls = 0;
|
||||
const result = await tokenizeSubtitleService(
|
||||
"猫",
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => false,
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
headword: "猫",
|
||||
surface: "猫",
|
||||
reading: "ネコ",
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
getFrequencyRank: () => {
|
||||
frequencyCalls += 1;
|
||||
return 10;
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
||||
assert.equal(frequencyCalls, 0);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService skips JLPT level for excluded demonstratives", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"この",
|
||||
|
||||
@@ -7,6 +7,7 @@ import {
|
||||
PartOfSpeech,
|
||||
SubtitleData,
|
||||
Token,
|
||||
FrequencyDictionaryLookup,
|
||||
} from "../../types";
|
||||
import {
|
||||
shouldIgnoreJlptForMecabPos1,
|
||||
@@ -35,11 +36,16 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
|
||||
const KATAKANA_CODEPOINT_START = 0x30a1;
|
||||
const KATAKANA_CODEPOINT_END = 0x30f6;
|
||||
const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
|
||||
const FREQUENCY_RANK_LOOKUP_CACHE_LIMIT = 2048;
|
||||
|
||||
const jlptLevelLookupCaches = new WeakMap<
|
||||
(text: string) => JlptLevel | null,
|
||||
Map<string, JlptLevel | null>
|
||||
>();
|
||||
const frequencyRankLookupCaches = new WeakMap<
|
||||
FrequencyDictionaryLookup,
|
||||
Map<string, number | null>
|
||||
>();
|
||||
|
||||
function isObject(value: unknown): value is Record<string, unknown> {
|
||||
return Boolean(value && typeof value === "object");
|
||||
@@ -61,6 +67,8 @@ export interface TokenizerServiceDeps {
|
||||
getKnownWordMatchMode: () => NPlusOneMatchMode;
|
||||
getJlptLevel: (text: string) => JlptLevel | null;
|
||||
getJlptEnabled?: () => boolean;
|
||||
getFrequencyDictionaryEnabled?: () => boolean;
|
||||
getFrequencyRank?: FrequencyDictionaryLookup;
|
||||
getMinSentenceWordsForNPlusOne?: () => number;
|
||||
tokenizeWithMecab: (text: string) => Promise<MergedToken[] | null>;
|
||||
}
|
||||
@@ -81,6 +89,8 @@ export interface TokenizerDepsRuntimeOptions {
|
||||
getKnownWordMatchMode: () => NPlusOneMatchMode;
|
||||
getJlptLevel: (text: string) => JlptLevel | null;
|
||||
getJlptEnabled?: () => boolean;
|
||||
getFrequencyDictionaryEnabled?: () => boolean;
|
||||
getFrequencyRank?: FrequencyDictionaryLookup;
|
||||
getMinSentenceWordsForNPlusOne?: () => number;
|
||||
getMecabTokenizer: () => MecabTokenizerLike | null;
|
||||
}
|
||||
@@ -122,6 +132,47 @@ function getCachedJlptLevel(
|
||||
return level;
|
||||
}
|
||||
|
||||
function normalizeFrequencyLookupText(rawText: string): string {
|
||||
return rawText.trim().toLowerCase();
|
||||
}
|
||||
|
||||
function getCachedFrequencyRank(
|
||||
lookupText: string,
|
||||
getFrequencyRank: FrequencyDictionaryLookup,
|
||||
): number | null {
|
||||
const normalizedText = normalizeFrequencyLookupText(lookupText);
|
||||
if (!normalizedText) {
|
||||
return null;
|
||||
}
|
||||
|
||||
let cache = frequencyRankLookupCaches.get(getFrequencyRank);
|
||||
if (!cache) {
|
||||
cache = new Map<string, number | null>();
|
||||
frequencyRankLookupCaches.set(getFrequencyRank, cache);
|
||||
}
|
||||
|
||||
if (cache.has(normalizedText)) {
|
||||
return cache.get(normalizedText) ?? null;
|
||||
}
|
||||
|
||||
let rank: number | null;
|
||||
try {
|
||||
rank = getFrequencyRank(normalizedText);
|
||||
} catch {
|
||||
rank = null;
|
||||
}
|
||||
|
||||
cache.set(normalizedText, rank);
|
||||
while (cache.size > FREQUENCY_RANK_LOOKUP_CACHE_LIMIT) {
|
||||
const firstKey = cache.keys().next().value;
|
||||
if (firstKey !== undefined) {
|
||||
cache.delete(firstKey);
|
||||
}
|
||||
}
|
||||
|
||||
return rank;
|
||||
}
|
||||
|
||||
export function createTokenizerDepsRuntimeService(
|
||||
options: TokenizerDepsRuntimeOptions,
|
||||
): TokenizerServiceDeps {
|
||||
@@ -137,6 +188,8 @@ export function createTokenizerDepsRuntimeService(
|
||||
getKnownWordMatchMode: options.getKnownWordMatchMode,
|
||||
getJlptLevel: options.getJlptLevel,
|
||||
getJlptEnabled: options.getJlptEnabled,
|
||||
getFrequencyDictionaryEnabled: options.getFrequencyDictionaryEnabled,
|
||||
getFrequencyRank: options.getFrequencyRank,
|
||||
getMinSentenceWordsForNPlusOne:
|
||||
options.getMinSentenceWordsForNPlusOne ?? (() => 3),
|
||||
tokenizeWithMecab: async (text) => {
|
||||
@@ -184,6 +237,34 @@ function applyKnownWordMarking(
|
||||
});
|
||||
}
|
||||
|
||||
function resolveFrequencyLookupText(token: MergedToken): string {
|
||||
if (token.headword && token.headword.length > 0) {
|
||||
return token.headword;
|
||||
}
|
||||
if (token.reading && token.reading.length > 0) {
|
||||
return token.reading;
|
||||
}
|
||||
return token.surface;
|
||||
}
|
||||
|
||||
function applyFrequencyMarking(
|
||||
tokens: MergedToken[],
|
||||
getFrequencyRank: FrequencyDictionaryLookup,
|
||||
): MergedToken[] {
|
||||
return tokens.map((token) => {
|
||||
const lookupText = resolveFrequencyLookupText(token);
|
||||
if (!lookupText) {
|
||||
return { ...token, frequencyRank: undefined };
|
||||
}
|
||||
|
||||
const rank = getCachedFrequencyRank(lookupText, getFrequencyRank);
|
||||
return {
|
||||
...token,
|
||||
frequencyRank: rank ?? undefined,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
function resolveJlptLookupText(token: MergedToken): string {
|
||||
if (token.headword && token.headword.length > 0) {
|
||||
return token.headword;
|
||||
@@ -753,6 +834,8 @@ export async function tokenizeSubtitleService(
|
||||
.replace(/\s+/g, " ")
|
||||
.trim();
|
||||
const jlptEnabled = deps.getJlptEnabled?.() !== false;
|
||||
const frequencyEnabled = deps.getFrequencyDictionaryEnabled?.() !== false;
|
||||
const frequencyLookup = deps.getFrequencyRank;
|
||||
|
||||
const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps);
|
||||
if (yomitanTokens && yomitanTokens.length > 0) {
|
||||
@@ -761,9 +844,16 @@ export async function tokenizeSubtitleService(
|
||||
deps.isKnownWord,
|
||||
deps.getKnownWordMatchMode(),
|
||||
);
|
||||
const jlptMarkedTokens = jlptEnabled
|
||||
? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel)
|
||||
: knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
|
||||
const frequencyMarkedTokens =
|
||||
frequencyEnabled && frequencyLookup
|
||||
? applyFrequencyMarking(knownMarkedTokens, frequencyLookup)
|
||||
: knownMarkedTokens.map((token) => ({
|
||||
...token,
|
||||
frequencyRank: undefined,
|
||||
}));
|
||||
const jlptMarkedTokens = jlptEnabled
|
||||
? applyJlptMarking(frequencyMarkedTokens, deps.getJlptLevel)
|
||||
: frequencyMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
|
||||
return {
|
||||
text: displayText,
|
||||
tokens: markNPlusOneTargets(
|
||||
@@ -781,9 +871,16 @@ export async function tokenizeSubtitleService(
|
||||
deps.isKnownWord,
|
||||
deps.getKnownWordMatchMode(),
|
||||
);
|
||||
const frequencyMarkedTokens =
|
||||
frequencyEnabled && frequencyLookup
|
||||
? applyFrequencyMarking(knownMarkedTokens, frequencyLookup)
|
||||
: knownMarkedTokens.map((token) => ({
|
||||
...token,
|
||||
frequencyRank: undefined,
|
||||
}));
|
||||
const jlptMarkedTokens = jlptEnabled
|
||||
? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel)
|
||||
: knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
|
||||
? applyJlptMarking(frequencyMarkedTokens, deps.getJlptLevel)
|
||||
: frequencyMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
|
||||
return {
|
||||
text: displayText,
|
||||
tokens: markNPlusOneTargets(
|
||||
|
||||
Reference in New Issue
Block a user