mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-06 19:57:26 -08:00
Overlay 2.0 (#12)
This commit is contained in:
@@ -2,6 +2,7 @@ import type { BrowserWindow, Extension } from 'electron';
|
||||
import { mergeTokens } from '../../token-merger';
|
||||
import { createLogger } from '../../logger';
|
||||
import {
|
||||
FrequencyDictionaryMatchMode,
|
||||
MergedToken,
|
||||
NPlusOneMatchMode,
|
||||
SubtitleData,
|
||||
@@ -9,13 +10,27 @@ import {
|
||||
FrequencyDictionaryLookup,
|
||||
JlptLevel,
|
||||
} from '../../types';
|
||||
import { annotateTokens } from './tokenizer/annotation-stage';
|
||||
import { enrichTokensWithMecabPos1 } from './tokenizer/parser-enrichment-stage';
|
||||
import {
|
||||
DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
|
||||
resolveAnnotationPos1ExclusionSet,
|
||||
} from '../../token-pos1-exclusions';
|
||||
import {
|
||||
DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
|
||||
resolveAnnotationPos2ExclusionSet,
|
||||
} from '../../token-pos2-exclusions';
|
||||
import { selectYomitanParseTokens } from './tokenizer/parser-selection-stage';
|
||||
import { requestYomitanParseResults } from './tokenizer/yomitan-parser-runtime';
|
||||
import {
|
||||
requestYomitanParseResults,
|
||||
requestYomitanTermFrequencies,
|
||||
} from './tokenizer/yomitan-parser-runtime';
|
||||
|
||||
const logger = createLogger('main:tokenizer');
|
||||
|
||||
type MecabTokenEnrichmentFn = (
|
||||
tokens: MergedToken[],
|
||||
mecabTokens: MergedToken[] | null,
|
||||
) => Promise<MergedToken[]>;
|
||||
|
||||
export interface TokenizerServiceDeps {
|
||||
getYomitanExt: () => Extension | null;
|
||||
getYomitanParserWindow: () => BrowserWindow | null;
|
||||
@@ -27,12 +42,15 @@ export interface TokenizerServiceDeps {
|
||||
isKnownWord: (text: string) => boolean;
|
||||
getKnownWordMatchMode: () => NPlusOneMatchMode;
|
||||
getJlptLevel: (text: string) => JlptLevel | null;
|
||||
getNPlusOneEnabled?: () => boolean;
|
||||
getJlptEnabled?: () => boolean;
|
||||
getFrequencyDictionaryEnabled?: () => boolean;
|
||||
getFrequencyDictionaryMatchMode?: () => FrequencyDictionaryMatchMode;
|
||||
getFrequencyRank?: FrequencyDictionaryLookup;
|
||||
getMinSentenceWordsForNPlusOne?: () => number;
|
||||
getYomitanGroupDebugEnabled?: () => boolean;
|
||||
tokenizeWithMecab: (text: string) => Promise<MergedToken[] | null>;
|
||||
enrichTokensWithMecab?: MecabTokenEnrichmentFn;
|
||||
}
|
||||
|
||||
interface MecabTokenizerLike {
|
||||
@@ -52,14 +70,100 @@ export interface TokenizerDepsRuntimeOptions {
|
||||
isKnownWord: (text: string) => boolean;
|
||||
getKnownWordMatchMode: () => NPlusOneMatchMode;
|
||||
getJlptLevel: (text: string) => JlptLevel | null;
|
||||
getNPlusOneEnabled?: () => boolean;
|
||||
getJlptEnabled?: () => boolean;
|
||||
getFrequencyDictionaryEnabled?: () => boolean;
|
||||
getFrequencyDictionaryMatchMode?: () => FrequencyDictionaryMatchMode;
|
||||
getFrequencyRank?: FrequencyDictionaryLookup;
|
||||
getMinSentenceWordsForNPlusOne?: () => number;
|
||||
getYomitanGroupDebugEnabled?: () => boolean;
|
||||
getMecabTokenizer: () => MecabTokenizerLike | null;
|
||||
}
|
||||
|
||||
interface TokenizerAnnotationOptions {
|
||||
nPlusOneEnabled: boolean;
|
||||
jlptEnabled: boolean;
|
||||
frequencyEnabled: boolean;
|
||||
frequencyMatchMode: FrequencyDictionaryMatchMode;
|
||||
minSentenceWordsForNPlusOne: number | undefined;
|
||||
pos1Exclusions: ReadonlySet<string>;
|
||||
pos2Exclusions: ReadonlySet<string>;
|
||||
}
|
||||
|
||||
let parserEnrichmentWorkerRuntimeModulePromise:
|
||||
| Promise<typeof import('./tokenizer/parser-enrichment-worker-runtime')>
|
||||
| null = null;
|
||||
let annotationStageModulePromise: Promise<typeof import('./tokenizer/annotation-stage')> | null = null;
|
||||
let parserEnrichmentFallbackModulePromise:
|
||||
| Promise<typeof import('./tokenizer/parser-enrichment-stage')>
|
||||
| null = null;
|
||||
const DEFAULT_ANNOTATION_POS1_EXCLUSIONS = resolveAnnotationPos1ExclusionSet(
|
||||
DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
|
||||
);
|
||||
const DEFAULT_ANNOTATION_POS2_EXCLUSIONS = resolveAnnotationPos2ExclusionSet(
|
||||
DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
|
||||
);
|
||||
|
||||
function getKnownWordLookup(deps: TokenizerServiceDeps, options: TokenizerAnnotationOptions): (text: string) => boolean {
|
||||
if (!options.nPlusOneEnabled) {
|
||||
return () => false;
|
||||
}
|
||||
return deps.isKnownWord;
|
||||
}
|
||||
|
||||
function needsMecabPosEnrichment(options: TokenizerAnnotationOptions): boolean {
|
||||
return options.nPlusOneEnabled || options.jlptEnabled || options.frequencyEnabled;
|
||||
}
|
||||
|
||||
function hasAnyAnnotationEnabled(options: TokenizerAnnotationOptions): boolean {
|
||||
return options.nPlusOneEnabled || options.jlptEnabled || options.frequencyEnabled;
|
||||
}
|
||||
|
||||
async function enrichTokensWithMecabAsync(
|
||||
tokens: MergedToken[],
|
||||
mecabTokens: MergedToken[] | null,
|
||||
): Promise<MergedToken[]> {
|
||||
if (!parserEnrichmentWorkerRuntimeModulePromise) {
|
||||
parserEnrichmentWorkerRuntimeModulePromise = import('./tokenizer/parser-enrichment-worker-runtime');
|
||||
}
|
||||
|
||||
try {
|
||||
const runtime = await parserEnrichmentWorkerRuntimeModulePromise;
|
||||
return await runtime.enrichTokensWithMecabPos1Async(tokens, mecabTokens);
|
||||
} catch {
|
||||
if (!parserEnrichmentFallbackModulePromise) {
|
||||
parserEnrichmentFallbackModulePromise = import('./tokenizer/parser-enrichment-stage');
|
||||
}
|
||||
const fallback = await parserEnrichmentFallbackModulePromise;
|
||||
return fallback.enrichTokensWithMecabPos1(tokens, mecabTokens);
|
||||
}
|
||||
}
|
||||
|
||||
async function applyAnnotationStage(
|
||||
tokens: MergedToken[],
|
||||
deps: TokenizerServiceDeps,
|
||||
options: TokenizerAnnotationOptions,
|
||||
): Promise<MergedToken[]> {
|
||||
if (!hasAnyAnnotationEnabled(options)) {
|
||||
return tokens;
|
||||
}
|
||||
|
||||
if (!annotationStageModulePromise) {
|
||||
annotationStageModulePromise = import('./tokenizer/annotation-stage');
|
||||
}
|
||||
|
||||
const annotationStage = await annotationStageModulePromise;
|
||||
return annotationStage.annotateTokens(
|
||||
tokens,
|
||||
{
|
||||
isKnownWord: getKnownWordLookup(deps, options),
|
||||
knownWordMatchMode: deps.getKnownWordMatchMode(),
|
||||
getJlptLevel: deps.getJlptLevel,
|
||||
},
|
||||
options,
|
||||
);
|
||||
}
|
||||
|
||||
export function createTokenizerDepsRuntime(
|
||||
options: TokenizerDepsRuntimeOptions,
|
||||
): TokenizerServiceDeps {
|
||||
@@ -76,8 +180,11 @@ export function createTokenizerDepsRuntime(
|
||||
isKnownWord: options.isKnownWord,
|
||||
getKnownWordMatchMode: options.getKnownWordMatchMode,
|
||||
getJlptLevel: options.getJlptLevel,
|
||||
getNPlusOneEnabled: options.getNPlusOneEnabled,
|
||||
getJlptEnabled: options.getJlptEnabled,
|
||||
getFrequencyDictionaryEnabled: options.getFrequencyDictionaryEnabled,
|
||||
getFrequencyDictionaryMatchMode:
|
||||
options.getFrequencyDictionaryMatchMode ?? (() => 'headword'),
|
||||
getFrequencyRank: options.getFrequencyRank,
|
||||
getMinSentenceWordsForNPlusOne: options.getMinSentenceWordsForNPlusOne ?? (() => 3),
|
||||
getYomitanGroupDebugEnabled: options.getYomitanGroupDebugEnabled ?? (() => false),
|
||||
@@ -104,8 +211,11 @@ export function createTokenizerDepsRuntime(
|
||||
return null;
|
||||
}
|
||||
|
||||
return mergeTokens(rawTokens, options.isKnownWord, options.getKnownWordMatchMode());
|
||||
const isKnownWordLookup = options.getNPlusOneEnabled?.() === false ? () => false : options.isKnownWord;
|
||||
return mergeTokens(rawTokens, isKnownWordLookup, options.getKnownWordMatchMode());
|
||||
},
|
||||
enrichTokensWithMecab: async (tokens, mecabTokens) =>
|
||||
enrichTokensWithMecabAsync(tokens, mecabTokens),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -128,36 +238,181 @@ function logSelectedYomitanGroups(text: string, tokens: MergedToken[]): void {
|
||||
});
|
||||
}
|
||||
|
||||
function getAnnotationOptions(deps: TokenizerServiceDeps): {
|
||||
jlptEnabled: boolean;
|
||||
frequencyEnabled: boolean;
|
||||
minSentenceWordsForNPlusOne: number | undefined;
|
||||
} {
|
||||
return {
|
||||
jlptEnabled: deps.getJlptEnabled?.() !== false,
|
||||
frequencyEnabled: deps.getFrequencyDictionaryEnabled?.() !== false,
|
||||
minSentenceWordsForNPlusOne: deps.getMinSentenceWordsForNPlusOne?.(),
|
||||
};
|
||||
function normalizePositiveFrequencyRank(value: unknown): number | null {
|
||||
if (typeof value !== 'number' || !Number.isFinite(value) || value <= 0) {
|
||||
return null;
|
||||
}
|
||||
return Math.max(1, Math.floor(value));
|
||||
}
|
||||
|
||||
function applyAnnotationStage(tokens: MergedToken[], deps: TokenizerServiceDeps): MergedToken[] {
|
||||
const options = getAnnotationOptions(deps);
|
||||
function normalizeFrequencyLookupText(rawText: string): string {
|
||||
return rawText.trim().toLowerCase();
|
||||
}
|
||||
|
||||
return annotateTokens(
|
||||
tokens,
|
||||
{
|
||||
isKnownWord: deps.isKnownWord,
|
||||
knownWordMatchMode: deps.getKnownWordMatchMode(),
|
||||
getJlptLevel: deps.getJlptLevel,
|
||||
getFrequencyRank: deps.getFrequencyRank,
|
||||
},
|
||||
options,
|
||||
);
|
||||
function resolveFrequencyLookupText(
|
||||
token: MergedToken,
|
||||
matchMode: FrequencyDictionaryMatchMode,
|
||||
): string {
|
||||
if (matchMode === 'surface') {
|
||||
if (token.surface && token.surface.length > 0) {
|
||||
return token.surface;
|
||||
}
|
||||
if (token.headword && token.headword.length > 0) {
|
||||
return token.headword;
|
||||
}
|
||||
return token.reading;
|
||||
}
|
||||
|
||||
if (token.headword && token.headword.length > 0) {
|
||||
return token.headword;
|
||||
}
|
||||
if (token.reading && token.reading.length > 0) {
|
||||
return token.reading;
|
||||
}
|
||||
return token.surface;
|
||||
}
|
||||
|
||||
function buildYomitanFrequencyTermReadingList(
|
||||
tokens: MergedToken[],
|
||||
matchMode: FrequencyDictionaryMatchMode,
|
||||
): Array<{ term: string; reading: string | null }> {
|
||||
return tokens
|
||||
.map((token) => {
|
||||
const term = resolveFrequencyLookupText(token, matchMode).trim();
|
||||
if (!term) {
|
||||
return null;
|
||||
}
|
||||
const readingRaw =
|
||||
token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null;
|
||||
return { term, reading: readingRaw };
|
||||
})
|
||||
.filter((pair): pair is { term: string; reading: string | null } => pair !== null);
|
||||
}
|
||||
|
||||
function buildYomitanFrequencyRankMap(
|
||||
frequencies: ReadonlyArray<{ term: string; frequency: number; dictionaryPriority?: number }>,
|
||||
): Map<string, number> {
|
||||
const rankByTerm = new Map<string, { rank: number; dictionaryPriority: number }>();
|
||||
for (const frequency of frequencies) {
|
||||
const normalizedTerm = frequency.term.trim();
|
||||
const rank = normalizePositiveFrequencyRank(frequency.frequency);
|
||||
if (!normalizedTerm || rank === null) {
|
||||
continue;
|
||||
}
|
||||
const dictionaryPriority =
|
||||
typeof frequency.dictionaryPriority === 'number' && Number.isFinite(frequency.dictionaryPriority)
|
||||
? Math.max(0, Math.floor(frequency.dictionaryPriority))
|
||||
: Number.MAX_SAFE_INTEGER;
|
||||
const current = rankByTerm.get(normalizedTerm);
|
||||
if (
|
||||
current === undefined ||
|
||||
dictionaryPriority < current.dictionaryPriority ||
|
||||
(dictionaryPriority === current.dictionaryPriority && rank < current.rank)
|
||||
) {
|
||||
rankByTerm.set(normalizedTerm, { rank, dictionaryPriority });
|
||||
}
|
||||
}
|
||||
|
||||
const collapsedRankByTerm = new Map<string, number>();
|
||||
for (const [term, entry] of rankByTerm.entries()) {
|
||||
collapsedRankByTerm.set(term, entry.rank);
|
||||
}
|
||||
|
||||
return collapsedRankByTerm;
|
||||
}
|
||||
|
||||
function getLocalFrequencyRank(
|
||||
lookupText: string,
|
||||
getFrequencyRank: FrequencyDictionaryLookup,
|
||||
cache: Map<string, number | null>,
|
||||
): number | null {
|
||||
const normalizedText = normalizeFrequencyLookupText(lookupText);
|
||||
if (!normalizedText) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (cache.has(normalizedText)) {
|
||||
return cache.get(normalizedText) ?? null;
|
||||
}
|
||||
|
||||
let rank: number | null;
|
||||
try {
|
||||
rank = getFrequencyRank(normalizedText);
|
||||
} catch {
|
||||
rank = null;
|
||||
}
|
||||
rank = normalizePositiveFrequencyRank(rank);
|
||||
cache.set(normalizedText, rank);
|
||||
return rank;
|
||||
}
|
||||
|
||||
function applyFrequencyRanks(
|
||||
tokens: MergedToken[],
|
||||
matchMode: FrequencyDictionaryMatchMode,
|
||||
yomitanRankByTerm: Map<string, number>,
|
||||
getFrequencyRank: FrequencyDictionaryLookup | undefined,
|
||||
): MergedToken[] {
|
||||
if (tokens.length === 0) {
|
||||
return tokens;
|
||||
}
|
||||
|
||||
const localLookupCache = new Map<string, number | null>();
|
||||
return tokens.map((token) => {
|
||||
const existingRank = normalizePositiveFrequencyRank(token.frequencyRank);
|
||||
if (existingRank !== null) {
|
||||
return {
|
||||
...token,
|
||||
frequencyRank: existingRank,
|
||||
};
|
||||
}
|
||||
|
||||
const lookupText = resolveFrequencyLookupText(token, matchMode).trim();
|
||||
if (!lookupText) {
|
||||
return {
|
||||
...token,
|
||||
frequencyRank: undefined,
|
||||
};
|
||||
}
|
||||
|
||||
const yomitanRank = yomitanRankByTerm.get(lookupText);
|
||||
if (yomitanRank !== undefined) {
|
||||
return {
|
||||
...token,
|
||||
frequencyRank: yomitanRank,
|
||||
};
|
||||
}
|
||||
|
||||
if (!getFrequencyRank) {
|
||||
return {
|
||||
...token,
|
||||
frequencyRank: undefined,
|
||||
};
|
||||
}
|
||||
|
||||
const localRank = getLocalFrequencyRank(lookupText, getFrequencyRank, localLookupCache);
|
||||
return {
|
||||
...token,
|
||||
frequencyRank: localRank ?? undefined,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
function getAnnotationOptions(deps: TokenizerServiceDeps): TokenizerAnnotationOptions {
|
||||
return {
|
||||
nPlusOneEnabled: deps.getNPlusOneEnabled?.() !== false,
|
||||
jlptEnabled: deps.getJlptEnabled?.() !== false,
|
||||
frequencyEnabled: deps.getFrequencyDictionaryEnabled?.() !== false,
|
||||
frequencyMatchMode: deps.getFrequencyDictionaryMatchMode?.() ?? 'headword',
|
||||
minSentenceWordsForNPlusOne: deps.getMinSentenceWordsForNPlusOne?.(),
|
||||
pos1Exclusions: DEFAULT_ANNOTATION_POS1_EXCLUSIONS,
|
||||
pos2Exclusions: DEFAULT_ANNOTATION_POS2_EXCLUSIONS,
|
||||
};
|
||||
}
|
||||
|
||||
async function parseWithYomitanInternalParser(
|
||||
text: string,
|
||||
deps: TokenizerServiceDeps,
|
||||
options: TokenizerAnnotationOptions,
|
||||
): Promise<MergedToken[] | null> {
|
||||
const parseResults = await requestYomitanParseResults(text, deps, logger);
|
||||
if (!parseResults) {
|
||||
@@ -166,7 +421,7 @@ async function parseWithYomitanInternalParser(
|
||||
|
||||
const selectedTokens = selectYomitanParseTokens(
|
||||
parseResults,
|
||||
deps.isKnownWord,
|
||||
getKnownWordLookup(deps, options),
|
||||
deps.getKnownWordMatchMode(),
|
||||
);
|
||||
if (!selectedTokens || selectedTokens.length === 0) {
|
||||
@@ -177,19 +432,52 @@ async function parseWithYomitanInternalParser(
|
||||
logSelectedYomitanGroups(text, selectedTokens);
|
||||
}
|
||||
|
||||
try {
|
||||
const mecabTokens = await deps.tokenizeWithMecab(text);
|
||||
return enrichTokensWithMecabPos1(selectedTokens, mecabTokens);
|
||||
} catch (err) {
|
||||
const error = err as Error;
|
||||
logger.warn(
|
||||
'Failed to enrich Yomitan tokens with MeCab POS:',
|
||||
error.message,
|
||||
`tokenCount=${selectedTokens.length}`,
|
||||
`textLength=${text.length}`,
|
||||
const frequencyRankPromise: Promise<Map<string, number>> = options.frequencyEnabled
|
||||
? (async () => {
|
||||
const frequencyMatchMode = options.frequencyMatchMode;
|
||||
const termReadingList = buildYomitanFrequencyTermReadingList(
|
||||
selectedTokens,
|
||||
frequencyMatchMode,
|
||||
);
|
||||
const yomitanFrequencies = await requestYomitanTermFrequencies(termReadingList, deps, logger);
|
||||
return buildYomitanFrequencyRankMap(yomitanFrequencies);
|
||||
})()
|
||||
: Promise.resolve(new Map<string, number>());
|
||||
|
||||
const mecabEnrichmentPromise: Promise<MergedToken[]> = needsMecabPosEnrichment(options)
|
||||
? (async () => {
|
||||
try {
|
||||
const mecabTokens = await deps.tokenizeWithMecab(text);
|
||||
const enrichTokensWithMecab = deps.enrichTokensWithMecab ?? enrichTokensWithMecabAsync;
|
||||
return await enrichTokensWithMecab(selectedTokens, mecabTokens);
|
||||
} catch (err) {
|
||||
const error = err as Error;
|
||||
logger.warn(
|
||||
'Failed to enrich Yomitan tokens with MeCab POS:',
|
||||
error.message,
|
||||
`tokenCount=${selectedTokens.length}`,
|
||||
`textLength=${text.length}`,
|
||||
);
|
||||
return selectedTokens;
|
||||
}
|
||||
})()
|
||||
: Promise.resolve(selectedTokens);
|
||||
|
||||
const [yomitanRankByTerm, enrichedTokens] = await Promise.all([
|
||||
frequencyRankPromise,
|
||||
mecabEnrichmentPromise,
|
||||
]);
|
||||
|
||||
if (options.frequencyEnabled) {
|
||||
return applyFrequencyRanks(
|
||||
enrichedTokens,
|
||||
options.frequencyMatchMode,
|
||||
yomitanRankByTerm,
|
||||
deps.getFrequencyRank,
|
||||
);
|
||||
return selectedTokens;
|
||||
}
|
||||
|
||||
return enrichedTokens;
|
||||
}
|
||||
|
||||
export async function tokenizeSubtitle(
|
||||
@@ -207,12 +495,13 @@ export async function tokenizeSubtitle(
|
||||
}
|
||||
|
||||
const tokenizeText = displayText.replace(/\n/g, ' ').replace(/\s+/g, ' ').trim();
|
||||
const annotationOptions = getAnnotationOptions(deps);
|
||||
|
||||
const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps);
|
||||
const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps, annotationOptions);
|
||||
if (yomitanTokens && yomitanTokens.length > 0) {
|
||||
return {
|
||||
text: displayText,
|
||||
tokens: applyAnnotationStage(yomitanTokens, deps),
|
||||
tokens: await applyAnnotationStage(yomitanTokens, deps, annotationOptions),
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user