import type { BrowserWindow, Extension } from 'electron'; import { mergeTokens } from '../../token-merger'; import { createLogger } from '../../logger'; import { FrequencyDictionaryMatchMode, MergedToken, NPlusOneMatchMode, SubtitleData, Token, FrequencyDictionaryLookup, JlptLevel, } from '../../types'; import { selectYomitanParseTokens } from './tokenizer/parser-selection-stage'; import { requestYomitanParseResults, requestYomitanTermFrequencies, } from './tokenizer/yomitan-parser-runtime'; const logger = createLogger('main:tokenizer'); type MecabTokenEnrichmentFn = ( tokens: MergedToken[], mecabTokens: MergedToken[] | null, ) => Promise; export interface TokenizerServiceDeps { getYomitanExt: () => Extension | null; getYomitanParserWindow: () => BrowserWindow | null; setYomitanParserWindow: (window: BrowserWindow | null) => void; getYomitanParserReadyPromise: () => Promise | null; setYomitanParserReadyPromise: (promise: Promise | null) => void; getYomitanParserInitPromise: () => Promise | null; setYomitanParserInitPromise: (promise: Promise | null) => void; isKnownWord: (text: string) => boolean; getKnownWordMatchMode: () => NPlusOneMatchMode; getJlptLevel: (text: string) => JlptLevel | null; getNPlusOneEnabled?: () => boolean; getJlptEnabled?: () => boolean; getFrequencyDictionaryEnabled?: () => boolean; getFrequencyDictionaryMatchMode?: () => FrequencyDictionaryMatchMode; getFrequencyRank?: FrequencyDictionaryLookup; getMinSentenceWordsForNPlusOne?: () => number; getYomitanGroupDebugEnabled?: () => boolean; tokenizeWithMecab: (text: string) => Promise; enrichTokensWithMecab?: MecabTokenEnrichmentFn; } interface MecabTokenizerLike { tokenize: (text: string) => Promise; checkAvailability?: () => Promise; getStatus?: () => { available: boolean }; } export interface TokenizerDepsRuntimeOptions { getYomitanExt: () => Extension | null; getYomitanParserWindow: () => BrowserWindow | null; setYomitanParserWindow: (window: BrowserWindow | null) => void; getYomitanParserReadyPromise: () => Promise | null; setYomitanParserReadyPromise: (promise: Promise | null) => void; getYomitanParserInitPromise: () => Promise | null; setYomitanParserInitPromise: (promise: Promise | null) => void; isKnownWord: (text: string) => boolean; getKnownWordMatchMode: () => NPlusOneMatchMode; getJlptLevel: (text: string) => JlptLevel | null; getNPlusOneEnabled?: () => boolean; getJlptEnabled?: () => boolean; getFrequencyDictionaryEnabled?: () => boolean; getFrequencyDictionaryMatchMode?: () => FrequencyDictionaryMatchMode; getFrequencyRank?: FrequencyDictionaryLookup; getMinSentenceWordsForNPlusOne?: () => number; getYomitanGroupDebugEnabled?: () => boolean; getMecabTokenizer: () => MecabTokenizerLike | null; } interface TokenizerAnnotationOptions { nPlusOneEnabled: boolean; jlptEnabled: boolean; frequencyEnabled: boolean; frequencyMatchMode: FrequencyDictionaryMatchMode; minSentenceWordsForNPlusOne: number | undefined; } let parserEnrichmentWorkerRuntimeModulePromise: | Promise | null = null; let annotationStageModulePromise: Promise | null = null; let parserEnrichmentFallbackModulePromise: | Promise | null = null; function getKnownWordLookup(deps: TokenizerServiceDeps, options: TokenizerAnnotationOptions): (text: string) => boolean { if (!options.nPlusOneEnabled) { return () => false; } return deps.isKnownWord; } function needsMecabPosEnrichment(options: TokenizerAnnotationOptions): boolean { return options.jlptEnabled || options.frequencyEnabled; } function hasAnyAnnotationEnabled(options: TokenizerAnnotationOptions): boolean { return options.nPlusOneEnabled || options.jlptEnabled || options.frequencyEnabled; } async function enrichTokensWithMecabAsync( tokens: MergedToken[], mecabTokens: MergedToken[] | null, ): Promise { if (!parserEnrichmentWorkerRuntimeModulePromise) { parserEnrichmentWorkerRuntimeModulePromise = import('./tokenizer/parser-enrichment-worker-runtime'); } try { const runtime = await parserEnrichmentWorkerRuntimeModulePromise; return await runtime.enrichTokensWithMecabPos1Async(tokens, mecabTokens); } catch { if (!parserEnrichmentFallbackModulePromise) { parserEnrichmentFallbackModulePromise = import('./tokenizer/parser-enrichment-stage'); } const fallback = await parserEnrichmentFallbackModulePromise; return fallback.enrichTokensWithMecabPos1(tokens, mecabTokens); } } async function applyAnnotationStage( tokens: MergedToken[], deps: TokenizerServiceDeps, options: TokenizerAnnotationOptions, ): Promise { if (!hasAnyAnnotationEnabled(options)) { return tokens; } if (!annotationStageModulePromise) { annotationStageModulePromise = import('./tokenizer/annotation-stage'); } const annotationStage = await annotationStageModulePromise; return annotationStage.annotateTokens( tokens, { isKnownWord: getKnownWordLookup(deps, options), knownWordMatchMode: deps.getKnownWordMatchMode(), getJlptLevel: deps.getJlptLevel, }, options, ); } export function createTokenizerDepsRuntime( options: TokenizerDepsRuntimeOptions, ): TokenizerServiceDeps { const checkedMecabTokenizers = new WeakSet(); return { getYomitanExt: options.getYomitanExt, getYomitanParserWindow: options.getYomitanParserWindow, setYomitanParserWindow: options.setYomitanParserWindow, getYomitanParserReadyPromise: options.getYomitanParserReadyPromise, setYomitanParserReadyPromise: options.setYomitanParserReadyPromise, getYomitanParserInitPromise: options.getYomitanParserInitPromise, setYomitanParserInitPromise: options.setYomitanParserInitPromise, isKnownWord: options.isKnownWord, getKnownWordMatchMode: options.getKnownWordMatchMode, getJlptLevel: options.getJlptLevel, getNPlusOneEnabled: options.getNPlusOneEnabled, getJlptEnabled: options.getJlptEnabled, getFrequencyDictionaryEnabled: options.getFrequencyDictionaryEnabled, getFrequencyDictionaryMatchMode: options.getFrequencyDictionaryMatchMode ?? (() => 'headword'), getFrequencyRank: options.getFrequencyRank, getMinSentenceWordsForNPlusOne: options.getMinSentenceWordsForNPlusOne ?? (() => 3), getYomitanGroupDebugEnabled: options.getYomitanGroupDebugEnabled ?? (() => false), tokenizeWithMecab: async (text) => { const mecabTokenizer = options.getMecabTokenizer(); if (!mecabTokenizer) { return null; } if ( typeof mecabTokenizer.checkAvailability === 'function' && typeof mecabTokenizer.getStatus === 'function' && !checkedMecabTokenizers.has(mecabTokenizer as object) ) { const status = mecabTokenizer.getStatus(); if (!status.available) { await mecabTokenizer.checkAvailability(); } checkedMecabTokenizers.add(mecabTokenizer as object); } const rawTokens = await mecabTokenizer.tokenize(text); if (!rawTokens || rawTokens.length === 0) { return null; } const isKnownWordLookup = options.getNPlusOneEnabled?.() === false ? () => false : options.isKnownWord; return mergeTokens(rawTokens, isKnownWordLookup, options.getKnownWordMatchMode()); }, enrichTokensWithMecab: async (tokens, mecabTokens) => enrichTokensWithMecabAsync(tokens, mecabTokens), }; } function logSelectedYomitanGroups(text: string, tokens: MergedToken[]): void { if (tokens.length === 0) { return; } logger.info('Selected Yomitan token groups', { text, tokenCount: tokens.length, groups: tokens.map((token, index) => ({ index, surface: token.surface, headword: token.headword, reading: token.reading, startPos: token.startPos, endPos: token.endPos, })), }); } function normalizePositiveFrequencyRank(value: unknown): number | null { if (typeof value !== 'number' || !Number.isFinite(value) || value <= 0) { return null; } return Math.max(1, Math.floor(value)); } function normalizeFrequencyLookupText(rawText: string): string { return rawText.trim().toLowerCase(); } function resolveFrequencyLookupText( token: MergedToken, matchMode: FrequencyDictionaryMatchMode, ): string { if (matchMode === 'surface') { if (token.surface && token.surface.length > 0) { return token.surface; } if (token.headword && token.headword.length > 0) { return token.headword; } return token.reading; } if (token.headword && token.headword.length > 0) { return token.headword; } if (token.reading && token.reading.length > 0) { return token.reading; } return token.surface; } function buildYomitanFrequencyTermReadingList( tokens: MergedToken[], matchMode: FrequencyDictionaryMatchMode, ): Array<{ term: string; reading: string | null }> { return tokens .map((token) => { const term = resolveFrequencyLookupText(token, matchMode).trim(); if (!term) { return null; } const readingRaw = token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null; const reading = matchMode === 'headword' ? null : readingRaw; return { term, reading }; }) .filter((pair): pair is { term: string; reading: string | null } => pair !== null); } function buildYomitanFrequencyRankMap( frequencies: ReadonlyArray<{ term: string; frequency: number; dictionaryPriority?: number }>, ): Map { const rankByTerm = new Map(); for (const frequency of frequencies) { const normalizedTerm = frequency.term.trim(); const rank = normalizePositiveFrequencyRank(frequency.frequency); if (!normalizedTerm || rank === null) { continue; } const dictionaryPriority = typeof frequency.dictionaryPriority === 'number' && Number.isFinite(frequency.dictionaryPriority) ? Math.max(0, Math.floor(frequency.dictionaryPriority)) : Number.MAX_SAFE_INTEGER; const current = rankByTerm.get(normalizedTerm); if ( current === undefined || dictionaryPriority < current.dictionaryPriority || (dictionaryPriority === current.dictionaryPriority && rank < current.rank) ) { rankByTerm.set(normalizedTerm, { rank, dictionaryPriority }); } } const collapsedRankByTerm = new Map(); for (const [term, entry] of rankByTerm.entries()) { collapsedRankByTerm.set(term, entry.rank); } return collapsedRankByTerm; } function getLocalFrequencyRank( lookupText: string, getFrequencyRank: FrequencyDictionaryLookup, cache: Map, ): number | null { const normalizedText = normalizeFrequencyLookupText(lookupText); if (!normalizedText) { return null; } if (cache.has(normalizedText)) { return cache.get(normalizedText) ?? null; } let rank: number | null; try { rank = getFrequencyRank(normalizedText); } catch { rank = null; } rank = normalizePositiveFrequencyRank(rank); cache.set(normalizedText, rank); return rank; } function applyFrequencyRanks( tokens: MergedToken[], matchMode: FrequencyDictionaryMatchMode, yomitanRankByTerm: Map, getFrequencyRank: FrequencyDictionaryLookup | undefined, ): MergedToken[] { if (tokens.length === 0) { return tokens; } const localLookupCache = new Map(); return tokens.map((token) => { const existingRank = normalizePositiveFrequencyRank(token.frequencyRank); if (existingRank !== null) { return { ...token, frequencyRank: existingRank, }; } const lookupText = resolveFrequencyLookupText(token, matchMode).trim(); if (!lookupText) { return { ...token, frequencyRank: undefined, }; } const yomitanRank = yomitanRankByTerm.get(lookupText); if (yomitanRank !== undefined) { return { ...token, frequencyRank: yomitanRank, }; } if (!getFrequencyRank) { return { ...token, frequencyRank: undefined, }; } const localRank = getLocalFrequencyRank(lookupText, getFrequencyRank, localLookupCache); return { ...token, frequencyRank: localRank ?? undefined, }; }); } function getAnnotationOptions(deps: TokenizerServiceDeps): TokenizerAnnotationOptions { return { nPlusOneEnabled: deps.getNPlusOneEnabled?.() !== false, jlptEnabled: deps.getJlptEnabled?.() !== false, frequencyEnabled: deps.getFrequencyDictionaryEnabled?.() !== false, frequencyMatchMode: deps.getFrequencyDictionaryMatchMode?.() ?? 'headword', minSentenceWordsForNPlusOne: deps.getMinSentenceWordsForNPlusOne?.(), }; } async function parseWithYomitanInternalParser( text: string, deps: TokenizerServiceDeps, options: TokenizerAnnotationOptions, ): Promise { const parseResults = await requestYomitanParseResults(text, deps, logger); if (!parseResults) { return null; } const selectedTokens = selectYomitanParseTokens( parseResults, getKnownWordLookup(deps, options), deps.getKnownWordMatchMode(), ); if (!selectedTokens || selectedTokens.length === 0) { return null; } if (deps.getYomitanGroupDebugEnabled?.() === true) { logSelectedYomitanGroups(text, selectedTokens); } let yomitanRankByTerm = new Map(); if (options.frequencyEnabled) { const frequencyMatchMode = options.frequencyMatchMode; const termReadingList = buildYomitanFrequencyTermReadingList( selectedTokens, frequencyMatchMode, ); const yomitanFrequencies = await requestYomitanTermFrequencies(termReadingList, deps, logger); yomitanRankByTerm = buildYomitanFrequencyRankMap(yomitanFrequencies); } let enrichedTokens = selectedTokens; if (needsMecabPosEnrichment(options)) { try { const mecabTokens = await deps.tokenizeWithMecab(text); const enrichTokensWithMecab = deps.enrichTokensWithMecab ?? enrichTokensWithMecabAsync; enrichedTokens = await enrichTokensWithMecab(enrichedTokens, mecabTokens); } catch (err) { const error = err as Error; logger.warn( 'Failed to enrich Yomitan tokens with MeCab POS:', error.message, `tokenCount=${selectedTokens.length}`, `textLength=${text.length}`, ); } } if (options.frequencyEnabled) { return applyFrequencyRanks( enrichedTokens, options.frequencyMatchMode, yomitanRankByTerm, deps.getFrequencyRank, ); } return enrichedTokens; } export async function tokenizeSubtitle( text: string, deps: TokenizerServiceDeps, ): Promise { const displayText = text .replace(/\r\n/g, '\n') .replace(/\\N/g, '\n') .replace(/\\n/g, '\n') .trim(); if (!displayText) { return { text, tokens: null }; } const tokenizeText = displayText.replace(/\n/g, ' ').replace(/\s+/g, ' ').trim(); const annotationOptions = getAnnotationOptions(deps); const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps, annotationOptions); if (yomitanTokens && yomitanTokens.length > 0) { return { text: displayText, tokens: await applyAnnotationStage(yomitanTokens, deps, annotationOptions), }; } return { text: displayText, tokens: null }; }