import type { BrowserWindow, Extension } from 'electron'; import { markNPlusOneTargets, mergeTokens } from '../../token-merger'; import { JlptLevel, MergedToken, NPlusOneMatchMode, PartOfSpeech, SubtitleData, Token, FrequencyDictionaryLookup, } from '../../types'; import { shouldIgnoreJlptForMecabPos1, shouldIgnoreJlptByTerm } from './jlpt-token-filter'; import { createLogger } from '../../logger'; interface YomitanParseHeadword { term?: unknown; } interface YomitanParseSegment { text?: string; reading?: string; headwords?: unknown; } interface YomitanParseResultItem { source?: unknown; index?: unknown; content?: unknown; } type YomitanParseLine = YomitanParseSegment[]; const KATAKANA_TO_HIRAGANA_OFFSET = 0x60; const KATAKANA_CODEPOINT_START = 0x30a1; const KATAKANA_CODEPOINT_END = 0x30f6; const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048; const FREQUENCY_RANK_LOOKUP_CACHE_LIMIT = 2048; const logger = createLogger('main:tokenizer'); const jlptLevelLookupCaches = new WeakMap< (text: string) => JlptLevel | null, Map >(); const frequencyRankLookupCaches = new WeakMap< FrequencyDictionaryLookup, Map >(); function isObject(value: unknown): value is Record { return Boolean(value && typeof value === 'object'); } function isString(value: unknown): value is string { return typeof value === 'string'; } export interface TokenizerServiceDeps { getYomitanExt: () => Extension | null; getYomitanParserWindow: () => BrowserWindow | null; setYomitanParserWindow: (window: BrowserWindow | null) => void; getYomitanParserReadyPromise: () => Promise | null; setYomitanParserReadyPromise: (promise: Promise | null) => void; getYomitanParserInitPromise: () => Promise | null; setYomitanParserInitPromise: (promise: Promise | null) => void; isKnownWord: (text: string) => boolean; getKnownWordMatchMode: () => NPlusOneMatchMode; getJlptLevel: (text: string) => JlptLevel | null; getJlptEnabled?: () => boolean; getFrequencyDictionaryEnabled?: () => boolean; getFrequencyRank?: FrequencyDictionaryLookup; getMinSentenceWordsForNPlusOne?: () => number; getYomitanGroupDebugEnabled?: () => boolean; tokenizeWithMecab: (text: string) => Promise; } interface MecabTokenizerLike { tokenize: (text: string) => Promise; checkAvailability?: () => Promise; getStatus?: () => { available: boolean }; } export interface TokenizerDepsRuntimeOptions { getYomitanExt: () => Extension | null; getYomitanParserWindow: () => BrowserWindow | null; setYomitanParserWindow: (window: BrowserWindow | null) => void; getYomitanParserReadyPromise: () => Promise | null; setYomitanParserReadyPromise: (promise: Promise | null) => void; getYomitanParserInitPromise: () => Promise | null; setYomitanParserInitPromise: (promise: Promise | null) => void; isKnownWord: (text: string) => boolean; getKnownWordMatchMode: () => NPlusOneMatchMode; getJlptLevel: (text: string) => JlptLevel | null; getJlptEnabled?: () => boolean; getFrequencyDictionaryEnabled?: () => boolean; getFrequencyRank?: FrequencyDictionaryLookup; getMinSentenceWordsForNPlusOne?: () => number; getYomitanGroupDebugEnabled?: () => boolean; getMecabTokenizer: () => MecabTokenizerLike | null; } function getCachedJlptLevel( lookupText: string, getJlptLevel: (text: string) => JlptLevel | null, ): JlptLevel | null { const normalizedText = lookupText.trim(); if (!normalizedText) { return null; } let cache = jlptLevelLookupCaches.get(getJlptLevel); if (!cache) { cache = new Map(); jlptLevelLookupCaches.set(getJlptLevel, cache); } if (cache.has(normalizedText)) { return cache.get(normalizedText) ?? null; } let level: JlptLevel | null; try { level = getJlptLevel(normalizedText); } catch { level = null; } cache.set(normalizedText, level); while (cache.size > JLPT_LEVEL_LOOKUP_CACHE_LIMIT) { const firstKey = cache.keys().next().value; if (firstKey !== undefined) { cache.delete(firstKey); } } return level; } function normalizeFrequencyLookupText(rawText: string): string { return rawText.trim().toLowerCase(); } function getCachedFrequencyRank( lookupText: string, getFrequencyRank: FrequencyDictionaryLookup, ): number | null { const normalizedText = normalizeFrequencyLookupText(lookupText); if (!normalizedText) { return null; } let cache = frequencyRankLookupCaches.get(getFrequencyRank); if (!cache) { cache = new Map(); frequencyRankLookupCaches.set(getFrequencyRank, cache); } if (cache.has(normalizedText)) { return cache.get(normalizedText) ?? null; } let rank: number | null; try { rank = getFrequencyRank(normalizedText); } catch { rank = null; } if (rank !== null) { if (!Number.isFinite(rank) || rank <= 0) { rank = null; } } cache.set(normalizedText, rank); while (cache.size > FREQUENCY_RANK_LOOKUP_CACHE_LIMIT) { const firstKey = cache.keys().next().value; if (firstKey !== undefined) { cache.delete(firstKey); } } return rank; } export function createTokenizerDepsRuntime( options: TokenizerDepsRuntimeOptions, ): TokenizerServiceDeps { const checkedMecabTokenizers = new WeakSet(); return { getYomitanExt: options.getYomitanExt, getYomitanParserWindow: options.getYomitanParserWindow, setYomitanParserWindow: options.setYomitanParserWindow, getYomitanParserReadyPromise: options.getYomitanParserReadyPromise, setYomitanParserReadyPromise: options.setYomitanParserReadyPromise, getYomitanParserInitPromise: options.getYomitanParserInitPromise, setYomitanParserInitPromise: options.setYomitanParserInitPromise, isKnownWord: options.isKnownWord, getKnownWordMatchMode: options.getKnownWordMatchMode, getJlptLevel: options.getJlptLevel, getJlptEnabled: options.getJlptEnabled, getFrequencyDictionaryEnabled: options.getFrequencyDictionaryEnabled, getFrequencyRank: options.getFrequencyRank, getMinSentenceWordsForNPlusOne: options.getMinSentenceWordsForNPlusOne ?? (() => 3), getYomitanGroupDebugEnabled: options.getYomitanGroupDebugEnabled ?? (() => false), tokenizeWithMecab: async (text) => { const mecabTokenizer = options.getMecabTokenizer(); if (!mecabTokenizer) { return null; } if ( typeof mecabTokenizer.checkAvailability === 'function' && typeof mecabTokenizer.getStatus === 'function' && !checkedMecabTokenizers.has(mecabTokenizer as object) ) { const status = mecabTokenizer.getStatus(); if (!status.available) { await mecabTokenizer.checkAvailability(); } checkedMecabTokenizers.add(mecabTokenizer as object); } const rawTokens = await mecabTokenizer.tokenize(text); if (!rawTokens || rawTokens.length === 0) { return null; } return mergeTokens(rawTokens, options.isKnownWord, options.getKnownWordMatchMode()); }, }; } function resolveKnownWordText( surface: string, headword: string, matchMode: NPlusOneMatchMode, ): string { return matchMode === 'surface' ? surface : headword; } function applyKnownWordMarking( tokens: MergedToken[], isKnownWord: (text: string) => boolean, knownWordMatchMode: NPlusOneMatchMode, ): MergedToken[] { return tokens.map((token) => { const matchText = resolveKnownWordText(token.surface, token.headword, knownWordMatchMode); return { ...token, isKnown: token.isKnown || (matchText ? isKnownWord(matchText) : false), }; }); } function resolveFrequencyLookupText(token: MergedToken): string { if (token.headword && token.headword.length > 0) { return token.headword; } if (token.reading && token.reading.length > 0) { return token.reading; } return token.surface; } function getFrequencyLookupTextCandidates(token: MergedToken): string[] { const lookupText = resolveFrequencyLookupText(token).trim(); return lookupText ? [lookupText] : []; } function isFrequencyExcludedByPos(token: MergedToken): boolean { if ( token.partOfSpeech === PartOfSpeech.particle || token.partOfSpeech === PartOfSpeech.bound_auxiliary ) { return true; } return token.pos1 === '助詞' || token.pos1 === '助動詞'; } function applyFrequencyMarking( tokens: MergedToken[], getFrequencyRank: FrequencyDictionaryLookup, ): MergedToken[] { return tokens.map((token) => { if (isFrequencyExcludedByPos(token)) { return { ...token, frequencyRank: undefined }; } const lookupTexts = getFrequencyLookupTextCandidates(token); if (lookupTexts.length === 0) { return { ...token, frequencyRank: undefined }; } let bestRank: number | null = null; for (const lookupText of lookupTexts) { const rank = getCachedFrequencyRank(lookupText, getFrequencyRank); if (rank === null) { continue; } if (bestRank === null || rank < bestRank) { bestRank = rank; } } return { ...token, frequencyRank: bestRank ?? undefined, }; }); } function resolveJlptLookupText(token: MergedToken): string { if (token.headword && token.headword.length > 0) { return token.headword; } if (token.reading && token.reading.length > 0) { return token.reading; } return token.surface; } function normalizeJlptTextForExclusion(text: string): string { const raw = text.trim(); if (!raw) { return ''; } let normalized = ''; for (const char of raw) { const code = char.codePointAt(0); if (code === undefined) { continue; } if (code >= KATAKANA_CODEPOINT_START && code <= KATAKANA_CODEPOINT_END) { normalized += String.fromCodePoint(code - KATAKANA_TO_HIRAGANA_OFFSET); continue; } normalized += char; } return normalized; } function isKanaChar(char: string): boolean { const code = char.codePointAt(0); if (code === undefined) { return false; } return ( (code >= 0x3041 && code <= 0x3096) || (code >= 0x309b && code <= 0x309f) || (code >= 0x30a0 && code <= 0x30fa) || (code >= 0x30fd && code <= 0x30ff) ); } /** * Detects repeated-kana speech-like tokens (e.g. 「ああああ」, 「ははは」, 「うーん」 style patterns) * so they are not JLPT-labeled when they are mostly expressive particles/sfx. */ function isRepeatedKanaSfx(text: string): boolean { const normalized = text.trim(); if (!normalized) { return false; } const chars = [...normalized]; if (!chars.every(isKanaChar)) { return false; } const counts = new Map(); let hasAdjacentRepeat = false; for (let i = 0; i < chars.length; i += 1) { const char = chars[i]!; counts.set(char, (counts.get(char) ?? 0) + 1); if (i > 0 && chars[i] === chars[i - 1]) { hasAdjacentRepeat = true; } } const topCount = Math.max(...counts.values()); if (chars.length <= 2) { return hasAdjacentRepeat || topCount >= 2; } if (hasAdjacentRepeat) { return true; } return topCount >= Math.ceil(chars.length / 2); } function isJlptEligibleToken(token: MergedToken): boolean { if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) return false; const candidates = [ resolveJlptLookupText(token), token.surface, token.reading, token.headword, ].filter( (candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0, ); for (const candidate of candidates) { const normalizedCandidate = normalizeJlptTextForExclusion(candidate); if (!normalizedCandidate) { continue; } const trimmedCandidate = candidate.trim(); if (shouldIgnoreJlptByTerm(trimmedCandidate) || shouldIgnoreJlptByTerm(normalizedCandidate)) { return false; } if (isRepeatedKanaSfx(candidate) || isRepeatedKanaSfx(normalizedCandidate)) { return false; } } return true; } function isYomitanParseResultItem(value: unknown): value is YomitanParseResultItem { if (!isObject(value)) { return false; } if (!isString((value as YomitanParseResultItem).source)) { return false; } if (!Array.isArray((value as YomitanParseResultItem).content)) { return false; } return true; } function isYomitanParseLine(value: unknown): value is YomitanParseLine { if (!Array.isArray(value)) { return false; } return value.every((segment) => { if (!isObject(segment)) { return false; } const candidate = segment as YomitanParseSegment; return isString(candidate.text); }); } function isYomitanHeadwordRows(value: unknown): value is YomitanParseHeadword[][] { return ( Array.isArray(value) && value.every( (group) => Array.isArray(group) && group.every((item) => isObject(item) && isString((item as YomitanParseHeadword).term)), ) ); } function extractYomitanHeadword(segment: YomitanParseSegment): string { const headwords = segment.headwords; if (!isYomitanHeadwordRows(headwords)) { return ''; } for (const group of headwords) { if (group.length > 0) { const firstHeadword = group[0] as YomitanParseHeadword; if (isString(firstHeadword?.term)) { return firstHeadword.term; } } } return ''; } function applyJlptMarking( tokens: MergedToken[], getJlptLevel: (text: string) => JlptLevel | null, ): MergedToken[] { return tokens.map((token) => { if (!isJlptEligibleToken(token)) { return { ...token, jlptLevel: undefined }; } const primaryLevel = getCachedJlptLevel(resolveJlptLookupText(token), getJlptLevel); const fallbackLevel = primaryLevel === null ? getCachedJlptLevel(token.surface, getJlptLevel) : null; return { ...token, jlptLevel: primaryLevel ?? fallbackLevel ?? token.jlptLevel, }; }); } interface YomitanParseCandidate { source: string; index: number; tokens: MergedToken[]; } function mapYomitanParseResultItemToMergedTokens( parseResult: YomitanParseResultItem, isKnownWord: (text: string) => boolean, knownWordMatchMode: NPlusOneMatchMode, ): YomitanParseCandidate | null { const content = parseResult.content; if (!Array.isArray(content) || content.length === 0) { return null; } const source = String(parseResult.source ?? ''); const index = typeof parseResult.index === 'number' && Number.isInteger(parseResult.index) ? parseResult.index : 0; const tokens: MergedToken[] = []; let charOffset = 0; let validLineCount = 0; for (const line of content) { if (!isYomitanParseLine(line)) { continue; } validLineCount += 1; let combinedSurface = ''; let combinedReading = ''; let combinedHeadword = ''; for (const segment of line) { const segmentText = segment.text; if (!segmentText || segmentText.length === 0) { continue; } combinedSurface += segmentText; if (typeof segment.reading === 'string') { combinedReading += segment.reading; } if (!combinedHeadword) { combinedHeadword = extractYomitanHeadword(segment); } } if (!combinedSurface) { continue; } const start = charOffset; const end = start + combinedSurface.length; charOffset = end; const headword = combinedHeadword || combinedSurface; tokens.push({ surface: combinedSurface, reading: combinedReading, headword, startPos: start, endPos: end, partOfSpeech: PartOfSpeech.other, pos1: '', isMerged: true, isNPlusOneTarget: false, isKnown: (() => { const matchText = resolveKnownWordText(combinedSurface, headword, knownWordMatchMode); return matchText ? isKnownWord(matchText) : false; })(), }); } if (validLineCount === 0 || tokens.length === 0) { return null; } return { source, index, tokens }; } function selectBestYomitanParseCandidate( candidates: YomitanParseCandidate[], ): MergedToken[] | null { if (candidates.length === 0) { return null; } const scanningCandidates = candidates.filter( (candidate) => candidate.source === 'scanning-parser', ); const mecabCandidates = candidates.filter((candidate) => candidate.source === 'mecab'); const getBestByTokenCount = (items: YomitanParseCandidate[]): YomitanParseCandidate | null => items.length === 0 ? null : items.reduce((best, current) => current.tokens.length > best.tokens.length ? current : best, ); const getCandidateScore = (candidate: YomitanParseCandidate): number => { const readableTokenCount = candidate.tokens.filter( (token) => token.reading.trim().length > 0, ).length; const suspiciousKanaFragmentCount = candidate.tokens.filter( (token) => token.reading.trim().length === 0 && token.surface.length >= 2 && Array.from(token.surface).every((char) => isKanaChar(char)), ).length; return readableTokenCount * 100 - suspiciousKanaFragmentCount * 50 - candidate.tokens.length; }; const chooseBestCandidate = (items: YomitanParseCandidate[]): YomitanParseCandidate | null => { if (items.length === 0) { return null; } return items.reduce((best, current) => { const bestScore = getCandidateScore(best); const currentScore = getCandidateScore(current); if (currentScore !== bestScore) { return currentScore > bestScore ? current : best; } if (current.tokens.length !== best.tokens.length) { return current.tokens.length < best.tokens.length ? current : best; } return best; }); }; if (scanningCandidates.length > 0) { const bestScanning = getBestByTokenCount(scanningCandidates); if (bestScanning && bestScanning.tokens.length > 1) { return bestScanning.tokens; } const bestMecab = chooseBestCandidate(mecabCandidates); if (bestMecab && bestMecab.tokens.length > (bestScanning?.tokens.length ?? 0)) { return bestMecab.tokens; } return bestScanning ? bestScanning.tokens : null; } const multiTokenCandidates = candidates.filter((candidate) => candidate.tokens.length > 1); const pool = multiTokenCandidates.length > 0 ? multiTokenCandidates : candidates; const bestCandidate = chooseBestCandidate(pool); return bestCandidate ? bestCandidate.tokens : null; } function mapYomitanParseResultsToMergedTokens( parseResults: unknown, isKnownWord: (text: string) => boolean, knownWordMatchMode: NPlusOneMatchMode, ): MergedToken[] | null { if (!Array.isArray(parseResults) || parseResults.length === 0) { return null; } const candidates = parseResults .filter((item): item is YomitanParseResultItem => isYomitanParseResultItem(item)) .map((item) => mapYomitanParseResultItemToMergedTokens(item, isKnownWord, knownWordMatchMode)) .filter((candidate): candidate is YomitanParseCandidate => candidate !== null); const bestCandidate = selectBestYomitanParseCandidate(candidates); return bestCandidate; } function logSelectedYomitanGroups(text: string, tokens: MergedToken[]): void { if (!tokens || tokens.length === 0) { return; } logger.info('Selected Yomitan token groups', { text, tokenCount: tokens.length, groups: tokens.map((token, index) => ({ index, surface: token.surface, headword: token.headword, reading: token.reading, startPos: token.startPos, endPos: token.endPos, })), }); } function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): string | undefined { if (mecabTokens.length === 0) { return undefined; } const tokenStart = token.startPos ?? 0; const tokenEnd = token.endPos ?? tokenStart + token.surface.length; let bestSurfaceMatchPos1: string | undefined; let bestSurfaceMatchDistance = Number.MAX_SAFE_INTEGER; let bestSurfaceMatchEndDistance = Number.MAX_SAFE_INTEGER; for (const mecabToken of mecabTokens) { if (!mecabToken.pos1) { continue; } if (mecabToken.surface !== token.surface) { continue; } const mecabStart = mecabToken.startPos ?? 0; const mecabEnd = mecabToken.endPos ?? mecabStart + mecabToken.surface.length; const startDistance = Math.abs(mecabStart - tokenStart); const endDistance = Math.abs(mecabEnd - tokenEnd); if ( startDistance < bestSurfaceMatchDistance || (startDistance === bestSurfaceMatchDistance && endDistance < bestSurfaceMatchEndDistance) ) { bestSurfaceMatchDistance = startDistance; bestSurfaceMatchEndDistance = endDistance; bestSurfaceMatchPos1 = mecabToken.pos1; } } if (bestSurfaceMatchPos1) { return bestSurfaceMatchPos1; } let bestPos1: string | undefined; let bestOverlap = 0; let bestSpan = 0; let bestStartDistance = Number.MAX_SAFE_INTEGER; let bestStart = Number.MAX_SAFE_INTEGER; for (const mecabToken of mecabTokens) { if (!mecabToken.pos1) { continue; } const mecabStart = mecabToken.startPos ?? 0; const mecabEnd = mecabToken.endPos ?? mecabStart + mecabToken.surface.length; const overlapStart = Math.max(tokenStart, mecabStart); const overlapEnd = Math.min(tokenEnd, mecabEnd); const overlap = Math.max(0, overlapEnd - overlapStart); if (overlap === 0) { continue; } const span = mecabEnd - mecabStart; if ( overlap > bestOverlap || (overlap === bestOverlap && (Math.abs(mecabStart - tokenStart) < bestStartDistance || (Math.abs(mecabStart - tokenStart) === bestStartDistance && (span > bestSpan || (span === bestSpan && mecabStart < bestStart))))) ) { bestOverlap = overlap; bestSpan = span; bestStartDistance = Math.abs(mecabStart - tokenStart); bestStart = mecabStart; bestPos1 = mecabToken.pos1; } } return bestOverlap > 0 ? bestPos1 : undefined; } function fillMissingPos1BySurfaceSequence( tokens: MergedToken[], mecabTokens: MergedToken[], ): MergedToken[] { const indexedMecabTokens = mecabTokens .map((token, index) => ({ token, index })) .filter(({ token }) => token.pos1 && token.surface.trim().length > 0); if (indexedMecabTokens.length === 0) { return tokens; } let cursor = 0; return tokens.map((token) => { if (token.pos1 && token.pos1.trim().length > 0) { return token; } const surface = token.surface.trim(); if (!surface) { return token; } let best: { pos1: string; index: number } | null = null; for (const candidate of indexedMecabTokens) { if (candidate.token.surface !== surface) { continue; } if (candidate.index < cursor) { continue; } best = { pos1: candidate.token.pos1 as string, index: candidate.index }; break; } if (!best) { for (const candidate of indexedMecabTokens) { if (candidate.token.surface !== surface) { continue; } best = { pos1: candidate.token.pos1 as string, index: candidate.index }; break; } } if (!best) { return token; } cursor = best.index + 1; return { ...token, pos1: best.pos1, }; }); } async function enrichYomitanPos1( tokens: MergedToken[], deps: TokenizerServiceDeps, text: string, ): Promise { if (!tokens || tokens.length === 0) { return tokens; } let mecabTokens: MergedToken[] | null = null; try { mecabTokens = await deps.tokenizeWithMecab(text); } catch (err) { const error = err as Error; logger.warn( 'Failed to enrich Yomitan tokens with MeCab POS:', error.message, `tokenCount=${tokens.length}`, `textLength=${text.length}`, ); return tokens; } if (!mecabTokens || mecabTokens.length === 0) { logger.warn( 'MeCab enrichment returned no tokens; preserving Yomitan token output.', `tokenCount=${tokens.length}`, `textLength=${text.length}`, ); return tokens; } const overlapEnriched = tokens.map((token) => { if (token.pos1) { return token; } const pos1 = pickClosestMecabPos1(token, mecabTokens); if (!pos1) { return token; } return { ...token, pos1, }; }); return fillMissingPos1BySurfaceSequence(overlapEnriched, mecabTokens); } async function ensureYomitanParserWindow(deps: TokenizerServiceDeps): Promise { const electron = await import('electron'); const yomitanExt = deps.getYomitanExt(); if (!yomitanExt) { return false; } const currentWindow = deps.getYomitanParserWindow(); if (currentWindow && !currentWindow.isDestroyed()) { return true; } const existingInitPromise = deps.getYomitanParserInitPromise(); if (existingInitPromise) { return existingInitPromise; } const initPromise = (async () => { const { BrowserWindow, session } = electron; const parserWindow = new BrowserWindow({ show: false, width: 800, height: 600, webPreferences: { contextIsolation: true, nodeIntegration: false, session: session.defaultSession, }, }); deps.setYomitanParserWindow(parserWindow); deps.setYomitanParserReadyPromise( new Promise((resolve, reject) => { parserWindow.webContents.once('did-finish-load', () => resolve()); parserWindow.webContents.once('did-fail-load', (_event, _errorCode, errorDescription) => { reject(new Error(errorDescription)); }); }), ); parserWindow.on('closed', () => { if (deps.getYomitanParserWindow() === parserWindow) { deps.setYomitanParserWindow(null); deps.setYomitanParserReadyPromise(null); } }); try { await parserWindow.loadURL(`chrome-extension://${yomitanExt.id}/search.html`); const readyPromise = deps.getYomitanParserReadyPromise(); if (readyPromise) { await readyPromise; } return true; } catch (err) { logger.error('Failed to initialize Yomitan parser window:', (err as Error).message); if (!parserWindow.isDestroyed()) { parserWindow.destroy(); } if (deps.getYomitanParserWindow() === parserWindow) { deps.setYomitanParserWindow(null); deps.setYomitanParserReadyPromise(null); } return false; } finally { deps.setYomitanParserInitPromise(null); } })(); deps.setYomitanParserInitPromise(initPromise); return initPromise; } async function parseWithYomitanInternalParser( text: string, deps: TokenizerServiceDeps, ): Promise { const yomitanExt = deps.getYomitanExt(); if (!text || !yomitanExt) { return null; } const isReady = await ensureYomitanParserWindow(deps); const parserWindow = deps.getYomitanParserWindow(); if (!isReady || !parserWindow || parserWindow.isDestroyed()) { return null; } const script = ` (async () => { const invoke = (action, params) => new Promise((resolve, reject) => { chrome.runtime.sendMessage({ action, params }, (response) => { if (chrome.runtime.lastError) { reject(new Error(chrome.runtime.lastError.message)); return; } if (!response || typeof response !== "object") { reject(new Error("Invalid response from Yomitan backend")); return; } if (response.error) { reject(new Error(response.error.message || "Yomitan backend error")); return; } resolve(response.result); }); }); const optionsFull = await invoke("optionsGetFull", undefined); const profileIndex = optionsFull.profileCurrent; const scanLength = optionsFull.profiles?.[profileIndex]?.options?.scanning?.length ?? 40; return await invoke("parseText", { text: ${JSON.stringify(text)}, optionsContext: { index: profileIndex }, scanLength, useInternalParser: true, useMecabParser: true }); })(); `; try { const parseResults = await parserWindow.webContents.executeJavaScript(script, true); const yomitanTokens = mapYomitanParseResultsToMergedTokens( parseResults, deps.isKnownWord, deps.getKnownWordMatchMode(), ); if (!yomitanTokens || yomitanTokens.length === 0) { return null; } if (deps.getYomitanGroupDebugEnabled?.() === true) { logSelectedYomitanGroups(text, yomitanTokens); } return enrichYomitanPos1(yomitanTokens, deps, text); } catch (err) { logger.error('Yomitan parser request failed:', (err as Error).message); return null; } } export async function tokenizeSubtitle( text: string, deps: TokenizerServiceDeps, ): Promise { const minSentenceWordsForNPlusOne = deps.getMinSentenceWordsForNPlusOne?.(); const sanitizedMinSentenceWordsForNPlusOne = minSentenceWordsForNPlusOne !== undefined && Number.isInteger(minSentenceWordsForNPlusOne) && minSentenceWordsForNPlusOne > 0 ? minSentenceWordsForNPlusOne : 3; const displayText = text .replace(/\r\n/g, '\n') .replace(/\\N/g, '\n') .replace(/\\n/g, '\n') .trim(); if (!displayText) { return { text, tokens: null }; } const tokenizeText = displayText.replace(/\n/g, ' ').replace(/\s+/g, ' ').trim(); const jlptEnabled = deps.getJlptEnabled?.() !== false; const frequencyEnabled = deps.getFrequencyDictionaryEnabled?.() !== false; const frequencyLookup = deps.getFrequencyRank; const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps); if (yomitanTokens && yomitanTokens.length > 0) { const knownMarkedTokens = applyKnownWordMarking( yomitanTokens, deps.isKnownWord, deps.getKnownWordMatchMode(), ); const frequencyMarkedTokens = frequencyEnabled && frequencyLookup ? applyFrequencyMarking(knownMarkedTokens, frequencyLookup) : knownMarkedTokens.map((token) => ({ ...token, frequencyRank: undefined, })); const jlptMarkedTokens = jlptEnabled ? applyJlptMarking(frequencyMarkedTokens, deps.getJlptLevel) : frequencyMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined, })); return { text: displayText, tokens: markNPlusOneTargets(jlptMarkedTokens, sanitizedMinSentenceWordsForNPlusOne), }; } try { const mecabTokens = await deps.tokenizeWithMecab(tokenizeText); if (mecabTokens && mecabTokens.length > 0) { const knownMarkedTokens = applyKnownWordMarking( mecabTokens, deps.isKnownWord, deps.getKnownWordMatchMode(), ); const frequencyMarkedTokens = frequencyEnabled && frequencyLookup ? applyFrequencyMarking(knownMarkedTokens, frequencyLookup) : knownMarkedTokens.map((token) => ({ ...token, frequencyRank: undefined, })); const jlptMarkedTokens = jlptEnabled ? applyJlptMarking(frequencyMarkedTokens, deps.getJlptLevel) : frequencyMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined, })); return { text: displayText, tokens: markNPlusOneTargets(jlptMarkedTokens, sanitizedMinSentenceWordsForNPlusOne), }; } } catch (err) { logger.error('Tokenization error:', (err as Error).message); } return { text: displayText, tokens: null }; }