import { BrowserWindow, Extension, session } from "electron"; import { markNPlusOneTargets, mergeTokens } from "../../token-merger"; import { JlptLevel, MergedToken, NPlusOneMatchMode, PartOfSpeech, SubtitleData, Token, } from "../../types"; import { shouldIgnoreJlptForMecabPos1 } from "./jlpt-token-filter-config"; import { shouldIgnoreJlptByTerm } from "./jlpt-excluded-terms"; interface YomitanParseHeadword { term?: unknown; } interface YomitanParseSegment { text?: string; reading?: string; headwords?: unknown; } interface YomitanParseResultItem { source?: unknown; index?: unknown; content?: unknown; } type YomitanParseLine = YomitanParseSegment[]; const KATAKANA_TO_HIRAGANA_OFFSET = 0x60; const KATAKANA_CODEPOINT_START = 0x30a1; const KATAKANA_CODEPOINT_END = 0x30f6; function isObject(value: unknown): value is Record { return Boolean(value && typeof value === "object"); } function isString(value: unknown): value is string { return typeof value === "string"; } export interface TokenizerServiceDeps { getYomitanExt: () => Extension | null; getYomitanParserWindow: () => BrowserWindow | null; setYomitanParserWindow: (window: BrowserWindow | null) => void; getYomitanParserReadyPromise: () => Promise | null; setYomitanParserReadyPromise: (promise: Promise | null) => void; getYomitanParserInitPromise: () => Promise | null; setYomitanParserInitPromise: (promise: Promise | null) => void; isKnownWord: (text: string) => boolean; getKnownWordMatchMode: () => NPlusOneMatchMode; getJlptLevel: (text: string) => JlptLevel | null; getJlptEnabled?: () => boolean; tokenizeWithMecab: (text: string) => Promise; } interface MecabTokenizerLike { tokenize: (text: string) => Promise; } export interface TokenizerDepsRuntimeOptions { getYomitanExt: () => Extension | null; getYomitanParserWindow: () => BrowserWindow | null; setYomitanParserWindow: (window: BrowserWindow | null) => void; getYomitanParserReadyPromise: () => Promise | null; setYomitanParserReadyPromise: (promise: Promise | null) => void; getYomitanParserInitPromise: () => Promise | null; setYomitanParserInitPromise: (promise: Promise | null) => void; isKnownWord: (text: string) => boolean; getKnownWordMatchMode: () => NPlusOneMatchMode; getJlptLevel: (text: string) => JlptLevel | null; getJlptEnabled?: () => boolean; getMecabTokenizer: () => MecabTokenizerLike | null; } export function createTokenizerDepsRuntimeService( options: TokenizerDepsRuntimeOptions, ): TokenizerServiceDeps { return { getYomitanExt: options.getYomitanExt, getYomitanParserWindow: options.getYomitanParserWindow, setYomitanParserWindow: options.setYomitanParserWindow, getYomitanParserReadyPromise: options.getYomitanParserReadyPromise, setYomitanParserReadyPromise: options.setYomitanParserReadyPromise, getYomitanParserInitPromise: options.getYomitanParserInitPromise, setYomitanParserInitPromise: options.setYomitanParserInitPromise, isKnownWord: options.isKnownWord, getKnownWordMatchMode: options.getKnownWordMatchMode, getJlptLevel: options.getJlptLevel, getJlptEnabled: options.getJlptEnabled, tokenizeWithMecab: async (text) => { const mecabTokenizer = options.getMecabTokenizer(); if (!mecabTokenizer) { return null; } const rawTokens = await mecabTokenizer.tokenize(text); if (!rawTokens || rawTokens.length === 0) { return null; } return mergeTokens( rawTokens, options.isKnownWord, options.getKnownWordMatchMode(), ); }, }; } function resolveKnownWordText( surface: string, headword: string, matchMode: NPlusOneMatchMode, ): string { return matchMode === "surface" ? surface : headword; } function applyKnownWordMarking( tokens: MergedToken[], isKnownWord: (text: string) => boolean, knownWordMatchMode: NPlusOneMatchMode, ): MergedToken[] { return tokens.map((token) => { const matchText = resolveKnownWordText( token.surface, token.headword, knownWordMatchMode, ); return { ...token, isKnown: token.isKnown || (matchText ? isKnownWord(matchText) : false), }; }); } function resolveJlptLookupText(token: MergedToken): string { if (token.headword && token.headword.length > 0) { return token.headword; } if (token.reading && token.reading.length > 0) { return token.reading; } return token.surface; } function normalizeJlptTextForExclusion(text: string): string { const raw = text.trim(); if (!raw) { return ""; } let normalized = ""; for (const char of raw) { const code = char.codePointAt(0); if (code === undefined) { continue; } if (code >= KATAKANA_CODEPOINT_START && code <= KATAKANA_CODEPOINT_END) { normalized += String.fromCodePoint(code - KATAKANA_TO_HIRAGANA_OFFSET); continue; } normalized += char; } return normalized; } function isKanaChar(char: string): boolean { const code = char.codePointAt(0); if (code === undefined) { return false; } return ( (code >= 0x3041 && code <= 0x3096) || (code >= 0x309b && code <= 0x309f) || (code >= 0x30a0 && code <= 0x30fa) || (code >= 0x30fd && code <= 0x30ff) ); } /** * Detects repeated-kana speech-like tokens (e.g. 「ああああ」, 「ははは」, 「うーん」 style patterns) * so they are not JLPT-labeled when they are mostly expressive particles/sfx. */ function isRepeatedKanaSfx(text: string): boolean { const normalized = text.trim(); if (!normalized) { return false; } const chars = [...normalized]; if (!chars.every(isKanaChar)) { return false; } const counts = new Map(); let hasAdjacentRepeat = false; for (let i = 0; i < chars.length; i += 1) { const char = chars[i]; counts.set(char, (counts.get(char) ?? 0) + 1); if (i > 0 && chars[i] === chars[i - 1]) { hasAdjacentRepeat = true; } } const topCount = Math.max(...counts.values()); if (chars.length <= 2) { return hasAdjacentRepeat || topCount >= 2; } if (hasAdjacentRepeat) { return true; } return topCount >= Math.ceil(chars.length / 2); } function isJlptEligibleToken(token: MergedToken): boolean { if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) return false; const candidates = [ resolveJlptLookupText(token), token.surface, token.reading, token.headword, ].filter((candidate): candidate is string => typeof candidate === "string" && candidate.length > 0); for (const candidate of candidates) { const normalizedCandidate = normalizeJlptTextForExclusion(candidate); if (!normalizedCandidate) { continue; } const trimmedCandidate = candidate.trim(); if ( shouldIgnoreJlptByTerm(trimmedCandidate) || shouldIgnoreJlptByTerm(normalizedCandidate) ) { return false; } if ( isRepeatedKanaSfx(candidate) || isRepeatedKanaSfx(normalizedCandidate) ) { return false; } } return true; } function isYomitanParseResultItem( value: unknown, ): value is YomitanParseResultItem { if (!isObject(value)) { return false; } if ((value as YomitanParseResultItem).source !== "scanning-parser") { return false; } if (!Array.isArray((value as YomitanParseResultItem).content)) { return false; } return true; } function isYomitanParseLine(value: unknown): value is YomitanParseLine { if (!Array.isArray(value)) { return false; } return value.every((segment) => { if (!isObject(segment)) { return false; } const candidate = segment as YomitanParseSegment; return isString(candidate.text); }); } function isYomitanHeadwordRows(value: unknown): value is YomitanParseHeadword[][] { return ( Array.isArray(value) && value.every( (group) => Array.isArray(group) && group.every((item) => isObject(item) && isString((item as YomitanParseHeadword).term), ), ) ); } function extractYomitanHeadword(segment: YomitanParseSegment): string { const headwords = segment.headwords; if (!isYomitanHeadwordRows(headwords)) { return ""; } for (const group of headwords) { if (group.length > 0) { const firstHeadword = group[0] as YomitanParseHeadword; if (isString(firstHeadword?.term)) { return firstHeadword.term; } } } return ""; } function applyJlptMarking( tokens: MergedToken[], getJlptLevel: (text: string) => JlptLevel | null, ): MergedToken[] { return tokens.map((token) => { if (!isJlptEligibleToken(token)) { return { ...token, jlptLevel: undefined }; } const primaryLevel = getJlptLevel(resolveJlptLookupText(token)); const fallbackLevel = getJlptLevel(token.surface); return { ...token, jlptLevel: primaryLevel ?? fallbackLevel ?? token.jlptLevel, }; }); } function mapYomitanParseResultsToMergedTokens( parseResults: unknown, isKnownWord: (text: string) => boolean, knownWordMatchMode: NPlusOneMatchMode, ): MergedToken[] | null { if (!Array.isArray(parseResults) || parseResults.length === 0) { return null; } const scanningItems = parseResults.filter( (item): item is YomitanParseResultItem => isYomitanParseResultItem(item), ); if (scanningItems.length === 0) { return null; } const primaryItem = scanningItems.find((item) => item.index === 0) || scanningItems[0]; const content = primaryItem.content; if (!Array.isArray(content)) { return null; } const tokens: MergedToken[] = []; let charOffset = 0; let validLineCount = 0; for (const line of content) { if (!isYomitanParseLine(line)) { continue; } validLineCount += 1; let surface = ""; let reading = ""; let headword = ""; for (const segment of line) { const segmentText = segment.text; if (!segmentText || segmentText.length === 0) { continue; } surface += segmentText; if (typeof segment.reading === "string") { reading += segment.reading; } if (!headword) { headword = extractYomitanHeadword(segment); } } if (!surface) { continue; } const start = charOffset; const end = start + surface.length; charOffset = end; tokens.push({ surface, reading, headword: headword || surface, startPos: start, endPos: end, partOfSpeech: PartOfSpeech.other, pos1: "", isMerged: true, isNPlusOneTarget: false, isKnown: (() => { const matchText = resolveKnownWordText( surface, headword, knownWordMatchMode, ); return matchText ? isKnownWord(matchText) : false; })(), }); } if (validLineCount === 0) { return null; } return tokens.length > 0 ? tokens : null; } function pickClosestMecabPos1( token: MergedToken, mecabTokens: MergedToken[], ): string | undefined { if (mecabTokens.length === 0) { return undefined; } const tokenStart = token.startPos ?? 0; const tokenEnd = token.endPos ?? tokenStart + token.surface.length; let bestPos1: string | undefined; let bestOverlap = 0; let bestSpan = 0; let bestStart = Number.MAX_SAFE_INTEGER; for (const mecabToken of mecabTokens) { if (!mecabToken.pos1) { continue; } const mecabStart = mecabToken.startPos ?? 0; const mecabEnd = mecabToken.endPos ?? mecabStart + mecabToken.surface.length; const overlapStart = Math.max(tokenStart, mecabStart); const overlapEnd = Math.min(tokenEnd, mecabEnd); const overlap = Math.max(0, overlapEnd - overlapStart); if (overlap === 0) { continue; } const span = mecabEnd - mecabStart; if ( overlap > bestOverlap || (overlap === bestOverlap && (span > bestSpan || (span === bestSpan && mecabStart < bestStart))) ) { bestOverlap = overlap; bestSpan = span; bestStart = mecabStart; bestPos1 = mecabToken.pos1; } } return bestOverlap > 0 ? bestPos1 : undefined; } async function enrichYomitanPos1( tokens: MergedToken[], deps: TokenizerServiceDeps, text: string, ): Promise { if (!tokens || tokens.length === 0) { return tokens; } let mecabTokens: MergedToken[] | null = null; try { mecabTokens = await deps.tokenizeWithMecab(text); } catch (err) { const error = err as Error; console.warn( "Failed to enrich Yomitan tokens with MeCab POS:", error.message, `tokenCount=${tokens.length}`, `textLength=${text.length}`, ); return tokens; } if (!mecabTokens || mecabTokens.length === 0) { console.warn( "MeCab enrichment returned no tokens; preserving Yomitan token output.", `tokenCount=${tokens.length}`, `textLength=${text.length}`, ); return tokens; } return tokens.map((token) => { if (token.pos1) { return token; } const pos1 = pickClosestMecabPos1(token, mecabTokens); if (!pos1) { return token; } return { ...token, pos1, }; }); } async function ensureYomitanParserWindow( deps: TokenizerServiceDeps, ): Promise { const yomitanExt = deps.getYomitanExt(); if (!yomitanExt) { return false; } const currentWindow = deps.getYomitanParserWindow(); if (currentWindow && !currentWindow.isDestroyed()) { return true; } const existingInitPromise = deps.getYomitanParserInitPromise(); if (existingInitPromise) { return existingInitPromise; } const initPromise = (async () => { const parserWindow = new BrowserWindow({ show: false, width: 800, height: 600, webPreferences: { contextIsolation: true, nodeIntegration: false, session: session.defaultSession, }, }); deps.setYomitanParserWindow(parserWindow); deps.setYomitanParserReadyPromise( new Promise((resolve, reject) => { parserWindow.webContents.once("did-finish-load", () => resolve()); parserWindow.webContents.once( "did-fail-load", (_event, _errorCode, errorDescription) => { reject(new Error(errorDescription)); }, ); }), ); parserWindow.on("closed", () => { if (deps.getYomitanParserWindow() === parserWindow) { deps.setYomitanParserWindow(null); deps.setYomitanParserReadyPromise(null); } }); try { await parserWindow.loadURL(`chrome-extension://${yomitanExt.id}/search.html`); const readyPromise = deps.getYomitanParserReadyPromise(); if (readyPromise) { await readyPromise; } return true; } catch (err) { console.error( "Failed to initialize Yomitan parser window:", (err as Error).message, ); if (!parserWindow.isDestroyed()) { parserWindow.destroy(); } if (deps.getYomitanParserWindow() === parserWindow) { deps.setYomitanParserWindow(null); deps.setYomitanParserReadyPromise(null); } return false; } finally { deps.setYomitanParserInitPromise(null); } })(); deps.setYomitanParserInitPromise(initPromise); return initPromise; } async function parseWithYomitanInternalParser( text: string, deps: TokenizerServiceDeps, ): Promise { const yomitanExt = deps.getYomitanExt(); if (!text || !yomitanExt) { return null; } const isReady = await ensureYomitanParserWindow(deps); const parserWindow = deps.getYomitanParserWindow(); if (!isReady || !parserWindow || parserWindow.isDestroyed()) { return null; } const script = ` (async () => { const invoke = (action, params) => new Promise((resolve, reject) => { chrome.runtime.sendMessage({ action, params }, (response) => { if (chrome.runtime.lastError) { reject(new Error(chrome.runtime.lastError.message)); return; } if (!response || typeof response !== "object") { reject(new Error("Invalid response from Yomitan backend")); return; } if (response.error) { reject(new Error(response.error.message || "Yomitan backend error")); return; } resolve(response.result); }); }); const optionsFull = await invoke("optionsGetFull", undefined); const profileIndex = optionsFull.profileCurrent; const scanLength = optionsFull.profiles?.[profileIndex]?.options?.scanning?.length ?? 40; return await invoke("parseText", { text: ${JSON.stringify(text)}, optionsContext: { index: profileIndex }, scanLength, useInternalParser: true, useMecabParser: false }); })(); `; try { const parseResults = await parserWindow.webContents.executeJavaScript( script, true, ); const yomitanTokens = mapYomitanParseResultsToMergedTokens( parseResults, deps.isKnownWord, deps.getKnownWordMatchMode(), ); if (!yomitanTokens || yomitanTokens.length === 0) { return null; } return enrichYomitanPos1(yomitanTokens, deps, text); } catch (err) { console.error("Yomitan parser request failed:", (err as Error).message); return null; } } export async function tokenizeSubtitleService( text: string, deps: TokenizerServiceDeps, ): Promise { const displayText = text .replace(/\r\n/g, "\n") .replace(/\\N/g, "\n") .replace(/\\n/g, "\n") .trim(); if (!displayText) { return { text, tokens: null }; } const tokenizeText = displayText .replace(/\n/g, " ") .replace(/\s+/g, " ") .trim(); const jlptEnabled = deps.getJlptEnabled?.() !== false; const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps); if (yomitanTokens && yomitanTokens.length > 0) { const knownMarkedTokens = applyKnownWordMarking( yomitanTokens, deps.isKnownWord, deps.getKnownWordMatchMode(), ); const jlptMarkedTokens = jlptEnabled ? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel) : knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined })); return { text: displayText, tokens: markNPlusOneTargets(jlptMarkedTokens) }; } try { const mecabTokens = await deps.tokenizeWithMecab(tokenizeText); if (mecabTokens && mecabTokens.length > 0) { const knownMarkedTokens = applyKnownWordMarking( mecabTokens, deps.isKnownWord, deps.getKnownWordMatchMode(), ); const jlptMarkedTokens = jlptEnabled ? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel) : knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined })); return { text: displayText, tokens: markNPlusOneTargets(jlptMarkedTokens) }; } } catch (err) { console.error("Tokenization error:", (err as Error).message); } return { text: displayText, tokens: null }; }