diff --git a/package.json b/package.json index e4f2c19..f9dedff 100644 --- a/package.json +++ b/package.json @@ -4,6 +4,8 @@ "description": "All-in-one sentence mining overlay with AnkiConnect and dictionary integration", "main": "dist/main.js", "scripts": { + "get-frequency": "bun run scripts/get_frequency.ts", + "get-frequency:electron": "bun build scripts/get_frequency.ts --format=cjs --target=node --outfile dist/scripts/get_frequency.js --external electron && electron dist/scripts/get_frequency.js", "build": "tsc && pnpm run build:renderer && cp src/renderer/index.html src/renderer/style.css dist/renderer/ && bash scripts/build-macos-helper.sh", "build:renderer": "esbuild src/renderer/renderer.ts --bundle --platform=browser --format=esm --target=es2022 --outfile=dist/renderer/renderer.js --sourcemap", "check:main-lines": "bash scripts/check-main-lines.sh", diff --git a/scripts/get_frequency.ts b/scripts/get_frequency.ts new file mode 100644 index 0000000..ee3166a --- /dev/null +++ b/scripts/get_frequency.ts @@ -0,0 +1,907 @@ +import fs from "node:fs"; +import path from "node:path"; +import process from "node:process"; + +import { createTokenizerDepsRuntimeService, tokenizeSubtitleService } from "../src/core/services/tokenizer-service.js"; +import { createFrequencyDictionaryLookupService } from "../src/core/services/frequency-dictionary-service.js"; +import { MecabTokenizer } from "../src/mecab-tokenizer.js"; +import type { MergedToken, FrequencyDictionaryLookup } from "../src/types.js"; + +interface CliOptions { + input: string; + dictionaryPath: string; + emitPretty: boolean; + emitVerbose: boolean; + mecabCommand?: string; + mecabDictionaryPath?: string; + forceMecabOnly?: boolean; + yomitanExtensionPath?: string; + yomitanUserDataPath?: string; + emitColoredLine: boolean; + colorMode: "single" | "banded"; + colorTopX: number; + colorSingle: string; + colorBand1: string; + colorBand2: string; + colorBand3: string; + colorBand4: string; + colorBand5: string; + colorKnown: string; + colorNPlusOne: string; +} + +function parseCliArgs(argv: string[]): CliOptions { + const args = [...argv]; + let inputParts: string[] = []; + let dictionaryPath = path.join(process.cwd(), "vendor", "jiten_freq_global"); + let emitPretty = false; + let emitVerbose = false; + let mecabCommand: string | undefined; + let mecabDictionaryPath: string | undefined; + let forceMecabOnly = false; + let yomitanExtensionPath: string | undefined; + let yomitanUserDataPath: string | undefined; + let emitColoredLine = false; + let colorMode: "single" | "banded" = "single"; + let colorTopX = 1000; + let colorSingle = "#f5a97f"; + let colorBand1 = "#ed8796"; + let colorBand2 = "#f5a97f"; + let colorBand3 = "#f9e2af"; + let colorBand4 = "#a6e3a1"; + let colorBand5 = "#8aadf4"; + let colorKnown = "#a6da95"; + let colorNPlusOne = "#c6a0f6"; + + while (args.length > 0) { + const arg = args.shift(); + if (!arg) break; + + if (arg === "--help" || arg === "-h") { + printUsage(); + process.exit(0); + } + + if (arg === "--dictionary") { + const next = args.shift(); + if (!next) { + throw new Error("Missing value for --dictionary"); + } + dictionaryPath = path.resolve(next); + continue; + } + + if (arg === "--mecab-command") { + const next = args.shift(); + if (!next) { + throw new Error("Missing value for --mecab-command"); + } + mecabCommand = next; + continue; + } + + if (arg === "--mecab-dictionary") { + const next = args.shift(); + if (!next) { + throw new Error("Missing value for --mecab-dictionary"); + } + mecabDictionaryPath = next; + continue; + } + + if (arg === "--yomitan-extension") { + const next = args.shift(); + if (!next) { + throw new Error("Missing value for --yomitan-extension"); + } + yomitanExtensionPath = path.resolve(next); + continue; + } + + if (arg === "--yomitan-user-data") { + const next = args.shift(); + if (!next) { + throw new Error("Missing value for --yomitan-user-data"); + } + yomitanUserDataPath = path.resolve(next); + continue; + } + + if (arg === "--colorized-line") { + emitColoredLine = true; + continue; + } + + if (arg === "--color-mode") { + const next = args.shift(); + if (!next) { + throw new Error("Missing value for --color-mode"); + } + if (next !== "single" && next !== "banded") { + throw new Error("--color-mode must be 'single' or 'banded'"); + } + colorMode = next; + continue; + } + + if (arg === "--color-top-x") { + const next = args.shift(); + if (!next) { + throw new Error("Missing value for --color-top-x"); + } + const parsed = Number.parseInt(next, 10); + if (!Number.isFinite(parsed) || parsed <= 0) { + throw new Error("--color-top-x must be a positive integer"); + } + colorTopX = parsed; + continue; + } + + if (arg === "--color-single") { + const next = args.shift(); + if (!next) { + throw new Error("Missing value for --color-single"); + } + colorSingle = next; + continue; + } + + if (arg === "--color-band-1") { + const next = args.shift(); + if (!next) { + throw new Error("Missing value for --color-band-1"); + } + colorBand1 = next; + continue; + } + + if (arg === "--color-band-2") { + const next = args.shift(); + if (!next) { + throw new Error("Missing value for --color-band-2"); + } + colorBand2 = next; + continue; + } + + if (arg === "--color-band-3") { + const next = args.shift(); + if (!next) { + throw new Error("Missing value for --color-band-3"); + } + colorBand3 = next; + continue; + } + + if (arg === "--color-band-4") { + const next = args.shift(); + if (!next) { + throw new Error("Missing value for --color-band-4"); + } + colorBand4 = next; + continue; + } + + if (arg === "--color-band-5") { + const next = args.shift(); + if (!next) { + throw new Error("Missing value for --color-band-5"); + } + colorBand5 = next; + continue; + } + + if (arg === "--color-known") { + const next = args.shift(); + if (!next) { + throw new Error("Missing value for --color-known"); + } + colorKnown = next; + continue; + } + + if (arg === "--color-n-plus-one") { + const next = args.shift(); + if (!next) { + throw new Error("Missing value for --color-n-plus-one"); + } + colorNPlusOne = next; + continue; + } + + if (arg.startsWith("--dictionary=")) { + dictionaryPath = path.resolve(arg.slice("--dictionary=".length)); + continue; + } + + if (arg.startsWith("--mecab-command=")) { + mecabCommand = arg.slice("--mecab-command=".length); + continue; + } + + if (arg.startsWith("--mecab-dictionary=")) { + mecabDictionaryPath = arg.slice("--mecab-dictionary=".length); + continue; + } + + if (arg.startsWith("--yomitan-extension=")) { + yomitanExtensionPath = path.resolve( + arg.slice("--yomitan-extension=".length), + ); + continue; + } + + if (arg.startsWith("--yomitan-user-data=")) { + yomitanUserDataPath = path.resolve( + arg.slice("--yomitan-user-data=".length), + ); + continue; + } + + if (arg.startsWith("--colorized-line")) { + emitColoredLine = true; + continue; + } + + if (arg.startsWith("--color-mode=")) { + const value = arg.slice("--color-mode=".length); + if (value !== "single" && value !== "banded") { + throw new Error("--color-mode must be 'single' or 'banded'"); + } + colorMode = value; + continue; + } + + if (arg.startsWith("--color-top-x=")) { + const value = arg.slice("--color-top-x=".length); + const parsed = Number.parseInt(value, 10); + if (!Number.isFinite(parsed) || parsed <= 0) { + throw new Error("--color-top-x must be a positive integer"); + } + colorTopX = parsed; + continue; + } + + if (arg.startsWith("--color-single=")) { + colorSingle = arg.slice("--color-single=".length); + continue; + } + + if (arg.startsWith("--color-band-1=")) { + colorBand1 = arg.slice("--color-band-1=".length); + continue; + } + + if (arg.startsWith("--color-band-2=")) { + colorBand2 = arg.slice("--color-band-2=".length); + continue; + } + + if (arg.startsWith("--color-band-3=")) { + colorBand3 = arg.slice("--color-band-3=".length); + continue; + } + + if (arg.startsWith("--color-band-4=")) { + colorBand4 = arg.slice("--color-band-4=".length); + continue; + } + + if (arg.startsWith("--color-band-5=")) { + colorBand5 = arg.slice("--color-band-5=".length); + continue; + } + + if (arg.startsWith("--color-known=")) { + colorKnown = arg.slice("--color-known=".length); + continue; + } + + if (arg.startsWith("--color-n-plus-one=")) { + colorNPlusOne = arg.slice("--color-n-plus-one=".length); + continue; + } + + if (arg === "--pretty") { + emitPretty = true; + continue; + } + + if (arg === "--verbose") { + emitVerbose = true; + continue; + } + + if (arg === "--force-mecab") { + forceMecabOnly = true; + continue; + } + + if (arg.startsWith("-")) { + throw new Error(`Unknown flag: ${arg}`); + } + + inputParts.push(arg); + } + + const input = inputParts.join(" ").trim(); + if (!input) { + const stdin = fs.readFileSync(0, "utf8").trim(); + if (!stdin) { + throw new Error( + "Please provide input text as arguments or via stdin.", + ); + } + return { + input: stdin, + dictionaryPath, + emitPretty, + emitVerbose, + forceMecabOnly, + yomitanExtensionPath, + yomitanUserDataPath, + emitColoredLine, + colorMode, + colorTopX, + colorSingle, + colorBand1, + colorBand2, + colorBand3, + colorBand4, + colorBand5, + colorKnown, + colorNPlusOne, + mecabCommand, + mecabDictionaryPath, + }; + } + + return { + input, + dictionaryPath, + emitPretty, + emitVerbose, + forceMecabOnly, + yomitanExtensionPath, + yomitanUserDataPath, + emitColoredLine, + colorMode, + colorTopX, + colorSingle, + colorBand1, + colorBand2, + colorBand3, + colorBand4, + colorBand5, + colorKnown, + colorNPlusOne, + mecabCommand, + mecabDictionaryPath, + }; + } + +function printUsage(): void { + process.stdout.write(`Usage: + pnpm run get-frequency [--pretty] [--verbose] [--dictionary ] [--mecab-command ] [--mecab-dictionary ] + + --pretty Pretty-print JSON output. + --verbose Include merged-frequency diagnostics and lookup terms. + --force-mecab Skip Yomitan parser initialization and force MeCab fallback. + --yomitan-extension Optional path to a Yomitan extension directory. + --yomitan-user-data Optional Electron userData directory for Yomitan state. + --colorized-line Output a terminal-colorized line based on token classification. + --color-mode Frequency coloring mode (default: single). + --color-top-x Frequency color applies when rank <= n (default: 1000). + --color-single <#hex> Frequency single-mode color (default: #f5a97f). + --color-band-1 <#hex> Frequency band-1 color. + --color-band-2 <#hex> Frequency band-2 color. + --color-band-3 <#hex> Frequency band-3 color. + --color-band-4 <#hex> Frequency band-4 color. + --color-band-5 <#hex> Frequency band-5 color. + --color-known <#hex> Known-word color (default: #a6da95). + --color-n-plus-one <#hex> N+1 target color (default: #c6a0f6). + --dictionary Frequency dictionary root path (default: ./vendor/jiten_freq_global) + --mecab-command Optional MeCab binary path (default: mecab) + --mecab-dictionary Optional MeCab dictionary directory (default: system default) + -h, --help Show usage. +\n`); +} + +type FrequencyCandidate = { + term: string; + rank: number; +}; + +function getFrequencyLookupTextCandidates(token: MergedToken): string[] { + const tokenWithCandidates = token as MergedToken & { + frequencyLookupTerms?: string[]; + }; + const lookupTextCandidates: string[] = []; + const addLookupText = (text: string | undefined): void => { + if (!text) { + return; + } + const trimmed = text.trim(); + if (!trimmed) { + return; + } + lookupTextCandidates.push(trimmed); + }; + + if (Array.isArray(tokenWithCandidates.frequencyLookupTerms)) { + for (const term of tokenWithCandidates.frequencyLookupTerms) { + addLookupText(term); + } + } + + addLookupText(token.headword); + addLookupText(token.reading); + addLookupText(token.surface); + + const uniqueLookupTerms: string[] = []; + const seen = new Set(); + for (const term of lookupTextCandidates) { + if (seen.has(term)) { + continue; + } + seen.add(term); + uniqueLookupTerms.push(term); + } + return uniqueLookupTerms; +} + +function getBestFrequencyLookupCandidate( + token: MergedToken, + getFrequencyRank: FrequencyDictionaryLookup, +): FrequencyCandidate | null { + const lookupTexts = getFrequencyLookupTextCandidates(token); + let best: FrequencyCandidate | null = null; + for (const term of lookupTexts) { + const rank = getFrequencyRank(term); + if (typeof rank !== "number" || !Number.isFinite(rank) || rank <= 0) { + continue; + } + if (!best || rank < best.rank) { + best = { term, rank }; + } + } + return best; +} + +function simplifyToken(token: MergedToken): Record { + return { + surface: token.surface, + reading: token.reading, + headword: token.headword, + startPos: token.startPos, + endPos: token.endPos, + partOfSpeech: token.partOfSpeech, + isMerged: token.isMerged, + isKnown: token.isKnown, + isNPlusOneTarget: token.isNPlusOneTarget, + frequencyRank: token.frequencyRank, + jlptLevel: token.jlptLevel, + }; +} + +function simplifyTokenWithVerbose( + token: MergedToken, + getFrequencyRank: FrequencyDictionaryLookup, +): Record { + const tokenWithCandidates = token as MergedToken & { + frequencyLookupTerms?: string[]; + }; + const frequencyLookupTerms = tokenWithCandidates.frequencyLookupTerms; + const candidates = getFrequencyLookupTextCandidates(token).map((term) => ({ + term, + rank: getFrequencyRank(term), + })).filter((candidate) => + typeof candidate.rank === "number" && + Number.isFinite(candidate.rank) && + candidate.rank > 0 + ); + + const bestCandidate = getBestFrequencyLookupCandidate( + token, + getFrequencyRank, + ); + + return { + surface: token.surface, + reading: token.reading, + headword: token.headword, + startPos: token.startPos, + endPos: token.endPos, + partOfSpeech: token.partOfSpeech, + isMerged: token.isMerged, + isKnown: token.isKnown, + isNPlusOneTarget: token.isNPlusOneTarget, + frequencyRank: token.frequencyRank, + jlptLevel: token.jlptLevel, + frequencyLookupTerms: + Array.isArray(frequencyLookupTerms) && frequencyLookupTerms.length > 0 + ? frequencyLookupTerms + : undefined, + frequencyCandidates: candidates, + frequencyBestLookupTerm: bestCandidate?.term ?? null, + frequencyBestLookupRank: bestCandidate?.rank ?? null, + }; +} + +interface YomitanRuntimeState { + yomitanExt: unknown | null; + parserWindow: unknown | null; + parserReadyPromise: Promise | null; + parserInitPromise: Promise | null; + available: boolean; + note?: string; +} + +async function createYomitanRuntimeState( + userDataPath: string, +): Promise { + const state: YomitanRuntimeState = { + yomitanExt: null, + parserWindow: null, + parserReadyPromise: null, + parserInitPromise: null, + available: false, + }; + + const electronImport = await import("electron").catch((error) => { + state.note = error instanceof Error ? error.message : "unknown error"; + return null; + }); + if (!electronImport || !electronImport.app || !electronImport.app.whenReady) { + state.note = "electron runtime not available in this process"; + return state; + } + + try { + await electronImport.app.whenReady(); + const loadYomitanExtensionService = ( + await import( + "../src/core/services/yomitan-extension-loader-service.js" + ) + ).loadYomitanExtensionService as ( + options: { + userDataPath: string; + getYomitanParserWindow: () => unknown; + setYomitanParserWindow: (window: unknown) => void; + setYomitanParserReadyPromise: (promise: Promise | null) => void; + setYomitanParserInitPromise: (promise: Promise | null) => void; + setYomitanExtension: (extension: unknown) => void; + }, + ) => Promise; + + const extension = await loadYomitanExtensionService({ + userDataPath, + getYomitanParserWindow: () => state.parserWindow, + setYomitanParserWindow: (window) => { + state.parserWindow = window; + }, + setYomitanParserReadyPromise: (promise) => { + state.parserReadyPromise = promise; + }, + setYomitanParserInitPromise: (promise) => { + state.parserInitPromise = promise; + }, + setYomitanExtension: (extension) => { + state.yomitanExt = extension; + }, + }); + + if (!extension) { + state.note = "yomitan extension is not available"; + return state; + } + + state.yomitanExt = extension; + state.available = true; + return state; + } catch (error) { + state.note = + error instanceof Error + ? error.message + : "failed to initialize yomitan extension"; + return state; + } +} + +async function createYomitanRuntimeStateWithSearch( + userDataPath: string, + extensionPath?: string, +): Promise { + const preferredPath = extensionPath + ? path.resolve(extensionPath) + : undefined; + const defaultVendorPath = path.resolve(process.cwd(), "vendor", "yomitan"); + const candidates = [ + ...(preferredPath ? [preferredPath] : []), + defaultVendorPath, + ]; + + for (const candidate of candidates) { + if (!candidate) { + continue; + } + try { + if (fs.existsSync(path.join(candidate, "manifest.json"))) { + const state = await createYomitanRuntimeState(userDataPath); + if (state.available) { + return state; + } + if (!state.note) { + state.note = `Failed to load yomitan extension at ${candidate}`; + } + return state; + } + } catch { + continue; + } + } + + return createYomitanRuntimeState(userDataPath); +} + +async function getFrequencyLookup(dictionaryPath: string): Promise { + return createFrequencyDictionaryLookupService({ + searchPaths: [dictionaryPath], + log: (message) => { + // Keep script output pure JSON by default + if (process.env.DEBUG_FREQUENCY === "1") { + console.error(message); + } + }, + }); +} + +const ANSI_RESET = "\u001b[0m"; +const ANSI_FG_PREFIX = "\u001b[38;2"; +const HEX_COLOR_PATTERN = /^#(?:[0-9a-fA-F]{3}|[0-9a-fA-F]{6})$/; + +function parseHexRgb(input: string): [number, number, number] | null { + const normalized = input.trim().replace(/^#/, ""); + if (!HEX_COLOR_PATTERN.test(`#${normalized}`)) { + return null; + } + const expanded = normalized.length === 3 + ? normalized.split("").map((char) => `${char}${char}`).join("") + : normalized; + const r = Number.parseInt(expanded.substring(0, 2), 16); + const g = Number.parseInt(expanded.substring(2, 4), 16); + const b = Number.parseInt(expanded.substring(4, 6), 16); + if ( + !Number.isFinite(r) || + !Number.isFinite(g) || + !Number.isFinite(b) + ) { + return null; + } + return [r, g, b]; +} + +function wrapWithForeground(text: string, color: string): string { + const rgb = parseHexRgb(color); + if (!rgb) { + return text; + } + return `${ANSI_FG_PREFIX};${rgb[0]};${rgb[1]};${rgb[2]}m${text}${ANSI_RESET}`; +} + +function getBandColor( + rank: number, + colorTopX: number, + colorMode: "single" | "banded", + colorSingle: string, + bandedColors: [string, string, string, string, string], +): string { + const topX = Math.max(1, Math.floor(colorTopX)); + const safeRank = Math.max(1, Math.floor(rank)); + if (safeRank > topX) { + return ""; + } + if (colorMode === "single") { + return colorSingle; + } + const normalizedBand = Math.ceil((safeRank / topX) * bandedColors.length); + const band = Math.min(bandedColors.length, Math.max(1, normalizedBand)); + return bandedColors[band - 1]; +} + +function getTokenColor(token: MergedToken, args: CliOptions): string { + if (token.isNPlusOneTarget) { + return args.colorNPlusOne; + } + if (token.isKnown) { + return args.colorKnown; + } + if (typeof token.frequencyRank === "number" && Number.isFinite(token.frequencyRank)) { + return getBandColor( + token.frequencyRank, + args.colorTopX, + args.colorMode, + args.colorSingle, + [args.colorBand1, args.colorBand2, args.colorBand3, args.colorBand4, args.colorBand5], + ); + } + return ""; +} + +function renderColoredLine( + text: string, + tokens: MergedToken[], + args: CliOptions, +): string { + if (!args.emitColoredLine) { + return text; + } + if (tokens.length === 0) { + return text; + } + + const ordered = [...tokens].sort((a, b) => { + const aStart = a.startPos ?? 0; + const bStart = b.startPos ?? 0; + if (aStart !== bStart) { + return aStart - bStart; + } + return (a.endPos ?? a.surface.length) - (b.endPos ?? b.surface.length); + }); + + let cursor = 0; + let output = ""; + for (const token of ordered) { + const start = token.startPos ?? 0; + const end = token.endPos ?? (token.startPos ? token.startPos + token.surface.length : token.surface.length); + if (start < 0 || end < 0 || end < start) { + continue; + } + const safeStart = Math.min(Math.max(0, start), text.length); + const safeEnd = Math.min(Math.max(safeStart, end), text.length); + if (safeStart > cursor) { + output += text.slice(cursor, safeStart); + } + const tokenText = text.slice(safeStart, safeEnd); + const color = getTokenColor(token, args); + output += color ? wrapWithForeground(tokenText, color) : tokenText; + cursor = safeEnd; + } + + if (cursor < text.length) { + output += text.slice(cursor); + } + return output; +} + +async function main(): Promise { + const args = parseCliArgs(process.argv.slice(2)); + const getFrequencyRank = await getFrequencyLookup(args.dictionaryPath); + + const mecabTokenizer = new MecabTokenizer({ + mecabCommand: args.mecabCommand, + dictionaryPath: args.mecabDictionaryPath, + }); + const isMecabAvailable = await mecabTokenizer.checkAvailability(); + if (!isMecabAvailable) { + throw new Error( + "MeCab is not available on this system. Install/run environment with MeCab to tokenize input.", + ); + } + + const app = await import("electron").catch(() => null); + if (app && args.yomitanUserDataPath) { + app.app.setPath("userData", args.yomitanUserDataPath); + } + const yomitanState = + !args.forceMecabOnly + ? await createYomitanRuntimeStateWithSearch( + app?.app?.getPath ? app.app.getPath("userData") : process.cwd(), + args.yomitanExtensionPath, + ) + : null; + const hasYomitan = Boolean(yomitanState?.available && yomitanState?.yomitanExt); + + const deps = createTokenizerDepsRuntimeService({ + getYomitanExt: () => + (hasYomitan ? yomitanState!.yomitanExt : null) as never, + getYomitanParserWindow: () => + (hasYomitan ? yomitanState!.parserWindow : null) as never, + setYomitanParserWindow: (window) => { + if (!hasYomitan) { + return; + } + yomitanState!.parserWindow = window; + }, + getYomitanParserReadyPromise: () => + (hasYomitan ? yomitanState!.parserReadyPromise : null) as never, + setYomitanParserReadyPromise: (promise) => { + if (!hasYomitan) { + return; + } + yomitanState!.parserReadyPromise = promise; + }, + getYomitanParserInitPromise: () => + (hasYomitan ? yomitanState!.parserInitPromise : null) as never, + setYomitanParserInitPromise: (promise) => { + if (!hasYomitan) { + return; + } + yomitanState!.parserInitPromise = promise; + }, + isKnownWord: () => false, + getKnownWordMatchMode: () => "headword", + getJlptLevel: () => null, + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank, + getMecabTokenizer: () => ({ + tokenize: (text: string) => mecabTokenizer.tokenize(text), + }), + }); + + const subtitleData = await tokenizeSubtitleService(args.input, deps); + const tokenCount = subtitleData.tokens?.length ?? 0; + const mergedCount = subtitleData.tokens?.filter((token) => token.isMerged).length ?? 0; + const hasYomitanCandidates = Boolean( + subtitleData.tokens?.some((token) => { + const frequencyLookupTerms = ( + token as MergedToken & { frequencyLookupTerms?: string[] } + ).frequencyLookupTerms; + return Array.isArray(frequencyLookupTerms) && frequencyLookupTerms.length > 0; + }) ?? false, + ); + const tokens = + subtitleData.tokens?.map((token) => + args.emitVerbose + ? simplifyTokenWithVerbose(token, getFrequencyRank) + : simplifyToken(token), + ) ?? null; + const diagnostics = { + yomitan: { + available: Boolean(yomitanState?.available), + loaded: hasYomitan, + forceMecabOnly: args.forceMecabOnly, + note: yomitanState?.note ?? null, + }, + mecab: { + command: args.mecabCommand ?? "mecab", + dictionaryPath: args.mecabDictionaryPath ?? null, + available: isMecabAvailable, + }, + tokenizer: { + sourceHint: + tokenCount === 0 + ? "none" + : hasYomitan ? "yomitan-merged" : "mecab-merge", + mergedTokenCount: mergedCount, + totalTokenCount: tokenCount, + }, + }; + if (tokens === null) { + diagnostics.mecab["status"] = "no-tokens"; + diagnostics.mecab["note"] = + "MeCab returned no parseable tokens. This is often caused by a missing/invalid MeCab dictionary path."; + } else { + diagnostics.mecab["status"] = "ok"; + } + + const output = { + input: args.input, + tokenizerText: subtitleData.text, + tokens, + diagnostics, + }; + + const json = JSON.stringify(output, null, args.emitPretty ? 2 : undefined); + process.stdout.write(`${json}\n`); + + if (args.emitColoredLine && subtitleData.tokens) { + const coloredLine = renderColoredLine(subtitleData.text, subtitleData.tokens, args); + process.stdout.write(`${coloredLine}\n`); + } +} + +main().catch((error) => { + console.error(`Error: ${(error as Error).message}`); + process.exit(1); +}); diff --git a/src/core/services/tokenizer-service.test.ts b/src/core/services/tokenizer-service.test.ts index c2747d2..5e87264 100644 --- a/src/core/services/tokenizer-service.test.ts +++ b/src/core/services/tokenizer-service.test.ts @@ -228,6 +228,223 @@ test("tokenizeSubtitleService applies frequency dictionary ranks", async () => { assert.equal(result.tokens?.[1]?.frequencyRank, 1200); }); +test("tokenizeSubtitleService uses all Yomitan headword candidates for frequency lookup", async () => { + const result = await tokenizeSubtitleService( + "猫です", + makeDeps({ + getFrequencyDictionaryEnabled: () => true, + getYomitanExt: () => ({ id: "dummy-ext" } as any), + getYomitanParserWindow: () => ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: "scanning-parser", + index: 0, + content: [ + [ + { + text: "猫です", + reading: "ねこです", + headwords: [ + [{ term: "猫です" }], + [{ term: "猫" }], + ], + }, + ], + ], + }, + ], + }, + } as unknown as Electron.BrowserWindow), + getFrequencyRank: (text) => (text === "猫" ? 40 : text === "猫です" ? 1200 : null), + }), + ); + + assert.equal(result.tokens?.length, 1); + assert.equal(result.tokens?.[0]?.frequencyRank, 40); +}); + +test("tokenizeSubtitleService prefers exact headword frequency over surface/reading when available", async () => { + const result = await tokenizeSubtitleService( + "猫です", + makeDeps({ + getFrequencyDictionaryEnabled: () => true, + getYomitanExt: () => ({ id: "dummy-ext" } as any), + getYomitanParserWindow: () => ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: "scanning-parser", + index: 0, + content: [ + [ + { + text: "猫", + reading: "ねこ", + headwords: [[{ term: "ネコ" }]], + }, + ], + ], + }, + ], + }, + } as unknown as Electron.BrowserWindow), + getFrequencyRank: (text) => (text === "猫" ? 1200 : text === "ネコ" ? 8 : null), + }), + ); + + assert.equal(result.tokens?.length, 1); + assert.equal(result.tokens?.[0]?.frequencyRank, 8); +}); + +test("tokenizeSubtitleService keeps no frequency when only reading matches and headword candidates miss", async () => { + const result = await tokenizeSubtitleService( + "猫です", + makeDeps({ + getFrequencyDictionaryEnabled: () => true, + getYomitanExt: () => ({ id: "dummy-ext" } as any), + getYomitanParserWindow: () => ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: "scanning-parser", + index: 0, + content: [ + [ + { + text: "猫", + reading: "ねこ", + headwords: [[{ term: "猫です" }]], + }, + ], + ], + }, + ], + }, + } as unknown as Electron.BrowserWindow), + getFrequencyRank: (text) => (text === "ねこ" ? 77 : null), + }), + ); + + assert.equal(result.tokens?.length, 1); + assert.equal(result.tokens?.[0]?.frequencyRank, undefined); +}); + +test("tokenizeSubtitleService ignores invalid frequency ranks and takes best valid headword candidate", async () => { + const result = await tokenizeSubtitleService( + "猫です", + makeDeps({ + getFrequencyDictionaryEnabled: () => true, + getYomitanExt: () => ({ id: "dummy-ext" } as any), + getYomitanParserWindow: () => ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: "scanning-parser", + index: 0, + content: [ + [ + { + text: "猫です", + reading: "ねこです", + headwords: [ + [{ term: "猫" }], + [{ term: "猫です" }], + ], + }, + ], + ], + }, + ], + }, + } as unknown as Electron.BrowserWindow), + getFrequencyRank: (text) => (text === "猫" ? Number.NaN : text === "猫です" ? 500 : null), + }), + ); + + assert.equal(result.tokens?.length, 1); + assert.equal(result.tokens?.[0]?.frequencyRank, 500); +}); + +test("tokenizeSubtitleService handles real-word frequency candidates and prefers most frequent term", async () => { + const result = await tokenizeSubtitleService( + "昨日", + makeDeps({ + getFrequencyDictionaryEnabled: () => true, + getYomitanExt: () => ({ id: "dummy-ext" } as any), + getYomitanParserWindow: () => ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: "scanning-parser", + index: 0, + content: [ + [ + { + text: "昨日", + reading: "きのう", + headwords: [ + [{ term: "昨日" }], + [{ term: "きのう" }], + ], + }, + ], + ], + }, + ], + }, + } as unknown as Electron.BrowserWindow), + getFrequencyRank: (text) => (text === "きのう" ? 120 : text === "昨日" ? 40 : null), + }), + ); + + assert.equal(result.tokens?.length, 1); + assert.equal(result.tokens?.[0]?.frequencyRank, 40); +}); + +test("tokenizeSubtitleService ignores candidates with no dictionary rank when higher-frequency candidate exists", async () => { + const result = await tokenizeSubtitleService( + "猫です", + makeDeps({ + getFrequencyDictionaryEnabled: () => true, + getYomitanExt: () => ({ id: "dummy-ext" } as any), + getYomitanParserWindow: () => ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: "scanning-parser", + index: 0, + content: [ + [ + { + text: "猫", + reading: "ねこ", + headwords: [ + [{ term: "猫" }], + [{ term: "猫です" }], + [{ term: "unknown-term" }], + ], + }, + ], + ], + }, + ], + }, + } as unknown as Electron.BrowserWindow), + getFrequencyRank: (text) => (text === "unknown-term" ? -1 : text === "猫" ? 88 : text === "猫です" ? 9000 : null), + }), + ); + + assert.equal(result.tokens?.length, 1); + assert.equal(result.tokens?.[0]?.frequencyRank, 88); +}); + test("tokenizeSubtitleService ignores frequency lookup failures", async () => { const result = await tokenizeSubtitleService( "猫", @@ -557,10 +774,147 @@ test("tokenizeSubtitleService uses Yomitan parser result when available", async ); assert.equal(result.text, "猫です"); - assert.equal(result.tokens?.length, 1); - assert.equal(result.tokens?.[0]?.surface, "猫です"); - assert.equal(result.tokens?.[0]?.reading, "ねこです"); + assert.equal(result.tokens?.length, 2); + assert.equal(result.tokens?.[0]?.surface, "猫"); + assert.equal(result.tokens?.[0]?.reading, "ねこ"); assert.equal(result.tokens?.[0]?.isKnown, false); + assert.equal(result.tokens?.[1]?.surface, "です"); + assert.equal(result.tokens?.[1]?.reading, "です"); + assert.equal(result.tokens?.[1]?.isKnown, false); +}); + +test("tokenizeSubtitleService prefers mecab parser tokens when scanning parser returns one token", async () => { + const result = await tokenizeSubtitleService( + "俺は小園にいきたい", + makeDeps({ + getYomitanExt: () => ({ id: "dummy-ext" } as any), + getYomitanParserWindow: () => ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: "scanning-parser", + index: 0, + content: [ + [ + { + text: "俺は小園にいきたい", + reading: "おれは小園にいきたい", + headwords: [[{ term: "俺は小園にいきたい" }]], + }, + ], + ], + }, + { + source: "mecab", + index: 0, + content: [ + [{ text: "俺", reading: "おれ", headwords: [[{ term: "俺" }]] }], + [{ text: "は", reading: "は", headwords: [[{ term: "は" }]] }], + [{ text: "小園", reading: "おうえん", headwords: [[{ term: "小園" }]] }], + [{ text: "に", reading: "に", headwords: [[{ term: "に" }]] }], + [{ text: "いきたい", reading: "いきたい", headwords: [[{ term: "いきたい" }]] }], + ], + }, + ], + }, + } as unknown as Electron.BrowserWindow), + getFrequencyDictionaryEnabled: () => true, + tokenizeWithMecab: async () => null, + getFrequencyRank: (text) => + text === "小園" ? 25 : text === "いきたい" ? 1500 : null, + }), + ); + + assert.equal(result.tokens?.length, 5); + assert.equal(result.tokens?.map((token) => token.surface).join(","), "俺,は,小園,に,いきたい"); + assert.equal(result.tokens?.[2]?.surface, "小園"); + assert.equal(result.tokens?.[2]?.frequencyRank, 25); +}); + +test("tokenizeSubtitleService keeps scanning parser tokens when they are already split", async () => { + const result = await tokenizeSubtitleService( + "小園に行きたい", + makeDeps({ + getYomitanExt: () => ({ id: "dummy-ext" } as any), + getYomitanParserWindow: () => ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: "scanning-parser", + index: 0, + content: [ + [{ text: "小園", reading: "おうえん", headwords: [[{ term: "小園" }]] }], + [{ text: "に", reading: "に", headwords: [[{ term: "に" }]] }], + [{ text: "行きたい", reading: "いきたい", headwords: [[{ term: "行きたい" }]] }], + ], + }, + { + source: "mecab", + index: 0, + content: [ + [{ text: "小", reading: "お", headwords: [[{ term: "小" }]] }], + [{ text: "園", reading: "えん", headwords: [[{ term: "園" }]] }], + [{ text: "に", reading: "に", headwords: [[{ term: "に" }]] }], + [{ text: "行き", reading: "いき", headwords: [[{ term: "行き" }]] }], + [{ text: "たい", reading: "たい", headwords: [[{ term: "たい" }]] }], + ], + }, + ], + }, + } as unknown as Electron.BrowserWindow), + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => (text === "小園" ? 20 : null), + tokenizeWithMecab: async () => null, + }), + ); + + assert.equal(result.tokens?.length, 3); + assert.equal( + result.tokens?.map((token) => token.surface).join(","), + "小園,に,行きたい", + ); + assert.equal(result.tokens?.[0]?.frequencyRank, 20); + assert.equal(result.tokens?.[1]?.frequencyRank, undefined); + assert.equal(result.tokens?.[2]?.frequencyRank, undefined); +}); + +test("tokenizeSubtitleService still assigns frequency to non-known Yomitan tokens", async () => { + const result = await tokenizeSubtitleService( + "小園に", + makeDeps({ + getYomitanExt: () => ({ id: "dummy-ext" } as any), + getYomitanParserWindow: () => ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: "scanning-parser", + index: 0, + content: [ + [ + { text: "小園", reading: "おうえん", headwords: [[{ term: "小園" }]] }, + ], + [ + { text: "に", reading: "に", headwords: [[{ term: "に" }]] }, + ], + ], + }, + ], + }, + } as unknown as Electron.BrowserWindow), + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => (text === "小園" ? 75 : text === "に" ? 3000 : null), + isKnownWord: (text) => text === "小園", + }), + ); + + assert.equal(result.tokens?.length, 2); + assert.equal(result.tokens?.[0]?.isKnown, true); + assert.equal(result.tokens?.[0]?.frequencyRank, 75); + assert.equal(result.tokens?.[1]?.isKnown, false); + assert.equal(result.tokens?.[1]?.frequencyRank, undefined); }); test("tokenizeSubtitleService marks tokens as known using callback", async () => { @@ -589,6 +943,63 @@ test("tokenizeSubtitleService marks tokens as known using callback", async () => assert.equal(result.tokens?.[0]?.isKnown, true); }); +test("tokenizeSubtitleService still assigns frequency rank to non-known tokens", async () => { + const result = await tokenizeSubtitleService( + "既知未知", + makeDeps({ + tokenizeWithMecab: async () => [ + { + surface: "既知", + reading: "キチ", + partOfSpeech: PartOfSpeech.noun, + pos1: "", + pos2: "", + pos3: "", + pos4: "", + inflectionType: "", + inflectionForm: "", + headword: "既知", + katakanaReading: "キチ", + pronunciation: "キチ", + startPos: 0, + endPos: 2, + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + surface: "未知", + reading: "ミチ", + partOfSpeech: PartOfSpeech.noun, + pos1: "", + pos2: "", + pos3: "", + pos4: "", + inflectionType: "", + inflectionForm: "", + headword: "未知", + katakanaReading: "ミチ", + pronunciation: "ミチ", + startPos: 2, + endPos: 4, + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + ], + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => (text === "既知" ? 20 : text === "未知" ? 30 : null), + isKnownWord: (text) => text === "既知", + }), + ); + + assert.equal(result.tokens?.length, 2); + assert.equal(result.tokens?.[0]?.isKnown, true); + assert.equal(result.tokens?.[0]?.frequencyRank, 20); + assert.equal(result.tokens?.[1]?.isKnown, false); + assert.equal(result.tokens?.[1]?.frequencyRank, 30); +}); + test("tokenizeSubtitleService selects one N+1 target token", async () => { const result = await tokenizeSubtitleService( "猫です", diff --git a/src/core/services/tokenizer-service.ts b/src/core/services/tokenizer-service.ts index c4d9724..69d99f9 100644 --- a/src/core/services/tokenizer-service.ts +++ b/src/core/services/tokenizer-service.ts @@ -1,4 +1,4 @@ -import { BrowserWindow, Extension, session } from "electron"; +import type { BrowserWindow, Extension } from "electron"; import { markNPlusOneTargets, mergeTokens } from "../../token-merger"; import { JlptLevel, @@ -252,20 +252,67 @@ function resolveFrequencyLookupText(token: MergedToken): string { return token.surface; } +function getFrequencyLookupTextCandidates(token: MergedToken): string[] { + const tokenWithCandidates = token as MergedToken & { + frequencyLookupTerms?: string[]; + }; + const lookupTextCandidates: string[] = []; + const addLookupText = (text: string | undefined): void => { + if (!text) { + return; + } + const trimmed = text.trim(); + if (!trimmed) { + return; + } + lookupTextCandidates.push(trimmed); + }; + + if (Array.isArray(tokenWithCandidates.frequencyLookupTerms)) { + for (const term of tokenWithCandidates.frequencyLookupTerms) { + addLookupText(term); + } + } + + addLookupText(resolveFrequencyLookupText(token)); + + const uniqueLookupTerms: string[] = []; + const seen = new Set(); + for (const term of lookupTextCandidates) { + if (seen.has(term)) { + continue; + } + seen.add(term); + uniqueLookupTerms.push(term); + } + + return uniqueLookupTerms; +} + function applyFrequencyMarking( tokens: MergedToken[], getFrequencyRank: FrequencyDictionaryLookup, ): MergedToken[] { return tokens.map((token) => { - const lookupText = resolveFrequencyLookupText(token); - if (!lookupText) { + const lookupTexts = getFrequencyLookupTextCandidates(token); + if (lookupTexts.length === 0) { return { ...token, frequencyRank: undefined }; } - const rank = getCachedFrequencyRank(lookupText, getFrequencyRank); + let bestRank: number | null = null; + for (const lookupText of lookupTexts) { + const rank = getCachedFrequencyRank(lookupText, getFrequencyRank); + if (rank === null) { + continue; + } + if (bestRank === null || rank < bestRank) { + bestRank = rank; + } + } + return { ...token, - frequencyRank: rank ?? undefined, + frequencyRank: bestRank ?? undefined, }; }); } @@ -397,7 +444,7 @@ function isYomitanParseResultItem( if (!isObject(value)) { return false; } - if ((value as YomitanParseResultItem).source !== "scanning-parser") { + if (!isString((value as YomitanParseResultItem).source)) { return false; } if (!Array.isArray((value as YomitanParseResultItem).content)) { @@ -452,6 +499,27 @@ function extractYomitanHeadword(segment: YomitanParseSegment): string { return ""; } +function extractYomitanHeadwords(segment: YomitanParseSegment): string[] { + const headwords = segment.headwords; + if (!isYomitanHeadwordRows(headwords)) { + return []; + } + + const results: string[] = []; + for (const group of headwords) { + for (const candidate of group) { + if (isString(candidate.term)) { + const term = candidate.term.trim(); + if (term.length > 0) { + results.push(term); + } + } + } + } + + return results; +} + function applyJlptMarking( tokens: MergedToken[], getJlptLevel: (text: string) => JlptLevel | null, @@ -475,29 +543,27 @@ function applyJlptMarking( }); } -function mapYomitanParseResultsToMergedTokens( - parseResults: unknown, +interface YomitanParseCandidate { + source: string; + index: number; + tokens: MergedToken[]; +} + +function mapYomitanParseResultItemToMergedTokens( + parseResult: YomitanParseResultItem, isKnownWord: (text: string) => boolean, knownWordMatchMode: NPlusOneMatchMode, -): MergedToken[] | null { - if (!Array.isArray(parseResults) || parseResults.length === 0) { +): YomitanParseCandidate | null { + const content = parseResult.content; + if (!Array.isArray(content) || content.length === 0) { return null; } - const scanningItems = parseResults.filter( - (item): item is YomitanParseResultItem => isYomitanParseResultItem(item), - ); - - if (scanningItems.length === 0) { - return null; - } - - const primaryItem = - scanningItems.find((item) => item.index === 0) || scanningItems[0]; - const content = primaryItem.content; - if (!Array.isArray(content)) { - return null; - } + const source = String(parseResult.source ?? ""); + const index = + typeof parseResult.index === "number" && Number.isInteger(parseResult.index) + ? parseResult.index + : 0; const tokens: MergedToken[] = []; let charOffset = 0; @@ -509,60 +575,117 @@ function mapYomitanParseResultsToMergedTokens( } validLineCount += 1; - let surface = ""; - let reading = ""; - let headword = ""; - for (const segment of line) { const segmentText = segment.text; if (!segmentText || segmentText.length === 0) { continue; } - surface += segmentText; + const start = charOffset; + const end = start + segmentText.length; + charOffset = end; - if (typeof segment.reading === "string") { - reading += segment.reading; - } + const headword = extractYomitanHeadword(segment) || segmentText; + const frequencyLookupTerms = extractYomitanHeadwords(segment); - if (!headword) { - headword = extractYomitanHeadword(segment); - } + tokens.push({ + surface: segmentText, + reading: typeof segment.reading === "string" ? segment.reading : "", + headword, + startPos: start, + endPos: end, + partOfSpeech: PartOfSpeech.other, + pos1: "", + isMerged: true, + isNPlusOneTarget: false, + isKnown: (() => { + const matchText = resolveKnownWordText( + segmentText, + headword, + knownWordMatchMode, + ); + return matchText ? isKnownWord(matchText) : false; + })(), + frequencyLookupTerms: + frequencyLookupTerms.length > 0 ? frequencyLookupTerms : undefined, + }); } - - if (!surface) { - continue; - } - - const start = charOffset; - const end = start + surface.length; - charOffset = end; - - tokens.push({ - surface, - reading, - headword: headword || surface, - startPos: start, - endPos: end, - partOfSpeech: PartOfSpeech.other, - pos1: "", - isMerged: true, - isNPlusOneTarget: false, - isKnown: (() => { - const matchText = resolveKnownWordText( - surface, - headword, - knownWordMatchMode, - ); - return matchText ? isKnownWord(matchText) : false; - })(), - }); } - if (validLineCount === 0) { + if (validLineCount === 0 || tokens.length === 0) { return null; } - return tokens.length > 0 ? tokens : null; + + return { source, index, tokens }; +} + +function selectBestYomitanParseCandidate( + candidates: YomitanParseCandidate[], +): MergedToken[] | null { + if (candidates.length === 0) { + return null; + } + + const scanningCandidates = candidates.filter( + (candidate) => candidate.source === "scanning-parser", + ); + const mecabCandidates = candidates.filter( + (candidate) => candidate.source === "mecab", + ); + + const getBestByTokenCount = ( + items: YomitanParseCandidate[], + ): YomitanParseCandidate | null => items.length === 0 + ? null + : items.reduce((best, current) => + current.tokens.length > best.tokens.length ? current : best, + ); + + if (scanningCandidates.length > 0) { + const bestScanning = getBestByTokenCount(scanningCandidates); + if (bestScanning && bestScanning.tokens.length > 1) { + return bestScanning.tokens; + } + + const bestMecab = getBestByTokenCount(mecabCandidates); + if ( + bestMecab && + bestMecab.tokens.length > (bestScanning?.tokens.length ?? 0) + ) { + return bestMecab.tokens; + } + + return bestScanning ? bestScanning.tokens : null; + } + + const bestCandidate = getBestByTokenCount(candidates); + return bestCandidate ? bestCandidate.tokens : null; +} + +function mapYomitanParseResultsToMergedTokens( + parseResults: unknown, + isKnownWord: (text: string) => boolean, + knownWordMatchMode: NPlusOneMatchMode, +): MergedToken[] | null { + if (!Array.isArray(parseResults) || parseResults.length === 0) { + return null; + } + + const candidates = parseResults + .filter((item): item is YomitanParseResultItem => + isYomitanParseResultItem(item), + ) + .map((item) => + mapYomitanParseResultItemToMergedTokens( + item, + isKnownWord, + knownWordMatchMode, + ), + ) + .filter((candidate): candidate is YomitanParseCandidate => candidate !== null); + + const bestCandidate = selectBestYomitanParseCandidate(candidates); + return bestCandidate; } function pickClosestMecabPos1( @@ -664,6 +787,7 @@ async function enrichYomitanPos1( async function ensureYomitanParserWindow( deps: TokenizerServiceDeps, ): Promise { + const electron = await import("electron"); const yomitanExt = deps.getYomitanExt(); if (!yomitanExt) { return false; @@ -680,6 +804,7 @@ async function ensureYomitanParserWindow( } const initPromise = (async () => { + const { BrowserWindow, session } = electron; const parserWindow = new BrowserWindow({ show: false, width: 800, @@ -786,7 +911,7 @@ async function parseWithYomitanInternalParser( optionsContext: { index: profileIndex }, scanLength, useInternalParser: true, - useMecabParser: false + useMecabParser: true }); })(); `; diff --git a/src/mecab-tokenizer.ts b/src/mecab-tokenizer.ts index f311710..8afacb4 100644 --- a/src/mecab-tokenizer.ts +++ b/src/mecab-tokenizer.ts @@ -86,14 +86,29 @@ export function parseMecabLine(line: string): Token | null { }; } +export interface MecabTokenizerOptions { + mecabCommand?: string; + dictionaryPath?: string; +} + export class MecabTokenizer { private mecabPath: string | null = null; + private mecabCommand: string; + private dictionaryPath: string | null; private available: boolean = false; private enabled: boolean = true; + constructor(options: MecabTokenizerOptions = {}) { + this.mecabCommand = options.mecabCommand?.trim() || "mecab"; + this.dictionaryPath = options.dictionaryPath?.trim() || null; + } + async checkAvailability(): Promise { try { - const result = execSync("which mecab", { encoding: "utf-8" }).trim(); + const command = this.mecabCommand; + const result = command.includes("/") + ? command + : execSync(`which ${command}`, { encoding: "utf-8" }).trim(); if (result) { this.mecabPath = result; this.available = true; @@ -114,7 +129,11 @@ export class MecabTokenizer { } return new Promise((resolve) => { - const mecab = spawn("mecab", [], { + const mecabArgs: string[] = []; + if (this.dictionaryPath) { + mecabArgs.push("-d", this.dictionaryPath); + } + const mecab = spawn(this.mecabPath ?? this.mecabCommand, mecabArgs, { stdio: ["pipe", "pipe", "pipe"], }); @@ -149,6 +168,21 @@ export class MecabTokenizer { } } + if (tokens.length === 0 && text.trim().length > 0) { + const trimmedStdout = stdout.trim(); + const trimmedStderr = stderr.trim(); + if (trimmedStdout) { + log.warn( + "MeCab returned no parseable tokens.", + `command=${this.mecabPath ?? this.mecabCommand}`, + `stdout=${trimmedStdout.slice(0, 1024)}`, + ); + } + if (trimmedStderr) { + log.warn("MeCab stderr while tokenizing:", trimmedStderr); + } + } + resolve(tokens); }); diff --git a/src/renderer/subtitle-render.test.ts b/src/renderer/subtitle-render.test.ts index 596890a..f70430c 100644 --- a/src/renderer/subtitle-render.test.ts +++ b/src/renderer/subtitle-render.test.ts @@ -23,15 +23,35 @@ function createToken(overrides: Partial): MergedToken { } function extractClassBlock(cssText: string, selector: string): string { - const start = cssText.indexOf(selector); - if (start < 0) return ""; + const ruleRegex = /([^{}]+)\{([^}]*)\}/g; + let match: RegExpExecArray | null = null; + let fallbackBlock = ""; - const openBrace = cssText.indexOf("{", start); - if (openBrace < 0) return ""; - const closeBrace = cssText.indexOf("}", openBrace); - if (closeBrace < 0) return ""; + while ((match = ruleRegex.exec(cssText)) !== null) { + const selectorsBlock = match[1]?.trim() ?? ""; + const selectorBlock = match[2] ?? ""; - return cssText.slice(openBrace + 1, closeBrace); + const selectors = selectorsBlock + .split(",") + .map((entry) => entry.trim()) + .filter((entry) => entry.length > 0); + + if (selectors.includes(selector)) { + if (selectors.length === 1) { + return selectorBlock; + } + + if (!fallbackBlock) { + fallbackBlock = selectorBlock; + } + } + } + + if (fallbackBlock) { + return fallbackBlock; + } + + return ""; } test("computeWordClass preserves known and n+1 classes while adding JLPT classes", () => { @@ -173,10 +193,16 @@ test("computeWordClass uses configured band count for banded mode", () => { topX: 4, mode: "banded", singleColor: "#000000", - bandedColors: ["#111111", "#222222", "#333333"] as any, + bandedColors: [ + "#111111", + "#222222", + "#333333", + "#444444", + "#555555", + ], } as any); - assert.equal(actual, "word word-frequency-band-1"); + assert.equal(actual, "word word-frequency-band-3"); }); test("computeWordClass skips frequency class when rank is out of topX", () => { diff --git a/src/types.ts b/src/types.ts index f602b4c..de4217f 100644 --- a/src/types.ts +++ b/src/types.ts @@ -56,6 +56,7 @@ export interface MergedToken { isNPlusOneTarget: boolean; jlptLevel?: JlptLevel; frequencyRank?: number; + frequencyLookupTerms?: string[]; } export type FrequencyDictionaryLookup = (term: string) => number | null;