import fs from "node:fs"; import path from "node:path"; import process from "node:process"; import { createTokenizerDepsRuntimeService, tokenizeSubtitleService } from "../src/core/services/tokenizer-service.js"; import { createFrequencyDictionaryLookupService } from "../src/core/services/frequency-dictionary-service.js"; import { MecabTokenizer } from "../src/mecab-tokenizer.js"; import type { MergedToken, FrequencyDictionaryLookup } from "../src/types.js"; interface CliOptions { input: string; dictionaryPath: string; emitPretty: boolean; emitVerbose: boolean; mecabCommand?: string; mecabDictionaryPath?: string; forceMecabOnly?: boolean; yomitanExtensionPath?: string; yomitanUserDataPath?: string; emitColoredLine: boolean; colorMode: "single" | "banded"; colorTopX: number; colorSingle: string; colorBand1: string; colorBand2: string; colorBand3: string; colorBand4: string; colorBand5: string; colorKnown: string; colorNPlusOne: string; } function parseCliArgs(argv: string[]): CliOptions { const args = [...argv]; let inputParts: string[] = []; let dictionaryPath = path.join(process.cwd(), "vendor", "jiten_freq_global"); let emitPretty = false; let emitVerbose = false; let mecabCommand: string | undefined; let mecabDictionaryPath: string | undefined; let forceMecabOnly = false; let yomitanExtensionPath: string | undefined; let yomitanUserDataPath: string | undefined; let emitColoredLine = false; let colorMode: "single" | "banded" = "single"; let colorTopX = 1000; let colorSingle = "#f5a97f"; let colorBand1 = "#ed8796"; let colorBand2 = "#f5a97f"; let colorBand3 = "#f9e2af"; let colorBand4 = "#a6e3a1"; let colorBand5 = "#8aadf4"; let colorKnown = "#a6da95"; let colorNPlusOne = "#c6a0f6"; while (args.length > 0) { const arg = args.shift(); if (!arg) break; if (arg === "--help" || arg === "-h") { printUsage(); process.exit(0); } if (arg === "--dictionary") { const next = args.shift(); if (!next) { throw new Error("Missing value for --dictionary"); } dictionaryPath = path.resolve(next); continue; } if (arg === "--mecab-command") { const next = args.shift(); if (!next) { throw new Error("Missing value for --mecab-command"); } mecabCommand = next; continue; } if (arg === "--mecab-dictionary") { const next = args.shift(); if (!next) { throw new Error("Missing value for --mecab-dictionary"); } mecabDictionaryPath = next; continue; } if (arg === "--yomitan-extension") { const next = args.shift(); if (!next) { throw new Error("Missing value for --yomitan-extension"); } yomitanExtensionPath = path.resolve(next); continue; } if (arg === "--yomitan-user-data") { const next = args.shift(); if (!next) { throw new Error("Missing value for --yomitan-user-data"); } yomitanUserDataPath = path.resolve(next); continue; } if (arg === "--colorized-line") { emitColoredLine = true; continue; } if (arg === "--color-mode") { const next = args.shift(); if (!next) { throw new Error("Missing value for --color-mode"); } if (next !== "single" && next !== "banded") { throw new Error("--color-mode must be 'single' or 'banded'"); } colorMode = next; continue; } if (arg === "--color-top-x") { const next = args.shift(); if (!next) { throw new Error("Missing value for --color-top-x"); } const parsed = Number.parseInt(next, 10); if (!Number.isFinite(parsed) || parsed <= 0) { throw new Error("--color-top-x must be a positive integer"); } colorTopX = parsed; continue; } if (arg === "--color-single") { const next = args.shift(); if (!next) { throw new Error("Missing value for --color-single"); } colorSingle = next; continue; } if (arg === "--color-band-1") { const next = args.shift(); if (!next) { throw new Error("Missing value for --color-band-1"); } colorBand1 = next; continue; } if (arg === "--color-band-2") { const next = args.shift(); if (!next) { throw new Error("Missing value for --color-band-2"); } colorBand2 = next; continue; } if (arg === "--color-band-3") { const next = args.shift(); if (!next) { throw new Error("Missing value for --color-band-3"); } colorBand3 = next; continue; } if (arg === "--color-band-4") { const next = args.shift(); if (!next) { throw new Error("Missing value for --color-band-4"); } colorBand4 = next; continue; } if (arg === "--color-band-5") { const next = args.shift(); if (!next) { throw new Error("Missing value for --color-band-5"); } colorBand5 = next; continue; } if (arg === "--color-known") { const next = args.shift(); if (!next) { throw new Error("Missing value for --color-known"); } colorKnown = next; continue; } if (arg === "--color-n-plus-one") { const next = args.shift(); if (!next) { throw new Error("Missing value for --color-n-plus-one"); } colorNPlusOne = next; continue; } if (arg.startsWith("--dictionary=")) { dictionaryPath = path.resolve(arg.slice("--dictionary=".length)); continue; } if (arg.startsWith("--mecab-command=")) { mecabCommand = arg.slice("--mecab-command=".length); continue; } if (arg.startsWith("--mecab-dictionary=")) { mecabDictionaryPath = arg.slice("--mecab-dictionary=".length); continue; } if (arg.startsWith("--yomitan-extension=")) { yomitanExtensionPath = path.resolve( arg.slice("--yomitan-extension=".length), ); continue; } if (arg.startsWith("--yomitan-user-data=")) { yomitanUserDataPath = path.resolve( arg.slice("--yomitan-user-data=".length), ); continue; } if (arg.startsWith("--colorized-line")) { emitColoredLine = true; continue; } if (arg.startsWith("--color-mode=")) { const value = arg.slice("--color-mode=".length); if (value !== "single" && value !== "banded") { throw new Error("--color-mode must be 'single' or 'banded'"); } colorMode = value; continue; } if (arg.startsWith("--color-top-x=")) { const value = arg.slice("--color-top-x=".length); const parsed = Number.parseInt(value, 10); if (!Number.isFinite(parsed) || parsed <= 0) { throw new Error("--color-top-x must be a positive integer"); } colorTopX = parsed; continue; } if (arg.startsWith("--color-single=")) { colorSingle = arg.slice("--color-single=".length); continue; } if (arg.startsWith("--color-band-1=")) { colorBand1 = arg.slice("--color-band-1=".length); continue; } if (arg.startsWith("--color-band-2=")) { colorBand2 = arg.slice("--color-band-2=".length); continue; } if (arg.startsWith("--color-band-3=")) { colorBand3 = arg.slice("--color-band-3=".length); continue; } if (arg.startsWith("--color-band-4=")) { colorBand4 = arg.slice("--color-band-4=".length); continue; } if (arg.startsWith("--color-band-5=")) { colorBand5 = arg.slice("--color-band-5=".length); continue; } if (arg.startsWith("--color-known=")) { colorKnown = arg.slice("--color-known=".length); continue; } if (arg.startsWith("--color-n-plus-one=")) { colorNPlusOne = arg.slice("--color-n-plus-one=".length); continue; } if (arg === "--pretty") { emitPretty = true; continue; } if (arg === "--verbose") { emitVerbose = true; continue; } if (arg === "--force-mecab") { forceMecabOnly = true; continue; } if (arg.startsWith("-")) { throw new Error(`Unknown flag: ${arg}`); } inputParts.push(arg); } const input = inputParts.join(" ").trim(); if (!input) { const stdin = fs.readFileSync(0, "utf8").trim(); if (!stdin) { throw new Error( "Please provide input text as arguments or via stdin.", ); } return { input: stdin, dictionaryPath, emitPretty, emitVerbose, forceMecabOnly, yomitanExtensionPath, yomitanUserDataPath, emitColoredLine, colorMode, colorTopX, colorSingle, colorBand1, colorBand2, colorBand3, colorBand4, colorBand5, colorKnown, colorNPlusOne, mecabCommand, mecabDictionaryPath, }; } return { input, dictionaryPath, emitPretty, emitVerbose, forceMecabOnly, yomitanExtensionPath, yomitanUserDataPath, emitColoredLine, colorMode, colorTopX, colorSingle, colorBand1, colorBand2, colorBand3, colorBand4, colorBand5, colorKnown, colorNPlusOne, mecabCommand, mecabDictionaryPath, }; } function printUsage(): void { process.stdout.write(`Usage: pnpm run get-frequency [--pretty] [--verbose] [--dictionary ] [--mecab-command ] [--mecab-dictionary ] --pretty Pretty-print JSON output. --verbose Include merged-frequency diagnostics and lookup terms. --force-mecab Skip Yomitan parser initialization and force MeCab fallback. --yomitan-extension Optional path to a Yomitan extension directory. --yomitan-user-data Optional Electron userData directory for Yomitan state. --colorized-line Output a terminal-colorized line based on token classification. --color-mode Frequency coloring mode (default: single). --color-top-x Frequency color applies when rank <= n (default: 1000). --color-single <#hex> Frequency single-mode color (default: #f5a97f). --color-band-1 <#hex> Frequency band-1 color. --color-band-2 <#hex> Frequency band-2 color. --color-band-3 <#hex> Frequency band-3 color. --color-band-4 <#hex> Frequency band-4 color. --color-band-5 <#hex> Frequency band-5 color. --color-known <#hex> Known-word color (default: #a6da95). --color-n-plus-one <#hex> N+1 target color (default: #c6a0f6). --dictionary Frequency dictionary root path (default: ./vendor/jiten_freq_global) --mecab-command Optional MeCab binary path (default: mecab) --mecab-dictionary Optional MeCab dictionary directory (default: system default) -h, --help Show usage. \n`); } type FrequencyCandidate = { term: string; rank: number; }; function getFrequencyLookupTextCandidates(token: MergedToken): string[] { const tokenWithCandidates = token as MergedToken & { frequencyLookupTerms?: string[]; }; const lookupTextCandidates: string[] = []; const addLookupText = (text: string | undefined): void => { if (!text) { return; } const trimmed = text.trim(); if (!trimmed) { return; } lookupTextCandidates.push(trimmed); }; if (Array.isArray(tokenWithCandidates.frequencyLookupTerms)) { for (const term of tokenWithCandidates.frequencyLookupTerms) { addLookupText(term); } } addLookupText(token.headword); addLookupText(token.reading); addLookupText(token.surface); const uniqueLookupTerms: string[] = []; const seen = new Set(); for (const term of lookupTextCandidates) { if (seen.has(term)) { continue; } seen.add(term); uniqueLookupTerms.push(term); } return uniqueLookupTerms; } function getBestFrequencyLookupCandidate( token: MergedToken, getFrequencyRank: FrequencyDictionaryLookup, ): FrequencyCandidate | null { const lookupTexts = getFrequencyLookupTextCandidates(token); let best: FrequencyCandidate | null = null; for (const term of lookupTexts) { const rank = getFrequencyRank(term); if (typeof rank !== "number" || !Number.isFinite(rank) || rank <= 0) { continue; } if (!best || rank < best.rank) { best = { term, rank }; } } return best; } function simplifyToken(token: MergedToken): Record { return { surface: token.surface, reading: token.reading, headword: token.headword, startPos: token.startPos, endPos: token.endPos, partOfSpeech: token.partOfSpeech, isMerged: token.isMerged, isKnown: token.isKnown, isNPlusOneTarget: token.isNPlusOneTarget, frequencyRank: token.frequencyRank, jlptLevel: token.jlptLevel, }; } function simplifyTokenWithVerbose( token: MergedToken, getFrequencyRank: FrequencyDictionaryLookup, ): Record { const tokenWithCandidates = token as MergedToken & { frequencyLookupTerms?: string[]; }; const frequencyLookupTerms = tokenWithCandidates.frequencyLookupTerms; const candidates = getFrequencyLookupTextCandidates(token).map((term) => ({ term, rank: getFrequencyRank(term), })).filter((candidate) => typeof candidate.rank === "number" && Number.isFinite(candidate.rank) && candidate.rank > 0 ); const bestCandidate = getBestFrequencyLookupCandidate( token, getFrequencyRank, ); return { surface: token.surface, reading: token.reading, headword: token.headword, startPos: token.startPos, endPos: token.endPos, partOfSpeech: token.partOfSpeech, isMerged: token.isMerged, isKnown: token.isKnown, isNPlusOneTarget: token.isNPlusOneTarget, frequencyRank: token.frequencyRank, jlptLevel: token.jlptLevel, frequencyLookupTerms: Array.isArray(frequencyLookupTerms) && frequencyLookupTerms.length > 0 ? frequencyLookupTerms : undefined, frequencyCandidates: candidates, frequencyBestLookupTerm: bestCandidate?.term ?? null, frequencyBestLookupRank: bestCandidate?.rank ?? null, }; } interface YomitanRuntimeState { yomitanExt: unknown | null; parserWindow: unknown | null; parserReadyPromise: Promise | null; parserInitPromise: Promise | null; available: boolean; note?: string; } async function createYomitanRuntimeState( userDataPath: string, ): Promise { const state: YomitanRuntimeState = { yomitanExt: null, parserWindow: null, parserReadyPromise: null, parserInitPromise: null, available: false, }; const electronImport = await import("electron").catch((error) => { state.note = error instanceof Error ? error.message : "unknown error"; return null; }); if (!electronImport || !electronImport.app || !electronImport.app.whenReady) { state.note = "electron runtime not available in this process"; return state; } try { await electronImport.app.whenReady(); const loadYomitanExtensionService = ( await import( "../src/core/services/yomitan-extension-loader-service.js" ) ).loadYomitanExtensionService as ( options: { userDataPath: string; getYomitanParserWindow: () => unknown; setYomitanParserWindow: (window: unknown) => void; setYomitanParserReadyPromise: (promise: Promise | null) => void; setYomitanParserInitPromise: (promise: Promise | null) => void; setYomitanExtension: (extension: unknown) => void; }, ) => Promise; const extension = await loadYomitanExtensionService({ userDataPath, getYomitanParserWindow: () => state.parserWindow, setYomitanParserWindow: (window) => { state.parserWindow = window; }, setYomitanParserReadyPromise: (promise) => { state.parserReadyPromise = promise; }, setYomitanParserInitPromise: (promise) => { state.parserInitPromise = promise; }, setYomitanExtension: (extension) => { state.yomitanExt = extension; }, }); if (!extension) { state.note = "yomitan extension is not available"; return state; } state.yomitanExt = extension; state.available = true; return state; } catch (error) { state.note = error instanceof Error ? error.message : "failed to initialize yomitan extension"; return state; } } async function createYomitanRuntimeStateWithSearch( userDataPath: string, extensionPath?: string, ): Promise { const preferredPath = extensionPath ? path.resolve(extensionPath) : undefined; const defaultVendorPath = path.resolve(process.cwd(), "vendor", "yomitan"); const candidates = [ ...(preferredPath ? [preferredPath] : []), defaultVendorPath, ]; for (const candidate of candidates) { if (!candidate) { continue; } try { if (fs.existsSync(path.join(candidate, "manifest.json"))) { const state = await createYomitanRuntimeState(userDataPath); if (state.available) { return state; } if (!state.note) { state.note = `Failed to load yomitan extension at ${candidate}`; } return state; } } catch { continue; } } return createYomitanRuntimeState(userDataPath); } async function getFrequencyLookup(dictionaryPath: string): Promise { return createFrequencyDictionaryLookupService({ searchPaths: [dictionaryPath], log: (message) => { // Keep script output pure JSON by default if (process.env.DEBUG_FREQUENCY === "1") { console.error(message); } }, }); } const ANSI_RESET = "\u001b[0m"; const ANSI_FG_PREFIX = "\u001b[38;2"; const HEX_COLOR_PATTERN = /^#(?:[0-9a-fA-F]{3}|[0-9a-fA-F]{6})$/; function parseHexRgb(input: string): [number, number, number] | null { const normalized = input.trim().replace(/^#/, ""); if (!HEX_COLOR_PATTERN.test(`#${normalized}`)) { return null; } const expanded = normalized.length === 3 ? normalized.split("").map((char) => `${char}${char}`).join("") : normalized; const r = Number.parseInt(expanded.substring(0, 2), 16); const g = Number.parseInt(expanded.substring(2, 4), 16); const b = Number.parseInt(expanded.substring(4, 6), 16); if ( !Number.isFinite(r) || !Number.isFinite(g) || !Number.isFinite(b) ) { return null; } return [r, g, b]; } function wrapWithForeground(text: string, color: string): string { const rgb = parseHexRgb(color); if (!rgb) { return text; } return `${ANSI_FG_PREFIX};${rgb[0]};${rgb[1]};${rgb[2]}m${text}${ANSI_RESET}`; } function getBandColor( rank: number, colorTopX: number, colorMode: "single" | "banded", colorSingle: string, bandedColors: [string, string, string, string, string], ): string { const topX = Math.max(1, Math.floor(colorTopX)); const safeRank = Math.max(1, Math.floor(rank)); if (safeRank > topX) { return ""; } if (colorMode === "single") { return colorSingle; } const normalizedBand = Math.ceil((safeRank / topX) * bandedColors.length); const band = Math.min(bandedColors.length, Math.max(1, normalizedBand)); return bandedColors[band - 1]; } function getTokenColor(token: MergedToken, args: CliOptions): string { if (token.isNPlusOneTarget) { return args.colorNPlusOne; } if (token.isKnown) { return args.colorKnown; } if (typeof token.frequencyRank === "number" && Number.isFinite(token.frequencyRank)) { return getBandColor( token.frequencyRank, args.colorTopX, args.colorMode, args.colorSingle, [args.colorBand1, args.colorBand2, args.colorBand3, args.colorBand4, args.colorBand5], ); } return ""; } function renderColoredLine( text: string, tokens: MergedToken[], args: CliOptions, ): string { if (!args.emitColoredLine) { return text; } if (tokens.length === 0) { return text; } const ordered = [...tokens].sort((a, b) => { const aStart = a.startPos ?? 0; const bStart = b.startPos ?? 0; if (aStart !== bStart) { return aStart - bStart; } return (a.endPos ?? a.surface.length) - (b.endPos ?? b.surface.length); }); let cursor = 0; let output = ""; for (const token of ordered) { const start = token.startPos ?? 0; const end = token.endPos ?? (token.startPos ? token.startPos + token.surface.length : token.surface.length); if (start < 0 || end < 0 || end < start) { continue; } const safeStart = Math.min(Math.max(0, start), text.length); const safeEnd = Math.min(Math.max(safeStart, end), text.length); if (safeStart > cursor) { output += text.slice(cursor, safeStart); } const tokenText = text.slice(safeStart, safeEnd); const color = getTokenColor(token, args); output += color ? wrapWithForeground(tokenText, color) : tokenText; cursor = safeEnd; } if (cursor < text.length) { output += text.slice(cursor); } return output; } async function main(): Promise { const args = parseCliArgs(process.argv.slice(2)); const getFrequencyRank = await getFrequencyLookup(args.dictionaryPath); const mecabTokenizer = new MecabTokenizer({ mecabCommand: args.mecabCommand, dictionaryPath: args.mecabDictionaryPath, }); const isMecabAvailable = await mecabTokenizer.checkAvailability(); if (!isMecabAvailable) { throw new Error( "MeCab is not available on this system. Install/run environment with MeCab to tokenize input.", ); } const app = await import("electron").catch(() => null); if (app && args.yomitanUserDataPath) { app.app.setPath("userData", args.yomitanUserDataPath); } const yomitanState = !args.forceMecabOnly ? await createYomitanRuntimeStateWithSearch( app?.app?.getPath ? app.app.getPath("userData") : process.cwd(), args.yomitanExtensionPath, ) : null; const hasYomitan = Boolean(yomitanState?.available && yomitanState?.yomitanExt); const deps = createTokenizerDepsRuntimeService({ getYomitanExt: () => (hasYomitan ? yomitanState!.yomitanExt : null) as never, getYomitanParserWindow: () => (hasYomitan ? yomitanState!.parserWindow : null) as never, setYomitanParserWindow: (window) => { if (!hasYomitan) { return; } yomitanState!.parserWindow = window; }, getYomitanParserReadyPromise: () => (hasYomitan ? yomitanState!.parserReadyPromise : null) as never, setYomitanParserReadyPromise: (promise) => { if (!hasYomitan) { return; } yomitanState!.parserReadyPromise = promise; }, getYomitanParserInitPromise: () => (hasYomitan ? yomitanState!.parserInitPromise : null) as never, setYomitanParserInitPromise: (promise) => { if (!hasYomitan) { return; } yomitanState!.parserInitPromise = promise; }, isKnownWord: () => false, getKnownWordMatchMode: () => "headword", getJlptLevel: () => null, getFrequencyDictionaryEnabled: () => true, getFrequencyRank, getMecabTokenizer: () => ({ tokenize: (text: string) => mecabTokenizer.tokenize(text), }), }); const subtitleData = await tokenizeSubtitleService(args.input, deps); const tokenCount = subtitleData.tokens?.length ?? 0; const mergedCount = subtitleData.tokens?.filter((token) => token.isMerged).length ?? 0; const hasYomitanCandidates = Boolean( subtitleData.tokens?.some((token) => { const frequencyLookupTerms = ( token as MergedToken & { frequencyLookupTerms?: string[] } ).frequencyLookupTerms; return Array.isArray(frequencyLookupTerms) && frequencyLookupTerms.length > 0; }) ?? false, ); const tokens = subtitleData.tokens?.map((token) => args.emitVerbose ? simplifyTokenWithVerbose(token, getFrequencyRank) : simplifyToken(token), ) ?? null; const diagnostics = { yomitan: { available: Boolean(yomitanState?.available), loaded: hasYomitan, forceMecabOnly: args.forceMecabOnly, note: yomitanState?.note ?? null, }, mecab: { command: args.mecabCommand ?? "mecab", dictionaryPath: args.mecabDictionaryPath ?? null, available: isMecabAvailable, }, tokenizer: { sourceHint: tokenCount === 0 ? "none" : hasYomitan ? "yomitan-merged" : "mecab-merge", mergedTokenCount: mergedCount, totalTokenCount: tokenCount, }, }; if (tokens === null) { diagnostics.mecab["status"] = "no-tokens"; diagnostics.mecab["note"] = "MeCab returned no parseable tokens. This is often caused by a missing/invalid MeCab dictionary path."; } else { diagnostics.mecab["status"] = "ok"; } const output = { input: args.input, tokenizerText: subtitleData.text, tokens, diagnostics, }; const json = JSON.stringify(output, null, args.emitPretty ? 2 : undefined); process.stdout.write(`${json}\n`); if (args.emitColoredLine && subtitleData.tokens) { const coloredLine = renderColoredLine(subtitleData.text, subtitleData.tokens, args); process.stdout.write(`${coloredLine}\n`); } } main().catch((error) => { console.error(`Error: ${(error as Error).message}`); process.exit(1); });