mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-02-27 18:22:41 -08:00
Fix Yomitan token headword frequency matching and add frequency tests
This commit is contained in:
907
scripts/get_frequency.ts
Normal file
907
scripts/get_frequency.ts
Normal file
@@ -0,0 +1,907 @@
|
||||
import fs from "node:fs";
|
||||
import path from "node:path";
|
||||
import process from "node:process";
|
||||
|
||||
import { createTokenizerDepsRuntimeService, tokenizeSubtitleService } from "../src/core/services/tokenizer-service.js";
|
||||
import { createFrequencyDictionaryLookupService } from "../src/core/services/frequency-dictionary-service.js";
|
||||
import { MecabTokenizer } from "../src/mecab-tokenizer.js";
|
||||
import type { MergedToken, FrequencyDictionaryLookup } from "../src/types.js";
|
||||
|
||||
interface CliOptions {
|
||||
input: string;
|
||||
dictionaryPath: string;
|
||||
emitPretty: boolean;
|
||||
emitVerbose: boolean;
|
||||
mecabCommand?: string;
|
||||
mecabDictionaryPath?: string;
|
||||
forceMecabOnly?: boolean;
|
||||
yomitanExtensionPath?: string;
|
||||
yomitanUserDataPath?: string;
|
||||
emitColoredLine: boolean;
|
||||
colorMode: "single" | "banded";
|
||||
colorTopX: number;
|
||||
colorSingle: string;
|
||||
colorBand1: string;
|
||||
colorBand2: string;
|
||||
colorBand3: string;
|
||||
colorBand4: string;
|
||||
colorBand5: string;
|
||||
colorKnown: string;
|
||||
colorNPlusOne: string;
|
||||
}
|
||||
|
||||
function parseCliArgs(argv: string[]): CliOptions {
|
||||
const args = [...argv];
|
||||
let inputParts: string[] = [];
|
||||
let dictionaryPath = path.join(process.cwd(), "vendor", "jiten_freq_global");
|
||||
let emitPretty = false;
|
||||
let emitVerbose = false;
|
||||
let mecabCommand: string | undefined;
|
||||
let mecabDictionaryPath: string | undefined;
|
||||
let forceMecabOnly = false;
|
||||
let yomitanExtensionPath: string | undefined;
|
||||
let yomitanUserDataPath: string | undefined;
|
||||
let emitColoredLine = false;
|
||||
let colorMode: "single" | "banded" = "single";
|
||||
let colorTopX = 1000;
|
||||
let colorSingle = "#f5a97f";
|
||||
let colorBand1 = "#ed8796";
|
||||
let colorBand2 = "#f5a97f";
|
||||
let colorBand3 = "#f9e2af";
|
||||
let colorBand4 = "#a6e3a1";
|
||||
let colorBand5 = "#8aadf4";
|
||||
let colorKnown = "#a6da95";
|
||||
let colorNPlusOne = "#c6a0f6";
|
||||
|
||||
while (args.length > 0) {
|
||||
const arg = args.shift();
|
||||
if (!arg) break;
|
||||
|
||||
if (arg === "--help" || arg === "-h") {
|
||||
printUsage();
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
if (arg === "--dictionary") {
|
||||
const next = args.shift();
|
||||
if (!next) {
|
||||
throw new Error("Missing value for --dictionary");
|
||||
}
|
||||
dictionaryPath = path.resolve(next);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--mecab-command") {
|
||||
const next = args.shift();
|
||||
if (!next) {
|
||||
throw new Error("Missing value for --mecab-command");
|
||||
}
|
||||
mecabCommand = next;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--mecab-dictionary") {
|
||||
const next = args.shift();
|
||||
if (!next) {
|
||||
throw new Error("Missing value for --mecab-dictionary");
|
||||
}
|
||||
mecabDictionaryPath = next;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--yomitan-extension") {
|
||||
const next = args.shift();
|
||||
if (!next) {
|
||||
throw new Error("Missing value for --yomitan-extension");
|
||||
}
|
||||
yomitanExtensionPath = path.resolve(next);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--yomitan-user-data") {
|
||||
const next = args.shift();
|
||||
if (!next) {
|
||||
throw new Error("Missing value for --yomitan-user-data");
|
||||
}
|
||||
yomitanUserDataPath = path.resolve(next);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--colorized-line") {
|
||||
emitColoredLine = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--color-mode") {
|
||||
const next = args.shift();
|
||||
if (!next) {
|
||||
throw new Error("Missing value for --color-mode");
|
||||
}
|
||||
if (next !== "single" && next !== "banded") {
|
||||
throw new Error("--color-mode must be 'single' or 'banded'");
|
||||
}
|
||||
colorMode = next;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--color-top-x") {
|
||||
const next = args.shift();
|
||||
if (!next) {
|
||||
throw new Error("Missing value for --color-top-x");
|
||||
}
|
||||
const parsed = Number.parseInt(next, 10);
|
||||
if (!Number.isFinite(parsed) || parsed <= 0) {
|
||||
throw new Error("--color-top-x must be a positive integer");
|
||||
}
|
||||
colorTopX = parsed;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--color-single") {
|
||||
const next = args.shift();
|
||||
if (!next) {
|
||||
throw new Error("Missing value for --color-single");
|
||||
}
|
||||
colorSingle = next;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--color-band-1") {
|
||||
const next = args.shift();
|
||||
if (!next) {
|
||||
throw new Error("Missing value for --color-band-1");
|
||||
}
|
||||
colorBand1 = next;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--color-band-2") {
|
||||
const next = args.shift();
|
||||
if (!next) {
|
||||
throw new Error("Missing value for --color-band-2");
|
||||
}
|
||||
colorBand2 = next;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--color-band-3") {
|
||||
const next = args.shift();
|
||||
if (!next) {
|
||||
throw new Error("Missing value for --color-band-3");
|
||||
}
|
||||
colorBand3 = next;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--color-band-4") {
|
||||
const next = args.shift();
|
||||
if (!next) {
|
||||
throw new Error("Missing value for --color-band-4");
|
||||
}
|
||||
colorBand4 = next;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--color-band-5") {
|
||||
const next = args.shift();
|
||||
if (!next) {
|
||||
throw new Error("Missing value for --color-band-5");
|
||||
}
|
||||
colorBand5 = next;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--color-known") {
|
||||
const next = args.shift();
|
||||
if (!next) {
|
||||
throw new Error("Missing value for --color-known");
|
||||
}
|
||||
colorKnown = next;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--color-n-plus-one") {
|
||||
const next = args.shift();
|
||||
if (!next) {
|
||||
throw new Error("Missing value for --color-n-plus-one");
|
||||
}
|
||||
colorNPlusOne = next;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("--dictionary=")) {
|
||||
dictionaryPath = path.resolve(arg.slice("--dictionary=".length));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("--mecab-command=")) {
|
||||
mecabCommand = arg.slice("--mecab-command=".length);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("--mecab-dictionary=")) {
|
||||
mecabDictionaryPath = arg.slice("--mecab-dictionary=".length);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("--yomitan-extension=")) {
|
||||
yomitanExtensionPath = path.resolve(
|
||||
arg.slice("--yomitan-extension=".length),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("--yomitan-user-data=")) {
|
||||
yomitanUserDataPath = path.resolve(
|
||||
arg.slice("--yomitan-user-data=".length),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("--colorized-line")) {
|
||||
emitColoredLine = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("--color-mode=")) {
|
||||
const value = arg.slice("--color-mode=".length);
|
||||
if (value !== "single" && value !== "banded") {
|
||||
throw new Error("--color-mode must be 'single' or 'banded'");
|
||||
}
|
||||
colorMode = value;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("--color-top-x=")) {
|
||||
const value = arg.slice("--color-top-x=".length);
|
||||
const parsed = Number.parseInt(value, 10);
|
||||
if (!Number.isFinite(parsed) || parsed <= 0) {
|
||||
throw new Error("--color-top-x must be a positive integer");
|
||||
}
|
||||
colorTopX = parsed;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("--color-single=")) {
|
||||
colorSingle = arg.slice("--color-single=".length);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("--color-band-1=")) {
|
||||
colorBand1 = arg.slice("--color-band-1=".length);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("--color-band-2=")) {
|
||||
colorBand2 = arg.slice("--color-band-2=".length);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("--color-band-3=")) {
|
||||
colorBand3 = arg.slice("--color-band-3=".length);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("--color-band-4=")) {
|
||||
colorBand4 = arg.slice("--color-band-4=".length);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("--color-band-5=")) {
|
||||
colorBand5 = arg.slice("--color-band-5=".length);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("--color-known=")) {
|
||||
colorKnown = arg.slice("--color-known=".length);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("--color-n-plus-one=")) {
|
||||
colorNPlusOne = arg.slice("--color-n-plus-one=".length);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--pretty") {
|
||||
emitPretty = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--verbose") {
|
||||
emitVerbose = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg === "--force-mecab") {
|
||||
forceMecabOnly = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith("-")) {
|
||||
throw new Error(`Unknown flag: ${arg}`);
|
||||
}
|
||||
|
||||
inputParts.push(arg);
|
||||
}
|
||||
|
||||
const input = inputParts.join(" ").trim();
|
||||
if (!input) {
|
||||
const stdin = fs.readFileSync(0, "utf8").trim();
|
||||
if (!stdin) {
|
||||
throw new Error(
|
||||
"Please provide input text as arguments or via stdin.",
|
||||
);
|
||||
}
|
||||
return {
|
||||
input: stdin,
|
||||
dictionaryPath,
|
||||
emitPretty,
|
||||
emitVerbose,
|
||||
forceMecabOnly,
|
||||
yomitanExtensionPath,
|
||||
yomitanUserDataPath,
|
||||
emitColoredLine,
|
||||
colorMode,
|
||||
colorTopX,
|
||||
colorSingle,
|
||||
colorBand1,
|
||||
colorBand2,
|
||||
colorBand3,
|
||||
colorBand4,
|
||||
colorBand5,
|
||||
colorKnown,
|
||||
colorNPlusOne,
|
||||
mecabCommand,
|
||||
mecabDictionaryPath,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
input,
|
||||
dictionaryPath,
|
||||
emitPretty,
|
||||
emitVerbose,
|
||||
forceMecabOnly,
|
||||
yomitanExtensionPath,
|
||||
yomitanUserDataPath,
|
||||
emitColoredLine,
|
||||
colorMode,
|
||||
colorTopX,
|
||||
colorSingle,
|
||||
colorBand1,
|
||||
colorBand2,
|
||||
colorBand3,
|
||||
colorBand4,
|
||||
colorBand5,
|
||||
colorKnown,
|
||||
colorNPlusOne,
|
||||
mecabCommand,
|
||||
mecabDictionaryPath,
|
||||
};
|
||||
}
|
||||
|
||||
function printUsage(): void {
|
||||
process.stdout.write(`Usage:
|
||||
pnpm run get-frequency [--pretty] [--verbose] [--dictionary <path>] [--mecab-command <path>] [--mecab-dictionary <path>] <text>
|
||||
|
||||
--pretty Pretty-print JSON output.
|
||||
--verbose Include merged-frequency diagnostics and lookup terms.
|
||||
--force-mecab Skip Yomitan parser initialization and force MeCab fallback.
|
||||
--yomitan-extension <path> Optional path to a Yomitan extension directory.
|
||||
--yomitan-user-data <path> Optional Electron userData directory for Yomitan state.
|
||||
--colorized-line Output a terminal-colorized line based on token classification.
|
||||
--color-mode <single|banded> Frequency coloring mode (default: single).
|
||||
--color-top-x <n> Frequency color applies when rank <= n (default: 1000).
|
||||
--color-single <#hex> Frequency single-mode color (default: #f5a97f).
|
||||
--color-band-1 <#hex> Frequency band-1 color.
|
||||
--color-band-2 <#hex> Frequency band-2 color.
|
||||
--color-band-3 <#hex> Frequency band-3 color.
|
||||
--color-band-4 <#hex> Frequency band-4 color.
|
||||
--color-band-5 <#hex> Frequency band-5 color.
|
||||
--color-known <#hex> Known-word color (default: #a6da95).
|
||||
--color-n-plus-one <#hex> N+1 target color (default: #c6a0f6).
|
||||
--dictionary <path> Frequency dictionary root path (default: ./vendor/jiten_freq_global)
|
||||
--mecab-command <path> Optional MeCab binary path (default: mecab)
|
||||
--mecab-dictionary <path> Optional MeCab dictionary directory (default: system default)
|
||||
-h, --help Show usage.
|
||||
\n`);
|
||||
}
|
||||
|
||||
type FrequencyCandidate = {
|
||||
term: string;
|
||||
rank: number;
|
||||
};
|
||||
|
||||
function getFrequencyLookupTextCandidates(token: MergedToken): string[] {
|
||||
const tokenWithCandidates = token as MergedToken & {
|
||||
frequencyLookupTerms?: string[];
|
||||
};
|
||||
const lookupTextCandidates: string[] = [];
|
||||
const addLookupText = (text: string | undefined): void => {
|
||||
if (!text) {
|
||||
return;
|
||||
}
|
||||
const trimmed = text.trim();
|
||||
if (!trimmed) {
|
||||
return;
|
||||
}
|
||||
lookupTextCandidates.push(trimmed);
|
||||
};
|
||||
|
||||
if (Array.isArray(tokenWithCandidates.frequencyLookupTerms)) {
|
||||
for (const term of tokenWithCandidates.frequencyLookupTerms) {
|
||||
addLookupText(term);
|
||||
}
|
||||
}
|
||||
|
||||
addLookupText(token.headword);
|
||||
addLookupText(token.reading);
|
||||
addLookupText(token.surface);
|
||||
|
||||
const uniqueLookupTerms: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
for (const term of lookupTextCandidates) {
|
||||
if (seen.has(term)) {
|
||||
continue;
|
||||
}
|
||||
seen.add(term);
|
||||
uniqueLookupTerms.push(term);
|
||||
}
|
||||
return uniqueLookupTerms;
|
||||
}
|
||||
|
||||
function getBestFrequencyLookupCandidate(
|
||||
token: MergedToken,
|
||||
getFrequencyRank: FrequencyDictionaryLookup,
|
||||
): FrequencyCandidate | null {
|
||||
const lookupTexts = getFrequencyLookupTextCandidates(token);
|
||||
let best: FrequencyCandidate | null = null;
|
||||
for (const term of lookupTexts) {
|
||||
const rank = getFrequencyRank(term);
|
||||
if (typeof rank !== "number" || !Number.isFinite(rank) || rank <= 0) {
|
||||
continue;
|
||||
}
|
||||
if (!best || rank < best.rank) {
|
||||
best = { term, rank };
|
||||
}
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
function simplifyToken(token: MergedToken): Record<string, unknown> {
|
||||
return {
|
||||
surface: token.surface,
|
||||
reading: token.reading,
|
||||
headword: token.headword,
|
||||
startPos: token.startPos,
|
||||
endPos: token.endPos,
|
||||
partOfSpeech: token.partOfSpeech,
|
||||
isMerged: token.isMerged,
|
||||
isKnown: token.isKnown,
|
||||
isNPlusOneTarget: token.isNPlusOneTarget,
|
||||
frequencyRank: token.frequencyRank,
|
||||
jlptLevel: token.jlptLevel,
|
||||
};
|
||||
}
|
||||
|
||||
function simplifyTokenWithVerbose(
|
||||
token: MergedToken,
|
||||
getFrequencyRank: FrequencyDictionaryLookup,
|
||||
): Record<string, unknown> {
|
||||
const tokenWithCandidates = token as MergedToken & {
|
||||
frequencyLookupTerms?: string[];
|
||||
};
|
||||
const frequencyLookupTerms = tokenWithCandidates.frequencyLookupTerms;
|
||||
const candidates = getFrequencyLookupTextCandidates(token).map((term) => ({
|
||||
term,
|
||||
rank: getFrequencyRank(term),
|
||||
})).filter((candidate) =>
|
||||
typeof candidate.rank === "number" &&
|
||||
Number.isFinite(candidate.rank) &&
|
||||
candidate.rank > 0
|
||||
);
|
||||
|
||||
const bestCandidate = getBestFrequencyLookupCandidate(
|
||||
token,
|
||||
getFrequencyRank,
|
||||
);
|
||||
|
||||
return {
|
||||
surface: token.surface,
|
||||
reading: token.reading,
|
||||
headword: token.headword,
|
||||
startPos: token.startPos,
|
||||
endPos: token.endPos,
|
||||
partOfSpeech: token.partOfSpeech,
|
||||
isMerged: token.isMerged,
|
||||
isKnown: token.isKnown,
|
||||
isNPlusOneTarget: token.isNPlusOneTarget,
|
||||
frequencyRank: token.frequencyRank,
|
||||
jlptLevel: token.jlptLevel,
|
||||
frequencyLookupTerms:
|
||||
Array.isArray(frequencyLookupTerms) && frequencyLookupTerms.length > 0
|
||||
? frequencyLookupTerms
|
||||
: undefined,
|
||||
frequencyCandidates: candidates,
|
||||
frequencyBestLookupTerm: bestCandidate?.term ?? null,
|
||||
frequencyBestLookupRank: bestCandidate?.rank ?? null,
|
||||
};
|
||||
}
|
||||
|
||||
interface YomitanRuntimeState {
|
||||
yomitanExt: unknown | null;
|
||||
parserWindow: unknown | null;
|
||||
parserReadyPromise: Promise<void> | null;
|
||||
parserInitPromise: Promise<boolean> | null;
|
||||
available: boolean;
|
||||
note?: string;
|
||||
}
|
||||
|
||||
async function createYomitanRuntimeState(
|
||||
userDataPath: string,
|
||||
): Promise<YomitanRuntimeState> {
|
||||
const state: YomitanRuntimeState = {
|
||||
yomitanExt: null,
|
||||
parserWindow: null,
|
||||
parserReadyPromise: null,
|
||||
parserInitPromise: null,
|
||||
available: false,
|
||||
};
|
||||
|
||||
const electronImport = await import("electron").catch((error) => {
|
||||
state.note = error instanceof Error ? error.message : "unknown error";
|
||||
return null;
|
||||
});
|
||||
if (!electronImport || !electronImport.app || !electronImport.app.whenReady) {
|
||||
state.note = "electron runtime not available in this process";
|
||||
return state;
|
||||
}
|
||||
|
||||
try {
|
||||
await electronImport.app.whenReady();
|
||||
const loadYomitanExtensionService = (
|
||||
await import(
|
||||
"../src/core/services/yomitan-extension-loader-service.js"
|
||||
)
|
||||
).loadYomitanExtensionService as (
|
||||
options: {
|
||||
userDataPath: string;
|
||||
getYomitanParserWindow: () => unknown;
|
||||
setYomitanParserWindow: (window: unknown) => void;
|
||||
setYomitanParserReadyPromise: (promise: Promise<void> | null) => void;
|
||||
setYomitanParserInitPromise: (promise: Promise<boolean> | null) => void;
|
||||
setYomitanExtension: (extension: unknown) => void;
|
||||
},
|
||||
) => Promise<unknown>;
|
||||
|
||||
const extension = await loadYomitanExtensionService({
|
||||
userDataPath,
|
||||
getYomitanParserWindow: () => state.parserWindow,
|
||||
setYomitanParserWindow: (window) => {
|
||||
state.parserWindow = window;
|
||||
},
|
||||
setYomitanParserReadyPromise: (promise) => {
|
||||
state.parserReadyPromise = promise;
|
||||
},
|
||||
setYomitanParserInitPromise: (promise) => {
|
||||
state.parserInitPromise = promise;
|
||||
},
|
||||
setYomitanExtension: (extension) => {
|
||||
state.yomitanExt = extension;
|
||||
},
|
||||
});
|
||||
|
||||
if (!extension) {
|
||||
state.note = "yomitan extension is not available";
|
||||
return state;
|
||||
}
|
||||
|
||||
state.yomitanExt = extension;
|
||||
state.available = true;
|
||||
return state;
|
||||
} catch (error) {
|
||||
state.note =
|
||||
error instanceof Error
|
||||
? error.message
|
||||
: "failed to initialize yomitan extension";
|
||||
return state;
|
||||
}
|
||||
}
|
||||
|
||||
async function createYomitanRuntimeStateWithSearch(
|
||||
userDataPath: string,
|
||||
extensionPath?: string,
|
||||
): Promise<YomitanRuntimeState> {
|
||||
const preferredPath = extensionPath
|
||||
? path.resolve(extensionPath)
|
||||
: undefined;
|
||||
const defaultVendorPath = path.resolve(process.cwd(), "vendor", "yomitan");
|
||||
const candidates = [
|
||||
...(preferredPath ? [preferredPath] : []),
|
||||
defaultVendorPath,
|
||||
];
|
||||
|
||||
for (const candidate of candidates) {
|
||||
if (!candidate) {
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
if (fs.existsSync(path.join(candidate, "manifest.json"))) {
|
||||
const state = await createYomitanRuntimeState(userDataPath);
|
||||
if (state.available) {
|
||||
return state;
|
||||
}
|
||||
if (!state.note) {
|
||||
state.note = `Failed to load yomitan extension at ${candidate}`;
|
||||
}
|
||||
return state;
|
||||
}
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return createYomitanRuntimeState(userDataPath);
|
||||
}
|
||||
|
||||
async function getFrequencyLookup(dictionaryPath: string): Promise<FrequencyDictionaryLookup> {
|
||||
return createFrequencyDictionaryLookupService({
|
||||
searchPaths: [dictionaryPath],
|
||||
log: (message) => {
|
||||
// Keep script output pure JSON by default
|
||||
if (process.env.DEBUG_FREQUENCY === "1") {
|
||||
console.error(message);
|
||||
}
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
const ANSI_RESET = "\u001b[0m";
|
||||
const ANSI_FG_PREFIX = "\u001b[38;2";
|
||||
const HEX_COLOR_PATTERN = /^#(?:[0-9a-fA-F]{3}|[0-9a-fA-F]{6})$/;
|
||||
|
||||
function parseHexRgb(input: string): [number, number, number] | null {
|
||||
const normalized = input.trim().replace(/^#/, "");
|
||||
if (!HEX_COLOR_PATTERN.test(`#${normalized}`)) {
|
||||
return null;
|
||||
}
|
||||
const expanded = normalized.length === 3
|
||||
? normalized.split("").map((char) => `${char}${char}`).join("")
|
||||
: normalized;
|
||||
const r = Number.parseInt(expanded.substring(0, 2), 16);
|
||||
const g = Number.parseInt(expanded.substring(2, 4), 16);
|
||||
const b = Number.parseInt(expanded.substring(4, 6), 16);
|
||||
if (
|
||||
!Number.isFinite(r) ||
|
||||
!Number.isFinite(g) ||
|
||||
!Number.isFinite(b)
|
||||
) {
|
||||
return null;
|
||||
}
|
||||
return [r, g, b];
|
||||
}
|
||||
|
||||
function wrapWithForeground(text: string, color: string): string {
|
||||
const rgb = parseHexRgb(color);
|
||||
if (!rgb) {
|
||||
return text;
|
||||
}
|
||||
return `${ANSI_FG_PREFIX};${rgb[0]};${rgb[1]};${rgb[2]}m${text}${ANSI_RESET}`;
|
||||
}
|
||||
|
||||
function getBandColor(
|
||||
rank: number,
|
||||
colorTopX: number,
|
||||
colorMode: "single" | "banded",
|
||||
colorSingle: string,
|
||||
bandedColors: [string, string, string, string, string],
|
||||
): string {
|
||||
const topX = Math.max(1, Math.floor(colorTopX));
|
||||
const safeRank = Math.max(1, Math.floor(rank));
|
||||
if (safeRank > topX) {
|
||||
return "";
|
||||
}
|
||||
if (colorMode === "single") {
|
||||
return colorSingle;
|
||||
}
|
||||
const normalizedBand = Math.ceil((safeRank / topX) * bandedColors.length);
|
||||
const band = Math.min(bandedColors.length, Math.max(1, normalizedBand));
|
||||
return bandedColors[band - 1];
|
||||
}
|
||||
|
||||
function getTokenColor(token: MergedToken, args: CliOptions): string {
|
||||
if (token.isNPlusOneTarget) {
|
||||
return args.colorNPlusOne;
|
||||
}
|
||||
if (token.isKnown) {
|
||||
return args.colorKnown;
|
||||
}
|
||||
if (typeof token.frequencyRank === "number" && Number.isFinite(token.frequencyRank)) {
|
||||
return getBandColor(
|
||||
token.frequencyRank,
|
||||
args.colorTopX,
|
||||
args.colorMode,
|
||||
args.colorSingle,
|
||||
[args.colorBand1, args.colorBand2, args.colorBand3, args.colorBand4, args.colorBand5],
|
||||
);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
function renderColoredLine(
|
||||
text: string,
|
||||
tokens: MergedToken[],
|
||||
args: CliOptions,
|
||||
): string {
|
||||
if (!args.emitColoredLine) {
|
||||
return text;
|
||||
}
|
||||
if (tokens.length === 0) {
|
||||
return text;
|
||||
}
|
||||
|
||||
const ordered = [...tokens].sort((a, b) => {
|
||||
const aStart = a.startPos ?? 0;
|
||||
const bStart = b.startPos ?? 0;
|
||||
if (aStart !== bStart) {
|
||||
return aStart - bStart;
|
||||
}
|
||||
return (a.endPos ?? a.surface.length) - (b.endPos ?? b.surface.length);
|
||||
});
|
||||
|
||||
let cursor = 0;
|
||||
let output = "";
|
||||
for (const token of ordered) {
|
||||
const start = token.startPos ?? 0;
|
||||
const end = token.endPos ?? (token.startPos ? token.startPos + token.surface.length : token.surface.length);
|
||||
if (start < 0 || end < 0 || end < start) {
|
||||
continue;
|
||||
}
|
||||
const safeStart = Math.min(Math.max(0, start), text.length);
|
||||
const safeEnd = Math.min(Math.max(safeStart, end), text.length);
|
||||
if (safeStart > cursor) {
|
||||
output += text.slice(cursor, safeStart);
|
||||
}
|
||||
const tokenText = text.slice(safeStart, safeEnd);
|
||||
const color = getTokenColor(token, args);
|
||||
output += color ? wrapWithForeground(tokenText, color) : tokenText;
|
||||
cursor = safeEnd;
|
||||
}
|
||||
|
||||
if (cursor < text.length) {
|
||||
output += text.slice(cursor);
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const args = parseCliArgs(process.argv.slice(2));
|
||||
const getFrequencyRank = await getFrequencyLookup(args.dictionaryPath);
|
||||
|
||||
const mecabTokenizer = new MecabTokenizer({
|
||||
mecabCommand: args.mecabCommand,
|
||||
dictionaryPath: args.mecabDictionaryPath,
|
||||
});
|
||||
const isMecabAvailable = await mecabTokenizer.checkAvailability();
|
||||
if (!isMecabAvailable) {
|
||||
throw new Error(
|
||||
"MeCab is not available on this system. Install/run environment with MeCab to tokenize input.",
|
||||
);
|
||||
}
|
||||
|
||||
const app = await import("electron").catch(() => null);
|
||||
if (app && args.yomitanUserDataPath) {
|
||||
app.app.setPath("userData", args.yomitanUserDataPath);
|
||||
}
|
||||
const yomitanState =
|
||||
!args.forceMecabOnly
|
||||
? await createYomitanRuntimeStateWithSearch(
|
||||
app?.app?.getPath ? app.app.getPath("userData") : process.cwd(),
|
||||
args.yomitanExtensionPath,
|
||||
)
|
||||
: null;
|
||||
const hasYomitan = Boolean(yomitanState?.available && yomitanState?.yomitanExt);
|
||||
|
||||
const deps = createTokenizerDepsRuntimeService({
|
||||
getYomitanExt: () =>
|
||||
(hasYomitan ? yomitanState!.yomitanExt : null) as never,
|
||||
getYomitanParserWindow: () =>
|
||||
(hasYomitan ? yomitanState!.parserWindow : null) as never,
|
||||
setYomitanParserWindow: (window) => {
|
||||
if (!hasYomitan) {
|
||||
return;
|
||||
}
|
||||
yomitanState!.parserWindow = window;
|
||||
},
|
||||
getYomitanParserReadyPromise: () =>
|
||||
(hasYomitan ? yomitanState!.parserReadyPromise : null) as never,
|
||||
setYomitanParserReadyPromise: (promise) => {
|
||||
if (!hasYomitan) {
|
||||
return;
|
||||
}
|
||||
yomitanState!.parserReadyPromise = promise;
|
||||
},
|
||||
getYomitanParserInitPromise: () =>
|
||||
(hasYomitan ? yomitanState!.parserInitPromise : null) as never,
|
||||
setYomitanParserInitPromise: (promise) => {
|
||||
if (!hasYomitan) {
|
||||
return;
|
||||
}
|
||||
yomitanState!.parserInitPromise = promise;
|
||||
},
|
||||
isKnownWord: () => false,
|
||||
getKnownWordMatchMode: () => "headword",
|
||||
getJlptLevel: () => null,
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank,
|
||||
getMecabTokenizer: () => ({
|
||||
tokenize: (text: string) => mecabTokenizer.tokenize(text),
|
||||
}),
|
||||
});
|
||||
|
||||
const subtitleData = await tokenizeSubtitleService(args.input, deps);
|
||||
const tokenCount = subtitleData.tokens?.length ?? 0;
|
||||
const mergedCount = subtitleData.tokens?.filter((token) => token.isMerged).length ?? 0;
|
||||
const hasYomitanCandidates = Boolean(
|
||||
subtitleData.tokens?.some((token) => {
|
||||
const frequencyLookupTerms = (
|
||||
token as MergedToken & { frequencyLookupTerms?: string[] }
|
||||
).frequencyLookupTerms;
|
||||
return Array.isArray(frequencyLookupTerms) && frequencyLookupTerms.length > 0;
|
||||
}) ?? false,
|
||||
);
|
||||
const tokens =
|
||||
subtitleData.tokens?.map((token) =>
|
||||
args.emitVerbose
|
||||
? simplifyTokenWithVerbose(token, getFrequencyRank)
|
||||
: simplifyToken(token),
|
||||
) ?? null;
|
||||
const diagnostics = {
|
||||
yomitan: {
|
||||
available: Boolean(yomitanState?.available),
|
||||
loaded: hasYomitan,
|
||||
forceMecabOnly: args.forceMecabOnly,
|
||||
note: yomitanState?.note ?? null,
|
||||
},
|
||||
mecab: {
|
||||
command: args.mecabCommand ?? "mecab",
|
||||
dictionaryPath: args.mecabDictionaryPath ?? null,
|
||||
available: isMecabAvailable,
|
||||
},
|
||||
tokenizer: {
|
||||
sourceHint:
|
||||
tokenCount === 0
|
||||
? "none"
|
||||
: hasYomitan ? "yomitan-merged" : "mecab-merge",
|
||||
mergedTokenCount: mergedCount,
|
||||
totalTokenCount: tokenCount,
|
||||
},
|
||||
};
|
||||
if (tokens === null) {
|
||||
diagnostics.mecab["status"] = "no-tokens";
|
||||
diagnostics.mecab["note"] =
|
||||
"MeCab returned no parseable tokens. This is often caused by a missing/invalid MeCab dictionary path.";
|
||||
} else {
|
||||
diagnostics.mecab["status"] = "ok";
|
||||
}
|
||||
|
||||
const output = {
|
||||
input: args.input,
|
||||
tokenizerText: subtitleData.text,
|
||||
tokens,
|
||||
diagnostics,
|
||||
};
|
||||
|
||||
const json = JSON.stringify(output, null, args.emitPretty ? 2 : undefined);
|
||||
process.stdout.write(`${json}\n`);
|
||||
|
||||
if (args.emitColoredLine && subtitleData.tokens) {
|
||||
const coloredLine = renderColoredLine(subtitleData.text, subtitleData.tokens, args);
|
||||
process.stdout.write(`${coloredLine}\n`);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error(`Error: ${(error as Error).message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user