Fix Yomitan token headword frequency matching and add frequency tests

This commit is contained in:
kyasuda
2026-02-16 13:21:19 -08:00
parent e142d2dc3b
commit 0eb2868805
7 changed files with 1586 additions and 80 deletions

View File

@@ -86,14 +86,29 @@ export function parseMecabLine(line: string): Token | null {
};
}
export interface MecabTokenizerOptions {
mecabCommand?: string;
dictionaryPath?: string;
}
export class MecabTokenizer {
private mecabPath: string | null = null;
private mecabCommand: string;
private dictionaryPath: string | null;
private available: boolean = false;
private enabled: boolean = true;
constructor(options: MecabTokenizerOptions = {}) {
this.mecabCommand = options.mecabCommand?.trim() || "mecab";
this.dictionaryPath = options.dictionaryPath?.trim() || null;
}
async checkAvailability(): Promise<boolean> {
try {
const result = execSync("which mecab", { encoding: "utf-8" }).trim();
const command = this.mecabCommand;
const result = command.includes("/")
? command
: execSync(`which ${command}`, { encoding: "utf-8" }).trim();
if (result) {
this.mecabPath = result;
this.available = true;
@@ -114,7 +129,11 @@ export class MecabTokenizer {
}
return new Promise((resolve) => {
const mecab = spawn("mecab", [], {
const mecabArgs: string[] = [];
if (this.dictionaryPath) {
mecabArgs.push("-d", this.dictionaryPath);
}
const mecab = spawn(this.mecabPath ?? this.mecabCommand, mecabArgs, {
stdio: ["pipe", "pipe", "pipe"],
});
@@ -149,6 +168,21 @@ export class MecabTokenizer {
}
}
if (tokens.length === 0 && text.trim().length > 0) {
const trimmedStdout = stdout.trim();
const trimmedStderr = stderr.trim();
if (trimmedStdout) {
log.warn(
"MeCab returned no parseable tokens.",
`command=${this.mecabPath ?? this.mecabCommand}`,
`stdout=${trimmedStdout.slice(0, 1024)}`,
);
}
if (trimmedStderr) {
log.warn("MeCab stderr while tokenizing:", trimmedStderr);
}
}
resolve(tokens);
});