/* * SubMiner - All-in-one sentence mining overlay * Copyright (C) 2024 sudacode * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ import { spawn, execSync } from "child_process"; import { PartOfSpeech, Token, MecabStatus } from "./types"; import { createLogger } from "./logger"; export { PartOfSpeech }; const log = createLogger("mecab"); function mapPartOfSpeech(pos1: string): PartOfSpeech { switch (pos1) { case "名詞": return PartOfSpeech.noun; case "動詞": return PartOfSpeech.verb; case "形容詞": return PartOfSpeech.i_adjective; case "形状詞": case "形容動詞": return PartOfSpeech.na_adjective; case "助詞": return PartOfSpeech.particle; case "助動詞": return PartOfSpeech.bound_auxiliary; case "記号": case "補助記号": return PartOfSpeech.symbol; default: return PartOfSpeech.other; } } export function parseMecabLine(line: string): Token | null { if (!line || line === "EOS" || line.trim() === "") { return null; } const tabIndex = line.indexOf("\t"); if (tabIndex === -1) { return null; } const surface = line.substring(0, tabIndex); const featureString = line.substring(tabIndex + 1); const features = featureString.split(","); const pos1 = features[0] || ""; const pos2 = features[1] || ""; const pos3 = features[2] || ""; const pos4 = features[3] || ""; const inflectionType = features[4] || ""; const inflectionForm = features[5] || ""; const lemma = features[6] || surface; const reading = features[7] || ""; const pronunciation = features[8] || ""; return { word: surface, partOfSpeech: mapPartOfSpeech(pos1), pos1, pos2, pos3, pos4, inflectionType, inflectionForm, headword: lemma !== "*" ? lemma : surface, katakanaReading: reading !== "*" ? reading : "", pronunciation: pronunciation !== "*" ? pronunciation : "", }; } export class MecabTokenizer { private mecabPath: string | null = null; private available: boolean = false; private enabled: boolean = true; async checkAvailability(): Promise { try { const result = execSync("which mecab", { encoding: "utf-8" }).trim(); if (result) { this.mecabPath = result; this.available = true; log.info("MeCab found at:", this.mecabPath); return true; } } catch (err) { log.info("MeCab not found on system"); } this.available = false; return false; } async tokenize(text: string): Promise { if (!this.available || !this.enabled || !text) { return null; } return new Promise((resolve) => { const mecab = spawn("mecab", [], { stdio: ["pipe", "pipe", "pipe"], }); let stdout = ""; let stderr = ""; mecab.stdout.on("data", (data: Buffer) => { stdout += data.toString(); }); mecab.stderr.on("data", (data: Buffer) => { stderr += data.toString(); }); mecab.on("close", (code: number | null) => { if (code !== 0) { log.error("MeCab process exited with code:", code); if (stderr) { log.error("MeCab stderr:", stderr); } resolve(null); return; } const lines = stdout.split("\n"); const tokens: Token[] = []; for (const line of lines) { const token = parseMecabLine(line); if (token) { tokens.push(token); } } resolve(tokens); }); mecab.on("error", (err: Error) => { log.error("Failed to spawn MeCab:", err.message); resolve(null); }); mecab.stdin.write(text); mecab.stdin.end(); }); } getStatus(): MecabStatus { return { available: this.available, enabled: this.enabled, path: this.mecabPath, }; } setEnabled(enabled: boolean): void { this.enabled = enabled; } } export { mapPartOfSpeech };