/* * SubMiner - All-in-one sentence mining overlay * Copyright (C) 2024 sudacode * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ import * as childProcess from 'child_process'; import { PartOfSpeech, Token, MecabStatus } from './types'; import { createLogger } from './logger'; export { PartOfSpeech }; const log = createLogger('mecab'); function mapPartOfSpeech(pos1: string): PartOfSpeech { switch (pos1) { case '名詞': return PartOfSpeech.noun; case '動詞': return PartOfSpeech.verb; case '形容詞': return PartOfSpeech.i_adjective; case '形状詞': case '形容動詞': return PartOfSpeech.na_adjective; case '助詞': return PartOfSpeech.particle; case '助動詞': return PartOfSpeech.bound_auxiliary; case '記号': case '補助記号': return PartOfSpeech.symbol; default: return PartOfSpeech.other; } } export function parseMecabLine(line: string): Token | null { if (!line || line === 'EOS' || line.trim() === '') { return null; } const tabIndex = line.indexOf('\t'); if (tabIndex === -1) { return null; } const surface = line.substring(0, tabIndex); const featureString = line.substring(tabIndex + 1); const features = featureString.split(','); const pos1 = features[0] || ''; const pos2 = features[1] || ''; const pos3 = features[2] || ''; const pos4 = features[3] || ''; const inflectionType = features[4] || ''; const inflectionForm = features[5] || ''; const lemma = features[6] || surface; const reading = features[7] || ''; const pronunciation = features[8] || ''; return { word: surface, partOfSpeech: mapPartOfSpeech(pos1), pos1, pos2, pos3, pos4, inflectionType, inflectionForm, headword: lemma !== '*' ? lemma : surface, katakanaReading: reading !== '*' ? reading : '', pronunciation: pronunciation !== '*' ? pronunciation : '', }; } export interface MecabTokenizerOptions { mecabCommand?: string; dictionaryPath?: string; idleShutdownMs?: number; spawnFn?: typeof childProcess.spawn; execSyncFn?: typeof childProcess.execSync; setTimeoutFn?: (callback: () => void, delayMs: number) => ReturnType; clearTimeoutFn?: (timer: ReturnType) => void; } interface MecabQueuedRequest { text: string; retryCount: number; resolve: (tokens: Token[] | null) => void; } interface MecabActiveRequest extends MecabQueuedRequest { lines: string[]; stderr: string; } export class MecabTokenizer { private static readonly DEFAULT_IDLE_SHUTDOWN_MS = 30_000; private static readonly MAX_RETRY_COUNT = 1; private mecabPath: string | null = null; private mecabCommand: string; private dictionaryPath: string | null; private available: boolean = false; private enabled: boolean = true; private idleShutdownMs: number; private readonly spawnFn: typeof childProcess.spawn; private readonly execSyncFn: typeof childProcess.execSync; private readonly setTimeoutFn: ( callback: () => void, delayMs: number, ) => ReturnType; private readonly clearTimeoutFn: (timer: ReturnType) => void; private mecabProcess: ReturnType | null = null; private idleShutdownTimer: ReturnType | null = null; private stdoutBuffer = ''; private requestQueue: MecabQueuedRequest[] = []; private activeRequest: MecabActiveRequest | null = null; constructor(options: MecabTokenizerOptions = {}) { this.mecabCommand = options.mecabCommand?.trim() || 'mecab'; this.dictionaryPath = options.dictionaryPath?.trim() || null; this.idleShutdownMs = Math.max( 0, Math.floor(options.idleShutdownMs ?? MecabTokenizer.DEFAULT_IDLE_SHUTDOWN_MS), ); this.spawnFn = options.spawnFn ?? childProcess.spawn; this.execSyncFn = options.execSyncFn ?? childProcess.execSync; this.setTimeoutFn = options.setTimeoutFn ?? ((callback, delayMs) => setTimeout(callback, delayMs)); this.clearTimeoutFn = options.clearTimeoutFn ?? ((timer) => clearTimeout(timer)); } async checkAvailability(): Promise { try { const command = this.mecabCommand; const result = command.includes('/') ? command : this.execSyncFn(`which ${command}`, { encoding: 'utf-8' }); const resolvedPath = String(result).trim(); if (resolvedPath) { this.mecabPath = resolvedPath; this.available = true; log.info('MeCab found at:', this.mecabPath); return true; } } catch (err) { log.info('MeCab not found on system'); } this.stopPersistentProcess(); this.available = false; return false; } async tokenize(text: string): Promise { const normalizedText = text.replace(/\r?\n/g, ' ').trim(); if (!this.available || !this.enabled || !normalizedText) { return null; } return new Promise((resolve) => { this.clearIdleShutdownTimer(); this.requestQueue.push({ text: normalizedText, retryCount: 0, resolve, }); this.processQueue(); }); } private processQueue(): void { if (this.activeRequest) { return; } const request = this.requestQueue.shift(); if (!request) { this.scheduleIdleShutdown(); return; } if (!this.ensurePersistentProcess()) { this.retryOrResolveRequest(request); this.processQueue(); return; } this.activeRequest = { ...request, lines: [], stderr: '', }; try { this.mecabProcess?.stdin?.write(`${request.text}\n`); } catch (error) { log.error('Failed to write to MeCab process:', (error as Error).message); this.retryOrResolveRequest(request); this.activeRequest = null; this.stopPersistentProcess(); this.processQueue(); } } private retryOrResolveRequest(request: MecabQueuedRequest): void { if (request.retryCount < MecabTokenizer.MAX_RETRY_COUNT && this.enabled && this.available) { this.requestQueue.push({ ...request, retryCount: request.retryCount + 1, }); return; } request.resolve(null); } private ensurePersistentProcess(): boolean { if (this.mecabProcess) { return true; } const mecabArgs: string[] = []; if (this.dictionaryPath) { mecabArgs.push('-d', this.dictionaryPath); } let mecab: ReturnType; try { mecab = this.spawnFn(this.mecabPath ?? this.mecabCommand, mecabArgs, { stdio: ['pipe', 'pipe', 'pipe'], }); } catch (error) { log.error('Failed to spawn MeCab:', (error as Error).message); return false; } if (!mecab.stdin || !mecab.stdout || !mecab.stderr) { log.error('Failed to spawn MeCab: missing stdio pipes'); try { mecab.kill(); } catch {} return false; } this.stdoutBuffer = ''; mecab.stdout.on('data', (data: Buffer | string) => { this.handleStdoutChunk(data.toString()); }); mecab.stderr.on('data', (data: Buffer | string) => { if (!this.activeRequest) { return; } this.activeRequest.stderr += data.toString(); }); mecab.on('error', (error: Error) => { this.handlePersistentProcessEnded(mecab, `spawn error: ${error.message}`); }); mecab.on('close', (code: number | null) => { this.handlePersistentProcessEnded(mecab, `exit code ${String(code)}`); }); this.mecabProcess = mecab; return true; } private handleStdoutChunk(chunk: string): void { this.stdoutBuffer += chunk; while (true) { const newlineIndex = this.stdoutBuffer.indexOf('\n'); if (newlineIndex === -1) { break; } const line = this.stdoutBuffer.slice(0, newlineIndex).replace(/\r$/, ''); this.stdoutBuffer = this.stdoutBuffer.slice(newlineIndex + 1); this.handleStdoutLine(line); } } private handleStdoutLine(line: string): void { if (!this.activeRequest) { return; } if (line === 'EOS') { this.resolveActiveRequest(); return; } if (!line.trim()) { return; } this.activeRequest.lines.push(line); } private resolveActiveRequest(): void { const current = this.activeRequest; if (!current) { return; } this.activeRequest = null; const tokens: Token[] = []; for (const line of current.lines) { const token = parseMecabLine(line); if (token) { tokens.push(token); } } if (tokens.length === 0 && current.text.trim().length > 0) { const trimmedStdout = current.lines.join('\n').trim(); const trimmedStderr = current.stderr.trim(); if (trimmedStdout) { log.warn( 'MeCab returned no parseable tokens.', `command=${this.mecabPath ?? this.mecabCommand}`, `stdout=${trimmedStdout.slice(0, 1024)}`, ); } if (trimmedStderr) { log.warn('MeCab stderr while tokenizing:', trimmedStderr); } } current.resolve(tokens); this.processQueue(); } private handlePersistentProcessEnded( process: ReturnType, reason: string, ): void { if (this.mecabProcess !== process) { return; } this.mecabProcess = null; this.stdoutBuffer = ''; this.clearIdleShutdownTimer(); const pending: MecabQueuedRequest[] = []; if (this.activeRequest) { pending.push({ text: this.activeRequest.text, retryCount: this.activeRequest.retryCount, resolve: this.activeRequest.resolve, }); } this.activeRequest = null; if (this.requestQueue.length > 0) { pending.push(...this.requestQueue); } this.requestQueue = []; if (pending.length > 0) { log.warn( `MeCab parser process ended during active work (${reason}); retrying pending request(s).`, ); for (const request of pending) { this.retryOrResolveRequest(request); } this.processQueue(); } } private scheduleIdleShutdown(): void { this.clearIdleShutdownTimer(); if (this.idleShutdownMs <= 0 || !this.mecabProcess) { return; } this.idleShutdownTimer = this.setTimeoutFn(() => { this.idleShutdownTimer = null; if (this.activeRequest || this.requestQueue.length > 0) { return; } this.stopPersistentProcess(); }, this.idleShutdownMs); const timerWithUnref = this.idleShutdownTimer as { unref?: () => void }; if (typeof timerWithUnref.unref === 'function') { timerWithUnref.unref(); } } private clearIdleShutdownTimer(): void { if (!this.idleShutdownTimer) { return; } this.clearTimeoutFn(this.idleShutdownTimer); this.idleShutdownTimer = null; } private stopPersistentProcess(): void { const process = this.mecabProcess; if (!process) { return; } this.mecabProcess = null; this.stdoutBuffer = ''; this.clearIdleShutdownTimer(); try { process.kill(); } catch {} } getStatus(): MecabStatus { return { available: this.available, enabled: this.enabled, path: this.mecabPath, }; } setEnabled(enabled: boolean): void { this.enabled = enabled; if (!enabled) { const pending: MecabQueuedRequest[] = []; if (this.activeRequest) { pending.push({ text: this.activeRequest.text, retryCount: MecabTokenizer.MAX_RETRY_COUNT, resolve: this.activeRequest.resolve, }); } if (this.requestQueue.length > 0) { pending.push(...this.requestQueue); } this.activeRequest = null; this.requestQueue = []; for (const request of pending) { request.resolve(null); } this.stopPersistentProcess(); } } } export { mapPartOfSpeech };