Files
SubMiner/src/mecab-tokenizer.ts
2026-02-17 22:54:09 -08:00

213 lines
5.7 KiB
TypeScript

/*
* SubMiner - All-in-one sentence mining overlay
* Copyright (C) 2024 sudacode
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
import { spawn, execSync } from 'child_process';
import { PartOfSpeech, Token, MecabStatus } from './types';
import { createLogger } from './logger';
export { PartOfSpeech };
const log = createLogger('mecab');
function mapPartOfSpeech(pos1: string): PartOfSpeech {
switch (pos1) {
case '名詞':
return PartOfSpeech.noun;
case '動詞':
return PartOfSpeech.verb;
case '形容詞':
return PartOfSpeech.i_adjective;
case '形状詞':
case '形容動詞':
return PartOfSpeech.na_adjective;
case '助詞':
return PartOfSpeech.particle;
case '助動詞':
return PartOfSpeech.bound_auxiliary;
case '記号':
case '補助記号':
return PartOfSpeech.symbol;
default:
return PartOfSpeech.other;
}
}
export function parseMecabLine(line: string): Token | null {
if (!line || line === 'EOS' || line.trim() === '') {
return null;
}
const tabIndex = line.indexOf('\t');
if (tabIndex === -1) {
return null;
}
const surface = line.substring(0, tabIndex);
const featureString = line.substring(tabIndex + 1);
const features = featureString.split(',');
const pos1 = features[0] || '';
const pos2 = features[1] || '';
const pos3 = features[2] || '';
const pos4 = features[3] || '';
const inflectionType = features[4] || '';
const inflectionForm = features[5] || '';
const lemma = features[6] || surface;
const reading = features[7] || '';
const pronunciation = features[8] || '';
return {
word: surface,
partOfSpeech: mapPartOfSpeech(pos1),
pos1,
pos2,
pos3,
pos4,
inflectionType,
inflectionForm,
headword: lemma !== '*' ? lemma : surface,
katakanaReading: reading !== '*' ? reading : '',
pronunciation: pronunciation !== '*' ? pronunciation : '',
};
}
export interface MecabTokenizerOptions {
mecabCommand?: string;
dictionaryPath?: string;
}
export class MecabTokenizer {
private mecabPath: string | null = null;
private mecabCommand: string;
private dictionaryPath: string | null;
private available: boolean = false;
private enabled: boolean = true;
constructor(options: MecabTokenizerOptions = {}) {
this.mecabCommand = options.mecabCommand?.trim() || 'mecab';
this.dictionaryPath = options.dictionaryPath?.trim() || null;
}
async checkAvailability(): Promise<boolean> {
try {
const command = this.mecabCommand;
const result = command.includes('/')
? command
: execSync(`which ${command}`, { encoding: 'utf-8' }).trim();
if (result) {
this.mecabPath = result;
this.available = true;
log.info('MeCab found at:', this.mecabPath);
return true;
}
} catch (err) {
log.info('MeCab not found on system');
}
this.available = false;
return false;
}
async tokenize(text: string): Promise<Token[] | null> {
if (!this.available || !this.enabled || !text) {
return null;
}
return new Promise((resolve) => {
const mecabArgs: string[] = [];
if (this.dictionaryPath) {
mecabArgs.push('-d', this.dictionaryPath);
}
const mecab = spawn(this.mecabPath ?? this.mecabCommand, mecabArgs, {
stdio: ['pipe', 'pipe', 'pipe'],
});
let stdout = '';
let stderr = '';
mecab.stdout.on('data', (data: Buffer) => {
stdout += data.toString();
});
mecab.stderr.on('data', (data: Buffer) => {
stderr += data.toString();
});
mecab.on('close', (code: number | null) => {
if (code !== 0) {
log.error('MeCab process exited with code:', code);
if (stderr) {
log.error('MeCab stderr:', stderr);
}
resolve(null);
return;
}
const lines = stdout.split('\n');
const tokens: Token[] = [];
for (const line of lines) {
const token = parseMecabLine(line);
if (token) {
tokens.push(token);
}
}
if (tokens.length === 0 && text.trim().length > 0) {
const trimmedStdout = stdout.trim();
const trimmedStderr = stderr.trim();
if (trimmedStdout) {
log.warn(
'MeCab returned no parseable tokens.',
`command=${this.mecabPath ?? this.mecabCommand}`,
`stdout=${trimmedStdout.slice(0, 1024)}`,
);
}
if (trimmedStderr) {
log.warn('MeCab stderr while tokenizing:', trimmedStderr);
}
}
resolve(tokens);
});
mecab.on('error', (err: Error) => {
log.error('Failed to spawn MeCab:', err.message);
resolve(null);
});
mecab.stdin.write(text);
mecab.stdin.end();
});
}
getStatus(): MecabStatus {
return {
available: this.available,
enabled: this.enabled,
path: this.mecabPath,
};
}
setEnabled(enabled: boolean): void {
this.enabled = enabled;
}
}
export { mapPartOfSpeech };