mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-01 18:22:41 -08:00
213 lines
5.7 KiB
TypeScript
213 lines
5.7 KiB
TypeScript
/*
|
|
* SubMiner - All-in-one sentence mining overlay
|
|
* Copyright (C) 2024 sudacode
|
|
*
|
|
* This program is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
import { spawn, execSync } from 'child_process';
|
|
import { PartOfSpeech, Token, MecabStatus } from './types';
|
|
import { createLogger } from './logger';
|
|
|
|
export { PartOfSpeech };
|
|
|
|
const log = createLogger('mecab');
|
|
|
|
function mapPartOfSpeech(pos1: string): PartOfSpeech {
|
|
switch (pos1) {
|
|
case '名詞':
|
|
return PartOfSpeech.noun;
|
|
case '動詞':
|
|
return PartOfSpeech.verb;
|
|
case '形容詞':
|
|
return PartOfSpeech.i_adjective;
|
|
case '形状詞':
|
|
case '形容動詞':
|
|
return PartOfSpeech.na_adjective;
|
|
case '助詞':
|
|
return PartOfSpeech.particle;
|
|
case '助動詞':
|
|
return PartOfSpeech.bound_auxiliary;
|
|
case '記号':
|
|
case '補助記号':
|
|
return PartOfSpeech.symbol;
|
|
default:
|
|
return PartOfSpeech.other;
|
|
}
|
|
}
|
|
|
|
export function parseMecabLine(line: string): Token | null {
|
|
if (!line || line === 'EOS' || line.trim() === '') {
|
|
return null;
|
|
}
|
|
|
|
const tabIndex = line.indexOf('\t');
|
|
if (tabIndex === -1) {
|
|
return null;
|
|
}
|
|
|
|
const surface = line.substring(0, tabIndex);
|
|
const featureString = line.substring(tabIndex + 1);
|
|
const features = featureString.split(',');
|
|
|
|
const pos1 = features[0] || '';
|
|
const pos2 = features[1] || '';
|
|
const pos3 = features[2] || '';
|
|
const pos4 = features[3] || '';
|
|
const inflectionType = features[4] || '';
|
|
const inflectionForm = features[5] || '';
|
|
const lemma = features[6] || surface;
|
|
const reading = features[7] || '';
|
|
const pronunciation = features[8] || '';
|
|
|
|
return {
|
|
word: surface,
|
|
partOfSpeech: mapPartOfSpeech(pos1),
|
|
pos1,
|
|
pos2,
|
|
pos3,
|
|
pos4,
|
|
inflectionType,
|
|
inflectionForm,
|
|
headword: lemma !== '*' ? lemma : surface,
|
|
katakanaReading: reading !== '*' ? reading : '',
|
|
pronunciation: pronunciation !== '*' ? pronunciation : '',
|
|
};
|
|
}
|
|
|
|
export interface MecabTokenizerOptions {
|
|
mecabCommand?: string;
|
|
dictionaryPath?: string;
|
|
}
|
|
|
|
export class MecabTokenizer {
|
|
private mecabPath: string | null = null;
|
|
private mecabCommand: string;
|
|
private dictionaryPath: string | null;
|
|
private available: boolean = false;
|
|
private enabled: boolean = true;
|
|
|
|
constructor(options: MecabTokenizerOptions = {}) {
|
|
this.mecabCommand = options.mecabCommand?.trim() || 'mecab';
|
|
this.dictionaryPath = options.dictionaryPath?.trim() || null;
|
|
}
|
|
|
|
async checkAvailability(): Promise<boolean> {
|
|
try {
|
|
const command = this.mecabCommand;
|
|
const result = command.includes('/')
|
|
? command
|
|
: execSync(`which ${command}`, { encoding: 'utf-8' }).trim();
|
|
if (result) {
|
|
this.mecabPath = result;
|
|
this.available = true;
|
|
log.info('MeCab found at:', this.mecabPath);
|
|
return true;
|
|
}
|
|
} catch (err) {
|
|
log.info('MeCab not found on system');
|
|
}
|
|
|
|
this.available = false;
|
|
return false;
|
|
}
|
|
|
|
async tokenize(text: string): Promise<Token[] | null> {
|
|
if (!this.available || !this.enabled || !text) {
|
|
return null;
|
|
}
|
|
|
|
return new Promise((resolve) => {
|
|
const mecabArgs: string[] = [];
|
|
if (this.dictionaryPath) {
|
|
mecabArgs.push('-d', this.dictionaryPath);
|
|
}
|
|
const mecab = spawn(this.mecabPath ?? this.mecabCommand, mecabArgs, {
|
|
stdio: ['pipe', 'pipe', 'pipe'],
|
|
});
|
|
|
|
let stdout = '';
|
|
let stderr = '';
|
|
|
|
mecab.stdout.on('data', (data: Buffer) => {
|
|
stdout += data.toString();
|
|
});
|
|
|
|
mecab.stderr.on('data', (data: Buffer) => {
|
|
stderr += data.toString();
|
|
});
|
|
|
|
mecab.on('close', (code: number | null) => {
|
|
if (code !== 0) {
|
|
log.error('MeCab process exited with code:', code);
|
|
if (stderr) {
|
|
log.error('MeCab stderr:', stderr);
|
|
}
|
|
resolve(null);
|
|
return;
|
|
}
|
|
|
|
const lines = stdout.split('\n');
|
|
const tokens: Token[] = [];
|
|
|
|
for (const line of lines) {
|
|
const token = parseMecabLine(line);
|
|
if (token) {
|
|
tokens.push(token);
|
|
}
|
|
}
|
|
|
|
if (tokens.length === 0 && text.trim().length > 0) {
|
|
const trimmedStdout = stdout.trim();
|
|
const trimmedStderr = stderr.trim();
|
|
if (trimmedStdout) {
|
|
log.warn(
|
|
'MeCab returned no parseable tokens.',
|
|
`command=${this.mecabPath ?? this.mecabCommand}`,
|
|
`stdout=${trimmedStdout.slice(0, 1024)}`,
|
|
);
|
|
}
|
|
if (trimmedStderr) {
|
|
log.warn('MeCab stderr while tokenizing:', trimmedStderr);
|
|
}
|
|
}
|
|
|
|
resolve(tokens);
|
|
});
|
|
|
|
mecab.on('error', (err: Error) => {
|
|
log.error('Failed to spawn MeCab:', err.message);
|
|
resolve(null);
|
|
});
|
|
|
|
mecab.stdin.write(text);
|
|
mecab.stdin.end();
|
|
});
|
|
}
|
|
|
|
getStatus(): MecabStatus {
|
|
return {
|
|
available: this.available,
|
|
enabled: this.enabled,
|
|
path: this.mecabPath,
|
|
};
|
|
}
|
|
|
|
setEnabled(enabled: boolean): void {
|
|
this.enabled = enabled;
|
|
}
|
|
}
|
|
|
|
export { mapPartOfSpeech };
|