mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-02 06:22:42 -08:00
perf(tokenizer): cut annotation latency with persistent mecab
This commit is contained in:
@@ -0,0 +1,48 @@
|
||||
---
|
||||
id: TASK-81
|
||||
title: 'Tokenization performance: disable Yomitan MeCab parser, gate local MeCab init, and add persistent MeCab process'
|
||||
status: Done
|
||||
assignee: []
|
||||
created_date: '2026-03-02 07:44'
|
||||
updated_date: '2026-03-02 07:46'
|
||||
labels: []
|
||||
dependencies: []
|
||||
priority: high
|
||||
ordinal: 9001
|
||||
---
|
||||
|
||||
## Description
|
||||
|
||||
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||
|
||||
Reduce subtitle annotation latency by:
|
||||
- disabling Yomitan-side MeCab parser requests (`useMecabParser=false`);
|
||||
- initializing local MeCab only when POS-dependent annotations are enabled (N+1 / JLPT / frequency);
|
||||
- replacing per-line local MeCab process spawning with a persistent parser process that auto-shuts down after idle time and restarts on demand.
|
||||
|
||||
<!-- SECTION:DESCRIPTION:END -->
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
<!-- AC:BEGIN -->
|
||||
|
||||
- [x] #1 Yomitan parse requests disable MeCab parser path.
|
||||
- [x] #2 MeCab warmup/init is skipped when all POS-dependent annotation toggles are off.
|
||||
- [x] #3 Local MeCab tokenizer uses persistent process across subtitle lines.
|
||||
- [x] #4 Persistent MeCab process auto-shuts down after idle timeout and restarts on next tokenize activity.
|
||||
- [x] #5 Tests cover parser flag, warmup gating, and persistent MeCab lifecycle behavior.
|
||||
|
||||
<!-- AC:END -->
|
||||
|
||||
## Final Summary
|
||||
|
||||
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||
|
||||
Implemented tokenizer latency optimizations:
|
||||
- switched Yomitan parse requests to `useMecabParser: false`;
|
||||
- added annotation-aware MeCab initialization gating in runtime warmup flow;
|
||||
- added persistent local MeCab process (default idle shutdown: 30s) with queued requests, retry-on-process-end, idle auto-shutdown, and automatic restart on new work;
|
||||
- added regression tests for Yomitan parse flag, MeCab warmup gating, and persistent/idle lifecycle behavior;
|
||||
- validated with targeted tests and `tsc --noEmit`.
|
||||
|
||||
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||
@@ -1,6 +1,7 @@
|
||||
import assert from 'node:assert/strict';
|
||||
import test from 'node:test';
|
||||
import {
|
||||
requestYomitanParseResults,
|
||||
requestYomitanTermFrequencies,
|
||||
syncYomitanDefaultAnkiServer,
|
||||
} from './yomitan-parser-runtime';
|
||||
@@ -246,3 +247,32 @@ test('requestYomitanTermFrequencies caches repeated term+reading lookups', async
|
||||
const frequencyCalls = scripts.filter((script) => script.includes('getTermFrequencies')).length;
|
||||
assert.equal(frequencyCalls, 1);
|
||||
});
|
||||
|
||||
test('requestYomitanParseResults disables Yomitan MeCab parser path', async () => {
|
||||
const scripts: string[] = [];
|
||||
const deps = createDeps(async (script) => {
|
||||
scripts.push(script);
|
||||
if (script.includes('optionsGetFull')) {
|
||||
return {
|
||||
profileCurrent: 0,
|
||||
profiles: [
|
||||
{
|
||||
options: {
|
||||
scanning: { length: 40 },
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
return [];
|
||||
});
|
||||
|
||||
const result = await requestYomitanParseResults('猫です', deps, {
|
||||
error: () => undefined,
|
||||
});
|
||||
|
||||
assert.deepEqual(result, []);
|
||||
const parseScript = scripts.find((script) => script.includes('parseText'));
|
||||
assert.ok(parseScript, 'expected parseText request script');
|
||||
assert.match(parseScript ?? '', /useMecabParser:\s*false/);
|
||||
});
|
||||
|
||||
@@ -529,7 +529,7 @@ export async function requestYomitanParseResults(
|
||||
optionsContext: { index: ${metadata.profileIndex} },
|
||||
scanLength: ${metadata.scanLength},
|
||||
useInternalParser: true,
|
||||
useMecabParser: true
|
||||
useMecabParser: false
|
||||
});
|
||||
})();
|
||||
`
|
||||
@@ -564,7 +564,7 @@ export async function requestYomitanParseResults(
|
||||
optionsContext: { index: profileIndex },
|
||||
scanLength,
|
||||
useInternalParser: true,
|
||||
useMecabParser: true
|
||||
useMecabParser: false
|
||||
});
|
||||
})();
|
||||
`;
|
||||
|
||||
@@ -236,3 +236,144 @@ test('composeMpvRuntimeHandlers returns callable handlers and forwards to inject
|
||||
assert.ok(calls.includes('warmup-yomitan'));
|
||||
assert.ok(calls.indexOf('create-mecab') < calls.indexOf('set-started:true'));
|
||||
});
|
||||
|
||||
test('composeMpvRuntimeHandlers skips MeCab warmup when all POS-dependent annotations are disabled', async () => {
|
||||
const calls: string[] = [];
|
||||
let mecabTokenizer: { id: string } | null = null;
|
||||
|
||||
class FakeMpvClient {
|
||||
connected = false;
|
||||
constructor(
|
||||
public socketPath: string,
|
||||
public options: unknown,
|
||||
) {}
|
||||
on(): void {}
|
||||
connect(): void {
|
||||
this.connected = true;
|
||||
}
|
||||
}
|
||||
|
||||
const composed = composeMpvRuntimeHandlers<
|
||||
FakeMpvClient,
|
||||
{ isKnownWord: (text: string) => boolean },
|
||||
{ text: string }
|
||||
>({
|
||||
bindMpvMainEventHandlersMainDeps: {
|
||||
appState: {
|
||||
initialArgs: null,
|
||||
overlayRuntimeInitialized: true,
|
||||
mpvClient: null,
|
||||
immersionTracker: null,
|
||||
subtitleTimingTracker: null,
|
||||
currentSubText: '',
|
||||
currentSubAssText: '',
|
||||
playbackPaused: null,
|
||||
previousSecondarySubVisibility: null,
|
||||
},
|
||||
getQuitOnDisconnectArmed: () => false,
|
||||
scheduleQuitCheck: () => {},
|
||||
quitApp: () => {},
|
||||
reportJellyfinRemoteStopped: () => {},
|
||||
syncOverlayMpvSubtitleSuppression: () => {},
|
||||
maybeRunAnilistPostWatchUpdate: async () => {},
|
||||
logSubtitleTimingError: () => {},
|
||||
broadcastToOverlayWindows: () => {},
|
||||
onSubtitleChange: () => {},
|
||||
refreshDiscordPresence: () => {},
|
||||
ensureImmersionTrackerInitialized: () => {},
|
||||
updateCurrentMediaPath: () => {},
|
||||
restoreMpvSubVisibility: () => {},
|
||||
getCurrentAnilistMediaKey: () => null,
|
||||
resetAnilistMediaTracking: () => {},
|
||||
maybeProbeAnilistDuration: () => {},
|
||||
ensureAnilistMediaGuess: () => {},
|
||||
syncImmersionMediaState: () => {},
|
||||
updateCurrentMediaTitle: () => {},
|
||||
resetAnilistMediaGuessState: () => {},
|
||||
reportJellyfinRemoteProgress: () => {},
|
||||
updateSubtitleRenderMetrics: () => {},
|
||||
},
|
||||
mpvClientRuntimeServiceFactoryMainDeps: {
|
||||
createClient: FakeMpvClient,
|
||||
getSocketPath: () => '/tmp/mpv.sock',
|
||||
getResolvedConfig: () => ({ auto_start_overlay: false }),
|
||||
isAutoStartOverlayEnabled: () => true,
|
||||
setOverlayVisible: () => {},
|
||||
isVisibleOverlayVisible: () => false,
|
||||
getReconnectTimer: () => null,
|
||||
setReconnectTimer: () => {},
|
||||
},
|
||||
updateMpvSubtitleRenderMetricsMainDeps: {
|
||||
getCurrentMetrics: () => BASE_METRICS,
|
||||
setCurrentMetrics: () => {},
|
||||
applyPatch: (current, patch) => ({ next: { ...current, ...patch }, changed: true }),
|
||||
broadcastMetrics: () => {},
|
||||
},
|
||||
tokenizer: {
|
||||
buildTokenizerDepsMainDeps: {
|
||||
getYomitanExt: () => null,
|
||||
getYomitanParserWindow: () => null,
|
||||
setYomitanParserWindow: () => {},
|
||||
getYomitanParserReadyPromise: () => null,
|
||||
setYomitanParserReadyPromise: () => {},
|
||||
getYomitanParserInitPromise: () => null,
|
||||
setYomitanParserInitPromise: () => {},
|
||||
isKnownWord: () => false,
|
||||
recordLookup: () => {},
|
||||
getKnownWordMatchMode: () => 'headword',
|
||||
getNPlusOneEnabled: () => false,
|
||||
getMinSentenceWordsForNPlusOne: () => 3,
|
||||
getJlptLevel: () => null,
|
||||
getJlptEnabled: () => false,
|
||||
getFrequencyDictionaryEnabled: () => false,
|
||||
getFrequencyDictionaryMatchMode: () => 'headword',
|
||||
getFrequencyRank: () => null,
|
||||
getYomitanGroupDebugEnabled: () => false,
|
||||
getMecabTokenizer: () => null,
|
||||
},
|
||||
createTokenizerRuntimeDeps: () => ({ isKnownWord: () => false }),
|
||||
tokenizeSubtitle: async (text) => ({ text }),
|
||||
createMecabTokenizerAndCheckMainDeps: {
|
||||
getMecabTokenizer: () => mecabTokenizer,
|
||||
setMecabTokenizer: (next) => {
|
||||
mecabTokenizer = next as { id: string };
|
||||
calls.push('set-mecab');
|
||||
},
|
||||
createMecabTokenizer: () => {
|
||||
calls.push('create-mecab');
|
||||
return { id: 'mecab' };
|
||||
},
|
||||
checkAvailability: async () => {
|
||||
calls.push('check-mecab');
|
||||
},
|
||||
},
|
||||
prewarmSubtitleDictionariesMainDeps: {
|
||||
ensureJlptDictionaryLookup: async () => {},
|
||||
ensureFrequencyDictionaryLookup: async () => {},
|
||||
},
|
||||
},
|
||||
warmups: {
|
||||
launchBackgroundWarmupTaskMainDeps: {
|
||||
now: () => 0,
|
||||
logDebug: () => {},
|
||||
logWarn: () => {},
|
||||
},
|
||||
startBackgroundWarmupsMainDeps: {
|
||||
getStarted: () => false,
|
||||
setStarted: () => {},
|
||||
isTexthookerOnlyMode: () => false,
|
||||
ensureYomitanExtensionLoaded: async () => {},
|
||||
shouldWarmupMecab: () => false,
|
||||
shouldWarmupYomitanExtension: () => false,
|
||||
shouldWarmupSubtitleDictionaries: () => false,
|
||||
shouldWarmupJellyfinRemoteSession: () => false,
|
||||
shouldAutoConnectJellyfinRemote: () => false,
|
||||
startJellyfinRemoteSession: async () => {},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
await composed.startTokenizationWarmups();
|
||||
|
||||
assert.deepEqual(calls, []);
|
||||
});
|
||||
|
||||
@@ -133,12 +133,23 @@ export function composeMpvRuntimeHandlers<
|
||||
const prewarmSubtitleDictionaries = createPrewarmSubtitleDictionariesMainHandler(
|
||||
options.tokenizer.prewarmSubtitleDictionariesMainDeps,
|
||||
);
|
||||
const shouldInitializeMecabForAnnotations = (): boolean => {
|
||||
const nPlusOneEnabled =
|
||||
options.tokenizer.buildTokenizerDepsMainDeps.getNPlusOneEnabled?.() !== false;
|
||||
const jlptEnabled = options.tokenizer.buildTokenizerDepsMainDeps.getJlptEnabled() !== false;
|
||||
const frequencyEnabled =
|
||||
options.tokenizer.buildTokenizerDepsMainDeps.getFrequencyDictionaryEnabled() !== false;
|
||||
return nPlusOneEnabled || jlptEnabled || frequencyEnabled;
|
||||
};
|
||||
let tokenizationWarmupInFlight: Promise<void> | null = null;
|
||||
const startTokenizationWarmups = (): Promise<void> => {
|
||||
if (!tokenizationWarmupInFlight) {
|
||||
tokenizationWarmupInFlight = (async () => {
|
||||
await options.warmups.startBackgroundWarmupsMainDeps.ensureYomitanExtensionLoaded();
|
||||
if (!options.tokenizer.createMecabTokenizerAndCheckMainDeps.getMecabTokenizer()) {
|
||||
if (
|
||||
shouldInitializeMecabForAnnotations() &&
|
||||
!options.tokenizer.createMecabTokenizerAndCheckMainDeps.getMecabTokenizer()
|
||||
) {
|
||||
await createMecabTokenizerAndCheck().catch(() => {});
|
||||
}
|
||||
await prewarmSubtitleDictionaries({ showLoadingOsd: true });
|
||||
|
||||
114
src/mecab-tokenizer.test.ts
Normal file
114
src/mecab-tokenizer.test.ts
Normal file
@@ -0,0 +1,114 @@
|
||||
import assert from 'node:assert/strict';
|
||||
import test from 'node:test';
|
||||
import { EventEmitter } from 'node:events';
|
||||
import * as childProcess from 'node:child_process';
|
||||
import { PassThrough, Writable } from 'node:stream';
|
||||
import { MecabTokenizer } from './mecab-tokenizer';
|
||||
|
||||
function createFakeMecabProcess(onKill: () => void): ReturnType<typeof childProcess.spawn> {
|
||||
const stdout = new PassThrough();
|
||||
const stderr = new PassThrough();
|
||||
const stdin = new Writable({
|
||||
write(chunk, _encoding, callback) {
|
||||
const text = String(chunk).replace(/\n+$/, '').trim();
|
||||
if (!text) {
|
||||
stdout.write('EOS\n');
|
||||
callback();
|
||||
return;
|
||||
}
|
||||
|
||||
const payload = `${text}\t名詞,一般,*,*,*,*,${text},${text},${text}\nEOS\n`;
|
||||
stdout.write(payload);
|
||||
callback();
|
||||
},
|
||||
});
|
||||
|
||||
const process = new EventEmitter() as unknown as ReturnType<typeof childProcess.spawn> & {
|
||||
stdin: Writable;
|
||||
stdout: PassThrough;
|
||||
stderr: PassThrough;
|
||||
};
|
||||
process.stdin = stdin;
|
||||
process.stdout = stdout;
|
||||
process.stderr = stderr;
|
||||
process.kill = () => {
|
||||
onKill();
|
||||
process.emit('close', 0);
|
||||
return true;
|
||||
};
|
||||
return process;
|
||||
}
|
||||
|
||||
test('MecabTokenizer reuses a persistent parser process across subtitle lines', async () => {
|
||||
let spawnCalls = 0;
|
||||
let killCalls = 0;
|
||||
let timerId = 0;
|
||||
const timers = new Map<number, () => void>();
|
||||
|
||||
const tokenizer = new MecabTokenizer({
|
||||
execSyncFn: (() => '/usr/bin/mecab') as unknown as typeof childProcess.execSync,
|
||||
spawnFn: (() => {
|
||||
spawnCalls += 1;
|
||||
return createFakeMecabProcess(() => {
|
||||
killCalls += 1;
|
||||
});
|
||||
}) as unknown as typeof childProcess.spawn,
|
||||
setTimeoutFn: (callback) => {
|
||||
timerId += 1;
|
||||
timers.set(timerId, callback);
|
||||
return timerId as unknown as ReturnType<typeof setTimeout>;
|
||||
},
|
||||
clearTimeoutFn: (timeout) => {
|
||||
timers.delete(timeout as unknown as number);
|
||||
},
|
||||
idleShutdownMs: 60_000,
|
||||
});
|
||||
|
||||
assert.equal(await tokenizer.checkAvailability(), true);
|
||||
|
||||
const first = await tokenizer.tokenize('猫');
|
||||
const second = await tokenizer.tokenize('犬');
|
||||
|
||||
assert.equal(first?.[0]?.word, '猫');
|
||||
assert.equal(second?.[0]?.word, '犬');
|
||||
assert.equal(spawnCalls, 1);
|
||||
assert.equal(killCalls, 0);
|
||||
});
|
||||
|
||||
test('MecabTokenizer shuts down after idle timeout and restarts on new activity', async () => {
|
||||
let spawnCalls = 0;
|
||||
let killCalls = 0;
|
||||
let timerId = 0;
|
||||
const timers = new Map<number, () => void>();
|
||||
|
||||
const tokenizer = new MecabTokenizer({
|
||||
execSyncFn: (() => '/usr/bin/mecab') as unknown as typeof childProcess.execSync,
|
||||
spawnFn: (() => {
|
||||
spawnCalls += 1;
|
||||
return createFakeMecabProcess(() => {
|
||||
killCalls += 1;
|
||||
});
|
||||
}) as unknown as typeof childProcess.spawn,
|
||||
setTimeoutFn: (callback) => {
|
||||
timerId += 1;
|
||||
timers.set(timerId, callback);
|
||||
return timerId as unknown as ReturnType<typeof setTimeout>;
|
||||
},
|
||||
clearTimeoutFn: (timeout) => {
|
||||
timers.delete(timeout as unknown as number);
|
||||
},
|
||||
idleShutdownMs: 5_000,
|
||||
});
|
||||
|
||||
assert.equal(await tokenizer.checkAvailability(), true);
|
||||
await tokenizer.tokenize('猫');
|
||||
assert.equal(spawnCalls, 1);
|
||||
|
||||
const pendingTimer = [...timers.values()][0];
|
||||
assert.ok(pendingTimer, 'expected idle shutdown timer');
|
||||
pendingTimer?.();
|
||||
assert.equal(killCalls, 1);
|
||||
|
||||
await tokenizer.tokenize('犬');
|
||||
assert.equal(spawnCalls, 2);
|
||||
});
|
||||
@@ -16,7 +16,7 @@
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
import { spawn, execSync } from 'child_process';
|
||||
import * as childProcess from 'child_process';
|
||||
import { PartOfSpeech, Token, MecabStatus } from './types';
|
||||
import { createLogger } from './logger';
|
||||
|
||||
@@ -89,18 +89,58 @@ export function parseMecabLine(line: string): Token | null {
|
||||
export interface MecabTokenizerOptions {
|
||||
mecabCommand?: string;
|
||||
dictionaryPath?: string;
|
||||
idleShutdownMs?: number;
|
||||
spawnFn?: typeof childProcess.spawn;
|
||||
execSyncFn?: typeof childProcess.execSync;
|
||||
setTimeoutFn?: (callback: () => void, delayMs: number) => ReturnType<typeof setTimeout>;
|
||||
clearTimeoutFn?: (timer: ReturnType<typeof setTimeout>) => void;
|
||||
}
|
||||
|
||||
interface MecabQueuedRequest {
|
||||
text: string;
|
||||
retryCount: number;
|
||||
resolve: (tokens: Token[] | null) => void;
|
||||
}
|
||||
|
||||
interface MecabActiveRequest extends MecabQueuedRequest {
|
||||
lines: string[];
|
||||
stderr: string;
|
||||
}
|
||||
|
||||
export class MecabTokenizer {
|
||||
private static readonly DEFAULT_IDLE_SHUTDOWN_MS = 30_000;
|
||||
private static readonly MAX_RETRY_COUNT = 1;
|
||||
|
||||
private mecabPath: string | null = null;
|
||||
private mecabCommand: string;
|
||||
private dictionaryPath: string | null;
|
||||
private available: boolean = false;
|
||||
private enabled: boolean = true;
|
||||
private idleShutdownMs: number;
|
||||
private readonly spawnFn: typeof childProcess.spawn;
|
||||
private readonly execSyncFn: typeof childProcess.execSync;
|
||||
private readonly setTimeoutFn: (
|
||||
callback: () => void,
|
||||
delayMs: number,
|
||||
) => ReturnType<typeof setTimeout>;
|
||||
private readonly clearTimeoutFn: (timer: ReturnType<typeof setTimeout>) => void;
|
||||
private mecabProcess: ReturnType<typeof childProcess.spawn> | null = null;
|
||||
private idleShutdownTimer: ReturnType<typeof setTimeout> | null = null;
|
||||
private stdoutBuffer = '';
|
||||
private requestQueue: MecabQueuedRequest[] = [];
|
||||
private activeRequest: MecabActiveRequest | null = null;
|
||||
|
||||
constructor(options: MecabTokenizerOptions = {}) {
|
||||
this.mecabCommand = options.mecabCommand?.trim() || 'mecab';
|
||||
this.dictionaryPath = options.dictionaryPath?.trim() || null;
|
||||
this.idleShutdownMs = Math.max(
|
||||
0,
|
||||
Math.floor(options.idleShutdownMs ?? MecabTokenizer.DEFAULT_IDLE_SHUTDOWN_MS),
|
||||
);
|
||||
this.spawnFn = options.spawnFn ?? childProcess.spawn;
|
||||
this.execSyncFn = options.execSyncFn ?? childProcess.execSync;
|
||||
this.setTimeoutFn = options.setTimeoutFn ?? ((callback, delayMs) => setTimeout(callback, delayMs));
|
||||
this.clearTimeoutFn = options.clearTimeoutFn ?? ((timer) => clearTimeout(timer));
|
||||
}
|
||||
|
||||
async checkAvailability(): Promise<boolean> {
|
||||
@@ -108,9 +148,10 @@ export class MecabTokenizer {
|
||||
const command = this.mecabCommand;
|
||||
const result = command.includes('/')
|
||||
? command
|
||||
: execSync(`which ${command}`, { encoding: 'utf-8' }).trim();
|
||||
if (result) {
|
||||
this.mecabPath = result;
|
||||
: this.execSyncFn(`which ${command}`, { encoding: 'utf-8' });
|
||||
const resolvedPath = String(result).trim();
|
||||
if (resolvedPath) {
|
||||
this.mecabPath = resolvedPath;
|
||||
this.available = true;
|
||||
log.info('MeCab found at:', this.mecabPath);
|
||||
return true;
|
||||
@@ -119,81 +160,259 @@ export class MecabTokenizer {
|
||||
log.info('MeCab not found on system');
|
||||
}
|
||||
|
||||
this.stopPersistentProcess();
|
||||
this.available = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
async tokenize(text: string): Promise<Token[] | null> {
|
||||
if (!this.available || !this.enabled || !text) {
|
||||
const normalizedText = text.replace(/\r?\n/g, ' ').trim();
|
||||
if (!this.available || !this.enabled || !normalizedText) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return new Promise((resolve) => {
|
||||
const mecabArgs: string[] = [];
|
||||
if (this.dictionaryPath) {
|
||||
mecabArgs.push('-d', this.dictionaryPath);
|
||||
}
|
||||
const mecab = spawn(this.mecabPath ?? this.mecabCommand, mecabArgs, {
|
||||
this.clearIdleShutdownTimer();
|
||||
this.requestQueue.push({
|
||||
text: normalizedText,
|
||||
retryCount: 0,
|
||||
resolve,
|
||||
});
|
||||
this.processQueue();
|
||||
});
|
||||
}
|
||||
|
||||
private processQueue(): void {
|
||||
if (this.activeRequest) {
|
||||
return;
|
||||
}
|
||||
|
||||
const request = this.requestQueue.shift();
|
||||
if (!request) {
|
||||
this.scheduleIdleShutdown();
|
||||
return;
|
||||
}
|
||||
|
||||
if (!this.ensurePersistentProcess()) {
|
||||
this.retryOrResolveRequest(request);
|
||||
this.processQueue();
|
||||
return;
|
||||
}
|
||||
|
||||
this.activeRequest = {
|
||||
...request,
|
||||
lines: [],
|
||||
stderr: '',
|
||||
};
|
||||
|
||||
try {
|
||||
this.mecabProcess?.stdin?.write(`${request.text}\n`);
|
||||
} catch (error) {
|
||||
log.error('Failed to write to MeCab process:', (error as Error).message);
|
||||
this.retryOrResolveRequest(request);
|
||||
this.activeRequest = null;
|
||||
this.stopPersistentProcess();
|
||||
this.processQueue();
|
||||
}
|
||||
}
|
||||
|
||||
private retryOrResolveRequest(request: MecabQueuedRequest): void {
|
||||
if (
|
||||
request.retryCount < MecabTokenizer.MAX_RETRY_COUNT &&
|
||||
this.enabled &&
|
||||
this.available
|
||||
) {
|
||||
this.requestQueue.push({
|
||||
...request,
|
||||
retryCount: request.retryCount + 1,
|
||||
});
|
||||
return;
|
||||
}
|
||||
request.resolve(null);
|
||||
}
|
||||
|
||||
private ensurePersistentProcess(): boolean {
|
||||
if (this.mecabProcess) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const mecabArgs: string[] = [];
|
||||
if (this.dictionaryPath) {
|
||||
mecabArgs.push('-d', this.dictionaryPath);
|
||||
}
|
||||
|
||||
let mecab: ReturnType<typeof childProcess.spawn>;
|
||||
try {
|
||||
mecab = this.spawnFn(this.mecabPath ?? this.mecabCommand, mecabArgs, {
|
||||
stdio: ['pipe', 'pipe', 'pipe'],
|
||||
});
|
||||
} catch (error) {
|
||||
log.error('Failed to spawn MeCab:', (error as Error).message);
|
||||
return false;
|
||||
}
|
||||
|
||||
let stdout = '';
|
||||
let stderr = '';
|
||||
if (!mecab.stdin || !mecab.stdout || !mecab.stderr) {
|
||||
log.error('Failed to spawn MeCab: missing stdio pipes');
|
||||
try {
|
||||
mecab.kill();
|
||||
} catch {}
|
||||
return false;
|
||||
}
|
||||
|
||||
mecab.stdout.on('data', (data: Buffer) => {
|
||||
stdout += data.toString();
|
||||
});
|
||||
|
||||
mecab.stderr.on('data', (data: Buffer) => {
|
||||
stderr += data.toString();
|
||||
});
|
||||
|
||||
mecab.on('close', (code: number | null) => {
|
||||
if (code !== 0) {
|
||||
log.error('MeCab process exited with code:', code);
|
||||
if (stderr) {
|
||||
log.error('MeCab stderr:', stderr);
|
||||
}
|
||||
resolve(null);
|
||||
return;
|
||||
}
|
||||
|
||||
const lines = stdout.split('\n');
|
||||
const tokens: Token[] = [];
|
||||
|
||||
for (const line of lines) {
|
||||
const token = parseMecabLine(line);
|
||||
if (token) {
|
||||
tokens.push(token);
|
||||
}
|
||||
}
|
||||
|
||||
if (tokens.length === 0 && text.trim().length > 0) {
|
||||
const trimmedStdout = stdout.trim();
|
||||
const trimmedStderr = stderr.trim();
|
||||
if (trimmedStdout) {
|
||||
log.warn(
|
||||
'MeCab returned no parseable tokens.',
|
||||
`command=${this.mecabPath ?? this.mecabCommand}`,
|
||||
`stdout=${trimmedStdout.slice(0, 1024)}`,
|
||||
);
|
||||
}
|
||||
if (trimmedStderr) {
|
||||
log.warn('MeCab stderr while tokenizing:', trimmedStderr);
|
||||
}
|
||||
}
|
||||
|
||||
resolve(tokens);
|
||||
});
|
||||
|
||||
mecab.on('error', (err: Error) => {
|
||||
log.error('Failed to spawn MeCab:', err.message);
|
||||
resolve(null);
|
||||
});
|
||||
|
||||
mecab.stdin.write(text);
|
||||
mecab.stdin.end();
|
||||
this.stdoutBuffer = '';
|
||||
mecab.stdout.on('data', (data: Buffer | string) => {
|
||||
this.handleStdoutChunk(data.toString());
|
||||
});
|
||||
mecab.stderr.on('data', (data: Buffer | string) => {
|
||||
if (!this.activeRequest) {
|
||||
return;
|
||||
}
|
||||
this.activeRequest.stderr += data.toString();
|
||||
});
|
||||
mecab.on('error', (error: Error) => {
|
||||
this.handlePersistentProcessEnded(mecab, `spawn error: ${error.message}`);
|
||||
});
|
||||
mecab.on('close', (code: number | null) => {
|
||||
this.handlePersistentProcessEnded(mecab, `exit code ${String(code)}`);
|
||||
});
|
||||
|
||||
this.mecabProcess = mecab;
|
||||
return true;
|
||||
}
|
||||
|
||||
private handleStdoutChunk(chunk: string): void {
|
||||
this.stdoutBuffer += chunk;
|
||||
while (true) {
|
||||
const newlineIndex = this.stdoutBuffer.indexOf('\n');
|
||||
if (newlineIndex === -1) {
|
||||
break;
|
||||
}
|
||||
const line = this.stdoutBuffer.slice(0, newlineIndex).replace(/\r$/, '');
|
||||
this.stdoutBuffer = this.stdoutBuffer.slice(newlineIndex + 1);
|
||||
this.handleStdoutLine(line);
|
||||
}
|
||||
}
|
||||
|
||||
private handleStdoutLine(line: string): void {
|
||||
if (!this.activeRequest) {
|
||||
return;
|
||||
}
|
||||
if (line === 'EOS') {
|
||||
this.resolveActiveRequest();
|
||||
return;
|
||||
}
|
||||
if (!line.trim()) {
|
||||
return;
|
||||
}
|
||||
this.activeRequest.lines.push(line);
|
||||
}
|
||||
|
||||
private resolveActiveRequest(): void {
|
||||
const current = this.activeRequest;
|
||||
if (!current) {
|
||||
return;
|
||||
}
|
||||
this.activeRequest = null;
|
||||
|
||||
const tokens: Token[] = [];
|
||||
for (const line of current.lines) {
|
||||
const token = parseMecabLine(line);
|
||||
if (token) {
|
||||
tokens.push(token);
|
||||
}
|
||||
}
|
||||
|
||||
if (tokens.length === 0 && current.text.trim().length > 0) {
|
||||
const trimmedStdout = current.lines.join('\n').trim();
|
||||
const trimmedStderr = current.stderr.trim();
|
||||
if (trimmedStdout) {
|
||||
log.warn(
|
||||
'MeCab returned no parseable tokens.',
|
||||
`command=${this.mecabPath ?? this.mecabCommand}`,
|
||||
`stdout=${trimmedStdout.slice(0, 1024)}`,
|
||||
);
|
||||
}
|
||||
if (trimmedStderr) {
|
||||
log.warn('MeCab stderr while tokenizing:', trimmedStderr);
|
||||
}
|
||||
}
|
||||
|
||||
current.resolve(tokens);
|
||||
this.processQueue();
|
||||
}
|
||||
|
||||
private handlePersistentProcessEnded(
|
||||
process: ReturnType<typeof childProcess.spawn>,
|
||||
reason: string,
|
||||
): void {
|
||||
if (this.mecabProcess !== process) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.mecabProcess = null;
|
||||
this.stdoutBuffer = '';
|
||||
this.clearIdleShutdownTimer();
|
||||
|
||||
const pending: MecabQueuedRequest[] = [];
|
||||
if (this.activeRequest) {
|
||||
pending.push({
|
||||
text: this.activeRequest.text,
|
||||
retryCount: this.activeRequest.retryCount,
|
||||
resolve: this.activeRequest.resolve,
|
||||
});
|
||||
}
|
||||
this.activeRequest = null;
|
||||
if (this.requestQueue.length > 0) {
|
||||
pending.push(...this.requestQueue);
|
||||
}
|
||||
this.requestQueue = [];
|
||||
|
||||
if (pending.length > 0) {
|
||||
log.warn(`MeCab parser process ended during active work (${reason}); retrying pending request(s).`);
|
||||
for (const request of pending) {
|
||||
this.retryOrResolveRequest(request);
|
||||
}
|
||||
this.processQueue();
|
||||
}
|
||||
}
|
||||
|
||||
private scheduleIdleShutdown(): void {
|
||||
this.clearIdleShutdownTimer();
|
||||
if (this.idleShutdownMs <= 0 || !this.mecabProcess) {
|
||||
return;
|
||||
}
|
||||
this.idleShutdownTimer = this.setTimeoutFn(() => {
|
||||
this.idleShutdownTimer = null;
|
||||
if (this.activeRequest || this.requestQueue.length > 0) {
|
||||
return;
|
||||
}
|
||||
this.stopPersistentProcess();
|
||||
}, this.idleShutdownMs);
|
||||
const timerWithUnref = this.idleShutdownTimer as { unref?: () => void };
|
||||
if (typeof timerWithUnref.unref === 'function') {
|
||||
timerWithUnref.unref();
|
||||
}
|
||||
}
|
||||
|
||||
private clearIdleShutdownTimer(): void {
|
||||
if (!this.idleShutdownTimer) {
|
||||
return;
|
||||
}
|
||||
this.clearTimeoutFn(this.idleShutdownTimer);
|
||||
this.idleShutdownTimer = null;
|
||||
}
|
||||
|
||||
private stopPersistentProcess(): void {
|
||||
const process = this.mecabProcess;
|
||||
if (!process) {
|
||||
return;
|
||||
}
|
||||
this.mecabProcess = null;
|
||||
this.stdoutBuffer = '';
|
||||
this.clearIdleShutdownTimer();
|
||||
try {
|
||||
process.kill();
|
||||
} catch {}
|
||||
}
|
||||
|
||||
getStatus(): MecabStatus {
|
||||
@@ -206,6 +425,25 @@ export class MecabTokenizer {
|
||||
|
||||
setEnabled(enabled: boolean): void {
|
||||
this.enabled = enabled;
|
||||
if (!enabled) {
|
||||
const pending: MecabQueuedRequest[] = [];
|
||||
if (this.activeRequest) {
|
||||
pending.push({
|
||||
text: this.activeRequest.text,
|
||||
retryCount: MecabTokenizer.MAX_RETRY_COUNT,
|
||||
resolve: this.activeRequest.resolve,
|
||||
});
|
||||
}
|
||||
if (this.requestQueue.length > 0) {
|
||||
pending.push(...this.requestQueue);
|
||||
}
|
||||
this.activeRequest = null;
|
||||
this.requestQueue = [];
|
||||
for (const request of pending) {
|
||||
request.resolve(null);
|
||||
}
|
||||
this.stopPersistentProcess();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user