mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-02 18:22:42 -08:00
perf(tokenizer): cut annotation latency with persistent mecab
This commit is contained in:
@@ -0,0 +1,48 @@
|
|||||||
|
---
|
||||||
|
id: TASK-81
|
||||||
|
title: 'Tokenization performance: disable Yomitan MeCab parser, gate local MeCab init, and add persistent MeCab process'
|
||||||
|
status: Done
|
||||||
|
assignee: []
|
||||||
|
created_date: '2026-03-02 07:44'
|
||||||
|
updated_date: '2026-03-02 07:46'
|
||||||
|
labels: []
|
||||||
|
dependencies: []
|
||||||
|
priority: high
|
||||||
|
ordinal: 9001
|
||||||
|
---
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||||
|
|
||||||
|
Reduce subtitle annotation latency by:
|
||||||
|
- disabling Yomitan-side MeCab parser requests (`useMecabParser=false`);
|
||||||
|
- initializing local MeCab only when POS-dependent annotations are enabled (N+1 / JLPT / frequency);
|
||||||
|
- replacing per-line local MeCab process spawning with a persistent parser process that auto-shuts down after idle time and restarts on demand.
|
||||||
|
|
||||||
|
<!-- SECTION:DESCRIPTION:END -->
|
||||||
|
|
||||||
|
## Acceptance Criteria
|
||||||
|
|
||||||
|
<!-- AC:BEGIN -->
|
||||||
|
|
||||||
|
- [x] #1 Yomitan parse requests disable MeCab parser path.
|
||||||
|
- [x] #2 MeCab warmup/init is skipped when all POS-dependent annotation toggles are off.
|
||||||
|
- [x] #3 Local MeCab tokenizer uses persistent process across subtitle lines.
|
||||||
|
- [x] #4 Persistent MeCab process auto-shuts down after idle timeout and restarts on next tokenize activity.
|
||||||
|
- [x] #5 Tests cover parser flag, warmup gating, and persistent MeCab lifecycle behavior.
|
||||||
|
|
||||||
|
<!-- AC:END -->
|
||||||
|
|
||||||
|
## Final Summary
|
||||||
|
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||||
|
|
||||||
|
Implemented tokenizer latency optimizations:
|
||||||
|
- switched Yomitan parse requests to `useMecabParser: false`;
|
||||||
|
- added annotation-aware MeCab initialization gating in runtime warmup flow;
|
||||||
|
- added persistent local MeCab process (default idle shutdown: 30s) with queued requests, retry-on-process-end, idle auto-shutdown, and automatic restart on new work;
|
||||||
|
- added regression tests for Yomitan parse flag, MeCab warmup gating, and persistent/idle lifecycle behavior;
|
||||||
|
- validated with targeted tests and `tsc --noEmit`.
|
||||||
|
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
import assert from 'node:assert/strict';
|
import assert from 'node:assert/strict';
|
||||||
import test from 'node:test';
|
import test from 'node:test';
|
||||||
import {
|
import {
|
||||||
|
requestYomitanParseResults,
|
||||||
requestYomitanTermFrequencies,
|
requestYomitanTermFrequencies,
|
||||||
syncYomitanDefaultAnkiServer,
|
syncYomitanDefaultAnkiServer,
|
||||||
} from './yomitan-parser-runtime';
|
} from './yomitan-parser-runtime';
|
||||||
@@ -246,3 +247,32 @@ test('requestYomitanTermFrequencies caches repeated term+reading lookups', async
|
|||||||
const frequencyCalls = scripts.filter((script) => script.includes('getTermFrequencies')).length;
|
const frequencyCalls = scripts.filter((script) => script.includes('getTermFrequencies')).length;
|
||||||
assert.equal(frequencyCalls, 1);
|
assert.equal(frequencyCalls, 1);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('requestYomitanParseResults disables Yomitan MeCab parser path', async () => {
|
||||||
|
const scripts: string[] = [];
|
||||||
|
const deps = createDeps(async (script) => {
|
||||||
|
scripts.push(script);
|
||||||
|
if (script.includes('optionsGetFull')) {
|
||||||
|
return {
|
||||||
|
profileCurrent: 0,
|
||||||
|
profiles: [
|
||||||
|
{
|
||||||
|
options: {
|
||||||
|
scanning: { length: 40 },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return [];
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await requestYomitanParseResults('猫です', deps, {
|
||||||
|
error: () => undefined,
|
||||||
|
});
|
||||||
|
|
||||||
|
assert.deepEqual(result, []);
|
||||||
|
const parseScript = scripts.find((script) => script.includes('parseText'));
|
||||||
|
assert.ok(parseScript, 'expected parseText request script');
|
||||||
|
assert.match(parseScript ?? '', /useMecabParser:\s*false/);
|
||||||
|
});
|
||||||
|
|||||||
@@ -529,7 +529,7 @@ export async function requestYomitanParseResults(
|
|||||||
optionsContext: { index: ${metadata.profileIndex} },
|
optionsContext: { index: ${metadata.profileIndex} },
|
||||||
scanLength: ${metadata.scanLength},
|
scanLength: ${metadata.scanLength},
|
||||||
useInternalParser: true,
|
useInternalParser: true,
|
||||||
useMecabParser: true
|
useMecabParser: false
|
||||||
});
|
});
|
||||||
})();
|
})();
|
||||||
`
|
`
|
||||||
@@ -564,7 +564,7 @@ export async function requestYomitanParseResults(
|
|||||||
optionsContext: { index: profileIndex },
|
optionsContext: { index: profileIndex },
|
||||||
scanLength,
|
scanLength,
|
||||||
useInternalParser: true,
|
useInternalParser: true,
|
||||||
useMecabParser: true
|
useMecabParser: false
|
||||||
});
|
});
|
||||||
})();
|
})();
|
||||||
`;
|
`;
|
||||||
|
|||||||
@@ -236,3 +236,144 @@ test('composeMpvRuntimeHandlers returns callable handlers and forwards to inject
|
|||||||
assert.ok(calls.includes('warmup-yomitan'));
|
assert.ok(calls.includes('warmup-yomitan'));
|
||||||
assert.ok(calls.indexOf('create-mecab') < calls.indexOf('set-started:true'));
|
assert.ok(calls.indexOf('create-mecab') < calls.indexOf('set-started:true'));
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('composeMpvRuntimeHandlers skips MeCab warmup when all POS-dependent annotations are disabled', async () => {
|
||||||
|
const calls: string[] = [];
|
||||||
|
let mecabTokenizer: { id: string } | null = null;
|
||||||
|
|
||||||
|
class FakeMpvClient {
|
||||||
|
connected = false;
|
||||||
|
constructor(
|
||||||
|
public socketPath: string,
|
||||||
|
public options: unknown,
|
||||||
|
) {}
|
||||||
|
on(): void {}
|
||||||
|
connect(): void {
|
||||||
|
this.connected = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const composed = composeMpvRuntimeHandlers<
|
||||||
|
FakeMpvClient,
|
||||||
|
{ isKnownWord: (text: string) => boolean },
|
||||||
|
{ text: string }
|
||||||
|
>({
|
||||||
|
bindMpvMainEventHandlersMainDeps: {
|
||||||
|
appState: {
|
||||||
|
initialArgs: null,
|
||||||
|
overlayRuntimeInitialized: true,
|
||||||
|
mpvClient: null,
|
||||||
|
immersionTracker: null,
|
||||||
|
subtitleTimingTracker: null,
|
||||||
|
currentSubText: '',
|
||||||
|
currentSubAssText: '',
|
||||||
|
playbackPaused: null,
|
||||||
|
previousSecondarySubVisibility: null,
|
||||||
|
},
|
||||||
|
getQuitOnDisconnectArmed: () => false,
|
||||||
|
scheduleQuitCheck: () => {},
|
||||||
|
quitApp: () => {},
|
||||||
|
reportJellyfinRemoteStopped: () => {},
|
||||||
|
syncOverlayMpvSubtitleSuppression: () => {},
|
||||||
|
maybeRunAnilistPostWatchUpdate: async () => {},
|
||||||
|
logSubtitleTimingError: () => {},
|
||||||
|
broadcastToOverlayWindows: () => {},
|
||||||
|
onSubtitleChange: () => {},
|
||||||
|
refreshDiscordPresence: () => {},
|
||||||
|
ensureImmersionTrackerInitialized: () => {},
|
||||||
|
updateCurrentMediaPath: () => {},
|
||||||
|
restoreMpvSubVisibility: () => {},
|
||||||
|
getCurrentAnilistMediaKey: () => null,
|
||||||
|
resetAnilistMediaTracking: () => {},
|
||||||
|
maybeProbeAnilistDuration: () => {},
|
||||||
|
ensureAnilistMediaGuess: () => {},
|
||||||
|
syncImmersionMediaState: () => {},
|
||||||
|
updateCurrentMediaTitle: () => {},
|
||||||
|
resetAnilistMediaGuessState: () => {},
|
||||||
|
reportJellyfinRemoteProgress: () => {},
|
||||||
|
updateSubtitleRenderMetrics: () => {},
|
||||||
|
},
|
||||||
|
mpvClientRuntimeServiceFactoryMainDeps: {
|
||||||
|
createClient: FakeMpvClient,
|
||||||
|
getSocketPath: () => '/tmp/mpv.sock',
|
||||||
|
getResolvedConfig: () => ({ auto_start_overlay: false }),
|
||||||
|
isAutoStartOverlayEnabled: () => true,
|
||||||
|
setOverlayVisible: () => {},
|
||||||
|
isVisibleOverlayVisible: () => false,
|
||||||
|
getReconnectTimer: () => null,
|
||||||
|
setReconnectTimer: () => {},
|
||||||
|
},
|
||||||
|
updateMpvSubtitleRenderMetricsMainDeps: {
|
||||||
|
getCurrentMetrics: () => BASE_METRICS,
|
||||||
|
setCurrentMetrics: () => {},
|
||||||
|
applyPatch: (current, patch) => ({ next: { ...current, ...patch }, changed: true }),
|
||||||
|
broadcastMetrics: () => {},
|
||||||
|
},
|
||||||
|
tokenizer: {
|
||||||
|
buildTokenizerDepsMainDeps: {
|
||||||
|
getYomitanExt: () => null,
|
||||||
|
getYomitanParserWindow: () => null,
|
||||||
|
setYomitanParserWindow: () => {},
|
||||||
|
getYomitanParserReadyPromise: () => null,
|
||||||
|
setYomitanParserReadyPromise: () => {},
|
||||||
|
getYomitanParserInitPromise: () => null,
|
||||||
|
setYomitanParserInitPromise: () => {},
|
||||||
|
isKnownWord: () => false,
|
||||||
|
recordLookup: () => {},
|
||||||
|
getKnownWordMatchMode: () => 'headword',
|
||||||
|
getNPlusOneEnabled: () => false,
|
||||||
|
getMinSentenceWordsForNPlusOne: () => 3,
|
||||||
|
getJlptLevel: () => null,
|
||||||
|
getJlptEnabled: () => false,
|
||||||
|
getFrequencyDictionaryEnabled: () => false,
|
||||||
|
getFrequencyDictionaryMatchMode: () => 'headword',
|
||||||
|
getFrequencyRank: () => null,
|
||||||
|
getYomitanGroupDebugEnabled: () => false,
|
||||||
|
getMecabTokenizer: () => null,
|
||||||
|
},
|
||||||
|
createTokenizerRuntimeDeps: () => ({ isKnownWord: () => false }),
|
||||||
|
tokenizeSubtitle: async (text) => ({ text }),
|
||||||
|
createMecabTokenizerAndCheckMainDeps: {
|
||||||
|
getMecabTokenizer: () => mecabTokenizer,
|
||||||
|
setMecabTokenizer: (next) => {
|
||||||
|
mecabTokenizer = next as { id: string };
|
||||||
|
calls.push('set-mecab');
|
||||||
|
},
|
||||||
|
createMecabTokenizer: () => {
|
||||||
|
calls.push('create-mecab');
|
||||||
|
return { id: 'mecab' };
|
||||||
|
},
|
||||||
|
checkAvailability: async () => {
|
||||||
|
calls.push('check-mecab');
|
||||||
|
},
|
||||||
|
},
|
||||||
|
prewarmSubtitleDictionariesMainDeps: {
|
||||||
|
ensureJlptDictionaryLookup: async () => {},
|
||||||
|
ensureFrequencyDictionaryLookup: async () => {},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
warmups: {
|
||||||
|
launchBackgroundWarmupTaskMainDeps: {
|
||||||
|
now: () => 0,
|
||||||
|
logDebug: () => {},
|
||||||
|
logWarn: () => {},
|
||||||
|
},
|
||||||
|
startBackgroundWarmupsMainDeps: {
|
||||||
|
getStarted: () => false,
|
||||||
|
setStarted: () => {},
|
||||||
|
isTexthookerOnlyMode: () => false,
|
||||||
|
ensureYomitanExtensionLoaded: async () => {},
|
||||||
|
shouldWarmupMecab: () => false,
|
||||||
|
shouldWarmupYomitanExtension: () => false,
|
||||||
|
shouldWarmupSubtitleDictionaries: () => false,
|
||||||
|
shouldWarmupJellyfinRemoteSession: () => false,
|
||||||
|
shouldAutoConnectJellyfinRemote: () => false,
|
||||||
|
startJellyfinRemoteSession: async () => {},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
await composed.startTokenizationWarmups();
|
||||||
|
|
||||||
|
assert.deepEqual(calls, []);
|
||||||
|
});
|
||||||
|
|||||||
@@ -133,12 +133,23 @@ export function composeMpvRuntimeHandlers<
|
|||||||
const prewarmSubtitleDictionaries = createPrewarmSubtitleDictionariesMainHandler(
|
const prewarmSubtitleDictionaries = createPrewarmSubtitleDictionariesMainHandler(
|
||||||
options.tokenizer.prewarmSubtitleDictionariesMainDeps,
|
options.tokenizer.prewarmSubtitleDictionariesMainDeps,
|
||||||
);
|
);
|
||||||
|
const shouldInitializeMecabForAnnotations = (): boolean => {
|
||||||
|
const nPlusOneEnabled =
|
||||||
|
options.tokenizer.buildTokenizerDepsMainDeps.getNPlusOneEnabled?.() !== false;
|
||||||
|
const jlptEnabled = options.tokenizer.buildTokenizerDepsMainDeps.getJlptEnabled() !== false;
|
||||||
|
const frequencyEnabled =
|
||||||
|
options.tokenizer.buildTokenizerDepsMainDeps.getFrequencyDictionaryEnabled() !== false;
|
||||||
|
return nPlusOneEnabled || jlptEnabled || frequencyEnabled;
|
||||||
|
};
|
||||||
let tokenizationWarmupInFlight: Promise<void> | null = null;
|
let tokenizationWarmupInFlight: Promise<void> | null = null;
|
||||||
const startTokenizationWarmups = (): Promise<void> => {
|
const startTokenizationWarmups = (): Promise<void> => {
|
||||||
if (!tokenizationWarmupInFlight) {
|
if (!tokenizationWarmupInFlight) {
|
||||||
tokenizationWarmupInFlight = (async () => {
|
tokenizationWarmupInFlight = (async () => {
|
||||||
await options.warmups.startBackgroundWarmupsMainDeps.ensureYomitanExtensionLoaded();
|
await options.warmups.startBackgroundWarmupsMainDeps.ensureYomitanExtensionLoaded();
|
||||||
if (!options.tokenizer.createMecabTokenizerAndCheckMainDeps.getMecabTokenizer()) {
|
if (
|
||||||
|
shouldInitializeMecabForAnnotations() &&
|
||||||
|
!options.tokenizer.createMecabTokenizerAndCheckMainDeps.getMecabTokenizer()
|
||||||
|
) {
|
||||||
await createMecabTokenizerAndCheck().catch(() => {});
|
await createMecabTokenizerAndCheck().catch(() => {});
|
||||||
}
|
}
|
||||||
await prewarmSubtitleDictionaries({ showLoadingOsd: true });
|
await prewarmSubtitleDictionaries({ showLoadingOsd: true });
|
||||||
|
|||||||
114
src/mecab-tokenizer.test.ts
Normal file
114
src/mecab-tokenizer.test.ts
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
import assert from 'node:assert/strict';
|
||||||
|
import test from 'node:test';
|
||||||
|
import { EventEmitter } from 'node:events';
|
||||||
|
import * as childProcess from 'node:child_process';
|
||||||
|
import { PassThrough, Writable } from 'node:stream';
|
||||||
|
import { MecabTokenizer } from './mecab-tokenizer';
|
||||||
|
|
||||||
|
function createFakeMecabProcess(onKill: () => void): ReturnType<typeof childProcess.spawn> {
|
||||||
|
const stdout = new PassThrough();
|
||||||
|
const stderr = new PassThrough();
|
||||||
|
const stdin = new Writable({
|
||||||
|
write(chunk, _encoding, callback) {
|
||||||
|
const text = String(chunk).replace(/\n+$/, '').trim();
|
||||||
|
if (!text) {
|
||||||
|
stdout.write('EOS\n');
|
||||||
|
callback();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const payload = `${text}\t名詞,一般,*,*,*,*,${text},${text},${text}\nEOS\n`;
|
||||||
|
stdout.write(payload);
|
||||||
|
callback();
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const process = new EventEmitter() as unknown as ReturnType<typeof childProcess.spawn> & {
|
||||||
|
stdin: Writable;
|
||||||
|
stdout: PassThrough;
|
||||||
|
stderr: PassThrough;
|
||||||
|
};
|
||||||
|
process.stdin = stdin;
|
||||||
|
process.stdout = stdout;
|
||||||
|
process.stderr = stderr;
|
||||||
|
process.kill = () => {
|
||||||
|
onKill();
|
||||||
|
process.emit('close', 0);
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
return process;
|
||||||
|
}
|
||||||
|
|
||||||
|
test('MecabTokenizer reuses a persistent parser process across subtitle lines', async () => {
|
||||||
|
let spawnCalls = 0;
|
||||||
|
let killCalls = 0;
|
||||||
|
let timerId = 0;
|
||||||
|
const timers = new Map<number, () => void>();
|
||||||
|
|
||||||
|
const tokenizer = new MecabTokenizer({
|
||||||
|
execSyncFn: (() => '/usr/bin/mecab') as unknown as typeof childProcess.execSync,
|
||||||
|
spawnFn: (() => {
|
||||||
|
spawnCalls += 1;
|
||||||
|
return createFakeMecabProcess(() => {
|
||||||
|
killCalls += 1;
|
||||||
|
});
|
||||||
|
}) as unknown as typeof childProcess.spawn,
|
||||||
|
setTimeoutFn: (callback) => {
|
||||||
|
timerId += 1;
|
||||||
|
timers.set(timerId, callback);
|
||||||
|
return timerId as unknown as ReturnType<typeof setTimeout>;
|
||||||
|
},
|
||||||
|
clearTimeoutFn: (timeout) => {
|
||||||
|
timers.delete(timeout as unknown as number);
|
||||||
|
},
|
||||||
|
idleShutdownMs: 60_000,
|
||||||
|
});
|
||||||
|
|
||||||
|
assert.equal(await tokenizer.checkAvailability(), true);
|
||||||
|
|
||||||
|
const first = await tokenizer.tokenize('猫');
|
||||||
|
const second = await tokenizer.tokenize('犬');
|
||||||
|
|
||||||
|
assert.equal(first?.[0]?.word, '猫');
|
||||||
|
assert.equal(second?.[0]?.word, '犬');
|
||||||
|
assert.equal(spawnCalls, 1);
|
||||||
|
assert.equal(killCalls, 0);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('MecabTokenizer shuts down after idle timeout and restarts on new activity', async () => {
|
||||||
|
let spawnCalls = 0;
|
||||||
|
let killCalls = 0;
|
||||||
|
let timerId = 0;
|
||||||
|
const timers = new Map<number, () => void>();
|
||||||
|
|
||||||
|
const tokenizer = new MecabTokenizer({
|
||||||
|
execSyncFn: (() => '/usr/bin/mecab') as unknown as typeof childProcess.execSync,
|
||||||
|
spawnFn: (() => {
|
||||||
|
spawnCalls += 1;
|
||||||
|
return createFakeMecabProcess(() => {
|
||||||
|
killCalls += 1;
|
||||||
|
});
|
||||||
|
}) as unknown as typeof childProcess.spawn,
|
||||||
|
setTimeoutFn: (callback) => {
|
||||||
|
timerId += 1;
|
||||||
|
timers.set(timerId, callback);
|
||||||
|
return timerId as unknown as ReturnType<typeof setTimeout>;
|
||||||
|
},
|
||||||
|
clearTimeoutFn: (timeout) => {
|
||||||
|
timers.delete(timeout as unknown as number);
|
||||||
|
},
|
||||||
|
idleShutdownMs: 5_000,
|
||||||
|
});
|
||||||
|
|
||||||
|
assert.equal(await tokenizer.checkAvailability(), true);
|
||||||
|
await tokenizer.tokenize('猫');
|
||||||
|
assert.equal(spawnCalls, 1);
|
||||||
|
|
||||||
|
const pendingTimer = [...timers.values()][0];
|
||||||
|
assert.ok(pendingTimer, 'expected idle shutdown timer');
|
||||||
|
pendingTimer?.();
|
||||||
|
assert.equal(killCalls, 1);
|
||||||
|
|
||||||
|
await tokenizer.tokenize('犬');
|
||||||
|
assert.equal(spawnCalls, 2);
|
||||||
|
});
|
||||||
@@ -16,7 +16,7 @@
|
|||||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { spawn, execSync } from 'child_process';
|
import * as childProcess from 'child_process';
|
||||||
import { PartOfSpeech, Token, MecabStatus } from './types';
|
import { PartOfSpeech, Token, MecabStatus } from './types';
|
||||||
import { createLogger } from './logger';
|
import { createLogger } from './logger';
|
||||||
|
|
||||||
@@ -89,18 +89,58 @@ export function parseMecabLine(line: string): Token | null {
|
|||||||
export interface MecabTokenizerOptions {
|
export interface MecabTokenizerOptions {
|
||||||
mecabCommand?: string;
|
mecabCommand?: string;
|
||||||
dictionaryPath?: string;
|
dictionaryPath?: string;
|
||||||
|
idleShutdownMs?: number;
|
||||||
|
spawnFn?: typeof childProcess.spawn;
|
||||||
|
execSyncFn?: typeof childProcess.execSync;
|
||||||
|
setTimeoutFn?: (callback: () => void, delayMs: number) => ReturnType<typeof setTimeout>;
|
||||||
|
clearTimeoutFn?: (timer: ReturnType<typeof setTimeout>) => void;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface MecabQueuedRequest {
|
||||||
|
text: string;
|
||||||
|
retryCount: number;
|
||||||
|
resolve: (tokens: Token[] | null) => void;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface MecabActiveRequest extends MecabQueuedRequest {
|
||||||
|
lines: string[];
|
||||||
|
stderr: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export class MecabTokenizer {
|
export class MecabTokenizer {
|
||||||
|
private static readonly DEFAULT_IDLE_SHUTDOWN_MS = 30_000;
|
||||||
|
private static readonly MAX_RETRY_COUNT = 1;
|
||||||
|
|
||||||
private mecabPath: string | null = null;
|
private mecabPath: string | null = null;
|
||||||
private mecabCommand: string;
|
private mecabCommand: string;
|
||||||
private dictionaryPath: string | null;
|
private dictionaryPath: string | null;
|
||||||
private available: boolean = false;
|
private available: boolean = false;
|
||||||
private enabled: boolean = true;
|
private enabled: boolean = true;
|
||||||
|
private idleShutdownMs: number;
|
||||||
|
private readonly spawnFn: typeof childProcess.spawn;
|
||||||
|
private readonly execSyncFn: typeof childProcess.execSync;
|
||||||
|
private readonly setTimeoutFn: (
|
||||||
|
callback: () => void,
|
||||||
|
delayMs: number,
|
||||||
|
) => ReturnType<typeof setTimeout>;
|
||||||
|
private readonly clearTimeoutFn: (timer: ReturnType<typeof setTimeout>) => void;
|
||||||
|
private mecabProcess: ReturnType<typeof childProcess.spawn> | null = null;
|
||||||
|
private idleShutdownTimer: ReturnType<typeof setTimeout> | null = null;
|
||||||
|
private stdoutBuffer = '';
|
||||||
|
private requestQueue: MecabQueuedRequest[] = [];
|
||||||
|
private activeRequest: MecabActiveRequest | null = null;
|
||||||
|
|
||||||
constructor(options: MecabTokenizerOptions = {}) {
|
constructor(options: MecabTokenizerOptions = {}) {
|
||||||
this.mecabCommand = options.mecabCommand?.trim() || 'mecab';
|
this.mecabCommand = options.mecabCommand?.trim() || 'mecab';
|
||||||
this.dictionaryPath = options.dictionaryPath?.trim() || null;
|
this.dictionaryPath = options.dictionaryPath?.trim() || null;
|
||||||
|
this.idleShutdownMs = Math.max(
|
||||||
|
0,
|
||||||
|
Math.floor(options.idleShutdownMs ?? MecabTokenizer.DEFAULT_IDLE_SHUTDOWN_MS),
|
||||||
|
);
|
||||||
|
this.spawnFn = options.spawnFn ?? childProcess.spawn;
|
||||||
|
this.execSyncFn = options.execSyncFn ?? childProcess.execSync;
|
||||||
|
this.setTimeoutFn = options.setTimeoutFn ?? ((callback, delayMs) => setTimeout(callback, delayMs));
|
||||||
|
this.clearTimeoutFn = options.clearTimeoutFn ?? ((timer) => clearTimeout(timer));
|
||||||
}
|
}
|
||||||
|
|
||||||
async checkAvailability(): Promise<boolean> {
|
async checkAvailability(): Promise<boolean> {
|
||||||
@@ -108,9 +148,10 @@ export class MecabTokenizer {
|
|||||||
const command = this.mecabCommand;
|
const command = this.mecabCommand;
|
||||||
const result = command.includes('/')
|
const result = command.includes('/')
|
||||||
? command
|
? command
|
||||||
: execSync(`which ${command}`, { encoding: 'utf-8' }).trim();
|
: this.execSyncFn(`which ${command}`, { encoding: 'utf-8' });
|
||||||
if (result) {
|
const resolvedPath = String(result).trim();
|
||||||
this.mecabPath = result;
|
if (resolvedPath) {
|
||||||
|
this.mecabPath = resolvedPath;
|
||||||
this.available = true;
|
this.available = true;
|
||||||
log.info('MeCab found at:', this.mecabPath);
|
log.info('MeCab found at:', this.mecabPath);
|
||||||
return true;
|
return true;
|
||||||
@@ -119,81 +160,259 @@ export class MecabTokenizer {
|
|||||||
log.info('MeCab not found on system');
|
log.info('MeCab not found on system');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
this.stopPersistentProcess();
|
||||||
this.available = false;
|
this.available = false;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
async tokenize(text: string): Promise<Token[] | null> {
|
async tokenize(text: string): Promise<Token[] | null> {
|
||||||
if (!this.available || !this.enabled || !text) {
|
const normalizedText = text.replace(/\r?\n/g, ' ').trim();
|
||||||
|
if (!this.available || !this.enabled || !normalizedText) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
return new Promise((resolve) => {
|
return new Promise((resolve) => {
|
||||||
const mecabArgs: string[] = [];
|
this.clearIdleShutdownTimer();
|
||||||
if (this.dictionaryPath) {
|
this.requestQueue.push({
|
||||||
mecabArgs.push('-d', this.dictionaryPath);
|
text: normalizedText,
|
||||||
}
|
retryCount: 0,
|
||||||
const mecab = spawn(this.mecabPath ?? this.mecabCommand, mecabArgs, {
|
resolve,
|
||||||
|
});
|
||||||
|
this.processQueue();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private processQueue(): void {
|
||||||
|
if (this.activeRequest) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const request = this.requestQueue.shift();
|
||||||
|
if (!request) {
|
||||||
|
this.scheduleIdleShutdown();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!this.ensurePersistentProcess()) {
|
||||||
|
this.retryOrResolveRequest(request);
|
||||||
|
this.processQueue();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.activeRequest = {
|
||||||
|
...request,
|
||||||
|
lines: [],
|
||||||
|
stderr: '',
|
||||||
|
};
|
||||||
|
|
||||||
|
try {
|
||||||
|
this.mecabProcess?.stdin?.write(`${request.text}\n`);
|
||||||
|
} catch (error) {
|
||||||
|
log.error('Failed to write to MeCab process:', (error as Error).message);
|
||||||
|
this.retryOrResolveRequest(request);
|
||||||
|
this.activeRequest = null;
|
||||||
|
this.stopPersistentProcess();
|
||||||
|
this.processQueue();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private retryOrResolveRequest(request: MecabQueuedRequest): void {
|
||||||
|
if (
|
||||||
|
request.retryCount < MecabTokenizer.MAX_RETRY_COUNT &&
|
||||||
|
this.enabled &&
|
||||||
|
this.available
|
||||||
|
) {
|
||||||
|
this.requestQueue.push({
|
||||||
|
...request,
|
||||||
|
retryCount: request.retryCount + 1,
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
request.resolve(null);
|
||||||
|
}
|
||||||
|
|
||||||
|
private ensurePersistentProcess(): boolean {
|
||||||
|
if (this.mecabProcess) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const mecabArgs: string[] = [];
|
||||||
|
if (this.dictionaryPath) {
|
||||||
|
mecabArgs.push('-d', this.dictionaryPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mecab: ReturnType<typeof childProcess.spawn>;
|
||||||
|
try {
|
||||||
|
mecab = this.spawnFn(this.mecabPath ?? this.mecabCommand, mecabArgs, {
|
||||||
stdio: ['pipe', 'pipe', 'pipe'],
|
stdio: ['pipe', 'pipe', 'pipe'],
|
||||||
});
|
});
|
||||||
|
} catch (error) {
|
||||||
|
log.error('Failed to spawn MeCab:', (error as Error).message);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
let stdout = '';
|
if (!mecab.stdin || !mecab.stdout || !mecab.stderr) {
|
||||||
let stderr = '';
|
log.error('Failed to spawn MeCab: missing stdio pipes');
|
||||||
|
try {
|
||||||
|
mecab.kill();
|
||||||
|
} catch {}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
mecab.stdout.on('data', (data: Buffer) => {
|
this.stdoutBuffer = '';
|
||||||
stdout += data.toString();
|
mecab.stdout.on('data', (data: Buffer | string) => {
|
||||||
});
|
this.handleStdoutChunk(data.toString());
|
||||||
|
|
||||||
mecab.stderr.on('data', (data: Buffer) => {
|
|
||||||
stderr += data.toString();
|
|
||||||
});
|
|
||||||
|
|
||||||
mecab.on('close', (code: number | null) => {
|
|
||||||
if (code !== 0) {
|
|
||||||
log.error('MeCab process exited with code:', code);
|
|
||||||
if (stderr) {
|
|
||||||
log.error('MeCab stderr:', stderr);
|
|
||||||
}
|
|
||||||
resolve(null);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const lines = stdout.split('\n');
|
|
||||||
const tokens: Token[] = [];
|
|
||||||
|
|
||||||
for (const line of lines) {
|
|
||||||
const token = parseMecabLine(line);
|
|
||||||
if (token) {
|
|
||||||
tokens.push(token);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (tokens.length === 0 && text.trim().length > 0) {
|
|
||||||
const trimmedStdout = stdout.trim();
|
|
||||||
const trimmedStderr = stderr.trim();
|
|
||||||
if (trimmedStdout) {
|
|
||||||
log.warn(
|
|
||||||
'MeCab returned no parseable tokens.',
|
|
||||||
`command=${this.mecabPath ?? this.mecabCommand}`,
|
|
||||||
`stdout=${trimmedStdout.slice(0, 1024)}`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if (trimmedStderr) {
|
|
||||||
log.warn('MeCab stderr while tokenizing:', trimmedStderr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
resolve(tokens);
|
|
||||||
});
|
|
||||||
|
|
||||||
mecab.on('error', (err: Error) => {
|
|
||||||
log.error('Failed to spawn MeCab:', err.message);
|
|
||||||
resolve(null);
|
|
||||||
});
|
|
||||||
|
|
||||||
mecab.stdin.write(text);
|
|
||||||
mecab.stdin.end();
|
|
||||||
});
|
});
|
||||||
|
mecab.stderr.on('data', (data: Buffer | string) => {
|
||||||
|
if (!this.activeRequest) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
this.activeRequest.stderr += data.toString();
|
||||||
|
});
|
||||||
|
mecab.on('error', (error: Error) => {
|
||||||
|
this.handlePersistentProcessEnded(mecab, `spawn error: ${error.message}`);
|
||||||
|
});
|
||||||
|
mecab.on('close', (code: number | null) => {
|
||||||
|
this.handlePersistentProcessEnded(mecab, `exit code ${String(code)}`);
|
||||||
|
});
|
||||||
|
|
||||||
|
this.mecabProcess = mecab;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private handleStdoutChunk(chunk: string): void {
|
||||||
|
this.stdoutBuffer += chunk;
|
||||||
|
while (true) {
|
||||||
|
const newlineIndex = this.stdoutBuffer.indexOf('\n');
|
||||||
|
if (newlineIndex === -1) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
const line = this.stdoutBuffer.slice(0, newlineIndex).replace(/\r$/, '');
|
||||||
|
this.stdoutBuffer = this.stdoutBuffer.slice(newlineIndex + 1);
|
||||||
|
this.handleStdoutLine(line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private handleStdoutLine(line: string): void {
|
||||||
|
if (!this.activeRequest) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (line === 'EOS') {
|
||||||
|
this.resolveActiveRequest();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!line.trim()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
this.activeRequest.lines.push(line);
|
||||||
|
}
|
||||||
|
|
||||||
|
private resolveActiveRequest(): void {
|
||||||
|
const current = this.activeRequest;
|
||||||
|
if (!current) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
this.activeRequest = null;
|
||||||
|
|
||||||
|
const tokens: Token[] = [];
|
||||||
|
for (const line of current.lines) {
|
||||||
|
const token = parseMecabLine(line);
|
||||||
|
if (token) {
|
||||||
|
tokens.push(token);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tokens.length === 0 && current.text.trim().length > 0) {
|
||||||
|
const trimmedStdout = current.lines.join('\n').trim();
|
||||||
|
const trimmedStderr = current.stderr.trim();
|
||||||
|
if (trimmedStdout) {
|
||||||
|
log.warn(
|
||||||
|
'MeCab returned no parseable tokens.',
|
||||||
|
`command=${this.mecabPath ?? this.mecabCommand}`,
|
||||||
|
`stdout=${trimmedStdout.slice(0, 1024)}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (trimmedStderr) {
|
||||||
|
log.warn('MeCab stderr while tokenizing:', trimmedStderr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
current.resolve(tokens);
|
||||||
|
this.processQueue();
|
||||||
|
}
|
||||||
|
|
||||||
|
private handlePersistentProcessEnded(
|
||||||
|
process: ReturnType<typeof childProcess.spawn>,
|
||||||
|
reason: string,
|
||||||
|
): void {
|
||||||
|
if (this.mecabProcess !== process) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.mecabProcess = null;
|
||||||
|
this.stdoutBuffer = '';
|
||||||
|
this.clearIdleShutdownTimer();
|
||||||
|
|
||||||
|
const pending: MecabQueuedRequest[] = [];
|
||||||
|
if (this.activeRequest) {
|
||||||
|
pending.push({
|
||||||
|
text: this.activeRequest.text,
|
||||||
|
retryCount: this.activeRequest.retryCount,
|
||||||
|
resolve: this.activeRequest.resolve,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
this.activeRequest = null;
|
||||||
|
if (this.requestQueue.length > 0) {
|
||||||
|
pending.push(...this.requestQueue);
|
||||||
|
}
|
||||||
|
this.requestQueue = [];
|
||||||
|
|
||||||
|
if (pending.length > 0) {
|
||||||
|
log.warn(`MeCab parser process ended during active work (${reason}); retrying pending request(s).`);
|
||||||
|
for (const request of pending) {
|
||||||
|
this.retryOrResolveRequest(request);
|
||||||
|
}
|
||||||
|
this.processQueue();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private scheduleIdleShutdown(): void {
|
||||||
|
this.clearIdleShutdownTimer();
|
||||||
|
if (this.idleShutdownMs <= 0 || !this.mecabProcess) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
this.idleShutdownTimer = this.setTimeoutFn(() => {
|
||||||
|
this.idleShutdownTimer = null;
|
||||||
|
if (this.activeRequest || this.requestQueue.length > 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
this.stopPersistentProcess();
|
||||||
|
}, this.idleShutdownMs);
|
||||||
|
const timerWithUnref = this.idleShutdownTimer as { unref?: () => void };
|
||||||
|
if (typeof timerWithUnref.unref === 'function') {
|
||||||
|
timerWithUnref.unref();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private clearIdleShutdownTimer(): void {
|
||||||
|
if (!this.idleShutdownTimer) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
this.clearTimeoutFn(this.idleShutdownTimer);
|
||||||
|
this.idleShutdownTimer = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private stopPersistentProcess(): void {
|
||||||
|
const process = this.mecabProcess;
|
||||||
|
if (!process) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
this.mecabProcess = null;
|
||||||
|
this.stdoutBuffer = '';
|
||||||
|
this.clearIdleShutdownTimer();
|
||||||
|
try {
|
||||||
|
process.kill();
|
||||||
|
} catch {}
|
||||||
}
|
}
|
||||||
|
|
||||||
getStatus(): MecabStatus {
|
getStatus(): MecabStatus {
|
||||||
@@ -206,6 +425,25 @@ export class MecabTokenizer {
|
|||||||
|
|
||||||
setEnabled(enabled: boolean): void {
|
setEnabled(enabled: boolean): void {
|
||||||
this.enabled = enabled;
|
this.enabled = enabled;
|
||||||
|
if (!enabled) {
|
||||||
|
const pending: MecabQueuedRequest[] = [];
|
||||||
|
if (this.activeRequest) {
|
||||||
|
pending.push({
|
||||||
|
text: this.activeRequest.text,
|
||||||
|
retryCount: MecabTokenizer.MAX_RETRY_COUNT,
|
||||||
|
resolve: this.activeRequest.resolve,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (this.requestQueue.length > 0) {
|
||||||
|
pending.push(...this.requestQueue);
|
||||||
|
}
|
||||||
|
this.activeRequest = null;
|
||||||
|
this.requestQueue = [];
|
||||||
|
for (const request of pending) {
|
||||||
|
request.resolve(null);
|
||||||
|
}
|
||||||
|
this.stopPersistentProcess();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user