diff --git a/backlog/tasks/task-81 - Tokenization-performance-disable-yomitan-mecab-and-persistent-local-mecab.md b/backlog/tasks/task-81 - Tokenization-performance-disable-yomitan-mecab-and-persistent-local-mecab.md index 0c33c89..8a38fa7 100644 --- a/backlog/tasks/task-81 - Tokenization-performance-disable-yomitan-mecab-and-persistent-local-mecab.md +++ b/backlog/tasks/task-81 - Tokenization-performance-disable-yomitan-mecab-and-persistent-local-mecab.md @@ -4,7 +4,7 @@ title: 'Tokenization performance: disable Yomitan MeCab parser, gate local MeCab status: Done assignee: [] created_date: '2026-03-02 07:44' -updated_date: '2026-03-02 20:37' +updated_date: '2026-03-02 20:44' labels: [] dependencies: [] priority: high @@ -47,8 +47,10 @@ Implemented tokenizer latency optimizations: - added regression coverage in `src/main/runtime/composers/mpv-runtime-composer.test.ts` for sequential tokenize calls (`warmup` side effects run once); - post-review critical fix: treat Yomitan default-profile Anki server sync `no-change` as successful check, so `lastSyncedYomitanAnkiServer` is cached and expensive sync checks do not repeat on every subtitle line; - added regression assertion in `src/core/services/tokenizer/yomitan-parser-runtime.test.ts` for `updated: false` path returning sync success; -- post-review performance fix: refactored POS enrichment to pre-index MeCab tokens by surface/start position and use sliding overlap window + binary-search cursor fallback, removing repeated full MeCab scans per token (`O(n*m)` hotspot); -- added regression test in `src/core/services/tokenizer/parser-enrichment-stage.test.ts` that fails on repeated distant-token scan access and passes with indexed lookup; +- post-review performance fix: refactored POS enrichment to pre-index MeCab tokens by surface plus character-position overlap index, replacing repeated active-candidate filtering/full-scan behavior with direct overlap candidate lookup per token; +- added regression tests in `src/core/services/tokenizer/parser-enrichment-stage.test.ts` for repeated distant-token scan access and repeated active-candidate filter scans; both fail on scan-based behavior and pass with indexed lookup; +- post-review startup fix: moved JLPT/frequency dictionary initialization from synchronous FS APIs to async `fs/promises` path inspection/read and cooperative chunked entry processing to reduce main-thread stall risk during cold start; +- post-review first-line latency fix: decoupled tokenization warmup gating so first `tokenizeSubtitle` only waits on Yomitan extension readiness, while MeCab check + dictionary prewarm continue in parallel background warmups; - validated with targeted tests and `tsc --noEmit`. diff --git a/src/core/services/tokenizer/parser-enrichment-stage.test.ts b/src/core/services/tokenizer/parser-enrichment-stage.test.ts index 9b5ea59..c0e9140 100644 --- a/src/core/services/tokenizer/parser-enrichment-stage.test.ts +++ b/src/core/services/tokenizer/parser-enrichment-stage.test.ts @@ -101,11 +101,10 @@ test('enrichTokensWithMecabPos1 avoids repeated active-candidate filter scans', let sentinelFilterCalls = 0; const originalFilter = Array.prototype.filter; - Array.prototype.filter = function filterWithSentinelCheck( - this: T[], - predicate: (value: T, index: number, array: T[]) => unknown, - thisArg?: unknown, - ) { + Array.prototype.filter = (function filterWithSentinelCheck( + this: unknown[], + ...args: any[] + ): any[] { const target = this as Array<{ surface?: string }>; if (target.some((candidate) => candidate?.surface === 'SENTINEL')) { sentinelFilterCalls += 1; @@ -113,8 +112,8 @@ test('enrichTokensWithMecabPos1 avoids repeated active-candidate filter scans', throw new Error('repeated active candidate filter scan detected'); } } - return originalFilter.call(this, predicate, thisArg); - }; + return (originalFilter as (...params: any[]) => any[]).apply(this, args); + }) as typeof Array.prototype.filter; try { const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);