From fa97472bce84099fe62d66dba3d0cb754bed1a1d Mon Sep 17 00:00:00 2001 From: sudacode Date: Mon, 2 Mar 2026 01:39:44 -0800 Subject: [PATCH] perf(tokenizer): optimize mecab POS enrichment lookups --- ...omitan-mecab-and-persistent-local-mecab.md | 4 +- .../tokenizer/parser-enrichment-stage.test.ts | 75 ++++ .../tokenizer/parser-enrichment-stage.ts | 341 ++++++++++++++---- 3 files changed, 344 insertions(+), 76 deletions(-) diff --git a/backlog/tasks/task-81 - Tokenization-performance-disable-yomitan-mecab-and-persistent-local-mecab.md b/backlog/tasks/task-81 - Tokenization-performance-disable-yomitan-mecab-and-persistent-local-mecab.md index 009622a..0c33c89 100644 --- a/backlog/tasks/task-81 - Tokenization-performance-disable-yomitan-mecab-and-persistent-local-mecab.md +++ b/backlog/tasks/task-81 - Tokenization-performance-disable-yomitan-mecab-and-persistent-local-mecab.md @@ -4,7 +4,7 @@ title: 'Tokenization performance: disable Yomitan MeCab parser, gate local MeCab status: Done assignee: [] created_date: '2026-03-02 07:44' -updated_date: '2026-03-02 20:34' +updated_date: '2026-03-02 20:37' labels: [] dependencies: [] priority: high @@ -47,6 +47,8 @@ Implemented tokenizer latency optimizations: - added regression coverage in `src/main/runtime/composers/mpv-runtime-composer.test.ts` for sequential tokenize calls (`warmup` side effects run once); - post-review critical fix: treat Yomitan default-profile Anki server sync `no-change` as successful check, so `lastSyncedYomitanAnkiServer` is cached and expensive sync checks do not repeat on every subtitle line; - added regression assertion in `src/core/services/tokenizer/yomitan-parser-runtime.test.ts` for `updated: false` path returning sync success; +- post-review performance fix: refactored POS enrichment to pre-index MeCab tokens by surface/start position and use sliding overlap window + binary-search cursor fallback, removing repeated full MeCab scans per token (`O(n*m)` hotspot); +- added regression test in `src/core/services/tokenizer/parser-enrichment-stage.test.ts` that fails on repeated distant-token scan access and passes with indexed lookup; - validated with targeted tests and `tsc --noEmit`. diff --git a/src/core/services/tokenizer/parser-enrichment-stage.test.ts b/src/core/services/tokenizer/parser-enrichment-stage.test.ts index 86178a4..9b5ea59 100644 --- a/src/core/services/tokenizer/parser-enrichment-stage.test.ts +++ b/src/core/services/tokenizer/parser-enrichment-stage.test.ts @@ -48,3 +48,78 @@ test('enrichTokensWithMecabPos1 passes through unchanged when mecab tokens are n const emptyResult = enrichTokensWithMecabPos1(tokens, []); assert.strictEqual(emptyResult, tokens); }); + +test('enrichTokensWithMecabPos1 avoids repeated full scans over distant mecab surfaces', () => { + const tokens = Array.from({ length: 12 }, (_, index) => + makeToken({ surface: `w${index}`, startPos: index, endPos: index + 1, pos1: '' }), + ); + const mecabTokens = tokens.map((token) => + makeToken({ + surface: token.surface, + startPos: token.startPos, + endPos: token.endPos, + pos1: '名詞', + }), + ); + + let distantSurfaceReads = 0; + const distantToken = makeToken({ surface: '遠', startPos: 500, endPos: 501, pos1: '記号' }); + Object.defineProperty(distantToken, 'surface', { + configurable: true, + get() { + distantSurfaceReads += 1; + if (distantSurfaceReads > 3) { + throw new Error('repeated full scan detected'); + } + return '遠'; + }, + }); + mecabTokens.push(distantToken); + + const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens); + assert.equal(enriched.length, tokens.length); + for (const token of enriched) { + assert.equal(token.pos1, '名詞'); + } +}); + +test('enrichTokensWithMecabPos1 avoids repeated active-candidate filter scans', () => { + const tokens = Array.from({ length: 8 }, (_, index) => + makeToken({ surface: `u${index}`, startPos: index, endPos: index + 1, pos1: '' }), + ); + const mecabTokens = [ + makeToken({ surface: 'SENTINEL', startPos: 0, endPos: 100, pos1: '記号' }), + ...tokens.map((token, index) => + makeToken({ + surface: `m${index}`, + startPos: token.startPos, + endPos: token.endPos, + pos1: '名詞', + }), + ), + ]; + + let sentinelFilterCalls = 0; + const originalFilter = Array.prototype.filter; + Array.prototype.filter = function filterWithSentinelCheck( + this: T[], + predicate: (value: T, index: number, array: T[]) => unknown, + thisArg?: unknown, + ) { + const target = this as Array<{ surface?: string }>; + if (target.some((candidate) => candidate?.surface === 'SENTINEL')) { + sentinelFilterCalls += 1; + if (sentinelFilterCalls > 2) { + throw new Error('repeated active candidate filter scan detected'); + } + } + return originalFilter.call(this, predicate, thisArg); + }; + + try { + const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens); + assert.equal(enriched.length, tokens.length); + } finally { + Array.prototype.filter = originalFilter; + } +}); diff --git a/src/core/services/tokenizer/parser-enrichment-stage.ts b/src/core/services/tokenizer/parser-enrichment-stage.ts index 857d255..16ea8fd 100644 --- a/src/core/services/tokenizer/parser-enrichment-stage.ts +++ b/src/core/services/tokenizer/parser-enrichment-stage.ts @@ -6,6 +6,120 @@ type MecabPosMetadata = { pos3?: string; }; +type IndexedMecabToken = { + index: number; + start: number; + end: number; + surface: string; + pos1: string; + pos2?: string; + pos3?: string; +}; + +type MecabLookup = { + indexedTokens: IndexedMecabToken[]; + byExactSurface: Map; + byTrimmedSurface: Map; + byPosition: Map; +}; + +function pushMapValue(map: Map, key: K, value: T): void { + const existing = map.get(key); + if (existing) { + existing.push(value); + return; + } + map.set(key, [value]); +} + +function toDiscreteSpan(start: number, end: number): { start: number; end: number } { + const discreteStart = Math.floor(start); + const discreteEnd = Math.max(discreteStart + 1, Math.ceil(end)); + return { + start: discreteStart, + end: discreteEnd, + }; +} + +function buildMecabLookup(mecabTokens: MergedToken[]): MecabLookup { + const indexedTokens: IndexedMecabToken[] = []; + for (const [index, token] of mecabTokens.entries()) { + const pos1 = token.pos1; + if (!pos1) { + continue; + } + const surface = token.surface; + const start = token.startPos ?? 0; + const end = token.endPos ?? start + surface.length; + indexedTokens.push({ + index, + start, + end, + surface, + pos1, + pos2: token.pos2, + pos3: token.pos3, + }); + } + + const byExactSurface = new Map(); + const byTrimmedSurface = new Map(); + const byPosition = new Map(); + for (const token of indexedTokens) { + pushMapValue(byExactSurface, token.surface, token); + const trimmedSurface = token.surface.trim(); + if (trimmedSurface) { + pushMapValue(byTrimmedSurface, trimmedSurface, token); + } + + const discreteSpan = toDiscreteSpan(token.start, token.end); + for (let position = discreteSpan.start; position < discreteSpan.end; position += 1) { + pushMapValue(byPosition, position, token); + } + } + + const byStartThenIndexSort = (left: IndexedMecabToken, right: IndexedMecabToken) => + left.start - right.start || left.index - right.index; + for (const candidates of byExactSurface.values()) { + candidates.sort(byStartThenIndexSort); + } + + return { + indexedTokens, + byExactSurface, + byTrimmedSurface, + byPosition, + }; +} + +function lowerBoundByStart(candidates: IndexedMecabToken[], targetStart: number): number { + let low = 0; + let high = candidates.length; + while (low < high) { + const mid = Math.floor((low + high) / 2); + if (candidates[mid]!.start < targetStart) { + low = mid + 1; + } else { + high = mid; + } + } + return low; +} + +function lowerBoundByIndex(candidates: IndexedMecabToken[], targetIndex: number): number { + let low = 0; + let high = candidates.length; + while (low < high) { + const mid = Math.floor((low + high) / 2); + if (candidates[mid]!.index < targetIndex) { + low = mid + 1; + } else { + high = mid; + } + } + return low; +} + function joinUniqueTags(values: Array): string | undefined { const unique: string[] = []; for (const value of values) { @@ -29,87 +143,127 @@ function joinUniqueTags(values: Array): string | undefined { return unique.join('|'); } -function pickClosestMecabPosMetadata( +function pickClosestMecabPosMetadataBySurface( token: MergedToken, - mecabTokens: MergedToken[], + candidates: IndexedMecabToken[] | undefined, ): MecabPosMetadata | null { - if (mecabTokens.length === 0) { + if (!candidates || candidates.length === 0) { return null; } const tokenStart = token.startPos ?? 0; const tokenEnd = token.endPos ?? tokenStart + token.surface.length; - let bestSurfaceMatchToken: MergedToken | null = null; + let bestSurfaceMatchToken: IndexedMecabToken | null = null; let bestSurfaceMatchDistance = Number.MAX_SAFE_INTEGER; let bestSurfaceMatchEndDistance = Number.MAX_SAFE_INTEGER; + let bestSurfaceMatchIndex = Number.MAX_SAFE_INTEGER; - for (const mecabToken of mecabTokens) { - if (!mecabToken.pos1) { - continue; + const nearestStartIndex = lowerBoundByStart(candidates, tokenStart); + let left = nearestStartIndex - 1; + let right = nearestStartIndex; + + while (left >= 0 || right < candidates.length) { + const leftDistance = + left >= 0 ? Math.abs(candidates[left]!.start - tokenStart) : Number.MAX_SAFE_INTEGER; + const rightDistance = + right < candidates.length + ? Math.abs(candidates[right]!.start - tokenStart) + : Number.MAX_SAFE_INTEGER; + const nearestDistance = Math.min(leftDistance, rightDistance); + if (nearestDistance > bestSurfaceMatchDistance) { + break; } - if (mecabToken.surface !== token.surface) { - continue; + if (leftDistance === nearestDistance && left >= 0) { + const candidate = candidates[left]!; + const startDistance = Math.abs(candidate.start - tokenStart); + const endDistance = Math.abs(candidate.end - tokenEnd); + if ( + startDistance < bestSurfaceMatchDistance || + (startDistance === bestSurfaceMatchDistance && + (endDistance < bestSurfaceMatchEndDistance || + (endDistance === bestSurfaceMatchEndDistance && candidate.index < bestSurfaceMatchIndex))) + ) { + bestSurfaceMatchDistance = startDistance; + bestSurfaceMatchEndDistance = endDistance; + bestSurfaceMatchIndex = candidate.index; + bestSurfaceMatchToken = candidate; + } + left -= 1; } - - const mecabStart = mecabToken.startPos ?? 0; - const mecabEnd = mecabToken.endPos ?? mecabStart + mecabToken.surface.length; - const startDistance = Math.abs(mecabStart - tokenStart); - const endDistance = Math.abs(mecabEnd - tokenEnd); - - if ( - startDistance < bestSurfaceMatchDistance || - (startDistance === bestSurfaceMatchDistance && endDistance < bestSurfaceMatchEndDistance) - ) { - bestSurfaceMatchDistance = startDistance; - bestSurfaceMatchEndDistance = endDistance; - bestSurfaceMatchToken = mecabToken; + if (rightDistance === nearestDistance && right < candidates.length) { + const candidate = candidates[right]!; + const startDistance = Math.abs(candidate.start - tokenStart); + const endDistance = Math.abs(candidate.end - tokenEnd); + if ( + startDistance < bestSurfaceMatchDistance || + (startDistance === bestSurfaceMatchDistance && + (endDistance < bestSurfaceMatchEndDistance || + (endDistance === bestSurfaceMatchEndDistance && candidate.index < bestSurfaceMatchIndex))) + ) { + bestSurfaceMatchDistance = startDistance; + bestSurfaceMatchEndDistance = endDistance; + bestSurfaceMatchIndex = candidate.index; + bestSurfaceMatchToken = candidate; + } + right += 1; } } - if (bestSurfaceMatchToken) { + if (bestSurfaceMatchToken !== null) { return { - pos1: bestSurfaceMatchToken.pos1 as string, + pos1: bestSurfaceMatchToken.pos1, pos2: bestSurfaceMatchToken.pos2, pos3: bestSurfaceMatchToken.pos3, }; } - let bestToken: MergedToken | null = null; + return null; +} + +function pickClosestMecabPosMetadataByOverlap( + token: MergedToken, + candidates: IndexedMecabToken[], +): MecabPosMetadata | null { + const tokenStart = token.startPos ?? 0; + const tokenEnd = token.endPos ?? tokenStart + token.surface.length; + let bestToken: IndexedMecabToken | null = null; let bestOverlap = 0; let bestSpan = 0; let bestStartDistance = Number.MAX_SAFE_INTEGER; let bestStart = Number.MAX_SAFE_INTEGER; - const overlappingTokens: MergedToken[] = []; + let bestIndex = Number.MAX_SAFE_INTEGER; + const overlappingTokens: IndexedMecabToken[] = []; - for (const mecabToken of mecabTokens) { - if (!mecabToken.pos1) { - continue; - } - - const mecabStart = mecabToken.startPos ?? 0; - const mecabEnd = mecabToken.endPos ?? mecabStart + mecabToken.surface.length; + for (const candidate of candidates) { + const mecabStart = candidate.start; + const mecabEnd = candidate.end; const overlapStart = Math.max(tokenStart, mecabStart); const overlapEnd = Math.min(tokenEnd, mecabEnd); const overlap = Math.max(0, overlapEnd - overlapStart); if (overlap === 0) { continue; } - overlappingTokens.push(mecabToken); + overlappingTokens.push(candidate); const span = mecabEnd - mecabStart; + const startDistance = Math.abs(mecabStart - tokenStart); if ( overlap > bestOverlap || (overlap === bestOverlap && - (Math.abs(mecabStart - tokenStart) < bestStartDistance || - (Math.abs(mecabStart - tokenStart) === bestStartDistance && - (span > bestSpan || (span === bestSpan && mecabStart < bestStart))))) + (startDistance < bestStartDistance || + (startDistance === bestStartDistance && + (span > bestSpan || + (span === bestSpan && + (mecabStart < bestStart || + (mecabStart === bestStart && candidate.index < bestIndex))))))) ) { bestOverlap = overlap; bestSpan = span; - bestStartDistance = Math.abs(mecabStart - tokenStart); + bestStartDistance = startDistance; bestStart = mecabStart; - bestToken = mecabToken; + bestIndex = candidate.index; + bestToken = candidate; } } @@ -117,12 +271,15 @@ function pickClosestMecabPosMetadata( return null; } - const overlapPos1 = joinUniqueTags(overlappingTokens.map((token) => token.pos1)); - const overlapPos2 = joinUniqueTags(overlappingTokens.map((token) => token.pos2)); - const overlapPos3 = joinUniqueTags(overlappingTokens.map((token) => token.pos3)); + const overlappingTokensByMecabOrder = overlappingTokens + .slice() + .sort((left, right) => left.index - right.index); + const overlapPos1 = joinUniqueTags(overlappingTokensByMecabOrder.map((candidate) => candidate.pos1)); + const overlapPos2 = joinUniqueTags(overlappingTokensByMecabOrder.map((candidate) => candidate.pos2)); + const overlapPos3 = joinUniqueTags(overlappingTokensByMecabOrder.map((candidate) => candidate.pos3)); return { - pos1: overlapPos1 ?? (bestToken.pos1 as string), + pos1: overlapPos1 ?? bestToken.pos1, pos2: overlapPos2 ?? bestToken.pos2, pos3: overlapPos3 ?? bestToken.pos3, }; @@ -130,13 +287,9 @@ function pickClosestMecabPosMetadata( function fillMissingPos1BySurfaceSequence( tokens: MergedToken[], - mecabTokens: MergedToken[], + byTrimmedSurface: Map, ): MergedToken[] { - const indexedMecabTokens = mecabTokens - .map((token, index) => ({ token, index })) - .filter(({ token }) => token.pos1 && token.surface.trim().length > 0); - - if (indexedMecabTokens.length === 0) { + if (byTrimmedSurface.size === 0) { return tokens; } @@ -151,27 +304,13 @@ function fillMissingPos1BySurfaceSequence( return token; } - let best: { token: MergedToken; index: number } | null = null; - for (const candidate of indexedMecabTokens) { - if (candidate.token.surface !== surface) { - continue; - } - if (candidate.index < cursor) { - continue; - } - best = { token: candidate.token, index: candidate.index }; - break; + const candidates = byTrimmedSurface.get(surface); + if (!candidates || candidates.length === 0) { + return token; } - if (!best) { - for (const candidate of indexedMecabTokens) { - if (candidate.token.surface !== surface) { - continue; - } - best = { token: candidate.token, index: candidate.index }; - break; - } - } + const atOrAfterCursorIndex = lowerBoundByIndex(candidates, cursor); + const best = candidates[atOrAfterCursorIndex] ?? candidates[0]; if (!best) { return token; @@ -180,13 +319,41 @@ function fillMissingPos1BySurfaceSequence( cursor = best.index + 1; return { ...token, - pos1: best.token.pos1, - pos2: best.token.pos2, - pos3: best.token.pos3, + pos1: best.pos1, + pos2: best.pos2, + pos3: best.pos3, }; }); } +function collectOverlapCandidatesByPosition( + token: MergedToken, + byPosition: Map, +): IndexedMecabToken[] { + const tokenStart = token.startPos ?? 0; + const tokenEnd = token.endPos ?? tokenStart + token.surface.length; + const discreteSpan = toDiscreteSpan(tokenStart, tokenEnd); + const seen = new Set(); + const overlapCandidates: IndexedMecabToken[] = []; + + for (let position = discreteSpan.start; position < discreteSpan.end; position += 1) { + const candidatesAtPosition = byPosition.get(position); + if (!candidatesAtPosition) { + continue; + } + + for (const candidate of candidatesAtPosition) { + if (seen.has(candidate.index)) { + continue; + } + seen.add(candidate.index); + overlapCandidates.push(candidate); + } + } + + return overlapCandidates; +} + export function enrichTokensWithMecabPos1( tokens: MergedToken[], mecabTokens: MergedToken[] | null, @@ -199,12 +366,36 @@ export function enrichTokensWithMecabPos1( return tokens; } - const overlapEnriched = tokens.map((token) => { + const lookup = buildMecabLookup(mecabTokens); + if (lookup.indexedTokens.length === 0) { + return tokens; + } + + const metadataByTokenIndex = new Map(); + + for (const [index, token] of tokens.entries()) { if (token.pos1) { - return token; + continue; } - const metadata = pickClosestMecabPosMetadata(token, mecabTokens); + const surfaceMetadata = pickClosestMecabPosMetadataBySurface( + token, + lookup.byExactSurface.get(token.surface), + ); + if (surfaceMetadata) { + metadataByTokenIndex.set(index, surfaceMetadata); + continue; + } + + const overlapCandidates = collectOverlapCandidatesByPosition(token, lookup.byPosition); + const overlapMetadata = pickClosestMecabPosMetadataByOverlap(token, overlapCandidates); + if (overlapMetadata) { + metadataByTokenIndex.set(index, overlapMetadata); + } + } + + const overlapEnriched = tokens.map((token, index) => { + const metadata = metadataByTokenIndex.get(index); if (!metadata) { return token; } @@ -217,5 +408,5 @@ export function enrichTokensWithMecabPos1( }; }); - return fillMissingPos1BySurfaceSequence(overlapEnriched, mecabTokens); + return fillMissingPos1BySurfaceSequence(overlapEnriched, lookup.byTrimmedSurface); }