perf(tokenizer): optimize mecab POS enrichment lookups

This commit is contained in:
2026-03-02 01:39:44 -08:00
parent 83f13df627
commit fa97472bce
3 changed files with 344 additions and 76 deletions

View File

@@ -4,7 +4,7 @@ title: 'Tokenization performance: disable Yomitan MeCab parser, gate local MeCab
status: Done status: Done
assignee: [] assignee: []
created_date: '2026-03-02 07:44' created_date: '2026-03-02 07:44'
updated_date: '2026-03-02 20:34' updated_date: '2026-03-02 20:37'
labels: [] labels: []
dependencies: [] dependencies: []
priority: high priority: high
@@ -47,6 +47,8 @@ Implemented tokenizer latency optimizations:
- added regression coverage in `src/main/runtime/composers/mpv-runtime-composer.test.ts` for sequential tokenize calls (`warmup` side effects run once); - added regression coverage in `src/main/runtime/composers/mpv-runtime-composer.test.ts` for sequential tokenize calls (`warmup` side effects run once);
- post-review critical fix: treat Yomitan default-profile Anki server sync `no-change` as successful check, so `lastSyncedYomitanAnkiServer` is cached and expensive sync checks do not repeat on every subtitle line; - post-review critical fix: treat Yomitan default-profile Anki server sync `no-change` as successful check, so `lastSyncedYomitanAnkiServer` is cached and expensive sync checks do not repeat on every subtitle line;
- added regression assertion in `src/core/services/tokenizer/yomitan-parser-runtime.test.ts` for `updated: false` path returning sync success; - added regression assertion in `src/core/services/tokenizer/yomitan-parser-runtime.test.ts` for `updated: false` path returning sync success;
- post-review performance fix: refactored POS enrichment to pre-index MeCab tokens by surface/start position and use sliding overlap window + binary-search cursor fallback, removing repeated full MeCab scans per token (`O(n*m)` hotspot);
- added regression test in `src/core/services/tokenizer/parser-enrichment-stage.test.ts` that fails on repeated distant-token scan access and passes with indexed lookup;
- validated with targeted tests and `tsc --noEmit`. - validated with targeted tests and `tsc --noEmit`.
<!-- SECTION:FINAL_SUMMARY:END --> <!-- SECTION:FINAL_SUMMARY:END -->

View File

@@ -48,3 +48,78 @@ test('enrichTokensWithMecabPos1 passes through unchanged when mecab tokens are n
const emptyResult = enrichTokensWithMecabPos1(tokens, []); const emptyResult = enrichTokensWithMecabPos1(tokens, []);
assert.strictEqual(emptyResult, tokens); assert.strictEqual(emptyResult, tokens);
}); });
test('enrichTokensWithMecabPos1 avoids repeated full scans over distant mecab surfaces', () => {
const tokens = Array.from({ length: 12 }, (_, index) =>
makeToken({ surface: `w${index}`, startPos: index, endPos: index + 1, pos1: '' }),
);
const mecabTokens = tokens.map((token) =>
makeToken({
surface: token.surface,
startPos: token.startPos,
endPos: token.endPos,
pos1: '名詞',
}),
);
let distantSurfaceReads = 0;
const distantToken = makeToken({ surface: '遠', startPos: 500, endPos: 501, pos1: '記号' });
Object.defineProperty(distantToken, 'surface', {
configurable: true,
get() {
distantSurfaceReads += 1;
if (distantSurfaceReads > 3) {
throw new Error('repeated full scan detected');
}
return '遠';
},
});
mecabTokens.push(distantToken);
const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
assert.equal(enriched.length, tokens.length);
for (const token of enriched) {
assert.equal(token.pos1, '名詞');
}
});
test('enrichTokensWithMecabPos1 avoids repeated active-candidate filter scans', () => {
const tokens = Array.from({ length: 8 }, (_, index) =>
makeToken({ surface: `u${index}`, startPos: index, endPos: index + 1, pos1: '' }),
);
const mecabTokens = [
makeToken({ surface: 'SENTINEL', startPos: 0, endPos: 100, pos1: '記号' }),
...tokens.map((token, index) =>
makeToken({
surface: `m${index}`,
startPos: token.startPos,
endPos: token.endPos,
pos1: '名詞',
}),
),
];
let sentinelFilterCalls = 0;
const originalFilter = Array.prototype.filter;
Array.prototype.filter = function filterWithSentinelCheck<T>(
this: T[],
predicate: (value: T, index: number, array: T[]) => unknown,
thisArg?: unknown,
) {
const target = this as Array<{ surface?: string }>;
if (target.some((candidate) => candidate?.surface === 'SENTINEL')) {
sentinelFilterCalls += 1;
if (sentinelFilterCalls > 2) {
throw new Error('repeated active candidate filter scan detected');
}
}
return originalFilter.call(this, predicate, thisArg);
};
try {
const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
assert.equal(enriched.length, tokens.length);
} finally {
Array.prototype.filter = originalFilter;
}
});

View File

@@ -6,6 +6,120 @@ type MecabPosMetadata = {
pos3?: string; pos3?: string;
}; };
type IndexedMecabToken = {
index: number;
start: number;
end: number;
surface: string;
pos1: string;
pos2?: string;
pos3?: string;
};
type MecabLookup = {
indexedTokens: IndexedMecabToken[];
byExactSurface: Map<string, IndexedMecabToken[]>;
byTrimmedSurface: Map<string, IndexedMecabToken[]>;
byPosition: Map<number, IndexedMecabToken[]>;
};
function pushMapValue<K, T>(map: Map<K, T[]>, key: K, value: T): void {
const existing = map.get(key);
if (existing) {
existing.push(value);
return;
}
map.set(key, [value]);
}
function toDiscreteSpan(start: number, end: number): { start: number; end: number } {
const discreteStart = Math.floor(start);
const discreteEnd = Math.max(discreteStart + 1, Math.ceil(end));
return {
start: discreteStart,
end: discreteEnd,
};
}
function buildMecabLookup(mecabTokens: MergedToken[]): MecabLookup {
const indexedTokens: IndexedMecabToken[] = [];
for (const [index, token] of mecabTokens.entries()) {
const pos1 = token.pos1;
if (!pos1) {
continue;
}
const surface = token.surface;
const start = token.startPos ?? 0;
const end = token.endPos ?? start + surface.length;
indexedTokens.push({
index,
start,
end,
surface,
pos1,
pos2: token.pos2,
pos3: token.pos3,
});
}
const byExactSurface = new Map<string, IndexedMecabToken[]>();
const byTrimmedSurface = new Map<string, IndexedMecabToken[]>();
const byPosition = new Map<number, IndexedMecabToken[]>();
for (const token of indexedTokens) {
pushMapValue(byExactSurface, token.surface, token);
const trimmedSurface = token.surface.trim();
if (trimmedSurface) {
pushMapValue(byTrimmedSurface, trimmedSurface, token);
}
const discreteSpan = toDiscreteSpan(token.start, token.end);
for (let position = discreteSpan.start; position < discreteSpan.end; position += 1) {
pushMapValue(byPosition, position, token);
}
}
const byStartThenIndexSort = (left: IndexedMecabToken, right: IndexedMecabToken) =>
left.start - right.start || left.index - right.index;
for (const candidates of byExactSurface.values()) {
candidates.sort(byStartThenIndexSort);
}
return {
indexedTokens,
byExactSurface,
byTrimmedSurface,
byPosition,
};
}
function lowerBoundByStart(candidates: IndexedMecabToken[], targetStart: number): number {
let low = 0;
let high = candidates.length;
while (low < high) {
const mid = Math.floor((low + high) / 2);
if (candidates[mid]!.start < targetStart) {
low = mid + 1;
} else {
high = mid;
}
}
return low;
}
function lowerBoundByIndex(candidates: IndexedMecabToken[], targetIndex: number): number {
let low = 0;
let high = candidates.length;
while (low < high) {
const mid = Math.floor((low + high) / 2);
if (candidates[mid]!.index < targetIndex) {
low = mid + 1;
} else {
high = mid;
}
}
return low;
}
function joinUniqueTags(values: Array<string | undefined>): string | undefined { function joinUniqueTags(values: Array<string | undefined>): string | undefined {
const unique: string[] = []; const unique: string[] = [];
for (const value of values) { for (const value of values) {
@@ -29,87 +143,127 @@ function joinUniqueTags(values: Array<string | undefined>): string | undefined {
return unique.join('|'); return unique.join('|');
} }
function pickClosestMecabPosMetadata( function pickClosestMecabPosMetadataBySurface(
token: MergedToken, token: MergedToken,
mecabTokens: MergedToken[], candidates: IndexedMecabToken[] | undefined,
): MecabPosMetadata | null { ): MecabPosMetadata | null {
if (mecabTokens.length === 0) { if (!candidates || candidates.length === 0) {
return null; return null;
} }
const tokenStart = token.startPos ?? 0; const tokenStart = token.startPos ?? 0;
const tokenEnd = token.endPos ?? tokenStart + token.surface.length; const tokenEnd = token.endPos ?? tokenStart + token.surface.length;
let bestSurfaceMatchToken: MergedToken | null = null; let bestSurfaceMatchToken: IndexedMecabToken | null = null;
let bestSurfaceMatchDistance = Number.MAX_SAFE_INTEGER; let bestSurfaceMatchDistance = Number.MAX_SAFE_INTEGER;
let bestSurfaceMatchEndDistance = Number.MAX_SAFE_INTEGER; let bestSurfaceMatchEndDistance = Number.MAX_SAFE_INTEGER;
let bestSurfaceMatchIndex = Number.MAX_SAFE_INTEGER;
for (const mecabToken of mecabTokens) { const nearestStartIndex = lowerBoundByStart(candidates, tokenStart);
if (!mecabToken.pos1) { let left = nearestStartIndex - 1;
continue; let right = nearestStartIndex;
while (left >= 0 || right < candidates.length) {
const leftDistance =
left >= 0 ? Math.abs(candidates[left]!.start - tokenStart) : Number.MAX_SAFE_INTEGER;
const rightDistance =
right < candidates.length
? Math.abs(candidates[right]!.start - tokenStart)
: Number.MAX_SAFE_INTEGER;
const nearestDistance = Math.min(leftDistance, rightDistance);
if (nearestDistance > bestSurfaceMatchDistance) {
break;
} }
if (mecabToken.surface !== token.surface) { if (leftDistance === nearestDistance && left >= 0) {
continue; const candidate = candidates[left]!;
const startDistance = Math.abs(candidate.start - tokenStart);
const endDistance = Math.abs(candidate.end - tokenEnd);
if (
startDistance < bestSurfaceMatchDistance ||
(startDistance === bestSurfaceMatchDistance &&
(endDistance < bestSurfaceMatchEndDistance ||
(endDistance === bestSurfaceMatchEndDistance && candidate.index < bestSurfaceMatchIndex)))
) {
bestSurfaceMatchDistance = startDistance;
bestSurfaceMatchEndDistance = endDistance;
bestSurfaceMatchIndex = candidate.index;
bestSurfaceMatchToken = candidate;
}
left -= 1;
} }
if (rightDistance === nearestDistance && right < candidates.length) {
const mecabStart = mecabToken.startPos ?? 0; const candidate = candidates[right]!;
const mecabEnd = mecabToken.endPos ?? mecabStart + mecabToken.surface.length; const startDistance = Math.abs(candidate.start - tokenStart);
const startDistance = Math.abs(mecabStart - tokenStart); const endDistance = Math.abs(candidate.end - tokenEnd);
const endDistance = Math.abs(mecabEnd - tokenEnd); if (
startDistance < bestSurfaceMatchDistance ||
if ( (startDistance === bestSurfaceMatchDistance &&
startDistance < bestSurfaceMatchDistance || (endDistance < bestSurfaceMatchEndDistance ||
(startDistance === bestSurfaceMatchDistance && endDistance < bestSurfaceMatchEndDistance) (endDistance === bestSurfaceMatchEndDistance && candidate.index < bestSurfaceMatchIndex)))
) { ) {
bestSurfaceMatchDistance = startDistance; bestSurfaceMatchDistance = startDistance;
bestSurfaceMatchEndDistance = endDistance; bestSurfaceMatchEndDistance = endDistance;
bestSurfaceMatchToken = mecabToken; bestSurfaceMatchIndex = candidate.index;
bestSurfaceMatchToken = candidate;
}
right += 1;
} }
} }
if (bestSurfaceMatchToken) { if (bestSurfaceMatchToken !== null) {
return { return {
pos1: bestSurfaceMatchToken.pos1 as string, pos1: bestSurfaceMatchToken.pos1,
pos2: bestSurfaceMatchToken.pos2, pos2: bestSurfaceMatchToken.pos2,
pos3: bestSurfaceMatchToken.pos3, pos3: bestSurfaceMatchToken.pos3,
}; };
} }
let bestToken: MergedToken | null = null; return null;
}
function pickClosestMecabPosMetadataByOverlap(
token: MergedToken,
candidates: IndexedMecabToken[],
): MecabPosMetadata | null {
const tokenStart = token.startPos ?? 0;
const tokenEnd = token.endPos ?? tokenStart + token.surface.length;
let bestToken: IndexedMecabToken | null = null;
let bestOverlap = 0; let bestOverlap = 0;
let bestSpan = 0; let bestSpan = 0;
let bestStartDistance = Number.MAX_SAFE_INTEGER; let bestStartDistance = Number.MAX_SAFE_INTEGER;
let bestStart = Number.MAX_SAFE_INTEGER; let bestStart = Number.MAX_SAFE_INTEGER;
const overlappingTokens: MergedToken[] = []; let bestIndex = Number.MAX_SAFE_INTEGER;
const overlappingTokens: IndexedMecabToken[] = [];
for (const mecabToken of mecabTokens) { for (const candidate of candidates) {
if (!mecabToken.pos1) { const mecabStart = candidate.start;
continue; const mecabEnd = candidate.end;
}
const mecabStart = mecabToken.startPos ?? 0;
const mecabEnd = mecabToken.endPos ?? mecabStart + mecabToken.surface.length;
const overlapStart = Math.max(tokenStart, mecabStart); const overlapStart = Math.max(tokenStart, mecabStart);
const overlapEnd = Math.min(tokenEnd, mecabEnd); const overlapEnd = Math.min(tokenEnd, mecabEnd);
const overlap = Math.max(0, overlapEnd - overlapStart); const overlap = Math.max(0, overlapEnd - overlapStart);
if (overlap === 0) { if (overlap === 0) {
continue; continue;
} }
overlappingTokens.push(mecabToken); overlappingTokens.push(candidate);
const span = mecabEnd - mecabStart; const span = mecabEnd - mecabStart;
const startDistance = Math.abs(mecabStart - tokenStart);
if ( if (
overlap > bestOverlap || overlap > bestOverlap ||
(overlap === bestOverlap && (overlap === bestOverlap &&
(Math.abs(mecabStart - tokenStart) < bestStartDistance || (startDistance < bestStartDistance ||
(Math.abs(mecabStart - tokenStart) === bestStartDistance && (startDistance === bestStartDistance &&
(span > bestSpan || (span === bestSpan && mecabStart < bestStart))))) (span > bestSpan ||
(span === bestSpan &&
(mecabStart < bestStart ||
(mecabStart === bestStart && candidate.index < bestIndex)))))))
) { ) {
bestOverlap = overlap; bestOverlap = overlap;
bestSpan = span; bestSpan = span;
bestStartDistance = Math.abs(mecabStart - tokenStart); bestStartDistance = startDistance;
bestStart = mecabStart; bestStart = mecabStart;
bestToken = mecabToken; bestIndex = candidate.index;
bestToken = candidate;
} }
} }
@@ -117,12 +271,15 @@ function pickClosestMecabPosMetadata(
return null; return null;
} }
const overlapPos1 = joinUniqueTags(overlappingTokens.map((token) => token.pos1)); const overlappingTokensByMecabOrder = overlappingTokens
const overlapPos2 = joinUniqueTags(overlappingTokens.map((token) => token.pos2)); .slice()
const overlapPos3 = joinUniqueTags(overlappingTokens.map((token) => token.pos3)); .sort((left, right) => left.index - right.index);
const overlapPos1 = joinUniqueTags(overlappingTokensByMecabOrder.map((candidate) => candidate.pos1));
const overlapPos2 = joinUniqueTags(overlappingTokensByMecabOrder.map((candidate) => candidate.pos2));
const overlapPos3 = joinUniqueTags(overlappingTokensByMecabOrder.map((candidate) => candidate.pos3));
return { return {
pos1: overlapPos1 ?? (bestToken.pos1 as string), pos1: overlapPos1 ?? bestToken.pos1,
pos2: overlapPos2 ?? bestToken.pos2, pos2: overlapPos2 ?? bestToken.pos2,
pos3: overlapPos3 ?? bestToken.pos3, pos3: overlapPos3 ?? bestToken.pos3,
}; };
@@ -130,13 +287,9 @@ function pickClosestMecabPosMetadata(
function fillMissingPos1BySurfaceSequence( function fillMissingPos1BySurfaceSequence(
tokens: MergedToken[], tokens: MergedToken[],
mecabTokens: MergedToken[], byTrimmedSurface: Map<string, IndexedMecabToken[]>,
): MergedToken[] { ): MergedToken[] {
const indexedMecabTokens = mecabTokens if (byTrimmedSurface.size === 0) {
.map((token, index) => ({ token, index }))
.filter(({ token }) => token.pos1 && token.surface.trim().length > 0);
if (indexedMecabTokens.length === 0) {
return tokens; return tokens;
} }
@@ -151,27 +304,13 @@ function fillMissingPos1BySurfaceSequence(
return token; return token;
} }
let best: { token: MergedToken; index: number } | null = null; const candidates = byTrimmedSurface.get(surface);
for (const candidate of indexedMecabTokens) { if (!candidates || candidates.length === 0) {
if (candidate.token.surface !== surface) { return token;
continue;
}
if (candidate.index < cursor) {
continue;
}
best = { token: candidate.token, index: candidate.index };
break;
} }
if (!best) { const atOrAfterCursorIndex = lowerBoundByIndex(candidates, cursor);
for (const candidate of indexedMecabTokens) { const best = candidates[atOrAfterCursorIndex] ?? candidates[0];
if (candidate.token.surface !== surface) {
continue;
}
best = { token: candidate.token, index: candidate.index };
break;
}
}
if (!best) { if (!best) {
return token; return token;
@@ -180,13 +319,41 @@ function fillMissingPos1BySurfaceSequence(
cursor = best.index + 1; cursor = best.index + 1;
return { return {
...token, ...token,
pos1: best.token.pos1, pos1: best.pos1,
pos2: best.token.pos2, pos2: best.pos2,
pos3: best.token.pos3, pos3: best.pos3,
}; };
}); });
} }
function collectOverlapCandidatesByPosition(
token: MergedToken,
byPosition: Map<number, IndexedMecabToken[]>,
): IndexedMecabToken[] {
const tokenStart = token.startPos ?? 0;
const tokenEnd = token.endPos ?? tokenStart + token.surface.length;
const discreteSpan = toDiscreteSpan(tokenStart, tokenEnd);
const seen = new Set<number>();
const overlapCandidates: IndexedMecabToken[] = [];
for (let position = discreteSpan.start; position < discreteSpan.end; position += 1) {
const candidatesAtPosition = byPosition.get(position);
if (!candidatesAtPosition) {
continue;
}
for (const candidate of candidatesAtPosition) {
if (seen.has(candidate.index)) {
continue;
}
seen.add(candidate.index);
overlapCandidates.push(candidate);
}
}
return overlapCandidates;
}
export function enrichTokensWithMecabPos1( export function enrichTokensWithMecabPos1(
tokens: MergedToken[], tokens: MergedToken[],
mecabTokens: MergedToken[] | null, mecabTokens: MergedToken[] | null,
@@ -199,12 +366,36 @@ export function enrichTokensWithMecabPos1(
return tokens; return tokens;
} }
const overlapEnriched = tokens.map((token) => { const lookup = buildMecabLookup(mecabTokens);
if (lookup.indexedTokens.length === 0) {
return tokens;
}
const metadataByTokenIndex = new Map<number, MecabPosMetadata>();
for (const [index, token] of tokens.entries()) {
if (token.pos1) { if (token.pos1) {
return token; continue;
} }
const metadata = pickClosestMecabPosMetadata(token, mecabTokens); const surfaceMetadata = pickClosestMecabPosMetadataBySurface(
token,
lookup.byExactSurface.get(token.surface),
);
if (surfaceMetadata) {
metadataByTokenIndex.set(index, surfaceMetadata);
continue;
}
const overlapCandidates = collectOverlapCandidatesByPosition(token, lookup.byPosition);
const overlapMetadata = pickClosestMecabPosMetadataByOverlap(token, overlapCandidates);
if (overlapMetadata) {
metadataByTokenIndex.set(index, overlapMetadata);
}
}
const overlapEnriched = tokens.map((token, index) => {
const metadata = metadataByTokenIndex.get(index);
if (!metadata) { if (!metadata) {
return token; return token;
} }
@@ -217,5 +408,5 @@ export function enrichTokensWithMecabPos1(
}; };
}); });
return fillMissingPos1BySurfaceSequence(overlapEnriched, mecabTokens); return fillMissingPos1BySurfaceSequence(overlapEnriched, lookup.byTrimmedSurface);
} }