mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-02 06:22:42 -08:00
perf(tokenizer): optimize mecab POS enrichment lookups
This commit is contained in:
@@ -4,7 +4,7 @@ title: 'Tokenization performance: disable Yomitan MeCab parser, gate local MeCab
|
||||
status: Done
|
||||
assignee: []
|
||||
created_date: '2026-03-02 07:44'
|
||||
updated_date: '2026-03-02 20:34'
|
||||
updated_date: '2026-03-02 20:37'
|
||||
labels: []
|
||||
dependencies: []
|
||||
priority: high
|
||||
@@ -47,6 +47,8 @@ Implemented tokenizer latency optimizations:
|
||||
- added regression coverage in `src/main/runtime/composers/mpv-runtime-composer.test.ts` for sequential tokenize calls (`warmup` side effects run once);
|
||||
- post-review critical fix: treat Yomitan default-profile Anki server sync `no-change` as successful check, so `lastSyncedYomitanAnkiServer` is cached and expensive sync checks do not repeat on every subtitle line;
|
||||
- added regression assertion in `src/core/services/tokenizer/yomitan-parser-runtime.test.ts` for `updated: false` path returning sync success;
|
||||
- post-review performance fix: refactored POS enrichment to pre-index MeCab tokens by surface/start position and use sliding overlap window + binary-search cursor fallback, removing repeated full MeCab scans per token (`O(n*m)` hotspot);
|
||||
- added regression test in `src/core/services/tokenizer/parser-enrichment-stage.test.ts` that fails on repeated distant-token scan access and passes with indexed lookup;
|
||||
- validated with targeted tests and `tsc --noEmit`.
|
||||
|
||||
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||
|
||||
@@ -48,3 +48,78 @@ test('enrichTokensWithMecabPos1 passes through unchanged when mecab tokens are n
|
||||
const emptyResult = enrichTokensWithMecabPos1(tokens, []);
|
||||
assert.strictEqual(emptyResult, tokens);
|
||||
});
|
||||
|
||||
test('enrichTokensWithMecabPos1 avoids repeated full scans over distant mecab surfaces', () => {
|
||||
const tokens = Array.from({ length: 12 }, (_, index) =>
|
||||
makeToken({ surface: `w${index}`, startPos: index, endPos: index + 1, pos1: '' }),
|
||||
);
|
||||
const mecabTokens = tokens.map((token) =>
|
||||
makeToken({
|
||||
surface: token.surface,
|
||||
startPos: token.startPos,
|
||||
endPos: token.endPos,
|
||||
pos1: '名詞',
|
||||
}),
|
||||
);
|
||||
|
||||
let distantSurfaceReads = 0;
|
||||
const distantToken = makeToken({ surface: '遠', startPos: 500, endPos: 501, pos1: '記号' });
|
||||
Object.defineProperty(distantToken, 'surface', {
|
||||
configurable: true,
|
||||
get() {
|
||||
distantSurfaceReads += 1;
|
||||
if (distantSurfaceReads > 3) {
|
||||
throw new Error('repeated full scan detected');
|
||||
}
|
||||
return '遠';
|
||||
},
|
||||
});
|
||||
mecabTokens.push(distantToken);
|
||||
|
||||
const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
|
||||
assert.equal(enriched.length, tokens.length);
|
||||
for (const token of enriched) {
|
||||
assert.equal(token.pos1, '名詞');
|
||||
}
|
||||
});
|
||||
|
||||
test('enrichTokensWithMecabPos1 avoids repeated active-candidate filter scans', () => {
|
||||
const tokens = Array.from({ length: 8 }, (_, index) =>
|
||||
makeToken({ surface: `u${index}`, startPos: index, endPos: index + 1, pos1: '' }),
|
||||
);
|
||||
const mecabTokens = [
|
||||
makeToken({ surface: 'SENTINEL', startPos: 0, endPos: 100, pos1: '記号' }),
|
||||
...tokens.map((token, index) =>
|
||||
makeToken({
|
||||
surface: `m${index}`,
|
||||
startPos: token.startPos,
|
||||
endPos: token.endPos,
|
||||
pos1: '名詞',
|
||||
}),
|
||||
),
|
||||
];
|
||||
|
||||
let sentinelFilterCalls = 0;
|
||||
const originalFilter = Array.prototype.filter;
|
||||
Array.prototype.filter = function filterWithSentinelCheck<T>(
|
||||
this: T[],
|
||||
predicate: (value: T, index: number, array: T[]) => unknown,
|
||||
thisArg?: unknown,
|
||||
) {
|
||||
const target = this as Array<{ surface?: string }>;
|
||||
if (target.some((candidate) => candidate?.surface === 'SENTINEL')) {
|
||||
sentinelFilterCalls += 1;
|
||||
if (sentinelFilterCalls > 2) {
|
||||
throw new Error('repeated active candidate filter scan detected');
|
||||
}
|
||||
}
|
||||
return originalFilter.call(this, predicate, thisArg);
|
||||
};
|
||||
|
||||
try {
|
||||
const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
|
||||
assert.equal(enriched.length, tokens.length);
|
||||
} finally {
|
||||
Array.prototype.filter = originalFilter;
|
||||
}
|
||||
});
|
||||
|
||||
@@ -6,6 +6,120 @@ type MecabPosMetadata = {
|
||||
pos3?: string;
|
||||
};
|
||||
|
||||
type IndexedMecabToken = {
|
||||
index: number;
|
||||
start: number;
|
||||
end: number;
|
||||
surface: string;
|
||||
pos1: string;
|
||||
pos2?: string;
|
||||
pos3?: string;
|
||||
};
|
||||
|
||||
type MecabLookup = {
|
||||
indexedTokens: IndexedMecabToken[];
|
||||
byExactSurface: Map<string, IndexedMecabToken[]>;
|
||||
byTrimmedSurface: Map<string, IndexedMecabToken[]>;
|
||||
byPosition: Map<number, IndexedMecabToken[]>;
|
||||
};
|
||||
|
||||
function pushMapValue<K, T>(map: Map<K, T[]>, key: K, value: T): void {
|
||||
const existing = map.get(key);
|
||||
if (existing) {
|
||||
existing.push(value);
|
||||
return;
|
||||
}
|
||||
map.set(key, [value]);
|
||||
}
|
||||
|
||||
function toDiscreteSpan(start: number, end: number): { start: number; end: number } {
|
||||
const discreteStart = Math.floor(start);
|
||||
const discreteEnd = Math.max(discreteStart + 1, Math.ceil(end));
|
||||
return {
|
||||
start: discreteStart,
|
||||
end: discreteEnd,
|
||||
};
|
||||
}
|
||||
|
||||
function buildMecabLookup(mecabTokens: MergedToken[]): MecabLookup {
|
||||
const indexedTokens: IndexedMecabToken[] = [];
|
||||
for (const [index, token] of mecabTokens.entries()) {
|
||||
const pos1 = token.pos1;
|
||||
if (!pos1) {
|
||||
continue;
|
||||
}
|
||||
const surface = token.surface;
|
||||
const start = token.startPos ?? 0;
|
||||
const end = token.endPos ?? start + surface.length;
|
||||
indexedTokens.push({
|
||||
index,
|
||||
start,
|
||||
end,
|
||||
surface,
|
||||
pos1,
|
||||
pos2: token.pos2,
|
||||
pos3: token.pos3,
|
||||
});
|
||||
}
|
||||
|
||||
const byExactSurface = new Map<string, IndexedMecabToken[]>();
|
||||
const byTrimmedSurface = new Map<string, IndexedMecabToken[]>();
|
||||
const byPosition = new Map<number, IndexedMecabToken[]>();
|
||||
for (const token of indexedTokens) {
|
||||
pushMapValue(byExactSurface, token.surface, token);
|
||||
const trimmedSurface = token.surface.trim();
|
||||
if (trimmedSurface) {
|
||||
pushMapValue(byTrimmedSurface, trimmedSurface, token);
|
||||
}
|
||||
|
||||
const discreteSpan = toDiscreteSpan(token.start, token.end);
|
||||
for (let position = discreteSpan.start; position < discreteSpan.end; position += 1) {
|
||||
pushMapValue(byPosition, position, token);
|
||||
}
|
||||
}
|
||||
|
||||
const byStartThenIndexSort = (left: IndexedMecabToken, right: IndexedMecabToken) =>
|
||||
left.start - right.start || left.index - right.index;
|
||||
for (const candidates of byExactSurface.values()) {
|
||||
candidates.sort(byStartThenIndexSort);
|
||||
}
|
||||
|
||||
return {
|
||||
indexedTokens,
|
||||
byExactSurface,
|
||||
byTrimmedSurface,
|
||||
byPosition,
|
||||
};
|
||||
}
|
||||
|
||||
function lowerBoundByStart(candidates: IndexedMecabToken[], targetStart: number): number {
|
||||
let low = 0;
|
||||
let high = candidates.length;
|
||||
while (low < high) {
|
||||
const mid = Math.floor((low + high) / 2);
|
||||
if (candidates[mid]!.start < targetStart) {
|
||||
low = mid + 1;
|
||||
} else {
|
||||
high = mid;
|
||||
}
|
||||
}
|
||||
return low;
|
||||
}
|
||||
|
||||
function lowerBoundByIndex(candidates: IndexedMecabToken[], targetIndex: number): number {
|
||||
let low = 0;
|
||||
let high = candidates.length;
|
||||
while (low < high) {
|
||||
const mid = Math.floor((low + high) / 2);
|
||||
if (candidates[mid]!.index < targetIndex) {
|
||||
low = mid + 1;
|
||||
} else {
|
||||
high = mid;
|
||||
}
|
||||
}
|
||||
return low;
|
||||
}
|
||||
|
||||
function joinUniqueTags(values: Array<string | undefined>): string | undefined {
|
||||
const unique: string[] = [];
|
||||
for (const value of values) {
|
||||
@@ -29,87 +143,127 @@ function joinUniqueTags(values: Array<string | undefined>): string | undefined {
|
||||
return unique.join('|');
|
||||
}
|
||||
|
||||
function pickClosestMecabPosMetadata(
|
||||
function pickClosestMecabPosMetadataBySurface(
|
||||
token: MergedToken,
|
||||
mecabTokens: MergedToken[],
|
||||
candidates: IndexedMecabToken[] | undefined,
|
||||
): MecabPosMetadata | null {
|
||||
if (mecabTokens.length === 0) {
|
||||
if (!candidates || candidates.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const tokenStart = token.startPos ?? 0;
|
||||
const tokenEnd = token.endPos ?? tokenStart + token.surface.length;
|
||||
let bestSurfaceMatchToken: MergedToken | null = null;
|
||||
let bestSurfaceMatchToken: IndexedMecabToken | null = null;
|
||||
let bestSurfaceMatchDistance = Number.MAX_SAFE_INTEGER;
|
||||
let bestSurfaceMatchEndDistance = Number.MAX_SAFE_INTEGER;
|
||||
let bestSurfaceMatchIndex = Number.MAX_SAFE_INTEGER;
|
||||
|
||||
for (const mecabToken of mecabTokens) {
|
||||
if (!mecabToken.pos1) {
|
||||
continue;
|
||||
const nearestStartIndex = lowerBoundByStart(candidates, tokenStart);
|
||||
let left = nearestStartIndex - 1;
|
||||
let right = nearestStartIndex;
|
||||
|
||||
while (left >= 0 || right < candidates.length) {
|
||||
const leftDistance =
|
||||
left >= 0 ? Math.abs(candidates[left]!.start - tokenStart) : Number.MAX_SAFE_INTEGER;
|
||||
const rightDistance =
|
||||
right < candidates.length
|
||||
? Math.abs(candidates[right]!.start - tokenStart)
|
||||
: Number.MAX_SAFE_INTEGER;
|
||||
const nearestDistance = Math.min(leftDistance, rightDistance);
|
||||
if (nearestDistance > bestSurfaceMatchDistance) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (mecabToken.surface !== token.surface) {
|
||||
continue;
|
||||
if (leftDistance === nearestDistance && left >= 0) {
|
||||
const candidate = candidates[left]!;
|
||||
const startDistance = Math.abs(candidate.start - tokenStart);
|
||||
const endDistance = Math.abs(candidate.end - tokenEnd);
|
||||
if (
|
||||
startDistance < bestSurfaceMatchDistance ||
|
||||
(startDistance === bestSurfaceMatchDistance &&
|
||||
(endDistance < bestSurfaceMatchEndDistance ||
|
||||
(endDistance === bestSurfaceMatchEndDistance && candidate.index < bestSurfaceMatchIndex)))
|
||||
) {
|
||||
bestSurfaceMatchDistance = startDistance;
|
||||
bestSurfaceMatchEndDistance = endDistance;
|
||||
bestSurfaceMatchIndex = candidate.index;
|
||||
bestSurfaceMatchToken = candidate;
|
||||
}
|
||||
left -= 1;
|
||||
}
|
||||
|
||||
const mecabStart = mecabToken.startPos ?? 0;
|
||||
const mecabEnd = mecabToken.endPos ?? mecabStart + mecabToken.surface.length;
|
||||
const startDistance = Math.abs(mecabStart - tokenStart);
|
||||
const endDistance = Math.abs(mecabEnd - tokenEnd);
|
||||
|
||||
if (
|
||||
startDistance < bestSurfaceMatchDistance ||
|
||||
(startDistance === bestSurfaceMatchDistance && endDistance < bestSurfaceMatchEndDistance)
|
||||
) {
|
||||
bestSurfaceMatchDistance = startDistance;
|
||||
bestSurfaceMatchEndDistance = endDistance;
|
||||
bestSurfaceMatchToken = mecabToken;
|
||||
if (rightDistance === nearestDistance && right < candidates.length) {
|
||||
const candidate = candidates[right]!;
|
||||
const startDistance = Math.abs(candidate.start - tokenStart);
|
||||
const endDistance = Math.abs(candidate.end - tokenEnd);
|
||||
if (
|
||||
startDistance < bestSurfaceMatchDistance ||
|
||||
(startDistance === bestSurfaceMatchDistance &&
|
||||
(endDistance < bestSurfaceMatchEndDistance ||
|
||||
(endDistance === bestSurfaceMatchEndDistance && candidate.index < bestSurfaceMatchIndex)))
|
||||
) {
|
||||
bestSurfaceMatchDistance = startDistance;
|
||||
bestSurfaceMatchEndDistance = endDistance;
|
||||
bestSurfaceMatchIndex = candidate.index;
|
||||
bestSurfaceMatchToken = candidate;
|
||||
}
|
||||
right += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (bestSurfaceMatchToken) {
|
||||
if (bestSurfaceMatchToken !== null) {
|
||||
return {
|
||||
pos1: bestSurfaceMatchToken.pos1 as string,
|
||||
pos1: bestSurfaceMatchToken.pos1,
|
||||
pos2: bestSurfaceMatchToken.pos2,
|
||||
pos3: bestSurfaceMatchToken.pos3,
|
||||
};
|
||||
}
|
||||
|
||||
let bestToken: MergedToken | null = null;
|
||||
return null;
|
||||
}
|
||||
|
||||
function pickClosestMecabPosMetadataByOverlap(
|
||||
token: MergedToken,
|
||||
candidates: IndexedMecabToken[],
|
||||
): MecabPosMetadata | null {
|
||||
const tokenStart = token.startPos ?? 0;
|
||||
const tokenEnd = token.endPos ?? tokenStart + token.surface.length;
|
||||
let bestToken: IndexedMecabToken | null = null;
|
||||
let bestOverlap = 0;
|
||||
let bestSpan = 0;
|
||||
let bestStartDistance = Number.MAX_SAFE_INTEGER;
|
||||
let bestStart = Number.MAX_SAFE_INTEGER;
|
||||
const overlappingTokens: MergedToken[] = [];
|
||||
let bestIndex = Number.MAX_SAFE_INTEGER;
|
||||
const overlappingTokens: IndexedMecabToken[] = [];
|
||||
|
||||
for (const mecabToken of mecabTokens) {
|
||||
if (!mecabToken.pos1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const mecabStart = mecabToken.startPos ?? 0;
|
||||
const mecabEnd = mecabToken.endPos ?? mecabStart + mecabToken.surface.length;
|
||||
for (const candidate of candidates) {
|
||||
const mecabStart = candidate.start;
|
||||
const mecabEnd = candidate.end;
|
||||
const overlapStart = Math.max(tokenStart, mecabStart);
|
||||
const overlapEnd = Math.min(tokenEnd, mecabEnd);
|
||||
const overlap = Math.max(0, overlapEnd - overlapStart);
|
||||
if (overlap === 0) {
|
||||
continue;
|
||||
}
|
||||
overlappingTokens.push(mecabToken);
|
||||
overlappingTokens.push(candidate);
|
||||
|
||||
const span = mecabEnd - mecabStart;
|
||||
const startDistance = Math.abs(mecabStart - tokenStart);
|
||||
if (
|
||||
overlap > bestOverlap ||
|
||||
(overlap === bestOverlap &&
|
||||
(Math.abs(mecabStart - tokenStart) < bestStartDistance ||
|
||||
(Math.abs(mecabStart - tokenStart) === bestStartDistance &&
|
||||
(span > bestSpan || (span === bestSpan && mecabStart < bestStart)))))
|
||||
(startDistance < bestStartDistance ||
|
||||
(startDistance === bestStartDistance &&
|
||||
(span > bestSpan ||
|
||||
(span === bestSpan &&
|
||||
(mecabStart < bestStart ||
|
||||
(mecabStart === bestStart && candidate.index < bestIndex)))))))
|
||||
) {
|
||||
bestOverlap = overlap;
|
||||
bestSpan = span;
|
||||
bestStartDistance = Math.abs(mecabStart - tokenStart);
|
||||
bestStartDistance = startDistance;
|
||||
bestStart = mecabStart;
|
||||
bestToken = mecabToken;
|
||||
bestIndex = candidate.index;
|
||||
bestToken = candidate;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -117,12 +271,15 @@ function pickClosestMecabPosMetadata(
|
||||
return null;
|
||||
}
|
||||
|
||||
const overlapPos1 = joinUniqueTags(overlappingTokens.map((token) => token.pos1));
|
||||
const overlapPos2 = joinUniqueTags(overlappingTokens.map((token) => token.pos2));
|
||||
const overlapPos3 = joinUniqueTags(overlappingTokens.map((token) => token.pos3));
|
||||
const overlappingTokensByMecabOrder = overlappingTokens
|
||||
.slice()
|
||||
.sort((left, right) => left.index - right.index);
|
||||
const overlapPos1 = joinUniqueTags(overlappingTokensByMecabOrder.map((candidate) => candidate.pos1));
|
||||
const overlapPos2 = joinUniqueTags(overlappingTokensByMecabOrder.map((candidate) => candidate.pos2));
|
||||
const overlapPos3 = joinUniqueTags(overlappingTokensByMecabOrder.map((candidate) => candidate.pos3));
|
||||
|
||||
return {
|
||||
pos1: overlapPos1 ?? (bestToken.pos1 as string),
|
||||
pos1: overlapPos1 ?? bestToken.pos1,
|
||||
pos2: overlapPos2 ?? bestToken.pos2,
|
||||
pos3: overlapPos3 ?? bestToken.pos3,
|
||||
};
|
||||
@@ -130,13 +287,9 @@ function pickClosestMecabPosMetadata(
|
||||
|
||||
function fillMissingPos1BySurfaceSequence(
|
||||
tokens: MergedToken[],
|
||||
mecabTokens: MergedToken[],
|
||||
byTrimmedSurface: Map<string, IndexedMecabToken[]>,
|
||||
): MergedToken[] {
|
||||
const indexedMecabTokens = mecabTokens
|
||||
.map((token, index) => ({ token, index }))
|
||||
.filter(({ token }) => token.pos1 && token.surface.trim().length > 0);
|
||||
|
||||
if (indexedMecabTokens.length === 0) {
|
||||
if (byTrimmedSurface.size === 0) {
|
||||
return tokens;
|
||||
}
|
||||
|
||||
@@ -151,27 +304,13 @@ function fillMissingPos1BySurfaceSequence(
|
||||
return token;
|
||||
}
|
||||
|
||||
let best: { token: MergedToken; index: number } | null = null;
|
||||
for (const candidate of indexedMecabTokens) {
|
||||
if (candidate.token.surface !== surface) {
|
||||
continue;
|
||||
}
|
||||
if (candidate.index < cursor) {
|
||||
continue;
|
||||
}
|
||||
best = { token: candidate.token, index: candidate.index };
|
||||
break;
|
||||
const candidates = byTrimmedSurface.get(surface);
|
||||
if (!candidates || candidates.length === 0) {
|
||||
return token;
|
||||
}
|
||||
|
||||
if (!best) {
|
||||
for (const candidate of indexedMecabTokens) {
|
||||
if (candidate.token.surface !== surface) {
|
||||
continue;
|
||||
}
|
||||
best = { token: candidate.token, index: candidate.index };
|
||||
break;
|
||||
}
|
||||
}
|
||||
const atOrAfterCursorIndex = lowerBoundByIndex(candidates, cursor);
|
||||
const best = candidates[atOrAfterCursorIndex] ?? candidates[0];
|
||||
|
||||
if (!best) {
|
||||
return token;
|
||||
@@ -180,13 +319,41 @@ function fillMissingPos1BySurfaceSequence(
|
||||
cursor = best.index + 1;
|
||||
return {
|
||||
...token,
|
||||
pos1: best.token.pos1,
|
||||
pos2: best.token.pos2,
|
||||
pos3: best.token.pos3,
|
||||
pos1: best.pos1,
|
||||
pos2: best.pos2,
|
||||
pos3: best.pos3,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
function collectOverlapCandidatesByPosition(
|
||||
token: MergedToken,
|
||||
byPosition: Map<number, IndexedMecabToken[]>,
|
||||
): IndexedMecabToken[] {
|
||||
const tokenStart = token.startPos ?? 0;
|
||||
const tokenEnd = token.endPos ?? tokenStart + token.surface.length;
|
||||
const discreteSpan = toDiscreteSpan(tokenStart, tokenEnd);
|
||||
const seen = new Set<number>();
|
||||
const overlapCandidates: IndexedMecabToken[] = [];
|
||||
|
||||
for (let position = discreteSpan.start; position < discreteSpan.end; position += 1) {
|
||||
const candidatesAtPosition = byPosition.get(position);
|
||||
if (!candidatesAtPosition) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const candidate of candidatesAtPosition) {
|
||||
if (seen.has(candidate.index)) {
|
||||
continue;
|
||||
}
|
||||
seen.add(candidate.index);
|
||||
overlapCandidates.push(candidate);
|
||||
}
|
||||
}
|
||||
|
||||
return overlapCandidates;
|
||||
}
|
||||
|
||||
export function enrichTokensWithMecabPos1(
|
||||
tokens: MergedToken[],
|
||||
mecabTokens: MergedToken[] | null,
|
||||
@@ -199,12 +366,36 @@ export function enrichTokensWithMecabPos1(
|
||||
return tokens;
|
||||
}
|
||||
|
||||
const overlapEnriched = tokens.map((token) => {
|
||||
const lookup = buildMecabLookup(mecabTokens);
|
||||
if (lookup.indexedTokens.length === 0) {
|
||||
return tokens;
|
||||
}
|
||||
|
||||
const metadataByTokenIndex = new Map<number, MecabPosMetadata>();
|
||||
|
||||
for (const [index, token] of tokens.entries()) {
|
||||
if (token.pos1) {
|
||||
return token;
|
||||
continue;
|
||||
}
|
||||
|
||||
const metadata = pickClosestMecabPosMetadata(token, mecabTokens);
|
||||
const surfaceMetadata = pickClosestMecabPosMetadataBySurface(
|
||||
token,
|
||||
lookup.byExactSurface.get(token.surface),
|
||||
);
|
||||
if (surfaceMetadata) {
|
||||
metadataByTokenIndex.set(index, surfaceMetadata);
|
||||
continue;
|
||||
}
|
||||
|
||||
const overlapCandidates = collectOverlapCandidatesByPosition(token, lookup.byPosition);
|
||||
const overlapMetadata = pickClosestMecabPosMetadataByOverlap(token, overlapCandidates);
|
||||
if (overlapMetadata) {
|
||||
metadataByTokenIndex.set(index, overlapMetadata);
|
||||
}
|
||||
}
|
||||
|
||||
const overlapEnriched = tokens.map((token, index) => {
|
||||
const metadata = metadataByTokenIndex.get(index);
|
||||
if (!metadata) {
|
||||
return token;
|
||||
}
|
||||
@@ -217,5 +408,5 @@ export function enrichTokensWithMecabPos1(
|
||||
};
|
||||
});
|
||||
|
||||
return fillMissingPos1BySurfaceSequence(overlapEnriched, mecabTokens);
|
||||
return fillMissingPos1BySurfaceSequence(overlapEnriched, lookup.byTrimmedSurface);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user