mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-01 18:22:41 -08:00
fix(tokenizer): restore n+1 highlighting with mecab pos enrichment
This commit is contained in:
@@ -75,6 +75,8 @@ export interface TokenizerServiceDeps {
|
||||
|
||||
interface MecabTokenizerLike {
|
||||
tokenize: (text: string) => Promise<Token[] | null>;
|
||||
checkAvailability?: () => Promise<boolean>;
|
||||
getStatus?: () => { available: boolean };
|
||||
}
|
||||
|
||||
export interface TokenizerDepsRuntimeOptions {
|
||||
@@ -182,6 +184,8 @@ function getCachedFrequencyRank(
|
||||
export function createTokenizerDepsRuntime(
|
||||
options: TokenizerDepsRuntimeOptions,
|
||||
): TokenizerServiceDeps {
|
||||
const checkedMecabTokenizers = new WeakSet<object>();
|
||||
|
||||
return {
|
||||
getYomitanExt: options.getYomitanExt,
|
||||
getYomitanParserWindow: options.getYomitanParserWindow,
|
||||
@@ -203,6 +207,19 @@ export function createTokenizerDepsRuntime(
|
||||
if (!mecabTokenizer) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (
|
||||
typeof mecabTokenizer.checkAvailability === 'function' &&
|
||||
typeof mecabTokenizer.getStatus === 'function' &&
|
||||
!checkedMecabTokenizers.has(mecabTokenizer as object)
|
||||
) {
|
||||
const status = mecabTokenizer.getStatus();
|
||||
if (!status.available) {
|
||||
await mecabTokenizer.checkAvailability();
|
||||
}
|
||||
checkedMecabTokenizers.add(mecabTokenizer as object);
|
||||
}
|
||||
|
||||
const rawTokens = await mecabTokenizer.tokenize(text);
|
||||
if (!rawTokens || rawTokens.length === 0) {
|
||||
return null;
|
||||
@@ -688,10 +705,42 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s
|
||||
|
||||
const tokenStart = token.startPos ?? 0;
|
||||
const tokenEnd = token.endPos ?? tokenStart + token.surface.length;
|
||||
let bestSurfaceMatchPos1: string | undefined;
|
||||
let bestSurfaceMatchDistance = Number.MAX_SAFE_INTEGER;
|
||||
let bestSurfaceMatchEndDistance = Number.MAX_SAFE_INTEGER;
|
||||
|
||||
for (const mecabToken of mecabTokens) {
|
||||
if (!mecabToken.pos1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (mecabToken.surface !== token.surface) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const mecabStart = mecabToken.startPos ?? 0;
|
||||
const mecabEnd = mecabToken.endPos ?? mecabStart + mecabToken.surface.length;
|
||||
const startDistance = Math.abs(mecabStart - tokenStart);
|
||||
const endDistance = Math.abs(mecabEnd - tokenEnd);
|
||||
|
||||
if (
|
||||
startDistance < bestSurfaceMatchDistance ||
|
||||
(startDistance === bestSurfaceMatchDistance && endDistance < bestSurfaceMatchEndDistance)
|
||||
) {
|
||||
bestSurfaceMatchDistance = startDistance;
|
||||
bestSurfaceMatchEndDistance = endDistance;
|
||||
bestSurfaceMatchPos1 = mecabToken.pos1;
|
||||
}
|
||||
}
|
||||
|
||||
if (bestSurfaceMatchPos1) {
|
||||
return bestSurfaceMatchPos1;
|
||||
}
|
||||
|
||||
let bestPos1: string | undefined;
|
||||
let bestOverlap = 0;
|
||||
let bestSpan = 0;
|
||||
let bestStartDistance = Number.MAX_SAFE_INTEGER;
|
||||
let bestStart = Number.MAX_SAFE_INTEGER;
|
||||
|
||||
for (const mecabToken of mecabTokens) {
|
||||
@@ -712,10 +761,13 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s
|
||||
if (
|
||||
overlap > bestOverlap ||
|
||||
(overlap === bestOverlap &&
|
||||
(span > bestSpan || (span === bestSpan && mecabStart < bestStart)))
|
||||
(Math.abs(mecabStart - tokenStart) < bestStartDistance ||
|
||||
(Math.abs(mecabStart - tokenStart) === bestStartDistance &&
|
||||
(span > bestSpan || (span === bestSpan && mecabStart < bestStart)))))
|
||||
) {
|
||||
bestOverlap = overlap;
|
||||
bestSpan = span;
|
||||
bestStartDistance = Math.abs(mecabStart - tokenStart);
|
||||
bestStart = mecabStart;
|
||||
bestPos1 = mecabToken.pos1;
|
||||
}
|
||||
@@ -724,6 +776,63 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s
|
||||
return bestOverlap > 0 ? bestPos1 : undefined;
|
||||
}
|
||||
|
||||
function fillMissingPos1BySurfaceSequence(
|
||||
tokens: MergedToken[],
|
||||
mecabTokens: MergedToken[],
|
||||
): MergedToken[] {
|
||||
const indexedMecabTokens = mecabTokens
|
||||
.map((token, index) => ({ token, index }))
|
||||
.filter(({ token }) => token.pos1 && token.surface.trim().length > 0);
|
||||
|
||||
if (indexedMecabTokens.length === 0) {
|
||||
return tokens;
|
||||
}
|
||||
|
||||
let cursor = 0;
|
||||
return tokens.map((token) => {
|
||||
if (token.pos1 && token.pos1.trim().length > 0) {
|
||||
return token;
|
||||
}
|
||||
|
||||
const surface = token.surface.trim();
|
||||
if (!surface) {
|
||||
return token;
|
||||
}
|
||||
|
||||
let best: { pos1: string; index: number } | null = null;
|
||||
for (const candidate of indexedMecabTokens) {
|
||||
if (candidate.token.surface !== surface) {
|
||||
continue;
|
||||
}
|
||||
if (candidate.index < cursor) {
|
||||
continue;
|
||||
}
|
||||
best = { pos1: candidate.token.pos1 as string, index: candidate.index };
|
||||
break;
|
||||
}
|
||||
|
||||
if (!best) {
|
||||
for (const candidate of indexedMecabTokens) {
|
||||
if (candidate.token.surface !== surface) {
|
||||
continue;
|
||||
}
|
||||
best = { pos1: candidate.token.pos1 as string, index: candidate.index };
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!best) {
|
||||
return token;
|
||||
}
|
||||
|
||||
cursor = best.index + 1;
|
||||
return {
|
||||
...token,
|
||||
pos1: best.pos1,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
async function enrichYomitanPos1(
|
||||
tokens: MergedToken[],
|
||||
deps: TokenizerServiceDeps,
|
||||
@@ -756,7 +865,7 @@ async function enrichYomitanPos1(
|
||||
return tokens;
|
||||
}
|
||||
|
||||
return tokens.map((token) => {
|
||||
const overlapEnriched = tokens.map((token) => {
|
||||
if (token.pos1) {
|
||||
return token;
|
||||
}
|
||||
@@ -771,6 +880,8 @@ async function enrichYomitanPos1(
|
||||
pos1,
|
||||
};
|
||||
});
|
||||
|
||||
return fillMissingPos1BySurfaceSequence(overlapEnriched, mecabTokens);
|
||||
}
|
||||
|
||||
async function ensureYomitanParserWindow(deps: TokenizerServiceDeps): Promise<boolean> {
|
||||
|
||||
Reference in New Issue
Block a user