fix(tokenizer): tighten n+1 eligibility using mecab pos overlaps

This commit is contained in:
2026-02-28 19:07:43 -08:00
parent 498fd2d09a
commit a7d220e182
10 changed files with 736 additions and 43 deletions

View File

@@ -2038,3 +2038,125 @@ test('tokenizeSubtitle keeps frequency enrichment while n+1 is disabled', async
assert.equal(mecabCalls, 1);
assert.equal(frequencyCalls, 1);
});
test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and frequency annotations', async () => {
const result = await tokenizeSubtitle(
'になれば',
makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], {
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) => (text === 'なる' ? 11 : null),
tokenizeWithMecab: async () => [
{
headword: 'なる',
surface: 'になれば',
reading: 'ニナレバ',
startPos: 0,
endPos: 4,
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '非自立',
isMerged: true,
isKnown: false,
isNPlusOneTarget: false,
},
],
getMinSentenceWordsForNPlusOne: () => 1,
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
});
test('tokenizeSubtitle keeps merged token when overlap contains at least one content pos1 tag', async () => {
const result = await tokenizeSubtitle(
'になれば',
makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], {
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) => (text === 'なる' ? 13 : null),
tokenizeWithMecab: async () => [
{
headword: 'に',
surface: 'に',
reading: 'ニ',
startPos: 0,
endPos: 1,
partOfSpeech: PartOfSpeech.particle,
pos1: '助詞',
pos2: '格助詞',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'なる',
surface: 'なれ',
reading: 'ナレ',
startPos: 1,
endPos: 3,
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '自立',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'ば',
surface: 'ば',
reading: 'バ',
startPos: 3,
endPos: 4,
partOfSpeech: PartOfSpeech.particle,
pos1: '助詞',
pos2: '接続助詞',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
],
getMinSentenceWordsForNPlusOne: () => 1,
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.pos1, '助詞|動詞');
assert.equal(result.tokens?.[0]?.frequencyRank, 13);
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, true);
});
test('tokenizeSubtitle excludes default non-independent pos2 from N+1 when JLPT/frequency are disabled', async () => {
let mecabCalls = 0;
const result = await tokenizeSubtitle(
'になれば',
makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], {
getJlptEnabled: () => false,
getFrequencyDictionaryEnabled: () => false,
getMinSentenceWordsForNPlusOne: () => 1,
tokenizeWithMecab: async () => {
mecabCalls += 1;
return [
{
headword: 'なる',
surface: 'になれば',
reading: 'ニナレバ',
startPos: 0,
endPos: 4,
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '非自立',
isMerged: true,
isKnown: false,
isNPlusOneTarget: false,
},
];
},
}),
);
assert.equal(mecabCalls, 1);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
});

View File

@@ -10,6 +10,14 @@ import {
FrequencyDictionaryLookup,
JlptLevel,
} from '../../types';
import {
DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
resolveAnnotationPos1ExclusionSet,
} from '../../token-pos1-exclusions';
import {
DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
resolveAnnotationPos2ExclusionSet,
} from '../../token-pos2-exclusions';
import { selectYomitanParseTokens } from './tokenizer/parser-selection-stage';
import {
requestYomitanParseResults,
@@ -78,6 +86,8 @@ interface TokenizerAnnotationOptions {
frequencyEnabled: boolean;
frequencyMatchMode: FrequencyDictionaryMatchMode;
minSentenceWordsForNPlusOne: number | undefined;
pos1Exclusions: ReadonlySet<string>;
pos2Exclusions: ReadonlySet<string>;
}
let parserEnrichmentWorkerRuntimeModulePromise:
@@ -87,6 +97,12 @@ let annotationStageModulePromise: Promise<typeof import('./tokenizer/annotation-
let parserEnrichmentFallbackModulePromise:
| Promise<typeof import('./tokenizer/parser-enrichment-stage')>
| null = null;
const DEFAULT_ANNOTATION_POS1_EXCLUSIONS = resolveAnnotationPos1ExclusionSet(
DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
);
const DEFAULT_ANNOTATION_POS2_EXCLUSIONS = resolveAnnotationPos2ExclusionSet(
DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
);
function getKnownWordLookup(deps: TokenizerServiceDeps, options: TokenizerAnnotationOptions): (text: string) => boolean {
if (!options.nPlusOneEnabled) {
@@ -96,7 +112,7 @@ function getKnownWordLookup(deps: TokenizerServiceDeps, options: TokenizerAnnota
}
function needsMecabPosEnrichment(options: TokenizerAnnotationOptions): boolean {
return options.jlptEnabled || options.frequencyEnabled;
return options.nPlusOneEnabled || options.jlptEnabled || options.frequencyEnabled;
}
function hasAnyAnnotationEnabled(options: TokenizerAnnotationOptions): boolean {
@@ -389,6 +405,8 @@ function getAnnotationOptions(deps: TokenizerServiceDeps): TokenizerAnnotationOp
frequencyEnabled: deps.getFrequencyDictionaryEnabled?.() !== false,
frequencyMatchMode: deps.getFrequencyDictionaryMatchMode?.() ?? 'headword',
minSentenceWordsForNPlusOne: deps.getMinSentenceWordsForNPlusOne?.(),
pos1Exclusions: DEFAULT_ANNOTATION_POS1_EXCLUSIONS,
pos2Exclusions: DEFAULT_ANNOTATION_POS2_EXCLUSIONS,
};
}

View File

@@ -205,3 +205,171 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
assert.equal(result[2]?.isKnown, true);
assert.equal(result[0]?.isNPlusOneTarget, false);
});
test('annotateTokens applies configured pos1 exclusions to both frequency and N+1', () => {
const tokens = [
makeToken({
surface: '猫',
headword: '猫',
pos1: '名詞',
frequencyRank: 21,
startPos: 0,
endPos: 1,
}),
makeToken({
surface: '走る',
headword: '走る',
pos1: '動詞',
partOfSpeech: PartOfSpeech.verb,
startPos: 1,
endPos: 3,
frequencyRank: 22,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === '走る',
}),
{
minSentenceWordsForNPlusOne: 1,
pos1Exclusions: new Set(['名詞']),
},
);
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[1]?.frequencyRank, 22);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[1]?.isNPlusOneTarget, false);
});
test('annotateTokens allows previously default-excluded pos1 when removed from effective set', () => {
const tokens = [
makeToken({
surface: 'は',
headword: 'は',
partOfSpeech: PartOfSpeech.other,
pos1: '助詞',
startPos: 0,
endPos: 1,
frequencyRank: 8,
}),
];
const result = annotateTokens(tokens, makeDeps(), {
minSentenceWordsForNPlusOne: 1,
pos1Exclusions: new Set(),
});
assert.equal(result[0]?.frequencyRank, 8);
assert.equal(result[0]?.isNPlusOneTarget, true);
});
test('annotateTokens excludes default non-independent pos2 from frequency and N+1', () => {
const tokens = [
makeToken({
surface: 'になれば',
headword: 'なる',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '非自立',
startPos: 0,
endPos: 4,
frequencyRank: 7,
}),
];
const result = annotateTokens(tokens, makeDeps(), {
minSentenceWordsForNPlusOne: 1,
});
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.isNPlusOneTarget, false);
});
test('annotateTokens excludes likely kana SFX tokens from frequency when POS tags are missing', () => {
const tokens = [
makeToken({
surface: 'ぐわっ',
reading: 'ぐわっ',
headword: 'ぐわっ',
pos1: '',
pos2: '',
frequencyRank: 12,
startPos: 0,
endPos: 3,
}),
];
const result = annotateTokens(tokens, makeDeps(), {
minSentenceWordsForNPlusOne: 1,
});
assert.equal(result[0]?.frequencyRank, undefined);
});
test('annotateTokens allows previously default-excluded pos2 when removed from effective set', () => {
const tokens = [
makeToken({
surface: 'になれば',
headword: 'なる',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '非自立',
startPos: 0,
endPos: 4,
frequencyRank: 9,
}),
];
const result = annotateTokens(tokens, makeDeps(), {
minSentenceWordsForNPlusOne: 1,
pos2Exclusions: new Set(),
});
assert.equal(result[0]?.frequencyRank, 9);
assert.equal(result[0]?.isNPlusOneTarget, true);
});
test('annotateTokens keeps composite tokens when any component pos tag is content-bearing', () => {
const tokens = [
makeToken({
surface: 'になれば',
headword: 'なる',
pos1: '助詞|動詞',
pos2: '格助詞|自立|接続助詞',
startPos: 0,
endPos: 4,
frequencyRank: 5,
}),
];
const result = annotateTokens(tokens, makeDeps(), {
minSentenceWordsForNPlusOne: 1,
});
assert.equal(result[0]?.frequencyRank, 5);
assert.equal(result[0]?.isNPlusOneTarget, true);
});
test('annotateTokens excludes composite tokens when all component pos tags are excluded', () => {
const tokens = [
makeToken({
surface: 'けど',
headword: 'けど',
pos1: '助詞|助詞',
pos2: '接続助詞|終助詞',
startPos: 0,
endPos: 2,
frequencyRank: 6,
}),
];
const result = annotateTokens(tokens, makeDeps(), {
minSentenceWordsForNPlusOne: 1,
});
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.isNPlusOneTarget, false);
});

View File

@@ -1,4 +1,12 @@
import { markNPlusOneTargets } from '../../../token-merger';
import {
DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
resolveAnnotationPos1ExclusionSet,
} from '../../../token-pos1-exclusions';
import {
DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
resolveAnnotationPos2ExclusionSet,
} from '../../../token-pos2-exclusions';
import {
JlptLevel,
MergedToken,
@@ -28,6 +36,8 @@ export interface AnnotationStageOptions {
jlptEnabled?: boolean;
frequencyEnabled?: boolean;
minSentenceWordsForNPlusOne?: number;
pos1Exclusions?: ReadonlySet<string>;
pos2Exclusions?: ReadonlySet<string>;
}
function resolveKnownWordText(
@@ -53,22 +63,85 @@ function applyKnownWordMarking(
});
}
function isFrequencyExcludedByPos(token: MergedToken): boolean {
if (
token.partOfSpeech === PartOfSpeech.particle ||
token.partOfSpeech === PartOfSpeech.bound_auxiliary
) {
function normalizePos1Tag(pos1: string | undefined): string {
return typeof pos1 === 'string' ? pos1.trim() : '';
}
function isExcludedByTagSet(
normalizedTag: string,
exclusions: ReadonlySet<string>,
): boolean {
if (!normalizedTag) {
return false;
}
const parts = normalizedTag
.split('|')
.map((part) => part.trim())
.filter((part) => part.length > 0);
if (parts.length === 0) {
return false;
}
return parts.every((part) => exclusions.has(part));
}
function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
if (options.pos1Exclusions) {
return options.pos1Exclusions;
}
return resolveAnnotationPos1ExclusionSet(DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG);
}
function resolvePos2Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
if (options.pos2Exclusions) {
return options.pos2Exclusions;
}
return resolveAnnotationPos2ExclusionSet(DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG);
}
function normalizePos2Tag(pos2: string | undefined): string {
return typeof pos2 === 'string' ? pos2.trim() : '';
}
function isFrequencyExcludedByPos(
token: MergedToken,
pos1Exclusions: ReadonlySet<string>,
pos2Exclusions: ReadonlySet<string>,
): boolean {
const normalizedPos1 = normalizePos1Tag(token.pos1);
const hasPos1 = normalizedPos1.length > 0;
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
return true;
}
return token.pos1 === '助詞' || token.pos1 === '助動詞';
const normalizedPos2 = normalizePos2Tag(token.pos2);
const hasPos2 = normalizedPos2.length > 0;
if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) {
return true;
}
if (hasPos1 || hasPos2) {
return false;
}
if (isLikelyFrequencyNoiseToken(token)) {
return true;
}
return (
token.partOfSpeech === PartOfSpeech.particle ||
token.partOfSpeech === PartOfSpeech.bound_auxiliary
);
}
function applyFrequencyMarking(
tokens: MergedToken[],
pos1Exclusions: ReadonlySet<string>,
pos2Exclusions: ReadonlySet<string>,
): MergedToken[] {
return tokens.map((token) => {
if (isFrequencyExcludedByPos(token)) {
if (isFrequencyExcludedByPos(token, pos1Exclusions, pos2Exclusions)) {
return { ...token, frequencyRank: undefined };
}
@@ -203,6 +276,101 @@ function isRepeatedKanaSfx(text: string): boolean {
return topCount >= Math.ceil(chars.length / 2);
}
function isTrailingSmallTsuKanaSfx(text: string): boolean {
const normalized = normalizeJlptTextForExclusion(text);
if (!normalized) {
return false;
}
const chars = [...normalized];
if (chars.length < 2 || chars.length > 4) {
return false;
}
if (!chars.every(isKanaChar)) {
return false;
}
return chars[chars.length - 1] === 'っ';
}
function isReduplicatedKanaSfx(text: string): boolean {
const normalized = normalizeJlptTextForExclusion(text);
if (!normalized) {
return false;
}
const chars = [...normalized];
if (chars.length < 4 || chars.length % 2 !== 0) {
return false;
}
if (!chars.every(isKanaChar)) {
return false;
}
const half = chars.length / 2;
return chars.slice(0, half).join('') === chars.slice(half).join('');
}
function hasAdjacentKanaRepeat(text: string): boolean {
const normalized = normalizeJlptTextForExclusion(text);
if (!normalized) {
return false;
}
const chars = [...normalized];
if (!chars.every(isKanaChar)) {
return false;
}
for (let i = 1; i < chars.length; i += 1) {
if (chars[i] === chars[i - 1]) {
return true;
}
}
return false;
}
function isLikelyFrequencyNoiseToken(token: MergedToken): boolean {
const candidates = [token.headword, token.surface].filter(
(candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
);
for (const candidate of candidates) {
const trimmedCandidate = candidate.trim();
if (!trimmedCandidate) {
continue;
}
const normalizedCandidate = normalizeJlptTextForExclusion(trimmedCandidate);
if (!normalizedCandidate) {
continue;
}
if (
shouldIgnoreJlptByTerm(trimmedCandidate) ||
shouldIgnoreJlptByTerm(normalizedCandidate)
) {
return true;
}
if (
hasAdjacentKanaRepeat(trimmedCandidate) ||
hasAdjacentKanaRepeat(normalizedCandidate) ||
isReduplicatedKanaSfx(trimmedCandidate) ||
isReduplicatedKanaSfx(normalizedCandidate) ||
isTrailingSmallTsuKanaSfx(trimmedCandidate) ||
isTrailingSmallTsuKanaSfx(normalizedCandidate)
) {
return true;
}
}
return false;
}
function isJlptEligibleToken(token: MergedToken): boolean {
if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) {
return false;
@@ -261,6 +429,8 @@ export function annotateTokens(
deps: AnnotationStageDeps,
options: AnnotationStageOptions = {},
): MergedToken[] {
const pos1Exclusions = resolvePos1Exclusions(options);
const pos2Exclusions = resolvePos2Exclusions(options);
const nPlusOneEnabled = options.nPlusOneEnabled !== false;
const knownMarkedTokens = nPlusOneEnabled
? applyKnownWordMarking(tokens, deps.isKnownWord, deps.knownWordMatchMode)
@@ -273,7 +443,7 @@ export function annotateTokens(
const frequencyEnabled = options.frequencyEnabled !== false;
const frequencyMarkedTokens =
frequencyEnabled
? applyFrequencyMarking(knownMarkedTokens)
? applyFrequencyMarking(knownMarkedTokens, pos1Exclusions, pos2Exclusions)
: knownMarkedTokens.map((token) => ({
...token,
frequencyRank: undefined,
@@ -303,5 +473,10 @@ export function annotateTokens(
? minSentenceWordsForNPlusOne
: 3;
return markNPlusOneTargets(jlptMarkedTokens, sanitizedMinSentenceWordsForNPlusOne);
return markNPlusOneTargets(
jlptMarkedTokens,
sanitizedMinSentenceWordsForNPlusOne,
pos1Exclusions,
pos2Exclusions,
);
}

View File

@@ -22,12 +22,13 @@ function makeToken(overrides: Partial<MergedToken>): MergedToken {
test('enrichTokensWithMecabPos1 picks pos1 by best overlap when no surface match exists', () => {
const tokens = [makeToken({ surface: 'grouped', startPos: 2, endPos: 7 })];
const mecabTokens = [
makeToken({ surface: 'left', startPos: 0, endPos: 4, pos1: 'A' }),
makeToken({ surface: 'right', startPos: 2, endPos: 6, pos1: 'B' }),
makeToken({ surface: 'left', startPos: 0, endPos: 4, pos1: 'A', pos2: 'L2' }),
makeToken({ surface: 'right', startPos: 2, endPos: 6, pos1: 'B', pos2: '非自立' }),
];
const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
assert.equal(enriched[0]?.pos1, 'B');
assert.equal(enriched[0]?.pos1, 'A|B');
assert.equal(enriched[0]?.pos2, 'L2|非自立');
});
test('enrichTokensWithMecabPos1 fills missing pos1 using surface-sequence fallback', () => {

View File

@@ -1,13 +1,45 @@
import { MergedToken } from '../../../types';
function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): string | undefined {
if (mecabTokens.length === 0) {
type MecabPosMetadata = {
pos1: string;
pos2?: string;
pos3?: string;
};
function joinUniqueTags(values: Array<string | undefined>): string | undefined {
const unique: string[] = [];
for (const value of values) {
if (!value) {
continue;
}
const trimmed = value.trim();
if (!trimmed) {
continue;
}
if (!unique.includes(trimmed)) {
unique.push(trimmed);
}
}
if (unique.length === 0) {
return undefined;
}
if (unique.length === 1) {
return unique[0];
}
return unique.join('|');
}
function pickClosestMecabPosMetadata(
token: MergedToken,
mecabTokens: MergedToken[],
): MecabPosMetadata | null {
if (mecabTokens.length === 0) {
return null;
}
const tokenStart = token.startPos ?? 0;
const tokenEnd = token.endPos ?? tokenStart + token.surface.length;
let bestSurfaceMatchPos1: string | undefined;
let bestSurfaceMatchToken: MergedToken | null = null;
let bestSurfaceMatchDistance = Number.MAX_SAFE_INTEGER;
let bestSurfaceMatchEndDistance = Number.MAX_SAFE_INTEGER;
@@ -31,19 +63,24 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s
) {
bestSurfaceMatchDistance = startDistance;
bestSurfaceMatchEndDistance = endDistance;
bestSurfaceMatchPos1 = mecabToken.pos1;
bestSurfaceMatchToken = mecabToken;
}
}
if (bestSurfaceMatchPos1) {
return bestSurfaceMatchPos1;
if (bestSurfaceMatchToken) {
return {
pos1: bestSurfaceMatchToken.pos1 as string,
pos2: bestSurfaceMatchToken.pos2,
pos3: bestSurfaceMatchToken.pos3,
};
}
let bestPos1: string | undefined;
let bestToken: MergedToken | null = null;
let bestOverlap = 0;
let bestSpan = 0;
let bestStartDistance = Number.MAX_SAFE_INTEGER;
let bestStart = Number.MAX_SAFE_INTEGER;
const overlappingTokens: MergedToken[] = [];
for (const mecabToken of mecabTokens) {
if (!mecabToken.pos1) {
@@ -58,6 +95,7 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s
if (overlap === 0) {
continue;
}
overlappingTokens.push(mecabToken);
const span = mecabEnd - mecabStart;
if (
@@ -71,11 +109,23 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s
bestSpan = span;
bestStartDistance = Math.abs(mecabStart - tokenStart);
bestStart = mecabStart;
bestPos1 = mecabToken.pos1;
bestToken = mecabToken;
}
}
return bestOverlap > 0 ? bestPos1 : undefined;
if (bestOverlap === 0 || !bestToken) {
return null;
}
const overlapPos1 = joinUniqueTags(overlappingTokens.map((token) => token.pos1));
const overlapPos2 = joinUniqueTags(overlappingTokens.map((token) => token.pos2));
const overlapPos3 = joinUniqueTags(overlappingTokens.map((token) => token.pos3));
return {
pos1: overlapPos1 ?? (bestToken.pos1 as string),
pos2: overlapPos2 ?? bestToken.pos2,
pos3: overlapPos3 ?? bestToken.pos3,
};
}
function fillMissingPos1BySurfaceSequence(
@@ -101,7 +151,7 @@ function fillMissingPos1BySurfaceSequence(
return token;
}
let best: { pos1: string; index: number } | null = null;
let best: { token: MergedToken; index: number } | null = null;
for (const candidate of indexedMecabTokens) {
if (candidate.token.surface !== surface) {
continue;
@@ -109,7 +159,7 @@ function fillMissingPos1BySurfaceSequence(
if (candidate.index < cursor) {
continue;
}
best = { pos1: candidate.token.pos1 as string, index: candidate.index };
best = { token: candidate.token, index: candidate.index };
break;
}
@@ -118,7 +168,7 @@ function fillMissingPos1BySurfaceSequence(
if (candidate.token.surface !== surface) {
continue;
}
best = { pos1: candidate.token.pos1 as string, index: candidate.index };
best = { token: candidate.token, index: candidate.index };
break;
}
}
@@ -130,7 +180,9 @@ function fillMissingPos1BySurfaceSequence(
cursor = best.index + 1;
return {
...token,
pos1: best.pos1,
pos1: best.token.pos1,
pos2: best.token.pos2,
pos3: best.token.pos3,
};
});
}
@@ -152,14 +204,16 @@ export function enrichTokensWithMecabPos1(
return token;
}
const pos1 = pickClosestMecabPos1(token, mecabTokens);
if (!pos1) {
const metadata = pickClosestMecabPosMetadata(token, mecabTokens);
if (!metadata) {
return token;
}
return {
...token,
pos1,
pos1: metadata.pos1,
pos2: metadata.pos2,
pos3: metadata.pos3,
};
});