fix(tokenizer): tighten n+1 eligibility using mecab pos overlaps

2026-03-02 06:22:42 -08:00 · 2026-02-28 19:07:43 -08:00
parent 498fd2d09a
commit a7d220e182
10 changed files with 736 additions and 43 deletions
--- a/src/core/services/tokenizer.test.ts
+++ b/src/core/services/tokenizer.test.ts
@@ -2038,3 +2038,125 @@ test('tokenizeSubtitle keeps frequency enrichment while n+1 is disabled', async
  assert.equal(mecabCalls, 1);
  assert.equal(frequencyCalls, 1);
 });
+
+
+test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and frequency annotations', async () => {
+  const result = await tokenizeSubtitle(
+    'になれば',
+    makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], {
+      getFrequencyDictionaryEnabled: () => true,
+      getFrequencyRank: (text) => (text === 'なる' ? 11 : null),
+      tokenizeWithMecab: async () => [
+        {
+          headword: 'なる',
+          surface: 'になれば',
+          reading: 'ニナレバ',
+          startPos: 0,
+          endPos: 4,
+          partOfSpeech: PartOfSpeech.verb,
+          pos1: '動詞',
+          pos2: '非自立',
+          isMerged: true,
+          isKnown: false,
+          isNPlusOneTarget: false,
+        },
+      ],
+      getMinSentenceWordsForNPlusOne: () => 1,
+    }),
+  );
+
+  assert.equal(result.tokens?.length, 1);
+  assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
+  assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
+});
+
+test('tokenizeSubtitle keeps merged token when overlap contains at least one content pos1 tag', async () => {
+  const result = await tokenizeSubtitle(
+    'になれば',
+    makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], {
+      getFrequencyDictionaryEnabled: () => true,
+      getFrequencyRank: (text) => (text === 'なる' ? 13 : null),
+      tokenizeWithMecab: async () => [
+        {
+          headword: 'に',
+          surface: 'に',
+          reading: 'ニ',
+          startPos: 0,
+          endPos: 1,
+          partOfSpeech: PartOfSpeech.particle,
+          pos1: '助詞',
+          pos2: '格助詞',
+          isMerged: false,
+          isKnown: false,
+          isNPlusOneTarget: false,
+        },
+        {
+          headword: 'なる',
+          surface: 'なれ',
+          reading: 'ナレ',
+          startPos: 1,
+          endPos: 3,
+          partOfSpeech: PartOfSpeech.verb,
+          pos1: '動詞',
+          pos2: '自立',
+          isMerged: false,
+          isKnown: false,
+          isNPlusOneTarget: false,
+        },
+        {
+          headword: 'ば',
+          surface: 'ば',
+          reading: 'バ',
+          startPos: 3,
+          endPos: 4,
+          partOfSpeech: PartOfSpeech.particle,
+          pos1: '助詞',
+          pos2: '接続助詞',
+          isMerged: false,
+          isKnown: false,
+          isNPlusOneTarget: false,
+        },
+      ],
+      getMinSentenceWordsForNPlusOne: () => 1,
+    }),
+  );
+
+  assert.equal(result.tokens?.length, 1);
+  assert.equal(result.tokens?.[0]?.pos1, '助詞|動詞');
+  assert.equal(result.tokens?.[0]?.frequencyRank, 13);
+  assert.equal(result.tokens?.[0]?.isNPlusOneTarget, true);
+});
+
+test('tokenizeSubtitle excludes default non-independent pos2 from N+1 when JLPT/frequency are disabled', async () => {
+  let mecabCalls = 0;
+  const result = await tokenizeSubtitle(
+    'になれば',
+    makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], {
+      getJlptEnabled: () => false,
+      getFrequencyDictionaryEnabled: () => false,
+      getMinSentenceWordsForNPlusOne: () => 1,
+      tokenizeWithMecab: async () => {
+        mecabCalls += 1;
+        return [
+          {
+            headword: 'なる',
+            surface: 'になれば',
+            reading: 'ニナレバ',
+            startPos: 0,
+            endPos: 4,
+            partOfSpeech: PartOfSpeech.verb,
+            pos1: '動詞',
+            pos2: '非自立',
+            isMerged: true,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+        ];
+      },
+    }),
+  );
+
+  assert.equal(mecabCalls, 1);
+  assert.equal(result.tokens?.length, 1);
+  assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
+});
--- a/src/core/services/tokenizer.ts
+++ b/src/core/services/tokenizer.ts
@@ -10,6 +10,14 @@ import {
  FrequencyDictionaryLookup,
  JlptLevel,
 } from '../../types';
+import {
+  DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
+  resolveAnnotationPos1ExclusionSet,
+} from '../../token-pos1-exclusions';
+import {
+  DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
+  resolveAnnotationPos2ExclusionSet,
+} from '../../token-pos2-exclusions';
 import { selectYomitanParseTokens } from './tokenizer/parser-selection-stage';
 import {
  requestYomitanParseResults,
@@ -78,6 +86,8 @@ interface TokenizerAnnotationOptions {
  frequencyEnabled: boolean;
  frequencyMatchMode: FrequencyDictionaryMatchMode;
  minSentenceWordsForNPlusOne: number | undefined;
+  pos1Exclusions: ReadonlySet<string>;
+  pos2Exclusions: ReadonlySet<string>;
 }

 let parserEnrichmentWorkerRuntimeModulePromise:
@@ -87,6 +97,12 @@ let annotationStageModulePromise: Promise<typeof import('./tokenizer/annotation-
 let parserEnrichmentFallbackModulePromise:
  | Promise<typeof import('./tokenizer/parser-enrichment-stage')>
  | null = null;
+const DEFAULT_ANNOTATION_POS1_EXCLUSIONS = resolveAnnotationPos1ExclusionSet(
+  DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
+);
+const DEFAULT_ANNOTATION_POS2_EXCLUSIONS = resolveAnnotationPos2ExclusionSet(
+  DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
+);

 function getKnownWordLookup(deps: TokenizerServiceDeps, options: TokenizerAnnotationOptions): (text: string) => boolean {
  if (!options.nPlusOneEnabled) {
@@ -96,7 +112,7 @@ function getKnownWordLookup(deps: TokenizerServiceDeps, options: TokenizerAnnota
 }

 function needsMecabPosEnrichment(options: TokenizerAnnotationOptions): boolean {
-  return options.jlptEnabled || options.frequencyEnabled;
+  return options.nPlusOneEnabled || options.jlptEnabled || options.frequencyEnabled;
 }

 function hasAnyAnnotationEnabled(options: TokenizerAnnotationOptions): boolean {
@@ -389,6 +405,8 @@ function getAnnotationOptions(deps: TokenizerServiceDeps): TokenizerAnnotationOp
    frequencyEnabled: deps.getFrequencyDictionaryEnabled?.() !== false,
    frequencyMatchMode: deps.getFrequencyDictionaryMatchMode?.() ?? 'headword',
    minSentenceWordsForNPlusOne: deps.getMinSentenceWordsForNPlusOne?.(),
+    pos1Exclusions: DEFAULT_ANNOTATION_POS1_EXCLUSIONS,
+    pos2Exclusions: DEFAULT_ANNOTATION_POS2_EXCLUSIONS,
  };
 }

--- a/src/core/services/tokenizer/annotation-stage.test.ts
+++ b/src/core/services/tokenizer/annotation-stage.test.ts
@@ -205,3 +205,171 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
  assert.equal(result[2]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
 });
+
+test('annotateTokens applies configured pos1 exclusions to both frequency and N+1', () => {
+  const tokens = [
+    makeToken({
+      surface: '猫',
+      headword: '猫',
+      pos1: '名詞',
+      frequencyRank: 21,
+      startPos: 0,
+      endPos: 1,
+    }),
+    makeToken({
+      surface: '走る',
+      headword: '走る',
+      pos1: '動詞',
+      partOfSpeech: PartOfSpeech.verb,
+      startPos: 1,
+      endPos: 3,
+      frequencyRank: 22,
+    }),
+  ];
+
+  const result = annotateTokens(
+    tokens,
+    makeDeps({
+      isKnownWord: (text) => text === '走る',
+    }),
+    {
+      minSentenceWordsForNPlusOne: 1,
+      pos1Exclusions: new Set(['名詞']),
+    },
+  );
+
+  assert.equal(result[0]?.frequencyRank, undefined);
+  assert.equal(result[1]?.frequencyRank, 22);
+  assert.equal(result[0]?.isNPlusOneTarget, false);
+  assert.equal(result[1]?.isNPlusOneTarget, false);
+});
+
+test('annotateTokens allows previously default-excluded pos1 when removed from effective set', () => {
+  const tokens = [
+    makeToken({
+      surface: 'は',
+      headword: 'は',
+      partOfSpeech: PartOfSpeech.other,
+      pos1: '助詞',
+      startPos: 0,
+      endPos: 1,
+      frequencyRank: 8,
+    }),
+  ];
+
+  const result = annotateTokens(tokens, makeDeps(), {
+    minSentenceWordsForNPlusOne: 1,
+    pos1Exclusions: new Set(),
+  });
+
+  assert.equal(result[0]?.frequencyRank, 8);
+  assert.equal(result[0]?.isNPlusOneTarget, true);
+});
+
+test('annotateTokens excludes default non-independent pos2 from frequency and N+1', () => {
+  const tokens = [
+    makeToken({
+      surface: 'になれば',
+      headword: 'なる',
+      partOfSpeech: PartOfSpeech.verb,
+      pos1: '動詞',
+      pos2: '非自立',
+      startPos: 0,
+      endPos: 4,
+      frequencyRank: 7,
+    }),
+  ];
+
+  const result = annotateTokens(tokens, makeDeps(), {
+    minSentenceWordsForNPlusOne: 1,
+  });
+
+  assert.equal(result[0]?.frequencyRank, undefined);
+  assert.equal(result[0]?.isNPlusOneTarget, false);
+});
+
+test('annotateTokens excludes likely kana SFX tokens from frequency when POS tags are missing', () => {
+  const tokens = [
+    makeToken({
+      surface: 'ぐわっ',
+      reading: 'ぐわっ',
+      headword: 'ぐわっ',
+      pos1: '',
+      pos2: '',
+      frequencyRank: 12,
+      startPos: 0,
+      endPos: 3,
+    }),
+  ];
+
+  const result = annotateTokens(tokens, makeDeps(), {
+    minSentenceWordsForNPlusOne: 1,
+  });
+
+  assert.equal(result[0]?.frequencyRank, undefined);
+});
+
+test('annotateTokens allows previously default-excluded pos2 when removed from effective set', () => {
+  const tokens = [
+    makeToken({
+      surface: 'になれば',
+      headword: 'なる',
+      partOfSpeech: PartOfSpeech.verb,
+      pos1: '動詞',
+      pos2: '非自立',
+      startPos: 0,
+      endPos: 4,
+      frequencyRank: 9,
+    }),
+  ];
+
+  const result = annotateTokens(tokens, makeDeps(), {
+    minSentenceWordsForNPlusOne: 1,
+    pos2Exclusions: new Set(),
+  });
+
+  assert.equal(result[0]?.frequencyRank, 9);
+  assert.equal(result[0]?.isNPlusOneTarget, true);
+});
+
+test('annotateTokens keeps composite tokens when any component pos tag is content-bearing', () => {
+  const tokens = [
+    makeToken({
+      surface: 'になれば',
+      headword: 'なる',
+      pos1: '助詞|動詞',
+      pos2: '格助詞|自立|接続助詞',
+      startPos: 0,
+      endPos: 4,
+      frequencyRank: 5,
+    }),
+  ];
+
+  const result = annotateTokens(tokens, makeDeps(), {
+    minSentenceWordsForNPlusOne: 1,
+  });
+
+  assert.equal(result[0]?.frequencyRank, 5);
+  assert.equal(result[0]?.isNPlusOneTarget, true);
+});
+
+test('annotateTokens excludes composite tokens when all component pos tags are excluded', () => {
+  const tokens = [
+    makeToken({
+      surface: 'けど',
+      headword: 'けど',
+      pos1: '助詞|助詞',
+      pos2: '接続助詞|終助詞',
+      startPos: 0,
+      endPos: 2,
+      frequencyRank: 6,
+    }),
+  ];
+
+  const result = annotateTokens(tokens, makeDeps(), {
+    minSentenceWordsForNPlusOne: 1,
+  });
+
+  assert.equal(result[0]?.frequencyRank, undefined);
+  assert.equal(result[0]?.isNPlusOneTarget, false);
+});
--- a/src/core/services/tokenizer/annotation-stage.ts
+++ b/src/core/services/tokenizer/annotation-stage.ts
@@ -1,4 +1,12 @@
 import { markNPlusOneTargets } from '../../../token-merger';
+import {
+  DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
+  resolveAnnotationPos1ExclusionSet,
+} from '../../../token-pos1-exclusions';
+import {
+  DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
+  resolveAnnotationPos2ExclusionSet,
+} from '../../../token-pos2-exclusions';
 import {
  JlptLevel,
  MergedToken,
@@ -28,6 +36,8 @@ export interface AnnotationStageOptions {
  jlptEnabled?: boolean;
  frequencyEnabled?: boolean;
  minSentenceWordsForNPlusOne?: number;
+  pos1Exclusions?: ReadonlySet<string>;
+  pos2Exclusions?: ReadonlySet<string>;
 }

 function resolveKnownWordText(
@@ -53,22 +63,85 @@ function applyKnownWordMarking(
  });
 }

-function isFrequencyExcludedByPos(token: MergedToken): boolean {
-  if (
-    token.partOfSpeech === PartOfSpeech.particle ||
-    token.partOfSpeech === PartOfSpeech.bound_auxiliary
-  ) {
+function normalizePos1Tag(pos1: string | undefined): string {
+  return typeof pos1 === 'string' ? pos1.trim() : '';
+}
+
+function isExcludedByTagSet(
+  normalizedTag: string,
+  exclusions: ReadonlySet<string>,
+): boolean {
+  if (!normalizedTag) {
+    return false;
+  }
+  const parts = normalizedTag
+    .split('|')
+    .map((part) => part.trim())
+    .filter((part) => part.length > 0);
+  if (parts.length === 0) {
+    return false;
+  }
+  return parts.every((part) => exclusions.has(part));
+}
+
+function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
+  if (options.pos1Exclusions) {
+    return options.pos1Exclusions;
+  }
+
+  return resolveAnnotationPos1ExclusionSet(DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG);
+}
+
+function resolvePos2Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
+  if (options.pos2Exclusions) {
+    return options.pos2Exclusions;
+  }
+
+  return resolveAnnotationPos2ExclusionSet(DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG);
+}
+
+function normalizePos2Tag(pos2: string | undefined): string {
+  return typeof pos2 === 'string' ? pos2.trim() : '';
+}
+
+function isFrequencyExcludedByPos(
+  token: MergedToken,
+  pos1Exclusions: ReadonlySet<string>,
+  pos2Exclusions: ReadonlySet<string>,
+): boolean {
+  const normalizedPos1 = normalizePos1Tag(token.pos1);
+  const hasPos1 = normalizedPos1.length > 0;
+  if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
    return true;
  }

-  return token.pos1 === '助詞' || token.pos1 === '助動詞';
+  const normalizedPos2 = normalizePos2Tag(token.pos2);
+  const hasPos2 = normalizedPos2.length > 0;
+  if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) {
+    return true;
+  }
+
+  if (hasPos1 || hasPos2) {
+    return false;
+  }
+
+  if (isLikelyFrequencyNoiseToken(token)) {
+    return true;
+  }
+
+  return (
+    token.partOfSpeech === PartOfSpeech.particle ||
+    token.partOfSpeech === PartOfSpeech.bound_auxiliary
+  );
 }

 function applyFrequencyMarking(
  tokens: MergedToken[],
+  pos1Exclusions: ReadonlySet<string>,
+  pos2Exclusions: ReadonlySet<string>,
 ): MergedToken[] {
  return tokens.map((token) => {
-    if (isFrequencyExcludedByPos(token)) {
+    if (isFrequencyExcludedByPos(token, pos1Exclusions, pos2Exclusions)) {
      return { ...token, frequencyRank: undefined };
    }

@@ -203,6 +276,101 @@ function isRepeatedKanaSfx(text: string): boolean {
  return topCount >= Math.ceil(chars.length / 2);
 }

+function isTrailingSmallTsuKanaSfx(text: string): boolean {
+  const normalized = normalizeJlptTextForExclusion(text);
+  if (!normalized) {
+    return false;
+  }
+
+  const chars = [...normalized];
+  if (chars.length < 2 || chars.length > 4) {
+    return false;
+  }
+
+  if (!chars.every(isKanaChar)) {
+    return false;
+  }
+
+  return chars[chars.length - 1] === 'っ';
+}
+
+function isReduplicatedKanaSfx(text: string): boolean {
+  const normalized = normalizeJlptTextForExclusion(text);
+  if (!normalized) {
+    return false;
+  }
+
+  const chars = [...normalized];
+  if (chars.length < 4 || chars.length % 2 !== 0) {
+    return false;
+  }
+
+  if (!chars.every(isKanaChar)) {
+    return false;
+  }
+
+  const half = chars.length / 2;
+  return chars.slice(0, half).join('') === chars.slice(half).join('');
+}
+
+function hasAdjacentKanaRepeat(text: string): boolean {
+  const normalized = normalizeJlptTextForExclusion(text);
+  if (!normalized) {
+    return false;
+  }
+
+  const chars = [...normalized];
+  if (!chars.every(isKanaChar)) {
+    return false;
+  }
+
+  for (let i = 1; i < chars.length; i += 1) {
+    if (chars[i] === chars[i - 1]) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+function isLikelyFrequencyNoiseToken(token: MergedToken): boolean {
+  const candidates = [token.headword, token.surface].filter(
+    (candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
+  );
+
+  for (const candidate of candidates) {
+    const trimmedCandidate = candidate.trim();
+    if (!trimmedCandidate) {
+      continue;
+    }
+
+    const normalizedCandidate = normalizeJlptTextForExclusion(trimmedCandidate);
+    if (!normalizedCandidate) {
+      continue;
+    }
+
+    if (
+      shouldIgnoreJlptByTerm(trimmedCandidate) ||
+      shouldIgnoreJlptByTerm(normalizedCandidate)
+    ) {
+      return true;
+    }
+
+    if (
+      hasAdjacentKanaRepeat(trimmedCandidate) ||
+      hasAdjacentKanaRepeat(normalizedCandidate) ||
+      isReduplicatedKanaSfx(trimmedCandidate) ||
+      isReduplicatedKanaSfx(normalizedCandidate) ||
+      isTrailingSmallTsuKanaSfx(trimmedCandidate) ||
+      isTrailingSmallTsuKanaSfx(normalizedCandidate)
+    ) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 function isJlptEligibleToken(token: MergedToken): boolean {
  if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) {
    return false;
@@ -261,6 +429,8 @@ export function annotateTokens(
  deps: AnnotationStageDeps,
  options: AnnotationStageOptions = {},
 ): MergedToken[] {
+  const pos1Exclusions = resolvePos1Exclusions(options);
+  const pos2Exclusions = resolvePos2Exclusions(options);
  const nPlusOneEnabled = options.nPlusOneEnabled !== false;
  const knownMarkedTokens = nPlusOneEnabled
    ? applyKnownWordMarking(tokens, deps.isKnownWord, deps.knownWordMatchMode)
@@ -273,7 +443,7 @@ export function annotateTokens(
  const frequencyEnabled = options.frequencyEnabled !== false;
  const frequencyMarkedTokens =
    frequencyEnabled
-      ? applyFrequencyMarking(knownMarkedTokens)
+      ? applyFrequencyMarking(knownMarkedTokens, pos1Exclusions, pos2Exclusions)
      : knownMarkedTokens.map((token) => ({
          ...token,
          frequencyRank: undefined,
@@ -303,5 +473,10 @@ export function annotateTokens(
      ? minSentenceWordsForNPlusOne
      : 3;

-  return markNPlusOneTargets(jlptMarkedTokens, sanitizedMinSentenceWordsForNPlusOne);
+  return markNPlusOneTargets(
+    jlptMarkedTokens,
+    sanitizedMinSentenceWordsForNPlusOne,
+    pos1Exclusions,
+    pos2Exclusions,
+  );
 }
--- a/src/core/services/tokenizer/parser-enrichment-stage.test.ts
+++ b/src/core/services/tokenizer/parser-enrichment-stage.test.ts
@@ -22,12 +22,13 @@ function makeToken(overrides: Partial<MergedToken>): MergedToken {
 test('enrichTokensWithMecabPos1 picks pos1 by best overlap when no surface match exists', () => {
  const tokens = [makeToken({ surface: 'grouped', startPos: 2, endPos: 7 })];
  const mecabTokens = [
-    makeToken({ surface: 'left', startPos: 0, endPos: 4, pos1: 'A' }),
-    makeToken({ surface: 'right', startPos: 2, endPos: 6, pos1: 'B' }),
+    makeToken({ surface: 'left', startPos: 0, endPos: 4, pos1: 'A', pos2: 'L2' }),
+    makeToken({ surface: 'right', startPos: 2, endPos: 6, pos1: 'B', pos2: '非自立' }),
  ];

  const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
-  assert.equal(enriched[0]?.pos1, 'B');
+  assert.equal(enriched[0]?.pos1, 'A|B');
+  assert.equal(enriched[0]?.pos2, 'L2|非自立');
 });

 test('enrichTokensWithMecabPos1 fills missing pos1 using surface-sequence fallback', () => {
--- a/src/core/services/tokenizer/parser-enrichment-stage.ts
+++ b/src/core/services/tokenizer/parser-enrichment-stage.ts
@@ -1,13 +1,45 @@
 import { MergedToken } from '../../../types';

-function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): string | undefined {
-  if (mecabTokens.length === 0) {
+type MecabPosMetadata = {
+  pos1: string;
+  pos2?: string;
+  pos3?: string;
+};
+
+function joinUniqueTags(values: Array<string | undefined>): string | undefined {
+  const unique: string[] = [];
+  for (const value of values) {
+    if (!value) {
+      continue;
+    }
+    const trimmed = value.trim();
+    if (!trimmed) {
+      continue;
+    }
+    if (!unique.includes(trimmed)) {
+      unique.push(trimmed);
+    }
+  }
+  if (unique.length === 0) {
    return undefined;
  }
+  if (unique.length === 1) {
+    return unique[0];
+  }
+  return unique.join('|');
+}
+
+function pickClosestMecabPosMetadata(
+  token: MergedToken,
+  mecabTokens: MergedToken[],
+): MecabPosMetadata | null {
+  if (mecabTokens.length === 0) {
+    return null;
+  }

  const tokenStart = token.startPos ?? 0;
  const tokenEnd = token.endPos ?? tokenStart + token.surface.length;
-  let bestSurfaceMatchPos1: string | undefined;
+  let bestSurfaceMatchToken: MergedToken | null = null;
  let bestSurfaceMatchDistance = Number.MAX_SAFE_INTEGER;
  let bestSurfaceMatchEndDistance = Number.MAX_SAFE_INTEGER;

@@ -31,19 +63,24 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s
    ) {
      bestSurfaceMatchDistance = startDistance;
      bestSurfaceMatchEndDistance = endDistance;
-      bestSurfaceMatchPos1 = mecabToken.pos1;
+      bestSurfaceMatchToken = mecabToken;
    }
  }

-  if (bestSurfaceMatchPos1) {
-    return bestSurfaceMatchPos1;
+  if (bestSurfaceMatchToken) {
+    return {
+      pos1: bestSurfaceMatchToken.pos1 as string,
+      pos2: bestSurfaceMatchToken.pos2,
+      pos3: bestSurfaceMatchToken.pos3,
+    };
  }

-  let bestPos1: string | undefined;
+  let bestToken: MergedToken | null = null;
  let bestOverlap = 0;
  let bestSpan = 0;
  let bestStartDistance = Number.MAX_SAFE_INTEGER;
  let bestStart = Number.MAX_SAFE_INTEGER;
+  const overlappingTokens: MergedToken[] = [];

  for (const mecabToken of mecabTokens) {
    if (!mecabToken.pos1) {
@@ -58,6 +95,7 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s
    if (overlap === 0) {
      continue;
    }
+    overlappingTokens.push(mecabToken);

    const span = mecabEnd - mecabStart;
    if (
@@ -71,11 +109,23 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s
      bestSpan = span;
      bestStartDistance = Math.abs(mecabStart - tokenStart);
      bestStart = mecabStart;
-      bestPos1 = mecabToken.pos1;
+      bestToken = mecabToken;
    }
  }

-  return bestOverlap > 0 ? bestPos1 : undefined;
+  if (bestOverlap === 0 || !bestToken) {
+    return null;
+  }
+
+  const overlapPos1 = joinUniqueTags(overlappingTokens.map((token) => token.pos1));
+  const overlapPos2 = joinUniqueTags(overlappingTokens.map((token) => token.pos2));
+  const overlapPos3 = joinUniqueTags(overlappingTokens.map((token) => token.pos3));
+
+  return {
+    pos1: overlapPos1 ?? (bestToken.pos1 as string),
+    pos2: overlapPos2 ?? bestToken.pos2,
+    pos3: overlapPos3 ?? bestToken.pos3,
+  };
 }

 function fillMissingPos1BySurfaceSequence(
@@ -101,7 +151,7 @@ function fillMissingPos1BySurfaceSequence(
      return token;
    }

-    let best: { pos1: string; index: number } | null = null;
+    let best: { token: MergedToken; index: number } | null = null;
    for (const candidate of indexedMecabTokens) {
      if (candidate.token.surface !== surface) {
        continue;
@@ -109,7 +159,7 @@ function fillMissingPos1BySurfaceSequence(
      if (candidate.index < cursor) {
        continue;
      }
-      best = { pos1: candidate.token.pos1 as string, index: candidate.index };
+      best = { token: candidate.token, index: candidate.index };
      break;
    }

@@ -118,7 +168,7 @@ function fillMissingPos1BySurfaceSequence(
        if (candidate.token.surface !== surface) {
          continue;
        }
-        best = { pos1: candidate.token.pos1 as string, index: candidate.index };
+        best = { token: candidate.token, index: candidate.index };
        break;
      }
    }
@@ -130,7 +180,9 @@ function fillMissingPos1BySurfaceSequence(
    cursor = best.index + 1;
    return {
      ...token,
-      pos1: best.pos1,
+      pos1: best.token.pos1,
+      pos2: best.token.pos2,
+      pos3: best.token.pos3,
    };
  });
 }
@@ -152,14 +204,16 @@ export function enrichTokensWithMecabPos1(
      return token;
    }

-    const pos1 = pickClosestMecabPos1(token, mecabTokens);
-    if (!pos1) {
+    const metadata = pickClosestMecabPosMetadata(token, mecabTokens);
+    if (!metadata) {
      return token;
    }

    return {
      ...token,
-      pos1,
+      pos1: metadata.pos1,
+      pos2: metadata.pos2,
+      pos3: metadata.pos3,
    };
  });