fix(tokenizer): tighten n+1 eligibility using mecab pos overlaps

2026-03-01 18:22:41 -08:00 · 2026-02-28 19:07:43 -08:00
parent 498fd2d09a
commit a7d220e182
10 changed files with 736 additions and 43 deletions
--- a/src/core/services/tokenizer.test.ts
+++ b/src/core/services/tokenizer.test.ts
@@ -2038,3 +2038,125 @@ test('tokenizeSubtitle keeps frequency enrichment while n+1 is disabled', async
  assert.equal(mecabCalls, 1);
  assert.equal(frequencyCalls, 1);
 });
 test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and frequency annotations', async () => {
  const result = await tokenizeSubtitle(
    'になれば',
    makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], {
      getFrequencyDictionaryEnabled: () => true,
      getFrequencyRank: (text) => (text === 'なる' ? 11 : null),
      tokenizeWithMecab: async () => [
        {
          headword: 'なる',
          surface: 'になれば',
          reading: 'ニナレバ',
          startPos: 0,
          endPos: 4,
          partOfSpeech: PartOfSpeech.verb,
          pos1: '動詞',
          pos2: '非自立',
          isMerged: true,
          isKnown: false,
          isNPlusOneTarget: false,
        },
      ],
      getMinSentenceWordsForNPlusOne: () => 1,
    }),
  );
  assert.equal(result.tokens?.length, 1);
  assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
  assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
 });
 test('tokenizeSubtitle keeps merged token when overlap contains at least one content pos1 tag', async () => {
  const result = await tokenizeSubtitle(
    'になれば',
    makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], {
      getFrequencyDictionaryEnabled: () => true,
      getFrequencyRank: (text) => (text === 'なる' ? 13 : null),
      tokenizeWithMecab: async () => [
        {
          headword: 'に',
          surface: 'に',
          reading: 'ニ',
          startPos: 0,
          endPos: 1,
          partOfSpeech: PartOfSpeech.particle,
          pos1: '助詞',
          pos2: '格助詞',
          isMerged: false,
          isKnown: false,
          isNPlusOneTarget: false,
        },
        {
          headword: 'なる',
          surface: 'なれ',
          reading: 'ナレ',
          startPos: 1,
          endPos: 3,
          partOfSpeech: PartOfSpeech.verb,
          pos1: '動詞',
          pos2: '自立',
          isMerged: false,
          isKnown: false,
          isNPlusOneTarget: false,
        },
        {
          headword: 'ば',
          surface: 'ば',
          reading: 'バ',
          startPos: 3,
          endPos: 4,
          partOfSpeech: PartOfSpeech.particle,
          pos1: '助詞',
          pos2: '接続助詞',
          isMerged: false,
          isKnown: false,
          isNPlusOneTarget: false,
        },
      ],
      getMinSentenceWordsForNPlusOne: () => 1,
    }),
  );
  assert.equal(result.tokens?.length, 1);
  assert.equal(result.tokens?.[0]?.pos1, '助詞|動詞');
  assert.equal(result.tokens?.[0]?.frequencyRank, 13);
  assert.equal(result.tokens?.[0]?.isNPlusOneTarget, true);
 });
 test('tokenizeSubtitle excludes default non-independent pos2 from N+1 when JLPT/frequency are disabled', async () => {
  let mecabCalls = 0;
  const result = await tokenizeSubtitle(
    'になれば',
    makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], {
      getJlptEnabled: () => false,
      getFrequencyDictionaryEnabled: () => false,
      getMinSentenceWordsForNPlusOne: () => 1,
      tokenizeWithMecab: async () => {
        mecabCalls += 1;
        return [
          {
            headword: 'なる',
            surface: 'になれば',
            reading: 'ニナレバ',
            startPos: 0,
            endPos: 4,
            partOfSpeech: PartOfSpeech.verb,
            pos1: '動詞',
            pos2: '非自立',
            isMerged: true,
            isKnown: false,
            isNPlusOneTarget: false,
          },
        ];
      },
    }),
  );
  assert.equal(mecabCalls, 1);
  assert.equal(result.tokens?.length, 1);
  assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
 });
--- a/src/core/services/tokenizer.ts
+++ b/src/core/services/tokenizer.ts
@@ -10,6 +10,14 @@ import {
  FrequencyDictionaryLookup,
  JlptLevel,
 } from '../../types';
 import {
  DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
  resolveAnnotationPos1ExclusionSet,
 } from '../../token-pos1-exclusions';
 import {
  DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
  resolveAnnotationPos2ExclusionSet,
 } from '../../token-pos2-exclusions';
 import { selectYomitanParseTokens } from './tokenizer/parser-selection-stage';
 import {
  requestYomitanParseResults,
@@ -78,6 +86,8 @@ interface TokenizerAnnotationOptions {
  frequencyEnabled: boolean;
  frequencyMatchMode: FrequencyDictionaryMatchMode;
  minSentenceWordsForNPlusOne: number | undefined;
  pos1Exclusions: ReadonlySet<string>;
  pos2Exclusions: ReadonlySet<string>;
 }
 let parserEnrichmentWorkerRuntimeModulePromise:
@@ -87,6 +97,12 @@ let annotationStageModulePromise: Promise<typeof import('./tokenizer/annotation-
 let parserEnrichmentFallbackModulePromise:
  | Promise<typeof import('./tokenizer/parser-enrichment-stage')>
  | null = null;
 const DEFAULT_ANNOTATION_POS1_EXCLUSIONS = resolveAnnotationPos1ExclusionSet(
  DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
 );
 const DEFAULT_ANNOTATION_POS2_EXCLUSIONS = resolveAnnotationPos2ExclusionSet(
  DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
 );
 function getKnownWordLookup(deps: TokenizerServiceDeps, options: TokenizerAnnotationOptions): (text: string) => boolean {
  if (!options.nPlusOneEnabled) {
@@ -96,7 +112,7 @@ function getKnownWordLookup(deps: TokenizerServiceDeps, options: TokenizerAnnota
 }
 function needsMecabPosEnrichment(options: TokenizerAnnotationOptions): boolean {
-  return options.jlptEnabled || options.frequencyEnabled;
+  return options.nPlusOneEnabled || options.jlptEnabled || options.frequencyEnabled;
 }
 function hasAnyAnnotationEnabled(options: TokenizerAnnotationOptions): boolean {
@@ -389,6 +405,8 @@ function getAnnotationOptions(deps: TokenizerServiceDeps): TokenizerAnnotationOp
    frequencyEnabled: deps.getFrequencyDictionaryEnabled?.() !== false,
    frequencyMatchMode: deps.getFrequencyDictionaryMatchMode?.() ?? 'headword',
    minSentenceWordsForNPlusOne: deps.getMinSentenceWordsForNPlusOne?.(),
    pos1Exclusions: DEFAULT_ANNOTATION_POS1_EXCLUSIONS,
    pos2Exclusions: DEFAULT_ANNOTATION_POS2_EXCLUSIONS,
  };
 }
--- a/src/core/services/tokenizer/annotation-stage.test.ts
+++ b/src/core/services/tokenizer/annotation-stage.test.ts
@@ -205,3 +205,171 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
  assert.equal(result[2]?.isKnown, true);
  assert.equal(result[0]?.isNPlusOneTarget, false);
 });
 test('annotateTokens applies configured pos1 exclusions to both frequency and N+1', () => {
  const tokens = [
    makeToken({
      surface: '猫',
      headword: '猫',
      pos1: '名詞',
      frequencyRank: 21,
      startPos: 0,
      endPos: 1,
    }),
    makeToken({
      surface: '走る',
      headword: '走る',
      pos1: '動詞',
      partOfSpeech: PartOfSpeech.verb,
      startPos: 1,
      endPos: 3,
      frequencyRank: 22,
    }),
  ];
  const result = annotateTokens(
    tokens,
    makeDeps({
      isKnownWord: (text) => text === '走る',
    }),
    {
      minSentenceWordsForNPlusOne: 1,
      pos1Exclusions: new Set(['名詞']),
    },
  );
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[1]?.frequencyRank, 22);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[1]?.isNPlusOneTarget, false);
 });
 test('annotateTokens allows previously default-excluded pos1 when removed from effective set', () => {
  const tokens = [
    makeToken({
      surface: 'は',
      headword: 'は',
      partOfSpeech: PartOfSpeech.other,
      pos1: '助詞',
      startPos: 0,
      endPos: 1,
      frequencyRank: 8,
    }),
  ];
  const result = annotateTokens(tokens, makeDeps(), {
    minSentenceWordsForNPlusOne: 1,
    pos1Exclusions: new Set(),
  });
  assert.equal(result[0]?.frequencyRank, 8);
  assert.equal(result[0]?.isNPlusOneTarget, true);
 });
 test('annotateTokens excludes default non-independent pos2 from frequency and N+1', () => {
  const tokens = [
    makeToken({
      surface: 'になれば',
      headword: 'なる',
      partOfSpeech: PartOfSpeech.verb,
      pos1: '動詞',
      pos2: '非自立',
      startPos: 0,
      endPos: 4,
      frequencyRank: 7,
    }),
  ];
  const result = annotateTokens(tokens, makeDeps(), {
    minSentenceWordsForNPlusOne: 1,
  });
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.isNPlusOneTarget, false);
 });
 test('annotateTokens excludes likely kana SFX tokens from frequency when POS tags are missing', () => {
  const tokens = [
    makeToken({
      surface: 'ぐわっ',
      reading: 'ぐわっ',
      headword: 'ぐわっ',
      pos1: '',
      pos2: '',
      frequencyRank: 12,
      startPos: 0,
      endPos: 3,
    }),
  ];
  const result = annotateTokens(tokens, makeDeps(), {
    minSentenceWordsForNPlusOne: 1,
  });
  assert.equal(result[0]?.frequencyRank, undefined);
 });
 test('annotateTokens allows previously default-excluded pos2 when removed from effective set', () => {
  const tokens = [
    makeToken({
      surface: 'になれば',
      headword: 'なる',
      partOfSpeech: PartOfSpeech.verb,
      pos1: '動詞',
      pos2: '非自立',
      startPos: 0,
      endPos: 4,
      frequencyRank: 9,
    }),
  ];
  const result = annotateTokens(tokens, makeDeps(), {
    minSentenceWordsForNPlusOne: 1,
    pos2Exclusions: new Set(),
  });
  assert.equal(result[0]?.frequencyRank, 9);
  assert.equal(result[0]?.isNPlusOneTarget, true);
 });
 test('annotateTokens keeps composite tokens when any component pos tag is content-bearing', () => {
  const tokens = [
    makeToken({
      surface: 'になれば',
      headword: 'なる',
      pos1: '助詞|動詞',
      pos2: '格助詞|自立|接続助詞',
      startPos: 0,
      endPos: 4,
      frequencyRank: 5,
    }),
  ];
  const result = annotateTokens(tokens, makeDeps(), {
    minSentenceWordsForNPlusOne: 1,
  });
  assert.equal(result[0]?.frequencyRank, 5);
  assert.equal(result[0]?.isNPlusOneTarget, true);
 });
 test('annotateTokens excludes composite tokens when all component pos tags are excluded', () => {
  const tokens = [
    makeToken({
      surface: 'けど',
      headword: 'けど',
      pos1: '助詞|助詞',
      pos2: '接続助詞|終助詞',
      startPos: 0,
      endPos: 2,
      frequencyRank: 6,
    }),
  ];
  const result = annotateTokens(tokens, makeDeps(), {
    minSentenceWordsForNPlusOne: 1,
  });
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.isNPlusOneTarget, false);
 });
--- a/src/core/services/tokenizer/annotation-stage.ts
+++ b/src/core/services/tokenizer/annotation-stage.ts
@@ -1,4 +1,12 @@
 import { markNPlusOneTargets } from '../../../token-merger';
 import {
  DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
  resolveAnnotationPos1ExclusionSet,
 } from '../../../token-pos1-exclusions';
 import {
  DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
  resolveAnnotationPos2ExclusionSet,
 } from '../../../token-pos2-exclusions';
 import {
  JlptLevel,
  MergedToken,
@@ -28,6 +36,8 @@ export interface AnnotationStageOptions {
  jlptEnabled?: boolean;
  frequencyEnabled?: boolean;
  minSentenceWordsForNPlusOne?: number;
  pos1Exclusions?: ReadonlySet<string>;
  pos2Exclusions?: ReadonlySet<string>;
 }
 function resolveKnownWordText(
@@ -53,22 +63,85 @@ function applyKnownWordMarking(
  });
 }
-function isFrequencyExcludedByPos(token: MergedToken): boolean {
+function normalizePos1Tag(pos1: string | undefined): string {
-  if (
+  return typeof pos1 === 'string' ? pos1.trim() : '';
-    token.partOfSpeech === PartOfSpeech.particle ||
+}
-    token.partOfSpeech === PartOfSpeech.bound_auxiliary
+
-  ) {
+function isExcludedByTagSet(
  normalizedTag: string,
  exclusions: ReadonlySet<string>,
 ): boolean {
  if (!normalizedTag) {
    return false;
  }
  const parts = normalizedTag
    .split('|')
    .map((part) => part.trim())
    .filter((part) => part.length > 0);
  if (parts.length === 0) {
    return false;
  }
  return parts.every((part) => exclusions.has(part));
 }
 function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
  if (options.pos1Exclusions) {
    return options.pos1Exclusions;
  }
  return resolveAnnotationPos1ExclusionSet(DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG);
 }
 function resolvePos2Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
  if (options.pos2Exclusions) {
    return options.pos2Exclusions;
  }
  return resolveAnnotationPos2ExclusionSet(DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG);
 }
 function normalizePos2Tag(pos2: string | undefined): string {
  return typeof pos2 === 'string' ? pos2.trim() : '';
 }
 function isFrequencyExcludedByPos(
  token: MergedToken,
  pos1Exclusions: ReadonlySet<string>,
  pos2Exclusions: ReadonlySet<string>,
 ): boolean {
  const normalizedPos1 = normalizePos1Tag(token.pos1);
  const hasPos1 = normalizedPos1.length > 0;
  if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
    return true;
  }
-  return token.pos1 === '助詞' || token.pos1 === '助動詞';
+  const normalizedPos2 = normalizePos2Tag(token.pos2);
  const hasPos2 = normalizedPos2.length > 0;
  if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) {
    return true;
  }
  if (hasPos1 || hasPos2) {
    return false;
  }
  if (isLikelyFrequencyNoiseToken(token)) {
    return true;
  }
  return (
    token.partOfSpeech === PartOfSpeech.particle ||
    token.partOfSpeech === PartOfSpeech.bound_auxiliary
  );
 }
 function applyFrequencyMarking(
  tokens: MergedToken[],
  pos1Exclusions: ReadonlySet<string>,
  pos2Exclusions: ReadonlySet<string>,
 ): MergedToken[] {
  return tokens.map((token) => {
-    if (isFrequencyExcludedByPos(token)) {
+    if (isFrequencyExcludedByPos(token, pos1Exclusions, pos2Exclusions)) {
      return { ...token, frequencyRank: undefined };
    }
@@ -203,6 +276,101 @@ function isRepeatedKanaSfx(text: string): boolean {
  return topCount >= Math.ceil(chars.length / 2);
 }
 function isTrailingSmallTsuKanaSfx(text: string): boolean {
  const normalized = normalizeJlptTextForExclusion(text);
  if (!normalized) {
    return false;
  }
  const chars = [...normalized];
  if (chars.length < 2 || chars.length > 4) {
    return false;
  }
  if (!chars.every(isKanaChar)) {
    return false;
  }
  return chars[chars.length - 1] === 'っ';
 }
 function isReduplicatedKanaSfx(text: string): boolean {
  const normalized = normalizeJlptTextForExclusion(text);
  if (!normalized) {
    return false;
  }
  const chars = [...normalized];
  if (chars.length < 4 || chars.length % 2 !== 0) {
    return false;
  }
  if (!chars.every(isKanaChar)) {
    return false;
  }
  const half = chars.length / 2;
  return chars.slice(0, half).join('') === chars.slice(half).join('');
 }
 function hasAdjacentKanaRepeat(text: string): boolean {
  const normalized = normalizeJlptTextForExclusion(text);
  if (!normalized) {
    return false;
  }
  const chars = [...normalized];
  if (!chars.every(isKanaChar)) {
    return false;
  }
  for (let i = 1; i < chars.length; i += 1) {
    if (chars[i] === chars[i - 1]) {
      return true;
    }
  }
  return false;
 }
 function isLikelyFrequencyNoiseToken(token: MergedToken): boolean {
  const candidates = [token.headword, token.surface].filter(
    (candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
  );
  for (const candidate of candidates) {
    const trimmedCandidate = candidate.trim();
    if (!trimmedCandidate) {
      continue;
    }
    const normalizedCandidate = normalizeJlptTextForExclusion(trimmedCandidate);
    if (!normalizedCandidate) {
      continue;
    }
    if (
      shouldIgnoreJlptByTerm(trimmedCandidate) ||
      shouldIgnoreJlptByTerm(normalizedCandidate)
    ) {
      return true;
    }
    if (
      hasAdjacentKanaRepeat(trimmedCandidate) ||
      hasAdjacentKanaRepeat(normalizedCandidate) ||
      isReduplicatedKanaSfx(trimmedCandidate) ||
      isReduplicatedKanaSfx(normalizedCandidate) ||
      isTrailingSmallTsuKanaSfx(trimmedCandidate) ||
      isTrailingSmallTsuKanaSfx(normalizedCandidate)
    ) {
      return true;
    }
  }
  return false;
 }
 function isJlptEligibleToken(token: MergedToken): boolean {
  if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) {
    return false;
@@ -261,6 +429,8 @@ export function annotateTokens(
  deps: AnnotationStageDeps,
  options: AnnotationStageOptions = {},
 ): MergedToken[] {
  const pos1Exclusions = resolvePos1Exclusions(options);
  const pos2Exclusions = resolvePos2Exclusions(options);
  const nPlusOneEnabled = options.nPlusOneEnabled !== false;
  const knownMarkedTokens = nPlusOneEnabled
    ? applyKnownWordMarking(tokens, deps.isKnownWord, deps.knownWordMatchMode)
@@ -273,7 +443,7 @@ export function annotateTokens(
  const frequencyEnabled = options.frequencyEnabled !== false;
  const frequencyMarkedTokens =
    frequencyEnabled
-      ? applyFrequencyMarking(knownMarkedTokens)
+      ? applyFrequencyMarking(knownMarkedTokens, pos1Exclusions, pos2Exclusions)
      : knownMarkedTokens.map((token) => ({
          ...token,
          frequencyRank: undefined,
@@ -303,5 +473,10 @@ export function annotateTokens(
      ? minSentenceWordsForNPlusOne
      : 3;
-  return markNPlusOneTargets(jlptMarkedTokens, sanitizedMinSentenceWordsForNPlusOne);
+  return markNPlusOneTargets(
    jlptMarkedTokens,
    sanitizedMinSentenceWordsForNPlusOne,
    pos1Exclusions,
    pos2Exclusions,
  );
 }
--- a/src/core/services/tokenizer/parser-enrichment-stage.test.ts
+++ b/src/core/services/tokenizer/parser-enrichment-stage.test.ts
@@ -22,12 +22,13 @@ function makeToken(overrides: Partial<MergedToken>): MergedToken {
 test('enrichTokensWithMecabPos1 picks pos1 by best overlap when no surface match exists', () => {
  const tokens = [makeToken({ surface: 'grouped', startPos: 2, endPos: 7 })];
  const mecabTokens = [
-    makeToken({ surface: 'left', startPos: 0, endPos: 4, pos1: 'A' }),
+    makeToken({ surface: 'left', startPos: 0, endPos: 4, pos1: 'A', pos2: 'L2' }),
-    makeToken({ surface: 'right', startPos: 2, endPos: 6, pos1: 'B' }),
+    makeToken({ surface: 'right', startPos: 2, endPos: 6, pos1: 'B', pos2: '非自立' }),
  ];
  const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
-  assert.equal(enriched[0]?.pos1, 'B');
+  assert.equal(enriched[0]?.pos1, 'A|B');
  assert.equal(enriched[0]?.pos2, 'L2|非自立');
 });
 test('enrichTokensWithMecabPos1 fills missing pos1 using surface-sequence fallback', () => {
--- a/src/core/services/tokenizer/parser-enrichment-stage.ts
+++ b/src/core/services/tokenizer/parser-enrichment-stage.ts
@@ -1,13 +1,45 @@
 import { MergedToken } from '../../../types';
-function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): string | undefined {
+type MecabPosMetadata = {
-  if (mecabTokens.length === 0) {
+  pos1: string;
  pos2?: string;
  pos3?: string;
 };
 function joinUniqueTags(values: Array<string | undefined>): string | undefined {
  const unique: string[] = [];
  for (const value of values) {
    if (!value) {
      continue;
    }
    const trimmed = value.trim();
    if (!trimmed) {
      continue;
    }
    if (!unique.includes(trimmed)) {
      unique.push(trimmed);
    }
  }
  if (unique.length === 0) {
    return undefined;
  }
  if (unique.length === 1) {
    return unique[0];
  }
  return unique.join('|');
 }
 function pickClosestMecabPosMetadata(
  token: MergedToken,
  mecabTokens: MergedToken[],
 ): MecabPosMetadata | null {
  if (mecabTokens.length === 0) {
    return null;
  }
  const tokenStart = token.startPos ?? 0;
  const tokenEnd = token.endPos ?? tokenStart + token.surface.length;
-  let bestSurfaceMatchPos1: string | undefined;
+  let bestSurfaceMatchToken: MergedToken | null = null;
  let bestSurfaceMatchDistance = Number.MAX_SAFE_INTEGER;
  let bestSurfaceMatchEndDistance = Number.MAX_SAFE_INTEGER;
@@ -31,19 +63,24 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s
    ) {
      bestSurfaceMatchDistance = startDistance;
      bestSurfaceMatchEndDistance = endDistance;
-      bestSurfaceMatchPos1 = mecabToken.pos1;
+      bestSurfaceMatchToken = mecabToken;
    }
  }
-  if (bestSurfaceMatchPos1) {
+  if (bestSurfaceMatchToken) {
-    return bestSurfaceMatchPos1;
+    return {
      pos1: bestSurfaceMatchToken.pos1 as string,
      pos2: bestSurfaceMatchToken.pos2,
      pos3: bestSurfaceMatchToken.pos3,
    };
  }
-  let bestPos1: string | undefined;
+  let bestToken: MergedToken | null = null;
  let bestOverlap = 0;
  let bestSpan = 0;
  let bestStartDistance = Number.MAX_SAFE_INTEGER;
  let bestStart = Number.MAX_SAFE_INTEGER;
  const overlappingTokens: MergedToken[] = [];
  for (const mecabToken of mecabTokens) {
    if (!mecabToken.pos1) {
@@ -58,6 +95,7 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s
    if (overlap === 0) {
      continue;
    }
    overlappingTokens.push(mecabToken);
    const span = mecabEnd - mecabStart;
    if (
@@ -71,11 +109,23 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s
      bestSpan = span;
      bestStartDistance = Math.abs(mecabStart - tokenStart);
      bestStart = mecabStart;
-      bestPos1 = mecabToken.pos1;
+      bestToken = mecabToken;
    }
  }
-  return bestOverlap > 0 ? bestPos1 : undefined;
+  if (bestOverlap === 0 || !bestToken) {
    return null;
  }
  const overlapPos1 = joinUniqueTags(overlappingTokens.map((token) => token.pos1));
  const overlapPos2 = joinUniqueTags(overlappingTokens.map((token) => token.pos2));
  const overlapPos3 = joinUniqueTags(overlappingTokens.map((token) => token.pos3));
  return {
    pos1: overlapPos1 ?? (bestToken.pos1 as string),
    pos2: overlapPos2 ?? bestToken.pos2,
    pos3: overlapPos3 ?? bestToken.pos3,
  };
 }
 function fillMissingPos1BySurfaceSequence(
@@ -101,7 +151,7 @@ function fillMissingPos1BySurfaceSequence(
      return token;
    }
-    let best: { pos1: string; index: number } | null = null;
+    let best: { token: MergedToken; index: number } | null = null;
    for (const candidate of indexedMecabTokens) {
      if (candidate.token.surface !== surface) {
        continue;
@@ -109,7 +159,7 @@ function fillMissingPos1BySurfaceSequence(
      if (candidate.index < cursor) {
        continue;
      }
-      best = { pos1: candidate.token.pos1 as string, index: candidate.index };
+      best = { token: candidate.token, index: candidate.index };
      break;
    }
@@ -118,7 +168,7 @@ function fillMissingPos1BySurfaceSequence(
        if (candidate.token.surface !== surface) {
          continue;
        }
-        best = { pos1: candidate.token.pos1 as string, index: candidate.index };
+        best = { token: candidate.token, index: candidate.index };
        break;
      }
    }
@@ -130,7 +180,9 @@ function fillMissingPos1BySurfaceSequence(
    cursor = best.index + 1;
    return {
      ...token,
-      pos1: best.pos1,
+      pos1: best.token.pos1,
      pos2: best.token.pos2,
      pos3: best.token.pos3,
    };
  });
 }
@@ -152,14 +204,16 @@ export function enrichTokensWithMecabPos1(
      return token;
    }
-    const pos1 = pickClosestMecabPos1(token, mecabTokens);
+    const metadata = pickClosestMecabPosMetadata(token, mecabTokens);
-    if (!pos1) {
+    if (!metadata) {
      return token;
    }
    return {
      ...token,
-      pos1,
+      pos1: metadata.pos1,
      pos2: metadata.pos2,
      pos3: metadata.pos3,
    };
  });
--- a/src/token-merger.ts
+++ b/src/token-merger.ts
@@ -17,6 +17,8 @@
 */
 import { PartOfSpeech, Token, MergedToken } from './types';
 import { DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG } from './token-pos1-exclusions';
 import { DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG } from './token-pos2-exclusions';
 export function isNoun(tok: Token): boolean {
  return tok.partOfSpeech === PartOfSpeech.noun;
@@ -241,25 +243,71 @@ export function mergeTokens(
 }
 const SENTENCE_BOUNDARY_SURFACES = new Set(['。', '？', '！', '?', '!', '…', '\u2026']);
-const N_PLUS_ONE_IGNORED_POS1 = new Set(['助詞', '助動詞', '記号', '補助記号']);
+const N_PLUS_ONE_IGNORED_POS1 = new Set(
  DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG.defaults,
 );
 const N_PLUS_ONE_IGNORED_POS2 = new Set(
  DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG.defaults,
 );
-export function isNPlusOneCandidateToken(token: MergedToken): boolean {
+function normalizePos1Tag(pos1: string | undefined): string {
  return typeof pos1 === 'string' ? pos1.trim() : '';
 }
 function normalizePos2Tag(pos2: string | undefined): string {
  return typeof pos2 === 'string' ? pos2.trim() : '';
 }
 function isExcludedByTagSet(
  normalizedTag: string,
  exclusions: ReadonlySet<string>,
 ): boolean {
  if (!normalizedTag) {
    return false;
  }
  const parts = normalizedTag
    .split('|')
    .map((part) => part.trim())
    .filter((part) => part.length > 0);
  if (parts.length === 0) {
    return false;
  }
  return parts.every((part) => exclusions.has(part));
 }
 export function isNPlusOneCandidateToken(
  token: MergedToken,
  pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
  pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
 ): boolean {
  if (token.isKnown) {
    return false;
  }
-  return isNPlusOneWordCountToken(token);
+  return isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions);
 }
-function isNPlusOneWordCountToken(token: MergedToken): boolean {
+function isNPlusOneWordCountToken(
-  if (token.partOfSpeech === PartOfSpeech.particle) {
+  token: MergedToken,
  pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
  pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
 ): boolean {
  const normalizedPos1 = normalizePos1Tag(token.pos1);
  const hasPos1 = normalizedPos1.length > 0;
  if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
    return false;
  }
-  if (token.partOfSpeech === PartOfSpeech.bound_auxiliary) {
+  const normalizedPos2 = normalizePos2Tag(token.pos2);
  const hasPos2 = normalizedPos2.length > 0;
  if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) {
    return false;
  }
-  if (token.partOfSpeech === PartOfSpeech.symbol) {
+  if (!hasPos1 && !hasPos2 && (
    token.partOfSpeech === PartOfSpeech.particle ||
    token.partOfSpeech === PartOfSpeech.bound_auxiliary ||
    token.partOfSpeech === PartOfSpeech.symbol
  )) {
    return false;
  }
@@ -271,10 +319,6 @@ function isNPlusOneWordCountToken(token: MergedToken): boolean {
    return false;
  }
  if (token.pos1 && N_PLUS_ONE_IGNORED_POS1.has(token.pos1)) {
    return false;
  }
  if (token.surface.trim().length === 0) {
    return false;
  }
@@ -290,7 +334,12 @@ function isSentenceBoundaryToken(token: MergedToken): boolean {
  return SENTENCE_BOUNDARY_SURFACES.has(token.surface);
 }
-export function markNPlusOneTargets(tokens: MergedToken[], minSentenceWords = 3): MergedToken[] {
+export function markNPlusOneTargets(
  tokens: MergedToken[],
  minSentenceWords = 3,
  pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
  pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
 ): MergedToken[] {
  if (tokens.length === 0) {
    return [];
  }
@@ -311,11 +360,11 @@ export function markNPlusOneTargets(tokens: MergedToken[], minSentenceWords = 3)
    for (let i = start; i < endExclusive; i++) {
      const token = markedTokens[i];
      if (!token) continue;
-      if (isNPlusOneWordCountToken(token)) {
+      if (isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions)) {
        sentenceWordCount += 1;
      }
-      if (isNPlusOneCandidateToken(token)) {
+      if (isNPlusOneCandidateToken(token, pos1Exclusions, pos2Exclusions)) {
        sentenceCandidates.push(i);
      }
    }
--- a/src/token-pos1-exclusions.ts
+++ b/src/token-pos1-exclusions.ts
@@ -0,0 +1,53 @@
 import type { ResolvedTokenPos1ExclusionConfig, TokenPos1ExclusionConfig } from './types';
 export const DEFAULT_ANNOTATION_POS1_EXCLUSION_DEFAULTS = Object.freeze([
  '助詞',
  '助動詞',
  '記号',
  '補助記号',
  '連体詞',
  '感動詞',
  '接続詞',
  '接頭詞',
 ]) as readonly string[];
 export const DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG: ResolvedTokenPos1ExclusionConfig = {
  defaults: [...DEFAULT_ANNOTATION_POS1_EXCLUSION_DEFAULTS],
  add: [],
  remove: [],
 };
 function normalizePosTag(value: string): string {
  return value.trim();
 }
 export function normalizePos1ExclusionList(values: readonly string[]): string[] {
  const deduped = new Set<string>();
  for (const value of values) {
    const normalized = normalizePosTag(value);
    if (!normalized) {
      continue;
    }
    deduped.add(normalized);
  }
  return [...deduped];
 }
 export function resolveAnnotationPos1ExclusionSet(
  config: TokenPos1ExclusionConfig | ResolvedTokenPos1ExclusionConfig,
 ): ReadonlySet<string> {
  const defaults = normalizePos1ExclusionList(config.defaults ?? []);
  const added = normalizePos1ExclusionList(config.add ?? []);
  const removed = new Set(normalizePos1ExclusionList(config.remove ?? []));
  const resolved = new Set<string>();
  for (const value of defaults) {
    resolved.add(value);
  }
  for (const value of added) {
    resolved.add(value);
  }
  for (const value of removed) {
    resolved.delete(value);
  }
  return resolved;
 }
--- a/src/token-pos2-exclusions.ts
+++ b/src/token-pos2-exclusions.ts
@@ -0,0 +1,29 @@
 import type { ResolvedTokenPos2ExclusionConfig, TokenPos2ExclusionConfig } from './types';
 import { normalizePos1ExclusionList } from './token-pos1-exclusions';
 export const DEFAULT_ANNOTATION_POS2_EXCLUSION_DEFAULTS = Object.freeze(['非自立']) as readonly string[];
 export const DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG: ResolvedTokenPos2ExclusionConfig = {
  defaults: [...DEFAULT_ANNOTATION_POS2_EXCLUSION_DEFAULTS],
  add: [],
  remove: [],
 };
 export function resolveAnnotationPos2ExclusionSet(
  config: TokenPos2ExclusionConfig | ResolvedTokenPos2ExclusionConfig,
 ): ReadonlySet<string> {
  const defaults = normalizePos1ExclusionList(config.defaults ?? []);
  const added = normalizePos1ExclusionList(config.add ?? []);
  const removed = new Set(normalizePos1ExclusionList(config.remove ?? []));
  const resolved = new Set<string>();
  for (const value of defaults) {
    resolved.add(value);
  }
  for (const value of added) {
    resolved.add(value);
  }
  for (const value of removed) {
    resolved.delete(value);
  }
  return resolved;
 }
--- a/src/types.ts
+++ b/src/types.ts
@@ -334,6 +334,30 @@ export interface SubtitleStyleConfig {
  };
 }
 export interface TokenPos1ExclusionConfig {
  defaults?: string[];
  add?: string[];
  remove?: string[];
 }
 export interface ResolvedTokenPos1ExclusionConfig {
  defaults: string[];
  add: string[];
  remove: string[];
 }
 export interface TokenPos2ExclusionConfig {
  defaults?: string[];
  add?: string[];
  remove?: string[];
 }
 export interface ResolvedTokenPos2ExclusionConfig {
  defaults: string[];
  add: string[];
  remove: string[];
 }
 export type FrequencyDictionaryMode = 'single' | 'banded';
 export interface ShortcutsConfig {