From a7d220e1820384addd06be08ce4243397402bbb2 Mon Sep 17 00:00:00 2001
From: sudacode <suda@sudacode.com>
Date: Sat, 28 Feb 2026 19:07:43 -0800
Subject: [PATCH] fix(tokenizer): tighten n+1 eligibility using mecab pos
 overlaps

---
 src/core/services/tokenizer.test.ts           | 122 +++++++++++
 src/core/services/tokenizer.ts                |  20 +-
 .../tokenizer/annotation-stage.test.ts        | 168 +++++++++++++++
 .../services/tokenizer/annotation-stage.ts    | 193 +++++++++++++++++-
 .../tokenizer/parser-enrichment-stage.test.ts |   7 +-
 .../tokenizer/parser-enrichment-stage.ts      |  86 ++++++--
 src/token-merger.ts                           |  77 +++++--
 src/token-pos1-exclusions.ts                  |  53 +++++
 src/token-pos2-exclusions.ts                  |  29 +++
 src/types.ts                                  |  24 +++
 10 files changed, 736 insertions(+), 43 deletions(-)
 create mode 100644 src/token-pos1-exclusions.ts
 create mode 100644 src/token-pos2-exclusions.ts

diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts
index c3a0989..1561bf1 100644
--- a/src/core/services/tokenizer.test.ts
+++ b/src/core/services/tokenizer.test.ts
@@ -2038,3 +2038,125 @@ test('tokenizeSubtitle keeps frequency enrichment while n+1 is disabled', async
   assert.equal(mecabCalls, 1);
   assert.equal(frequencyCalls, 1);
 });
+
+
+test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and frequency annotations', async () => {
+  const result = await tokenizeSubtitle(
+    'になれば',
+    makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], {
+      getFrequencyDictionaryEnabled: () => true,
+      getFrequencyRank: (text) => (text === 'なる' ? 11 : null),
+      tokenizeWithMecab: async () => [
+        {
+          headword: 'なる',
+          surface: 'になれば',
+          reading: 'ニナレバ',
+          startPos: 0,
+          endPos: 4,
+          partOfSpeech: PartOfSpeech.verb,
+          pos1: '動詞',
+          pos2: '非自立',
+          isMerged: true,
+          isKnown: false,
+          isNPlusOneTarget: false,
+        },
+      ],
+      getMinSentenceWordsForNPlusOne: () => 1,
+    }),
+  );
+
+  assert.equal(result.tokens?.length, 1);
+  assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
+  assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
+});
+
+test('tokenizeSubtitle keeps merged token when overlap contains at least one content pos1 tag', async () => {
+  const result = await tokenizeSubtitle(
+    'になれば',
+    makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], {
+      getFrequencyDictionaryEnabled: () => true,
+      getFrequencyRank: (text) => (text === 'なる' ? 13 : null),
+      tokenizeWithMecab: async () => [
+        {
+          headword: 'に',
+          surface: 'に',
+          reading: 'ニ',
+          startPos: 0,
+          endPos: 1,
+          partOfSpeech: PartOfSpeech.particle,
+          pos1: '助詞',
+          pos2: '格助詞',
+          isMerged: false,
+          isKnown: false,
+          isNPlusOneTarget: false,
+        },
+        {
+          headword: 'なる',
+          surface: 'なれ',
+          reading: 'ナレ',
+          startPos: 1,
+          endPos: 3,
+          partOfSpeech: PartOfSpeech.verb,
+          pos1: '動詞',
+          pos2: '自立',
+          isMerged: false,
+          isKnown: false,
+          isNPlusOneTarget: false,
+        },
+        {
+          headword: 'ば',
+          surface: 'ば',
+          reading: 'バ',
+          startPos: 3,
+          endPos: 4,
+          partOfSpeech: PartOfSpeech.particle,
+          pos1: '助詞',
+          pos2: '接続助詞',
+          isMerged: false,
+          isKnown: false,
+          isNPlusOneTarget: false,
+        },
+      ],
+      getMinSentenceWordsForNPlusOne: () => 1,
+    }),
+  );
+
+  assert.equal(result.tokens?.length, 1);
+  assert.equal(result.tokens?.[0]?.pos1, '助詞|動詞');
+  assert.equal(result.tokens?.[0]?.frequencyRank, 13);
+  assert.equal(result.tokens?.[0]?.isNPlusOneTarget, true);
+});
+
+test('tokenizeSubtitle excludes default non-independent pos2 from N+1 when JLPT/frequency are disabled', async () => {
+  let mecabCalls = 0;
+  const result = await tokenizeSubtitle(
+    'になれば',
+    makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], {
+      getJlptEnabled: () => false,
+      getFrequencyDictionaryEnabled: () => false,
+      getMinSentenceWordsForNPlusOne: () => 1,
+      tokenizeWithMecab: async () => {
+        mecabCalls += 1;
+        return [
+          {
+            headword: 'なる',
+            surface: 'になれば',
+            reading: 'ニナレバ',
+            startPos: 0,
+            endPos: 4,
+            partOfSpeech: PartOfSpeech.verb,
+            pos1: '動詞',
+            pos2: '非自立',
+            isMerged: true,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+        ];
+      },
+    }),
+  );
+
+  assert.equal(mecabCalls, 1);
+  assert.equal(result.tokens?.length, 1);
+  assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
+});
diff --git a/src/core/services/tokenizer.ts b/src/core/services/tokenizer.ts
index 764cbaa..ee218b1 100644
--- a/src/core/services/tokenizer.ts
+++ b/src/core/services/tokenizer.ts
@@ -10,6 +10,14 @@ import {
   FrequencyDictionaryLookup,
   JlptLevel,
 } from '../../types';
+import {
+  DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
+  resolveAnnotationPos1ExclusionSet,
+} from '../../token-pos1-exclusions';
+import {
+  DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
+  resolveAnnotationPos2ExclusionSet,
+} from '../../token-pos2-exclusions';
 import { selectYomitanParseTokens } from './tokenizer/parser-selection-stage';
 import {
   requestYomitanParseResults,
@@ -78,6 +86,8 @@ interface TokenizerAnnotationOptions {
   frequencyEnabled: boolean;
   frequencyMatchMode: FrequencyDictionaryMatchMode;
   minSentenceWordsForNPlusOne: number | undefined;
+  pos1Exclusions: ReadonlySet<string>;
+  pos2Exclusions: ReadonlySet<string>;
 }
 
 let parserEnrichmentWorkerRuntimeModulePromise:
@@ -87,6 +97,12 @@ let annotationStageModulePromise: Promise<typeof import('./tokenizer/annotation-
 let parserEnrichmentFallbackModulePromise:
   | Promise<typeof import('./tokenizer/parser-enrichment-stage')>
   | null = null;
+const DEFAULT_ANNOTATION_POS1_EXCLUSIONS = resolveAnnotationPos1ExclusionSet(
+  DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
+);
+const DEFAULT_ANNOTATION_POS2_EXCLUSIONS = resolveAnnotationPos2ExclusionSet(
+  DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
+);
 
 function getKnownWordLookup(deps: TokenizerServiceDeps, options: TokenizerAnnotationOptions): (text: string) => boolean {
   if (!options.nPlusOneEnabled) {
@@ -96,7 +112,7 @@ function getKnownWordLookup(deps: TokenizerServiceDeps, options: TokenizerAnnota
 }
 
 function needsMecabPosEnrichment(options: TokenizerAnnotationOptions): boolean {
-  return options.jlptEnabled || options.frequencyEnabled;
+  return options.nPlusOneEnabled || options.jlptEnabled || options.frequencyEnabled;
 }
 
 function hasAnyAnnotationEnabled(options: TokenizerAnnotationOptions): boolean {
@@ -389,6 +405,8 @@ function getAnnotationOptions(deps: TokenizerServiceDeps): TokenizerAnnotationOp
     frequencyEnabled: deps.getFrequencyDictionaryEnabled?.() !== false,
     frequencyMatchMode: deps.getFrequencyDictionaryMatchMode?.() ?? 'headword',
     minSentenceWordsForNPlusOne: deps.getMinSentenceWordsForNPlusOne?.(),
+    pos1Exclusions: DEFAULT_ANNOTATION_POS1_EXCLUSIONS,
+    pos2Exclusions: DEFAULT_ANNOTATION_POS2_EXCLUSIONS,
   };
 }
 
diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts
index fd4541b..50d2cbd 100644
--- a/src/core/services/tokenizer/annotation-stage.test.ts
+++ b/src/core/services/tokenizer/annotation-stage.test.ts
@@ -205,3 +205,171 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
   assert.equal(result[2]?.isKnown, true);
   assert.equal(result[0]?.isNPlusOneTarget, false);
 });
+
+test('annotateTokens applies configured pos1 exclusions to both frequency and N+1', () => {
+  const tokens = [
+    makeToken({
+      surface: '猫',
+      headword: '猫',
+      pos1: '名詞',
+      frequencyRank: 21,
+      startPos: 0,
+      endPos: 1,
+    }),
+    makeToken({
+      surface: '走る',
+      headword: '走る',
+      pos1: '動詞',
+      partOfSpeech: PartOfSpeech.verb,
+      startPos: 1,
+      endPos: 3,
+      frequencyRank: 22,
+    }),
+  ];
+
+  const result = annotateTokens(
+    tokens,
+    makeDeps({
+      isKnownWord: (text) => text === '走る',
+    }),
+    {
+      minSentenceWordsForNPlusOne: 1,
+      pos1Exclusions: new Set(['名詞']),
+    },
+  );
+
+  assert.equal(result[0]?.frequencyRank, undefined);
+  assert.equal(result[1]?.frequencyRank, 22);
+  assert.equal(result[0]?.isNPlusOneTarget, false);
+  assert.equal(result[1]?.isNPlusOneTarget, false);
+});
+
+test('annotateTokens allows previously default-excluded pos1 when removed from effective set', () => {
+  const tokens = [
+    makeToken({
+      surface: 'は',
+      headword: 'は',
+      partOfSpeech: PartOfSpeech.other,
+      pos1: '助詞',
+      startPos: 0,
+      endPos: 1,
+      frequencyRank: 8,
+    }),
+  ];
+
+  const result = annotateTokens(tokens, makeDeps(), {
+    minSentenceWordsForNPlusOne: 1,
+    pos1Exclusions: new Set(),
+  });
+
+  assert.equal(result[0]?.frequencyRank, 8);
+  assert.equal(result[0]?.isNPlusOneTarget, true);
+});
+
+test('annotateTokens excludes default non-independent pos2 from frequency and N+1', () => {
+  const tokens = [
+    makeToken({
+      surface: 'になれば',
+      headword: 'なる',
+      partOfSpeech: PartOfSpeech.verb,
+      pos1: '動詞',
+      pos2: '非自立',
+      startPos: 0,
+      endPos: 4,
+      frequencyRank: 7,
+    }),
+  ];
+
+  const result = annotateTokens(tokens, makeDeps(), {
+    minSentenceWordsForNPlusOne: 1,
+  });
+
+  assert.equal(result[0]?.frequencyRank, undefined);
+  assert.equal(result[0]?.isNPlusOneTarget, false);
+});
+
+test('annotateTokens excludes likely kana SFX tokens from frequency when POS tags are missing', () => {
+  const tokens = [
+    makeToken({
+      surface: 'ぐわっ',
+      reading: 'ぐわっ',
+      headword: 'ぐわっ',
+      pos1: '',
+      pos2: '',
+      frequencyRank: 12,
+      startPos: 0,
+      endPos: 3,
+    }),
+  ];
+
+  const result = annotateTokens(tokens, makeDeps(), {
+    minSentenceWordsForNPlusOne: 1,
+  });
+
+  assert.equal(result[0]?.frequencyRank, undefined);
+});
+
+test('annotateTokens allows previously default-excluded pos2 when removed from effective set', () => {
+  const tokens = [
+    makeToken({
+      surface: 'になれば',
+      headword: 'なる',
+      partOfSpeech: PartOfSpeech.verb,
+      pos1: '動詞',
+      pos2: '非自立',
+      startPos: 0,
+      endPos: 4,
+      frequencyRank: 9,
+    }),
+  ];
+
+  const result = annotateTokens(tokens, makeDeps(), {
+    minSentenceWordsForNPlusOne: 1,
+    pos2Exclusions: new Set(),
+  });
+
+  assert.equal(result[0]?.frequencyRank, 9);
+  assert.equal(result[0]?.isNPlusOneTarget, true);
+});
+
+test('annotateTokens keeps composite tokens when any component pos tag is content-bearing', () => {
+  const tokens = [
+    makeToken({
+      surface: 'になれば',
+      headword: 'なる',
+      pos1: '助詞|動詞',
+      pos2: '格助詞|自立|接続助詞',
+      startPos: 0,
+      endPos: 4,
+      frequencyRank: 5,
+    }),
+  ];
+
+  const result = annotateTokens(tokens, makeDeps(), {
+    minSentenceWordsForNPlusOne: 1,
+  });
+
+  assert.equal(result[0]?.frequencyRank, 5);
+  assert.equal(result[0]?.isNPlusOneTarget, true);
+});
+
+test('annotateTokens excludes composite tokens when all component pos tags are excluded', () => {
+  const tokens = [
+    makeToken({
+      surface: 'けど',
+      headword: 'けど',
+      pos1: '助詞|助詞',
+      pos2: '接続助詞|終助詞',
+      startPos: 0,
+      endPos: 2,
+      frequencyRank: 6,
+    }),
+  ];
+
+  const result = annotateTokens(tokens, makeDeps(), {
+    minSentenceWordsForNPlusOne: 1,
+  });
+
+  assert.equal(result[0]?.frequencyRank, undefined);
+  assert.equal(result[0]?.isNPlusOneTarget, false);
+});
diff --git a/src/core/services/tokenizer/annotation-stage.ts b/src/core/services/tokenizer/annotation-stage.ts
index af409e4..922f873 100644
--- a/src/core/services/tokenizer/annotation-stage.ts
+++ b/src/core/services/tokenizer/annotation-stage.ts
@@ -1,4 +1,12 @@
 import { markNPlusOneTargets } from '../../../token-merger';
+import {
+  DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
+  resolveAnnotationPos1ExclusionSet,
+} from '../../../token-pos1-exclusions';
+import {
+  DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
+  resolveAnnotationPos2ExclusionSet,
+} from '../../../token-pos2-exclusions';
 import {
   JlptLevel,
   MergedToken,
@@ -28,6 +36,8 @@ export interface AnnotationStageOptions {
   jlptEnabled?: boolean;
   frequencyEnabled?: boolean;
   minSentenceWordsForNPlusOne?: number;
+  pos1Exclusions?: ReadonlySet<string>;
+  pos2Exclusions?: ReadonlySet<string>;
 }
 
 function resolveKnownWordText(
@@ -53,22 +63,85 @@ function applyKnownWordMarking(
   });
 }
 
-function isFrequencyExcludedByPos(token: MergedToken): boolean {
-  if (
-    token.partOfSpeech === PartOfSpeech.particle ||
-    token.partOfSpeech === PartOfSpeech.bound_auxiliary
-  ) {
+function normalizePos1Tag(pos1: string | undefined): string {
+  return typeof pos1 === 'string' ? pos1.trim() : '';
+}
+
+function isExcludedByTagSet(
+  normalizedTag: string,
+  exclusions: ReadonlySet<string>,
+): boolean {
+  if (!normalizedTag) {
+    return false;
+  }
+  const parts = normalizedTag
+    .split('|')
+    .map((part) => part.trim())
+    .filter((part) => part.length > 0);
+  if (parts.length === 0) {
+    return false;
+  }
+  return parts.every((part) => exclusions.has(part));
+}
+
+function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
+  if (options.pos1Exclusions) {
+    return options.pos1Exclusions;
+  }
+
+  return resolveAnnotationPos1ExclusionSet(DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG);
+}
+
+function resolvePos2Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
+  if (options.pos2Exclusions) {
+    return options.pos2Exclusions;
+  }
+
+  return resolveAnnotationPos2ExclusionSet(DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG);
+}
+
+function normalizePos2Tag(pos2: string | undefined): string {
+  return typeof pos2 === 'string' ? pos2.trim() : '';
+}
+
+function isFrequencyExcludedByPos(
+  token: MergedToken,
+  pos1Exclusions: ReadonlySet<string>,
+  pos2Exclusions: ReadonlySet<string>,
+): boolean {
+  const normalizedPos1 = normalizePos1Tag(token.pos1);
+  const hasPos1 = normalizedPos1.length > 0;
+  if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
     return true;
   }
 
-  return token.pos1 === '助詞' || token.pos1 === '助動詞';
+  const normalizedPos2 = normalizePos2Tag(token.pos2);
+  const hasPos2 = normalizedPos2.length > 0;
+  if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) {
+    return true;
+  }
+
+  if (hasPos1 || hasPos2) {
+    return false;
+  }
+
+  if (isLikelyFrequencyNoiseToken(token)) {
+    return true;
+  }
+
+  return (
+    token.partOfSpeech === PartOfSpeech.particle ||
+    token.partOfSpeech === PartOfSpeech.bound_auxiliary
+  );
 }
 
 function applyFrequencyMarking(
   tokens: MergedToken[],
+  pos1Exclusions: ReadonlySet<string>,
+  pos2Exclusions: ReadonlySet<string>,
 ): MergedToken[] {
   return tokens.map((token) => {
-    if (isFrequencyExcludedByPos(token)) {
+    if (isFrequencyExcludedByPos(token, pos1Exclusions, pos2Exclusions)) {
       return { ...token, frequencyRank: undefined };
     }
 
@@ -203,6 +276,101 @@ function isRepeatedKanaSfx(text: string): boolean {
   return topCount >= Math.ceil(chars.length / 2);
 }
 
+function isTrailingSmallTsuKanaSfx(text: string): boolean {
+  const normalized = normalizeJlptTextForExclusion(text);
+  if (!normalized) {
+    return false;
+  }
+
+  const chars = [...normalized];
+  if (chars.length < 2 || chars.length > 4) {
+    return false;
+  }
+
+  if (!chars.every(isKanaChar)) {
+    return false;
+  }
+
+  return chars[chars.length - 1] === 'っ';
+}
+
+function isReduplicatedKanaSfx(text: string): boolean {
+  const normalized = normalizeJlptTextForExclusion(text);
+  if (!normalized) {
+    return false;
+  }
+
+  const chars = [...normalized];
+  if (chars.length < 4 || chars.length % 2 !== 0) {
+    return false;
+  }
+
+  if (!chars.every(isKanaChar)) {
+    return false;
+  }
+
+  const half = chars.length / 2;
+  return chars.slice(0, half).join('') === chars.slice(half).join('');
+}
+
+function hasAdjacentKanaRepeat(text: string): boolean {
+  const normalized = normalizeJlptTextForExclusion(text);
+  if (!normalized) {
+    return false;
+  }
+
+  const chars = [...normalized];
+  if (!chars.every(isKanaChar)) {
+    return false;
+  }
+
+  for (let i = 1; i < chars.length; i += 1) {
+    if (chars[i] === chars[i - 1]) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+function isLikelyFrequencyNoiseToken(token: MergedToken): boolean {
+  const candidates = [token.headword, token.surface].filter(
+    (candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
+  );
+
+  for (const candidate of candidates) {
+    const trimmedCandidate = candidate.trim();
+    if (!trimmedCandidate) {
+      continue;
+    }
+
+    const normalizedCandidate = normalizeJlptTextForExclusion(trimmedCandidate);
+    if (!normalizedCandidate) {
+      continue;
+    }
+
+    if (
+      shouldIgnoreJlptByTerm(trimmedCandidate) ||
+      shouldIgnoreJlptByTerm(normalizedCandidate)
+    ) {
+      return true;
+    }
+
+    if (
+      hasAdjacentKanaRepeat(trimmedCandidate) ||
+      hasAdjacentKanaRepeat(normalizedCandidate) ||
+      isReduplicatedKanaSfx(trimmedCandidate) ||
+      isReduplicatedKanaSfx(normalizedCandidate) ||
+      isTrailingSmallTsuKanaSfx(trimmedCandidate) ||
+      isTrailingSmallTsuKanaSfx(normalizedCandidate)
+    ) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 function isJlptEligibleToken(token: MergedToken): boolean {
   if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) {
     return false;
@@ -261,6 +429,8 @@ export function annotateTokens(
   deps: AnnotationStageDeps,
   options: AnnotationStageOptions = {},
 ): MergedToken[] {
+  const pos1Exclusions = resolvePos1Exclusions(options);
+  const pos2Exclusions = resolvePos2Exclusions(options);
   const nPlusOneEnabled = options.nPlusOneEnabled !== false;
   const knownMarkedTokens = nPlusOneEnabled
     ? applyKnownWordMarking(tokens, deps.isKnownWord, deps.knownWordMatchMode)
@@ -273,7 +443,7 @@ export function annotateTokens(
   const frequencyEnabled = options.frequencyEnabled !== false;
   const frequencyMarkedTokens =
     frequencyEnabled
-      ? applyFrequencyMarking(knownMarkedTokens)
+      ? applyFrequencyMarking(knownMarkedTokens, pos1Exclusions, pos2Exclusions)
       : knownMarkedTokens.map((token) => ({
           ...token,
           frequencyRank: undefined,
@@ -303,5 +473,10 @@ export function annotateTokens(
       ? minSentenceWordsForNPlusOne
       : 3;
 
-  return markNPlusOneTargets(jlptMarkedTokens, sanitizedMinSentenceWordsForNPlusOne);
+  return markNPlusOneTargets(
+    jlptMarkedTokens,
+    sanitizedMinSentenceWordsForNPlusOne,
+    pos1Exclusions,
+    pos2Exclusions,
+  );
 }
diff --git a/src/core/services/tokenizer/parser-enrichment-stage.test.ts b/src/core/services/tokenizer/parser-enrichment-stage.test.ts
index a00f82c..86178a4 100644
--- a/src/core/services/tokenizer/parser-enrichment-stage.test.ts
+++ b/src/core/services/tokenizer/parser-enrichment-stage.test.ts
@@ -22,12 +22,13 @@ function makeToken(overrides: Partial<MergedToken>): MergedToken {
 test('enrichTokensWithMecabPos1 picks pos1 by best overlap when no surface match exists', () => {
   const tokens = [makeToken({ surface: 'grouped', startPos: 2, endPos: 7 })];
   const mecabTokens = [
-    makeToken({ surface: 'left', startPos: 0, endPos: 4, pos1: 'A' }),
-    makeToken({ surface: 'right', startPos: 2, endPos: 6, pos1: 'B' }),
+    makeToken({ surface: 'left', startPos: 0, endPos: 4, pos1: 'A', pos2: 'L2' }),
+    makeToken({ surface: 'right', startPos: 2, endPos: 6, pos1: 'B', pos2: '非自立' }),
   ];
 
   const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
-  assert.equal(enriched[0]?.pos1, 'B');
+  assert.equal(enriched[0]?.pos1, 'A|B');
+  assert.equal(enriched[0]?.pos2, 'L2|非自立');
 });
 
 test('enrichTokensWithMecabPos1 fills missing pos1 using surface-sequence fallback', () => {
diff --git a/src/core/services/tokenizer/parser-enrichment-stage.ts b/src/core/services/tokenizer/parser-enrichment-stage.ts
index 3c3aeb2..857d255 100644
--- a/src/core/services/tokenizer/parser-enrichment-stage.ts
+++ b/src/core/services/tokenizer/parser-enrichment-stage.ts
@@ -1,13 +1,45 @@
 import { MergedToken } from '../../../types';
 
-function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): string | undefined {
-  if (mecabTokens.length === 0) {
+type MecabPosMetadata = {
+  pos1: string;
+  pos2?: string;
+  pos3?: string;
+};
+
+function joinUniqueTags(values: Array<string | undefined>): string | undefined {
+  const unique: string[] = [];
+  for (const value of values) {
+    if (!value) {
+      continue;
+    }
+    const trimmed = value.trim();
+    if (!trimmed) {
+      continue;
+    }
+    if (!unique.includes(trimmed)) {
+      unique.push(trimmed);
+    }
+  }
+  if (unique.length === 0) {
     return undefined;
   }
+  if (unique.length === 1) {
+    return unique[0];
+  }
+  return unique.join('|');
+}
+
+function pickClosestMecabPosMetadata(
+  token: MergedToken,
+  mecabTokens: MergedToken[],
+): MecabPosMetadata | null {
+  if (mecabTokens.length === 0) {
+    return null;
+  }
 
   const tokenStart = token.startPos ?? 0;
   const tokenEnd = token.endPos ?? tokenStart + token.surface.length;
-  let bestSurfaceMatchPos1: string | undefined;
+  let bestSurfaceMatchToken: MergedToken | null = null;
   let bestSurfaceMatchDistance = Number.MAX_SAFE_INTEGER;
   let bestSurfaceMatchEndDistance = Number.MAX_SAFE_INTEGER;
 
@@ -31,19 +63,24 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s
     ) {
       bestSurfaceMatchDistance = startDistance;
       bestSurfaceMatchEndDistance = endDistance;
-      bestSurfaceMatchPos1 = mecabToken.pos1;
+      bestSurfaceMatchToken = mecabToken;
     }
   }
 
-  if (bestSurfaceMatchPos1) {
-    return bestSurfaceMatchPos1;
+  if (bestSurfaceMatchToken) {
+    return {
+      pos1: bestSurfaceMatchToken.pos1 as string,
+      pos2: bestSurfaceMatchToken.pos2,
+      pos3: bestSurfaceMatchToken.pos3,
+    };
   }
 
-  let bestPos1: string | undefined;
+  let bestToken: MergedToken | null = null;
   let bestOverlap = 0;
   let bestSpan = 0;
   let bestStartDistance = Number.MAX_SAFE_INTEGER;
   let bestStart = Number.MAX_SAFE_INTEGER;
+  const overlappingTokens: MergedToken[] = [];
 
   for (const mecabToken of mecabTokens) {
     if (!mecabToken.pos1) {
@@ -58,6 +95,7 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s
     if (overlap === 0) {
       continue;
     }
+    overlappingTokens.push(mecabToken);
 
     const span = mecabEnd - mecabStart;
     if (
@@ -71,11 +109,23 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s
       bestSpan = span;
       bestStartDistance = Math.abs(mecabStart - tokenStart);
       bestStart = mecabStart;
-      bestPos1 = mecabToken.pos1;
+      bestToken = mecabToken;
     }
   }
 
-  return bestOverlap > 0 ? bestPos1 : undefined;
+  if (bestOverlap === 0 || !bestToken) {
+    return null;
+  }
+
+  const overlapPos1 = joinUniqueTags(overlappingTokens.map((token) => token.pos1));
+  const overlapPos2 = joinUniqueTags(overlappingTokens.map((token) => token.pos2));
+  const overlapPos3 = joinUniqueTags(overlappingTokens.map((token) => token.pos3));
+
+  return {
+    pos1: overlapPos1 ?? (bestToken.pos1 as string),
+    pos2: overlapPos2 ?? bestToken.pos2,
+    pos3: overlapPos3 ?? bestToken.pos3,
+  };
 }
 
 function fillMissingPos1BySurfaceSequence(
@@ -101,7 +151,7 @@ function fillMissingPos1BySurfaceSequence(
       return token;
     }
 
-    let best: { pos1: string; index: number } | null = null;
+    let best: { token: MergedToken; index: number } | null = null;
     for (const candidate of indexedMecabTokens) {
       if (candidate.token.surface !== surface) {
         continue;
@@ -109,7 +159,7 @@ function fillMissingPos1BySurfaceSequence(
       if (candidate.index < cursor) {
         continue;
       }
-      best = { pos1: candidate.token.pos1 as string, index: candidate.index };
+      best = { token: candidate.token, index: candidate.index };
       break;
     }
 
@@ -118,7 +168,7 @@ function fillMissingPos1BySurfaceSequence(
         if (candidate.token.surface !== surface) {
           continue;
         }
-        best = { pos1: candidate.token.pos1 as string, index: candidate.index };
+        best = { token: candidate.token, index: candidate.index };
         break;
       }
     }
@@ -130,7 +180,9 @@ function fillMissingPos1BySurfaceSequence(
     cursor = best.index + 1;
     return {
       ...token,
-      pos1: best.pos1,
+      pos1: best.token.pos1,
+      pos2: best.token.pos2,
+      pos3: best.token.pos3,
     };
   });
 }
@@ -152,14 +204,16 @@ export function enrichTokensWithMecabPos1(
       return token;
     }
 
-    const pos1 = pickClosestMecabPos1(token, mecabTokens);
-    if (!pos1) {
+    const metadata = pickClosestMecabPosMetadata(token, mecabTokens);
+    if (!metadata) {
       return token;
     }
 
     return {
       ...token,
-      pos1,
+      pos1: metadata.pos1,
+      pos2: metadata.pos2,
+      pos3: metadata.pos3,
     };
   });
 
diff --git a/src/token-merger.ts b/src/token-merger.ts
index 55dfb66..c30d986 100644
--- a/src/token-merger.ts
+++ b/src/token-merger.ts
@@ -17,6 +17,8 @@
  */
 
 import { PartOfSpeech, Token, MergedToken } from './types';
+import { DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG } from './token-pos1-exclusions';
+import { DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG } from './token-pos2-exclusions';
 
 export function isNoun(tok: Token): boolean {
   return tok.partOfSpeech === PartOfSpeech.noun;
@@ -241,25 +243,71 @@ export function mergeTokens(
 }
 
 const SENTENCE_BOUNDARY_SURFACES = new Set(['。', '？', '！', '?', '!', '…', '\u2026']);
-const N_PLUS_ONE_IGNORED_POS1 = new Set(['助詞', '助動詞', '記号', '補助記号']);
+const N_PLUS_ONE_IGNORED_POS1 = new Set(
+  DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG.defaults,
+);
+const N_PLUS_ONE_IGNORED_POS2 = new Set(
+  DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG.defaults,
+);
 
-export function isNPlusOneCandidateToken(token: MergedToken): boolean {
+function normalizePos1Tag(pos1: string | undefined): string {
+  return typeof pos1 === 'string' ? pos1.trim() : '';
+}
+
+function normalizePos2Tag(pos2: string | undefined): string {
+  return typeof pos2 === 'string' ? pos2.trim() : '';
+}
+
+function isExcludedByTagSet(
+  normalizedTag: string,
+  exclusions: ReadonlySet<string>,
+): boolean {
+  if (!normalizedTag) {
+    return false;
+  }
+  const parts = normalizedTag
+    .split('|')
+    .map((part) => part.trim())
+    .filter((part) => part.length > 0);
+  if (parts.length === 0) {
+    return false;
+  }
+  return parts.every((part) => exclusions.has(part));
+}
+
+export function isNPlusOneCandidateToken(
+  token: MergedToken,
+  pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
+  pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
+): boolean {
   if (token.isKnown) {
     return false;
   }
-  return isNPlusOneWordCountToken(token);
+  return isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions);
 }
 
-function isNPlusOneWordCountToken(token: MergedToken): boolean {
-  if (token.partOfSpeech === PartOfSpeech.particle) {
+function isNPlusOneWordCountToken(
+  token: MergedToken,
+  pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
+  pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
+): boolean {
+  const normalizedPos1 = normalizePos1Tag(token.pos1);
+  const hasPos1 = normalizedPos1.length > 0;
+  if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
     return false;
   }
 
-  if (token.partOfSpeech === PartOfSpeech.bound_auxiliary) {
+  const normalizedPos2 = normalizePos2Tag(token.pos2);
+  const hasPos2 = normalizedPos2.length > 0;
+  if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) {
     return false;
   }
 
-  if (token.partOfSpeech === PartOfSpeech.symbol) {
+  if (!hasPos1 && !hasPos2 && (
+    token.partOfSpeech === PartOfSpeech.particle ||
+    token.partOfSpeech === PartOfSpeech.bound_auxiliary ||
+    token.partOfSpeech === PartOfSpeech.symbol
+  )) {
     return false;
   }
 
@@ -271,10 +319,6 @@ function isNPlusOneWordCountToken(token: MergedToken): boolean {
     return false;
   }
 
-  if (token.pos1 && N_PLUS_ONE_IGNORED_POS1.has(token.pos1)) {
-    return false;
-  }
-
   if (token.surface.trim().length === 0) {
     return false;
   }
@@ -290,7 +334,12 @@ function isSentenceBoundaryToken(token: MergedToken): boolean {
   return SENTENCE_BOUNDARY_SURFACES.has(token.surface);
 }
 
-export function markNPlusOneTargets(tokens: MergedToken[], minSentenceWords = 3): MergedToken[] {
+export function markNPlusOneTargets(
+  tokens: MergedToken[],
+  minSentenceWords = 3,
+  pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
+  pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
+): MergedToken[] {
   if (tokens.length === 0) {
     return [];
   }
@@ -311,11 +360,11 @@ export function markNPlusOneTargets(tokens: MergedToken[], minSentenceWords = 3)
     for (let i = start; i < endExclusive; i++) {
       const token = markedTokens[i];
       if (!token) continue;
-      if (isNPlusOneWordCountToken(token)) {
+      if (isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions)) {
         sentenceWordCount += 1;
       }
 
-      if (isNPlusOneCandidateToken(token)) {
+      if (isNPlusOneCandidateToken(token, pos1Exclusions, pos2Exclusions)) {
         sentenceCandidates.push(i);
       }
     }
diff --git a/src/token-pos1-exclusions.ts b/src/token-pos1-exclusions.ts
new file mode 100644
index 0000000..f0fb258
--- /dev/null
+++ b/src/token-pos1-exclusions.ts
@@ -0,0 +1,53 @@
+import type { ResolvedTokenPos1ExclusionConfig, TokenPos1ExclusionConfig } from './types';
+
+export const DEFAULT_ANNOTATION_POS1_EXCLUSION_DEFAULTS = Object.freeze([
+  '助詞',
+  '助動詞',
+  '記号',
+  '補助記号',
+  '連体詞',
+  '感動詞',
+  '接続詞',
+  '接頭詞',
+]) as readonly string[];
+
+export const DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG: ResolvedTokenPos1ExclusionConfig = {
+  defaults: [...DEFAULT_ANNOTATION_POS1_EXCLUSION_DEFAULTS],
+  add: [],
+  remove: [],
+};
+
+function normalizePosTag(value: string): string {
+  return value.trim();
+}
+
+export function normalizePos1ExclusionList(values: readonly string[]): string[] {
+  const deduped = new Set<string>();
+  for (const value of values) {
+    const normalized = normalizePosTag(value);
+    if (!normalized) {
+      continue;
+    }
+    deduped.add(normalized);
+  }
+  return [...deduped];
+}
+
+export function resolveAnnotationPos1ExclusionSet(
+  config: TokenPos1ExclusionConfig | ResolvedTokenPos1ExclusionConfig,
+): ReadonlySet<string> {
+  const defaults = normalizePos1ExclusionList(config.defaults ?? []);
+  const added = normalizePos1ExclusionList(config.add ?? []);
+  const removed = new Set(normalizePos1ExclusionList(config.remove ?? []));
+  const resolved = new Set<string>();
+  for (const value of defaults) {
+    resolved.add(value);
+  }
+  for (const value of added) {
+    resolved.add(value);
+  }
+  for (const value of removed) {
+    resolved.delete(value);
+  }
+  return resolved;
+}
diff --git a/src/token-pos2-exclusions.ts b/src/token-pos2-exclusions.ts
new file mode 100644
index 0000000..a6eeef2
--- /dev/null
+++ b/src/token-pos2-exclusions.ts
@@ -0,0 +1,29 @@
+import type { ResolvedTokenPos2ExclusionConfig, TokenPos2ExclusionConfig } from './types';
+import { normalizePos1ExclusionList } from './token-pos1-exclusions';
+
+export const DEFAULT_ANNOTATION_POS2_EXCLUSION_DEFAULTS = Object.freeze(['非自立']) as readonly string[];
+
+export const DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG: ResolvedTokenPos2ExclusionConfig = {
+  defaults: [...DEFAULT_ANNOTATION_POS2_EXCLUSION_DEFAULTS],
+  add: [],
+  remove: [],
+};
+
+export function resolveAnnotationPos2ExclusionSet(
+  config: TokenPos2ExclusionConfig | ResolvedTokenPos2ExclusionConfig,
+): ReadonlySet<string> {
+  const defaults = normalizePos1ExclusionList(config.defaults ?? []);
+  const added = normalizePos1ExclusionList(config.add ?? []);
+  const removed = new Set(normalizePos1ExclusionList(config.remove ?? []));
+  const resolved = new Set<string>();
+  for (const value of defaults) {
+    resolved.add(value);
+  }
+  for (const value of added) {
+    resolved.add(value);
+  }
+  for (const value of removed) {
+    resolved.delete(value);
+  }
+  return resolved;
+}
diff --git a/src/types.ts b/src/types.ts
index 4915f2c..8830a27 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -334,6 +334,30 @@ export interface SubtitleStyleConfig {
   };
 }
 
+export interface TokenPos1ExclusionConfig {
+  defaults?: string[];
+  add?: string[];
+  remove?: string[];
+}
+
+export interface ResolvedTokenPos1ExclusionConfig {
+  defaults: string[];
+  add: string[];
+  remove: string[];
+}
+
+export interface TokenPos2ExclusionConfig {
+  defaults?: string[];
+  add?: string[];
+  remove?: string[];
+}
+
+export interface ResolvedTokenPos2ExclusionConfig {
+  defaults: string[];
+  add: string[];
+  remove: string[];
+}
+
 export type FrequencyDictionaryMode = 'single' | 'banded';
 
 export interface ShortcutsConfig {