feat(tokenizer): use Yomitan word classes for subtitle POS filtering

- Carry matched headword wordClasses from termsFind into YomitanScanToken - Map recognized Yomitan wordClasses to SubMiner coarse POS before annotation - MeCab enrichment now fills only missing POS fields, preserving existing coarse pos1 - Exclude standalone grammar particles, して helper fragments, and single-kana surfaces from annotations - Respect source-text punctuation gaps when counting N+1 sentence words - Preserve known-word highlight on excluded kanji-containing tokens - Add backlog tasks 304 (N+1 boundary bug) and 305 (wordClasses POS, done)
2026-04-28 04:19:27 -07:00 · 2026-04-25 23:08:33 -07:00
parent 53aa58d044
commit 96894ff85c
11 changed files with 926 additions and 39 deletions
--- a/src/core/services/tokenizer.test.ts
+++ b/src/core/services/tokenizer.test.ts
@@ -25,6 +25,7 @@ interface YomitanTokenInput {
  reading?: string;
  headword?: string;
  isNameMatch?: boolean;
+  wordClasses?: string[];
 }

 function makeDepsFromYomitanTokens(
@@ -55,6 +56,7 @@ function makeDepsFromYomitanTokens(
                startPos,
                endPos,
                isNameMatch: token.isNameMatch ?? false,
+                wordClasses: token.wordClasses,
              };
            });
          },
@@ -1552,7 +1554,7 @@ test('tokenizeSubtitle assigns JLPT level to Yomitan tokens', async () => {
  assert.equal(result.tokens?.[0]?.jlptLevel, 'N4');
 });

-test('tokenizeSubtitle can assign JLPT level to Yomitan particle token', async () => {
+test('tokenizeSubtitle clears JLPT level from standalone Yomitan particle token', async () => {
  const result = await tokenizeSubtitle(
    'は',
    makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は' }], {
@@ -1561,7 +1563,7 @@ test('tokenizeSubtitle can assign JLPT level to Yomitan particle token', async (
  );

  assert.equal(result.tokens?.length, 1);
-  assert.equal(result.tokens?.[0]?.jlptLevel, 'N5');
+  assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
 });

 test('tokenizeSubtitle returns null tokens for empty normalized text', async () => {
@@ -3034,6 +3036,58 @@ test('tokenizeSubtitle skips all enrichment stages when disabled', async () => {
  assert.equal(frequencyCalls, 0);
 });

+test('tokenizeSubtitle uses Yomitan word classes to classify standalone particles', async () => {
+  let mecabCalls = 0;
+  const result = await tokenizeSubtitle(
+    'は',
+    makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は', wordClasses: ['prt'] }], {
+      getFrequencyDictionaryEnabled: () => true,
+      getFrequencyRank: (text) => (text === 'は' ? 10 : null),
+      getJlptLevel: (text) => (text === 'は' ? 'N5' : null),
+      tokenizeWithMecab: async () => {
+        mecabCalls += 1;
+        return null;
+      },
+    }),
+  );
+
+  assert.equal(mecabCalls, 1);
+  assert.equal(result.tokens?.length, 1);
+  assert.equal(result.tokens?.[0]?.partOfSpeech, PartOfSpeech.particle);
+  assert.equal(result.tokens?.[0]?.pos1, '助詞');
+  assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
+  assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
+  assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
+});
+
+test('tokenizeSubtitle fills detailed MeCab POS when Yomitan word class supplies coarse POS', async () => {
+  const result = await tokenizeSubtitle(
+    'は',
+    makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は', wordClasses: ['prt'] }], {
+      tokenizeWithMecab: async () => [
+        {
+          headword: 'は',
+          surface: 'は',
+          reading: 'ハ',
+          startPos: 0,
+          endPos: 1,
+          partOfSpeech: PartOfSpeech.particle,
+          pos1: '助詞',
+          pos2: '係助詞',
+          pos3: '*',
+          isMerged: false,
+          isKnown: false,
+          isNPlusOneTarget: false,
+        },
+      ],
+    }),
+  );
+
+  assert.equal(result.tokens?.[0]?.partOfSpeech, PartOfSpeech.particle);
+  assert.equal(result.tokens?.[0]?.pos1, '助詞');
+  assert.equal(result.tokens?.[0]?.pos2, '係助詞');
+});
+
 test('tokenizeSubtitle keeps frequency enrichment while n+1 is disabled', async () => {
  let knownCalls = 0;
  let mecabCalls = 0;
@@ -3110,6 +3164,60 @@ test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and freque
  assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
 });

+test('tokenizeSubtitle preserves known-word highlight for exact non-independent kanji noun tokens', async () => {
+  const result = await tokenizeSubtitle(
+    'その点',
+    makeDepsFromYomitanTokens(
+      [
+        { surface: 'その', reading: 'その', headword: 'その' },
+        { surface: '点', reading: 'てん', headword: '点' },
+      ],
+      {
+        isKnownWord: (text) => text === '点' || text === 'てん',
+        getFrequencyDictionaryEnabled: () => true,
+        getFrequencyRank: (text) => (text === '点' ? 1384 : null),
+        getJlptLevel: (text) => (text === '点' ? 'N3' : null),
+        tokenizeWithMecab: async () => [
+          {
+            headword: 'その',
+            surface: 'その',
+            reading: 'ソノ',
+            startPos: 0,
+            endPos: 2,
+            partOfSpeech: PartOfSpeech.other,
+            pos1: '連体詞',
+            isMerged: false,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+          {
+            headword: '点',
+            surface: '点',
+            reading: 'テン',
+            startPos: 2,
+            endPos: 3,
+            partOfSpeech: PartOfSpeech.noun,
+            pos1: '名詞',
+            pos2: '非自立',
+            pos3: '一般',
+            isMerged: false,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+        ],
+      },
+    ),
+  );
+
+  assert.equal(result.tokens?.length, 2);
+  assert.equal(result.tokens?.[0]?.isKnown, false);
+  assert.equal(result.tokens?.[1]?.surface, '点');
+  assert.equal(result.tokens?.[1]?.isKnown, true);
+  assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false);
+  assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
+  assert.equal(result.tokens?.[1]?.jlptLevel, undefined);
+});
+
 test('tokenizeSubtitle keeps mecab-tagged interjections tokenized while clearing annotation metadata', async () => {
  const result = await tokenizeSubtitle(
    'ぐはっ',
--- a/src/core/services/tokenizer.ts
+++ b/src/core/services/tokenizer.ts
@@ -96,6 +96,7 @@ interface TokenizerAnnotationOptions {
  minSentenceWordsForNPlusOne: number | undefined;
  pos1Exclusions: ReadonlySet<string>;
  pos2Exclusions: ReadonlySet<string>;
+  sourceText?: string;
 }

 let parserEnrichmentWorkerRuntimeModulePromise: Promise<
@@ -333,6 +334,66 @@ function normalizeSelectedYomitanTokens(tokens: MergedToken[]): MergedToken[] {
  }));
 }

+function normalizeYomitanWordClasses(wordClasses: unknown): string[] {
+  if (!Array.isArray(wordClasses)) {
+    return [];
+  }
+
+  const normalized: string[] = [];
+  for (const wordClass of wordClasses) {
+    if (typeof wordClass !== 'string') {
+      continue;
+    }
+    const trimmed = wordClass.trim();
+    if (trimmed && !normalized.includes(trimmed)) {
+      normalized.push(trimmed);
+    }
+  }
+  return normalized;
+}
+
+function resolvePartOfSpeechFromYomitanWordClasses(wordClasses: string[]): {
+  partOfSpeech: PartOfSpeech;
+  pos1?: string;
+} {
+  if (wordClasses.includes('prt')) {
+    return { partOfSpeech: PartOfSpeech.particle, pos1: '助詞' };
+  }
+  if (wordClasses.includes('aux')) {
+    return { partOfSpeech: PartOfSpeech.bound_auxiliary, pos1: '助動詞' };
+  }
+  if (wordClasses.some((wordClass) => wordClass.startsWith('v'))) {
+    return { partOfSpeech: PartOfSpeech.verb, pos1: '動詞' };
+  }
+  if (wordClasses.includes('adj-i') || wordClasses.includes('adj-ix')) {
+    return { partOfSpeech: PartOfSpeech.i_adjective, pos1: '形容詞' };
+  }
+  if (wordClasses.includes('adj-na')) {
+    return { partOfSpeech: PartOfSpeech.na_adjective, pos1: '名詞' };
+  }
+  if (
+    wordClasses.some(
+      (wordClass) =>
+        wordClass === 'n' ||
+        wordClass === 'num' ||
+        wordClass === 'ctr' ||
+        wordClass === 'pn' ||
+        wordClass.startsWith('n-'),
+    )
+  ) {
+    return { partOfSpeech: PartOfSpeech.noun, pos1: '名詞' };
+  }
+
+  return { partOfSpeech: PartOfSpeech.other };
+}
+
+function getYomitanWordClassPosMetadata(wordClasses: unknown): {
+  partOfSpeech: PartOfSpeech;
+  pos1?: string;
+} {
+  return resolvePartOfSpeechFromYomitanWordClasses(normalizeYomitanWordClasses(wordClasses));
+}
+
 function resolveFrequencyLookupText(
  token: MergedToken,
  matchMode: FrequencyDictionaryMatchMode,
@@ -623,19 +684,23 @@ async function parseWithYomitanInternalParser(
  }
  const normalizedSelectedTokens = normalizeSelectedYomitanTokens(
    selectedTokens.map(
-      (token): MergedToken => ({
-        surface: token.surface,
-        reading: token.reading,
-        headword: token.headword,
-        startPos: token.startPos,
-        endPos: token.endPos,
-        partOfSpeech: PartOfSpeech.other,
-        isMerged: true,
-        isKnown: false,
-        isNPlusOneTarget: false,
-        isNameMatch: token.isNameMatch ?? false,
-        frequencyRank: token.frequencyRank,
-      }),
+      (token): MergedToken => {
+        const posMetadata = getYomitanWordClassPosMetadata(token.wordClasses);
+        return {
+          surface: token.surface,
+          reading: token.reading,
+          headword: token.headword,
+          startPos: token.startPos,
+          endPos: token.endPos,
+          partOfSpeech: posMetadata.partOfSpeech,
+          pos1: posMetadata.pos1,
+          isMerged: true,
+          isKnown: false,
+          isNPlusOneTarget: false,
+          isNameMatch: token.isNameMatch ?? false,
+          frequencyRank: token.frequencyRank,
+        };
+      },
    ),
  );

@@ -716,12 +781,11 @@ export async function tokenizeSubtitle(
    .replace(/\s+/g, ' ')
    .trim();
  const annotationOptions = getAnnotationOptions(deps);
+  annotationOptions.sourceText = tokenizeText;

  const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps, annotationOptions);
  if (yomitanTokens && yomitanTokens.length > 0) {
-    const annotatedTokens = await stripSubtitleAnnotationMetadata(
-      await applyAnnotationStage(yomitanTokens, deps, annotationOptions),
-    );
+    const annotatedTokens = await applyAnnotationStage(yomitanTokens, deps, annotationOptions);
    return {
      text: displayText,
      tokens: annotatedTokens.length > 0 ? annotatedTokens : null,
--- a/src/core/services/tokenizer/annotation-stage.test.ts
+++ b/src/core/services/tokenizer/annotation-stage.test.ts
@@ -366,6 +366,132 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes kana-only non-independe
  assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
 });

+test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone して grammar helper fragments', () => {
+  const token = makeToken({
+    surface: 'して',
+    headword: 'する',
+    reading: 'シテ',
+    partOfSpeech: PartOfSpeech.verb,
+    pos1: '動詞|助詞',
+    pos2: '自立|接続助詞',
+  });
+
+  assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
+});
+
+test('shouldExcludeTokenFromSubtitleAnnotations excludes inflected standalone して grammar helper fragments', () => {
+  const token = makeToken({
+    surface: 'してる',
+    headword: 'する',
+    reading: 'シテル',
+    partOfSpeech: PartOfSpeech.verb,
+    pos1: '動詞|助動詞',
+    pos2: '自立|非自立',
+  });
+
+  assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
+});
+
+test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone particle fragments without POS tags', () => {
+  const token = makeToken({
+    surface: 'と',
+    headword: 'と',
+    reading: 'ト',
+    partOfSpeech: PartOfSpeech.other,
+    pos1: '',
+    pos2: '',
+  });
+
+  assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
+});
+
+test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone connective particle fragments without POS tags', () => {
+  const token = makeToken({
+    surface: 'たって',
+    headword: 'たって',
+    reading: 'タッテ',
+    partOfSpeech: PartOfSpeech.other,
+    pos1: '',
+    pos2: '',
+  });
+
+  assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
+});
+
+test('shouldExcludeTokenFromSubtitleAnnotations excludes rhetorical もんか grammar particle phrases', () => {
+  for (const surface of ['もんか', 'ものか']) {
+    const token = makeToken({
+      surface,
+      headword: surface,
+      reading: surface === 'もんか' ? 'モンカ' : 'モノカ',
+      partOfSpeech: PartOfSpeech.noun,
+      pos1: '名詞|助詞',
+      pos2: '非自立|副助詞／並立助詞／終助詞',
+    });
+
+    assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, surface);
+  }
+});
+
+test('shouldExcludeTokenFromSubtitleAnnotations excludes bare くれ auxiliary fragments', () => {
+  const token = makeToken({
+    surface: 'くれ',
+    headword: '暮れ',
+    reading: 'クレ',
+    partOfSpeech: PartOfSpeech.noun,
+    pos1: '名詞',
+    pos2: '一般',
+  });
+
+  assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
+});
+
+test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone quote particle and auxiliary grammar terms', () => {
+  for (const token of [
+    makeToken({
+      surface: 'って',
+      headword: 'って',
+      reading: 'ッテ',
+      partOfSpeech: PartOfSpeech.other,
+      pos1: '',
+      pos2: '',
+    }),
+    makeToken({
+      surface: 'べき',
+      headword: 'べき',
+      reading: 'ベキ',
+      partOfSpeech: PartOfSpeech.other,
+      pos1: '',
+      pos2: '',
+    }),
+  ]) {
+    assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
+  }
+});
+
+test('shouldExcludeTokenFromSubtitleAnnotations excludes single-kana surface fragments', () => {
+  for (const token of [
+    makeToken({
+      surface: 'ふ',
+      headword: '不',
+      reading: 'フ',
+      partOfSpeech: PartOfSpeech.other,
+      pos1: '接頭詞',
+      pos2: '',
+    }),
+    makeToken({
+      surface: 'フ',
+      headword: '負',
+      reading: 'フ',
+      partOfSpeech: PartOfSpeech.noun,
+      pos1: '名詞',
+      pos2: '一般',
+    }),
+  ]) {
+    assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
+  }
+});
+
 test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => {
  const token = makeToken({
    surface: 'は',
@@ -536,6 +662,57 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
  assert.equal(result[0]?.isNPlusOneTarget, false);
 });

+test('annotateTokens N+1 sentence word count respects source punctuation gaps omitted by Yomitan', () => {
+  const tokens = [
+    makeToken({
+      surface: '私',
+      headword: '私',
+      pos1: '名詞',
+      startPos: 0,
+      endPos: 1,
+    }),
+    makeToken({
+      surface: '猫',
+      headword: '猫',
+      pos1: '名詞',
+      startPos: 1,
+      endPos: 2,
+    }),
+    makeToken({
+      surface: '犬',
+      headword: '犬',
+      pos1: '名詞',
+      startPos: 2,
+      endPos: 3,
+    }),
+    makeToken({
+      surface: 'ふざけん',
+      headword: 'ふざける',
+      partOfSpeech: PartOfSpeech.verb,
+      pos1: '動詞',
+      pos2: '自立',
+      startPos: 4,
+      endPos: 8,
+    }),
+  ];
+
+  const result = annotateTokens(
+    tokens,
+    makeDeps({
+      isKnownWord: (text) => text === '私' || text === '猫' || text === '犬',
+    }),
+    {
+      minSentenceWordsForNPlusOne: 3,
+      sourceText: '私猫犬！ふざけんなよ！',
+    },
+  );
+
+  assert.equal(result[0]?.isNPlusOneTarget, false);
+  assert.equal(result[1]?.isNPlusOneTarget, false);
+  assert.equal(result[2]?.isNPlusOneTarget, false);
+  assert.equal(result[3]?.isNPlusOneTarget, false);
+});
+
 test('annotateTokens applies configured pos1 exclusions to both frequency and N+1', () => {
  const tokens = [
    makeToken({
@@ -610,14 +787,52 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+
    }),
  ];

-  const result = annotateTokens(tokens, makeDeps(), {
-    minSentenceWordsForNPlusOne: 1,
-  });
+  const result = annotateTokens(
+    tokens,
+    makeDeps({
+      isKnownWord: (text) => text === 'た' || text === '負',
+      getJlptLevel: (text) => (text === 'た' || text === '負' ? 'N3' : null),
+    }),
+    {
+      minSentenceWordsForNPlusOne: 1,
+    },
+  );

  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.isNPlusOneTarget, false);
 });

+test('annotateTokens preserves exact known-word status for non-independent kanji noun tokens', () => {
+  const tokens = [
+    makeToken({
+      surface: '点',
+      reading: 'てん',
+      headword: '点',
+      partOfSpeech: PartOfSpeech.other,
+      pos1: '名詞',
+      pos2: '非自立',
+      pos3: '一般',
+      startPos: 2,
+      endPos: 3,
+      frequencyRank: 1384,
+    }),
+  ];
+
+  const result = annotateTokens(
+    tokens,
+    makeDeps({
+      isKnownWord: (text) => text === '点' || text === 'てん',
+      getJlptLevel: (text) => (text === '点' ? 'N3' : null),
+    }),
+    { minSentenceWordsForNPlusOne: 1 },
+  );
+
+  assert.equal(result[0]?.isKnown, true);
+  assert.equal(result[0]?.isNPlusOneTarget, false);
+  assert.equal(result[0]?.frequencyRank, undefined);
+  assert.equal(result[0]?.jlptLevel, undefined);
+});
+
 test('annotateTokens clears all annotations for non-independent kanji noun tokens under unified gate', () => {
  const tokens = [
    makeToken({
@@ -665,7 +880,7 @@ test('annotateTokens excludes likely kana SFX tokens from frequency when POS tag
  assert.equal(result[0]?.frequencyRank, undefined);
 });

-test('annotateTokens excludes single hiragana and katakana tokens from frequency when POS tags are missing', () => {
+test('annotateTokens clears all annotations from single hiragana and katakana surface fragments', () => {
  const tokens = [
    makeToken({
      surface: 'た',
@@ -679,12 +894,12 @@ test('annotateTokens excludes single hiragana and katakana tokens from frequency
      endPos: 1,
    }),
    makeToken({
-      surface: 'ア',
-      reading: 'ア',
-      headword: 'ア',
-      pos1: '',
+      surface: 'フ',
+      reading: 'フ',
+      headword: '負',
+      pos1: '名詞',
      pos2: '',
-      partOfSpeech: PartOfSpeech.other,
+      partOfSpeech: PartOfSpeech.noun,
      frequencyRank: 22,
      startPos: 1,
      endPos: 2,
@@ -706,8 +921,14 @@ test('annotateTokens excludes single hiragana and katakana tokens from frequency
    minSentenceWordsForNPlusOne: 1,
  });

+  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.frequencyRank, undefined);
+  assert.equal(result[0]?.jlptLevel, undefined);
+  assert.equal(result[1]?.isKnown, false);
+  assert.equal(result[1]?.isNPlusOneTarget, false);
  assert.equal(result[1]?.frequencyRank, undefined);
+  assert.equal(result[1]?.jlptLevel, undefined);
  assert.equal(result[2]?.frequencyRank, 23);
 });

@@ -856,6 +1077,219 @@ test('annotateTokens clears all annotations for kana-only non-independent noun h
  assert.equal(result[0]?.jlptLevel, undefined);
 });

+test('annotateTokens clears all annotations for standalone して helper fragments', () => {
+  const tokens = [
+    makeToken({
+      surface: 'してる',
+      headword: 'する',
+      reading: 'シテル',
+      partOfSpeech: PartOfSpeech.verb,
+      pos1: '動詞|助動詞',
+      pos2: '自立|非自立',
+      startPos: 0,
+      endPos: 3,
+      frequencyRank: 22,
+    }),
+  ];
+
+  const result = annotateTokens(
+    tokens,
+    makeDeps({
+      isKnownWord: (text) => text === 'する',
+      getJlptLevel: (text) => (text === 'する' ? 'N5' : null),
+    }),
+    { minSentenceWordsForNPlusOne: 1 },
+  );
+
+  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isNPlusOneTarget, false);
+  assert.equal(result[0]?.frequencyRank, undefined);
+  assert.equal(result[0]?.jlptLevel, undefined);
+});
+
+test('annotateTokens clears all annotations for standalone particle fragments without POS tags', () => {
+  const tokens = [
+    makeToken({
+      surface: 'と',
+      headword: 'と',
+      reading: 'ト',
+      partOfSpeech: PartOfSpeech.other,
+      pos1: '',
+      pos2: '',
+      startPos: 0,
+      endPos: 1,
+      frequencyRank: 4,
+    }),
+  ];
+
+  const result = annotateTokens(
+    tokens,
+    makeDeps({
+      isKnownWord: (text) => text === 'と',
+      getJlptLevel: (text) => (text === 'と' ? 'N5' : null),
+    }),
+    { minSentenceWordsForNPlusOne: 1 },
+  );
+
+  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isNPlusOneTarget, false);
+  assert.equal(result[0]?.frequencyRank, undefined);
+  assert.equal(result[0]?.jlptLevel, undefined);
+});
+
+test('annotateTokens does not mark standalone connective particles as N+1', () => {
+  const tokens = [
+    makeToken({
+      surface: '逃げる',
+      headword: '逃げる',
+      reading: 'ニゲル',
+      partOfSpeech: PartOfSpeech.verb,
+      pos1: '動詞',
+      pos2: '自立',
+      startPos: 0,
+      endPos: 3,
+    }),
+    makeToken({
+      surface: 'たって',
+      headword: 'たって',
+      reading: 'タッテ',
+      partOfSpeech: PartOfSpeech.other,
+      pos1: '',
+      pos2: '',
+      startPos: 3,
+      endPos: 6,
+      frequencyRank: 28,
+    }),
+    makeToken({
+      surface: '無駄',
+      headword: '無駄',
+      reading: 'ムダ',
+      partOfSpeech: PartOfSpeech.noun,
+      pos1: '名詞',
+      pos2: '形容動詞語幹',
+      startPos: 6,
+      endPos: 8,
+    }),
+  ];
+
+  const result = annotateTokens(
+    tokens,
+    makeDeps({
+      isKnownWord: (text) => text === '逃げる' || text === '無駄',
+      getJlptLevel: (text) => (text === 'たって' ? 'N3' : null),
+    }),
+    { minSentenceWordsForNPlusOne: 1 },
+  );
+
+  assert.equal(result[1]?.isKnown, false);
+  assert.equal(result[1]?.isNPlusOneTarget, false);
+  assert.equal(result[1]?.frequencyRank, undefined);
+  assert.equal(result[1]?.jlptLevel, undefined);
+});
+
+test('annotateTokens clears all annotations for rhetorical もんか grammar particle phrases', () => {
+  const tokens = [
+    makeToken({
+      surface: 'もんか',
+      headword: 'もんか',
+      reading: 'モンカ',
+      partOfSpeech: PartOfSpeech.noun,
+      pos1: '名詞|助詞',
+      pos2: '非自立|副助詞／並立助詞／終助詞',
+      startPos: 0,
+      endPos: 3,
+      frequencyRank: 69629,
+    }),
+  ];
+
+  const result = annotateTokens(
+    tokens,
+    makeDeps({
+      isKnownWord: (text) => text === 'もんか',
+      getJlptLevel: (text) => (text === 'もんか' ? 'N2' : null),
+    }),
+    { minSentenceWordsForNPlusOne: 1 },
+  );
+
+  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isNPlusOneTarget, false);
+  assert.equal(result[0]?.frequencyRank, undefined);
+  assert.equal(result[0]?.jlptLevel, undefined);
+});
+
+test('annotateTokens clears all annotations for bare くれ auxiliary fragments', () => {
+  const tokens = [
+    makeToken({
+      surface: 'くれ',
+      headword: '暮れ',
+      reading: 'クレ',
+      partOfSpeech: PartOfSpeech.noun,
+      pos1: '名詞',
+      pos2: '一般',
+      startPos: 0,
+      endPos: 2,
+      frequencyRank: 12877,
+    }),
+  ];
+
+  const result = annotateTokens(
+    tokens,
+    makeDeps({
+      isKnownWord: (text) => text === '暮れ',
+      getJlptLevel: (text) => (text === '暮れ' ? 'N3' : null),
+    }),
+    { minSentenceWordsForNPlusOne: 1 },
+  );
+
+  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isNPlusOneTarget, false);
+  assert.equal(result[0]?.frequencyRank, undefined);
+  assert.equal(result[0]?.jlptLevel, undefined);
+});
+
+test('annotateTokens clears all annotations for standalone quote particle and auxiliary grammar terms', () => {
+  const tokens = [
+    makeToken({
+      surface: 'って',
+      headword: 'って',
+      reading: 'ッテ',
+      partOfSpeech: PartOfSpeech.other,
+      pos1: '',
+      pos2: '',
+      startPos: 0,
+      endPos: 2,
+      frequencyRank: 28,
+    }),
+    makeToken({
+      surface: 'べき',
+      headword: 'べき',
+      reading: 'ベキ',
+      partOfSpeech: PartOfSpeech.other,
+      pos1: '',
+      pos2: '',
+      startPos: 2,
+      endPos: 4,
+      frequencyRank: 268,
+    }),
+  ];
+
+  const result = annotateTokens(
+    tokens,
+    makeDeps({
+      isKnownWord: (text) => text === 'って' || text === 'べき',
+      getJlptLevel: (text) => (text === 'って' || text === 'べき' ? 'N3' : null),
+    }),
+    { minSentenceWordsForNPlusOne: 1 },
+  );
+
+  for (const token of result) {
+    assert.equal(token.isKnown, false, token.surface);
+    assert.equal(token.isNPlusOneTarget, false, token.surface);
+    assert.equal(token.frequencyRank, undefined, token.surface);
+    assert.equal(token.jlptLevel, undefined, token.surface);
+  }
+});
+
 test('annotateTokens clears all annotations from standalone あ interjections without POS tags', () => {
  const tokens = [
    makeToken({
--- a/src/core/services/tokenizer/annotation-stage.ts
+++ b/src/core/services/tokenizer/annotation-stage.ts
@@ -89,6 +89,7 @@ export interface AnnotationStageOptions {
  minSentenceWordsForNPlusOne?: number;
  pos1Exclusions?: ReadonlySet<string>;
  pos2Exclusions?: ReadonlySet<string>;
+  sourceText?: string;
 }

 function resolveKnownWordText(
@@ -670,6 +671,36 @@ function computeTokenKnownStatus(
  return normalizedReading !== matchText.trim() && isKnownWord(normalizedReading);
 }

+function computeExcludedTokenKnownStatus(
+  token: MergedToken,
+  isKnownWord: (text: string) => boolean,
+): boolean {
+  const normalizedSurface = token.surface.trim();
+  if (!hasKanjiChar(normalizedSurface)) {
+    return false;
+  }
+
+  if (normalizedSurface && isKnownWord(normalizedSurface)) {
+    return true;
+  }
+
+  const normalizedReading = token.reading.trim();
+  if (
+    normalizedReading &&
+    normalizedReading !== normalizedSurface &&
+    isKnownWord(normalizedReading)
+  ) {
+    return true;
+  }
+
+  const normalizedHeadword = token.headword.trim();
+  return (
+    normalizedHeadword.length > 0 &&
+    normalizedHeadword === normalizedSurface &&
+    isKnownWord(normalizedHeadword)
+  );
+}
+
 function filterTokenFrequencyRank(
  token: MergedToken,
  pos1Exclusions: ReadonlySet<string>,
@@ -732,10 +763,16 @@ export function annotateTokens(
        pos2Exclusions,
      })
    ) {
-      return sharedStripSubtitleAnnotationMetadata(token, {
+      const strippedToken = sharedStripSubtitleAnnotationMetadata(token, {
        pos1Exclusions,
        pos2Exclusions,
      });
+      return {
+        ...strippedToken,
+        isKnown:
+          nPlusOneEnabled &&
+          computeExcludedTokenKnownStatus(token, deps.isKnownWord),
+      };
    }

    const prioritizedNameMatch = nameMatchEnabled && token.isNameMatch === true;
@@ -779,6 +816,7 @@ export function annotateTokens(
    sanitizedMinSentenceWordsForNPlusOne,
    pos1Exclusions,
    pos2Exclusions,
+    options.sourceText,
  );

  if (!nameMatchEnabled) {
--- a/src/core/services/tokenizer/parser-enrichment-stage.ts
+++ b/src/core/services/tokenizer/parser-enrichment-stage.ts
@@ -303,7 +303,9 @@ function fillMissingPos1BySurfaceSequence(

  let cursor = 0;
  return tokens.map((token) => {
-    if (token.pos1 && token.pos1.trim().length > 0) {
+    const hasCompletePosMetadata =
+      token.pos1?.trim() && token.pos2?.trim() && token.pos3?.trim();
+    if (hasCompletePosMetadata) {
      return token;
    }

@@ -327,9 +329,9 @@ function fillMissingPos1BySurfaceSequence(
    cursor = best.index + 1;
    return {
      ...token,
-      pos1: best.pos1,
-      pos2: best.pos2,
-      pos3: best.pos3,
+      pos1: token.pos1 ?? best.pos1,
+      pos2: token.pos2 ?? best.pos2,
+      pos3: token.pos3 ?? best.pos3,
    };
  });
 }
@@ -382,7 +384,7 @@ export function enrichTokensWithMecabPos1(
  const metadataByTokenIndex = new Map<number, MecabPosMetadata>();

  for (const [index, token] of tokens.entries()) {
-    if (token.pos1) {
+    if (token.pos1?.trim() && token.pos2?.trim() && token.pos3?.trim()) {
      continue;
    }

@@ -410,9 +412,9 @@ export function enrichTokensWithMecabPos1(

    return {
      ...token,
-      pos1: metadata.pos1,
-      pos2: metadata.pos2,
-      pos3: metadata.pos3,
+      pos1: token.pos1 ?? metadata.pos1,
+      pos2: token.pos2 ?? metadata.pos2,
+      pos3: token.pos3 ?? metadata.pos3,
    };
  });

--- a/src/core/services/tokenizer/subtitle-annotation-filter.ts
+++ b/src/core/services/tokenizer/subtitle-annotation-filter.ts
@@ -19,11 +19,18 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
  'ええ',
  'うう',
  'おお',
+  'くれ',
+  'たって',
+  'って',
+  'だって',
  'はあ',
  'はは',
+  'べき',
  'へえ',
  'ふう',
  'ほう',
+  'もんか',
+  'ものか',
 ]);
 const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
 const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
@@ -72,6 +79,26 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
 ]);
 const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
 const NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1 = new Set(['助詞', '助動詞']);
+const STANDALONE_GRAMMAR_PARTICLE_SURFACES = new Set([
+  'か',
+  'が',
+  'さ',
+  'し',
+  'ぞ',
+  'ぜ',
+  'と',
+  'な',
+  'に',
+  'ね',
+  'の',
+  'は',
+  'へ',
+  'も',
+  'や',
+  'よ',
+  'を',
+]);
+const STANDALONE_GRAMMAR_PARTICLE_PHRASES = new Set(['たって', 'だって']);

 export interface SubtitleAnnotationFilterOptions {
  pos1Exclusions?: ReadonlySet<string>;
@@ -278,6 +305,38 @@ function isKanaOnlyNonIndependentNounHelperMerge(token: MergedToken): boolean {
  return pos1Parts.slice(1).every((part) => NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1.has(part));
 }

+function isKanaOnlyText(text: string): boolean {
+  const normalized = normalizeKana(text);
+  return normalized.length > 0 && [...normalized].every(isKanaChar);
+}
+
+function isStandaloneSuruTeGrammarHelper(token: MergedToken): boolean {
+  const normalizedSurface = normalizeKana(token.surface);
+  const normalizedHeadword = normalizeKana(token.headword);
+  if (!normalizedSurface.startsWith('して') || normalizedHeadword !== 'する') {
+    return false;
+  }
+
+  const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
+  return isKanaOnlyText(normalizedSurface) && (pos1Parts.length === 0 || pos1Parts.includes('動詞'));
+}
+
+function isStandaloneGrammarParticle(token: MergedToken): boolean {
+  const normalizedSurface = normalizeKana(token.surface);
+  const normalizedHeadword = normalizeKana(token.headword);
+  return (
+    normalizedSurface === normalizedHeadword &&
+    (STANDALONE_GRAMMAR_PARTICLE_SURFACES.has(normalizedSurface) ||
+      STANDALONE_GRAMMAR_PARTICLE_PHRASES.has(normalizedSurface))
+  );
+}
+
+function isSingleKanaSurfaceFragment(token: MergedToken): boolean {
+  const normalizedSurface = normalizeKana(token.surface);
+  const chars = [...normalizedSurface];
+  return chars.length === 1 && chars.every(isKanaChar);
+}
+
 function isExcludedByTerm(token: MergedToken): boolean {
  const candidates = [token.surface, token.reading, token.headword].filter(
    (candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
@@ -365,6 +424,18 @@ export function shouldExcludeTokenFromSubtitleAnnotations(
    return true;
  }

+  if (isStandaloneSuruTeGrammarHelper(token)) {
+    return true;
+  }
+
+  if (isStandaloneGrammarParticle(token)) {
+    return true;
+  }
+
+  if (isSingleKanaSurfaceFragment(token)) {
+    return true;
+  }
+
  if (isExcludedTrailingParticleMergedToken(token)) {
    return true;
  }
--- a/src/core/services/tokenizer/yomitan-parser-runtime.test.ts
+++ b/src/core/services/tokenizer/yomitan-parser-runtime.test.ts
@@ -1049,6 +1049,60 @@ test('requestYomitanScanTokens marks grouped entries when SubMiner dictionary al
  assert.equal((result as Array<{ isNameMatch?: boolean }>)[0]?.isNameMatch, true);
 });

+test('requestYomitanScanTokens preserves matched headword word classes', async () => {
+  let scannerScript = '';
+  const deps = createDeps(async (script) => {
+    if (script.includes('termsFind')) {
+      scannerScript = script;
+      return [];
+    }
+    if (script.includes('optionsGetFull')) {
+      return {
+        profileCurrent: 0,
+        profiles: [
+          {
+            options: {
+              scanning: { length: 40 },
+            },
+          },
+        ],
+      };
+    }
+    return null;
+  });
+
+  await requestYomitanScanTokens('は', deps, { error: () => undefined });
+
+  const result = await runInjectedYomitanScript(scannerScript, (action, params) => {
+    if (action !== 'termsFind') {
+      throw new Error(`unexpected action: ${action}`);
+    }
+
+    const text = (params as { text?: string } | undefined)?.text;
+    if (text !== 'は') {
+      return { originalTextLength: 0, dictionaryEntries: [] };
+    }
+
+    return {
+      originalTextLength: 1,
+      dictionaryEntries: [
+        {
+          headwords: [
+            {
+              term: 'は',
+              reading: 'は',
+              wordClasses: ['prt'],
+              sources: [{ originalText: 'は', isPrimary: true, matchType: 'exact' }],
+            },
+          ],
+        },
+      ],
+    };
+  });
+
+  assert.deepEqual((result as Array<{ wordClasses?: string[] }>)[0]?.wordClasses, ['prt']);
+});
+
 test('requestYomitanScanTokens skips fallback fragments without exact primary source matches', async () => {
  const deps = createDeps(async (script) => {
    if (script.includes('optionsGetFull')) {
--- a/src/core/services/tokenizer/yomitan-parser-runtime.ts
+++ b/src/core/services/tokenizer/yomitan-parser-runtime.ts
@@ -53,6 +53,7 @@ export interface YomitanScanToken {
  endPos: number;
  isNameMatch?: boolean;
  frequencyRank?: number;
+  wordClasses?: string[];
 }

 interface YomitanProfileMetadata {
@@ -91,7 +92,10 @@ function isScanTokenArray(value: unknown): value is YomitanScanToken[] {
        typeof entry.startPos === 'number' &&
        typeof entry.endPos === 'number' &&
        (entry.isNameMatch === undefined || typeof entry.isNameMatch === 'boolean') &&
-        (entry.frequencyRank === undefined || typeof entry.frequencyRank === 'number'),
+        (entry.frequencyRank === undefined || typeof entry.frequencyRank === 'number') &&
+        (entry.wordClasses === undefined ||
+          (Array.isArray(entry.wordClasses) &&
+            entry.wordClasses.every((wordClass) => typeof wordClass === 'string'))),
    )
  );
 }
@@ -975,6 +979,11 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
        return best;
      }
      function getPreferredHeadword(dictionaryEntries, token, dictionaryPriorityByName, dictionaryFrequencyModeByName) {
+        function normalizeWordClasses(headword) {
+          if (!Array.isArray(headword?.wordClasses)) { return undefined; }
+          const classes = headword.wordClasses.filter((wordClass) => typeof wordClass === "string" && wordClass.trim().length > 0);
+          return classes.length > 0 ? classes : undefined;
+        }
        function appendDictionaryNames(target, value) {
          if (!value || typeof value !== 'object') {
            return;
@@ -1033,6 +1042,7 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
          return {
            term: preferredMatch.headword.term,
            reading: preferredMatch.headword.reading,
+            wordClasses: normalizeWordClasses(preferredMatch.headword),
            isNameMatch: matchedNameDictionary || isNameDictionaryEntry(preferredMatch.dictionaryEntry),
            frequencyRank: getBestFrequencyRankForMatches(
              exactFrequencyMatches.length > 0 ? exactFrequencyMatches : exactPrimaryMatches,
@@ -1099,7 +1109,7 @@ ${YOMITAN_SCANNING_HELPERS}
          if (preferredHeadword && typeof preferredHeadword.term === "string") {
            const reading = typeof preferredHeadword.reading === "string" ? preferredHeadword.reading : "";
            const segments = distributeFuriganaInflected(preferredHeadword.term, reading, source);
-            tokens.push({
+            const tokenPayload = {
              surface: segments.map((segment) => segment.text).join("") || source,
              reading: segments.map((segment) => typeof segment.reading === "string" ? segment.reading : "").join(""),
              headword: preferredHeadword.term,
@@ -1110,7 +1120,11 @@ ${YOMITAN_SCANNING_HELPERS}
                typeof preferredHeadword.frequencyRank === "number" && Number.isFinite(preferredHeadword.frequencyRank)
                  ? Math.max(1, Math.floor(preferredHeadword.frequencyRank))
                  : undefined,
-            });
+            };
+            if (Array.isArray(preferredHeadword.wordClasses) && preferredHeadword.wordClasses.length > 0) {
+              tokenPayload.wordClasses = preferredHeadword.wordClasses;
+            }
+            tokens.push(tokenPayload);
            i += originalTextLength;
            continue;
          }
--- a/src/token-merger.ts
+++ b/src/token-merger.ts
@@ -347,11 +347,25 @@ function isSentenceBoundaryToken(token: MergedToken): boolean {
  return SENTENCE_BOUNDARY_SURFACES.has(token.surface);
 }

+function hasSentenceBoundaryInSourceGap(
+  sourceText: string | undefined,
+  previousEnd: number | null,
+  nextStart: number,
+): boolean {
+  if (typeof sourceText !== 'string' || previousEnd === null || nextStart <= previousEnd) {
+    return false;
+  }
+
+  const gap = sourceText.slice(previousEnd, nextStart);
+  return [...gap].some((char) => SENTENCE_BOUNDARY_SURFACES.has(char));
+}
+
 export function markNPlusOneTargets(
  tokens: MergedToken[],
  minSentenceWords = 3,
  pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
  pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
+  sourceText?: string,
 ): MergedToken[] {
  if (tokens.length === 0) {
    return [];
@@ -363,6 +377,7 @@ export function markNPlusOneTargets(
  }));

  let sentenceStart = 0;
+  let previousTokenEnd: number | null = null;
  const minimumSentenceWords = Number.isInteger(minSentenceWords)
    ? Math.max(1, minSentenceWords)
    : 3;
@@ -393,10 +408,15 @@ export function markNPlusOneTargets(
  for (let i = 0; i < markedTokens.length; i++) {
    const token = markedTokens[i];
    if (!token) continue;
+    if (hasSentenceBoundaryInSourceGap(sourceText, previousTokenEnd, token.startPos)) {
+      markSentence(sentenceStart, i);
+      sentenceStart = i;
+    }
    if (isSentenceBoundaryToken(token)) {
      markSentence(sentenceStart, i);
      sentenceStart = i + 1;
    }
+    previousTokenEnd = token.endPos;
  }

  if (sentenceStart < markedTokens.length) {