fix(subtitle): unify annotation token filtering

2026-06-16 15:13:31 -07:00 · 2026-03-19 23:48:38 -07:00
parent 4a01cebca6
commit 42028d0a4d
5 changed files with 527 additions and 29 deletions
@@ -3628,6 +3628,119 @@ test('tokenizeSubtitle excludes merged function/content token from frequency hig
  assert.equal(result.tokens?.[0]?.isNPlusOneTarget, true);
 });

+test('tokenizeSubtitle clears all annotations for kana-only demonstrative helper merges', async () => {
+  const result = await tokenizeSubtitle(
+    'これで実力どおりか',
+    makeDepsFromYomitanTokens(
+      [
+        { surface: 'これで', reading: 'これで', headword: 'これ' },
+        { surface: '実力どおり', reading: 'じつりょくどおり', headword: '実力どおり' },
+        { surface: 'か', reading: 'か', headword: 'か' },
+      ],
+      {
+        getFrequencyDictionaryEnabled: () => true,
+        getFrequencyRank: (text) =>
+          text === 'これ' ? 9 : text === '実力どおり' ? 2500 : text === 'か' ? 800 : null,
+        getJlptLevel: (text) =>
+          text === 'これ' ? 'N5' : text === '実力どおり' ? 'N1' : text === 'か' ? 'N5' : null,
+        isKnownWord: (text) => text === 'これ',
+        getMinSentenceWordsForNPlusOne: () => 1,
+        tokenizeWithMecab: async () => [
+          {
+            headword: 'これ',
+            surface: 'これ',
+            reading: 'コレ',
+            startPos: 0,
+            endPos: 2,
+            partOfSpeech: PartOfSpeech.noun,
+            pos1: '名詞',
+            pos2: '代名詞',
+            isMerged: false,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+          {
+            headword: 'で',
+            surface: 'で',
+            reading: 'デ',
+            startPos: 2,
+            endPos: 3,
+            partOfSpeech: PartOfSpeech.particle,
+            pos1: '助詞',
+            pos2: '格助詞',
+            isMerged: false,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+          {
+            headword: '実力どおり',
+            surface: '実力どおり',
+            reading: 'ジツリョクドオリ',
+            startPos: 3,
+            endPos: 8,
+            partOfSpeech: PartOfSpeech.noun,
+            pos1: '名詞',
+            pos2: '一般',
+            isMerged: false,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+          {
+            headword: 'か',
+            surface: 'か',
+            reading: 'カ',
+            startPos: 8,
+            endPos: 9,
+            partOfSpeech: PartOfSpeech.particle,
+            pos1: '助詞',
+            pos2: '終助詞',
+            isMerged: false,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+        ],
+      },
+    ),
+  );
+
+  assert.deepEqual(
+    result.tokens?.map((token) => ({
+      surface: token.surface,
+      headword: token.headword,
+      isKnown: token.isKnown,
+      isNPlusOneTarget: token.isNPlusOneTarget,
+      frequencyRank: token.frequencyRank,
+      jlptLevel: token.jlptLevel,
+    })),
+    [
+      {
+        surface: 'これで',
+        headword: 'これ',
+        isKnown: false,
+        isNPlusOneTarget: false,
+        frequencyRank: undefined,
+        jlptLevel: undefined,
+      },
+      {
+        surface: '実力どおり',
+        headword: '実力どおり',
+        isKnown: false,
+        isNPlusOneTarget: true,
+        frequencyRank: 2500,
+        jlptLevel: 'N1',
+      },
+      {
+        surface: 'か',
+        headword: 'か',
+        isKnown: false,
+        isNPlusOneTarget: false,
+        frequencyRank: undefined,
+        jlptLevel: undefined,
+      },
+    ],
+  );
+});
+
 test('tokenizeSubtitle keeps frequency for content-led merged token with trailing colloquial suffixes', async () => {
  const result = await tokenizeSubtitle(
    '張り切ってんじゃ',
@@ -316,6 +316,19 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes merged lexical tokens w
  assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
 });

+test('shouldExcludeTokenFromSubtitleAnnotations excludes kana-only demonstrative helper merges', () => {
+  const token = makeToken({
+    surface: 'これで',
+    headword: 'これ',
+    reading: 'コレデ',
+    partOfSpeech: PartOfSpeech.noun,
+    pos1: '名詞|助詞',
+    pos2: '代名詞|格助詞',
+  });
+
+  assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
+});
+
 test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => {
  const token = makeToken({
    surface: 'は',
@@ -481,8 +494,8 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
  );

  assert.equal(result[0]?.isKnown, false);
-  assert.equal(result[1]?.isKnown, true);
-  assert.equal(result[2]?.isKnown, true);
+  assert.equal(result[1]?.isKnown, false);
+  assert.equal(result[2]?.isKnown, false);
  assert.equal(result[0]?.isNPlusOneTarget, false);
 });

@@ -568,7 +581,7 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+
  assert.equal(result[0]?.isNPlusOneTarget, false);
 });

-test('annotateTokens keeps frequency for kanji noun tokens even when mecab marks them non-independent', () => {
+test('annotateTokens clears all annotations for non-independent kanji noun tokens under unified gate', () => {
  const tokens = [
    makeToken({
      surface: '者',
@@ -588,7 +601,10 @@ test('annotateTokens keeps frequency for kanji noun tokens even when mecab marks
    minSentenceWordsForNPlusOne: 1,
  });

-  assert.equal(result[0]?.frequencyRank, 475);
+  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isNPlusOneTarget, false);
+  assert.equal(result[0]?.frequencyRank, undefined);
+  assert.equal(result[0]?.jlptLevel, undefined);
 });

 test('annotateTokens excludes likely kana SFX tokens from frequency when POS tags are missing', () => {
@@ -742,3 +758,33 @@ test('annotateTokens excludes composite tokens when all component pos tags are e
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.isNPlusOneTarget, false);
 });
+
+test('annotateTokens applies one shared exclusion gate across known N+1 frequency and JLPT', () => {
+  const tokens = [
+    makeToken({
+      surface: 'これで',
+      headword: 'これ',
+      reading: 'コレデ',
+      partOfSpeech: PartOfSpeech.noun,
+      pos1: '名詞|助詞',
+      pos2: '代名詞|格助詞',
+      startPos: 0,
+      endPos: 3,
+      frequencyRank: 9,
+    }),
+  ];
+
+  const result = annotateTokens(
+    tokens,
+    makeDeps({
+      isKnownWord: (text) => text === 'これ',
+      getJlptLevel: (text) => (text === 'これ' ? 'N5' : null),
+    }),
+    { minSentenceWordsForNPlusOne: 1 },
+  );
+
+  assert.equal(result[0]?.isKnown, false);
+  assert.equal(result[0]?.isNPlusOneTarget, false);
+  assert.equal(result[0]?.frequencyRank, undefined);
+  assert.equal(result[0]?.jlptLevel, undefined);
+});
@@ -9,6 +9,10 @@ import {
 } from '../../../token-pos2-exclusions';
 import { JlptLevel, MergedToken, NPlusOneMatchMode, PartOfSpeech } from '../../../types';
 import { shouldIgnoreJlptByTerm, shouldIgnoreJlptForMecabPos1 } from '../jlpt-token-filter';
+import {
+  shouldExcludeTokenFromSubtitleAnnotations as sharedShouldExcludeTokenFromSubtitleAnnotations,
+  stripSubtitleAnnotationMetadata as sharedStripSubtitleAnnotationMetadata,
+} from './subtitle-annotation-filter';

 const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
 const KATAKANA_CODEPOINT_START = 0x30a1;
@@ -633,34 +637,11 @@ function isExcludedFromSubtitleAnnotationsByTerm(token: MergedToken): boolean {
 }

 export function shouldExcludeTokenFromSubtitleAnnotations(token: MergedToken): boolean {
-  if (isExcludedFromSubtitleAnnotationsByPos1(normalizePos1Tag(token.pos1))) {
-    return true;
-  }
-
-  if (isAuxiliaryStemGrammarTailToken(token)) {
-    return true;
-  }
-
-  if (isExcludedTrailingParticleMergedToken(token)) {
-    return true;
-  }
-
-  return isExcludedFromSubtitleAnnotationsByTerm(token);
+  return sharedShouldExcludeTokenFromSubtitleAnnotations(token);
 }

 export function stripSubtitleAnnotationMetadata(token: MergedToken): MergedToken {
-  if (!shouldExcludeTokenFromSubtitleAnnotations(token)) {
-    return token;
-  }
-
-  return {
-    ...token,
-    isKnown: false,
-    isNPlusOneTarget: false,
-    isNameMatch: false,
-    jlptLevel: undefined,
-    frequencyRank: undefined,
-  };
+  return sharedStripSubtitleAnnotationMetadata(token);
 }

 function computeTokenKnownStatus(
@@ -737,6 +718,18 @@ export function annotateTokens(

  // Single pass: compute known word status, frequency filtering, and JLPT level together
  const annotated = tokens.map((token) => {
+    if (
+      sharedShouldExcludeTokenFromSubtitleAnnotations(token, {
+        pos1Exclusions,
+        pos2Exclusions,
+      })
+    ) {
+      return sharedStripSubtitleAnnotationMetadata(token, {
+        pos1Exclusions,
+        pos2Exclusions,
+      });
+    }
+
    const prioritizedNameMatch = nameMatchEnabled && token.isNameMatch === true;
    const isKnown = nPlusOneEnabled
      ? computeTokenKnownStatus(token, deps.isKnownWord, deps.knownWordMatchMode)
@@ -0,0 +1,341 @@
+import {
+  DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
+  resolveAnnotationPos1ExclusionSet,
+} from '../../../token-pos1-exclusions';
+import {
+  DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
+  resolveAnnotationPos2ExclusionSet,
+} from '../../../token-pos2-exclusions';
+import { MergedToken, PartOfSpeech } from '../../../types';
+import { shouldIgnoreJlptByTerm } from '../jlpt-token-filter';
+
+const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
+const KATAKANA_CODEPOINT_START = 0x30a1;
+const KATAKANA_CODEPOINT_END = 0x30f6;
+
+const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
+  'ああ',
+  'ええ',
+  'うう',
+  'おお',
+  'はあ',
+  'はは',
+  'へえ',
+  'ふう',
+  'ほう',
+]);
+const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
+const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
+  'だ',
+  'です',
+  'でした',
+  'だった',
+  'では',
+  'じゃ',
+  'でしょう',
+  'だろう',
+] as const;
+const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES = [
+  '',
+  'か',
+  'ね',
+  'よ',
+  'な',
+  'よね',
+  'かな',
+  'かね',
+] as const;
+const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set(
+  SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.flatMap((prefix) =>
+    SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES.flatMap((core) =>
+      SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES.map(
+        (particle) => `${prefix}${core}${particle}`,
+      ),
+    ),
+  ),
+);
+const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
+  'って',
+  'ってよ',
+  'ってね',
+  'ってな',
+  'ってさ',
+  'ってか',
+  'ってば',
+]);
+const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
+
+export interface SubtitleAnnotationFilterOptions {
+  pos1Exclusions?: ReadonlySet<string>;
+  pos2Exclusions?: ReadonlySet<string>;
+}
+
+function normalizePosTag(pos: string | undefined): string {
+  return typeof pos === 'string' ? pos.trim() : '';
+}
+
+function splitNormalizedTagParts(normalizedTag: string): string[] {
+  if (!normalizedTag) {
+    return [];
+  }
+
+  return normalizedTag
+    .split('|')
+    .map((part) => part.trim())
+    .filter((part) => part.length > 0);
+}
+
+function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet<string>): boolean {
+  const parts = splitNormalizedTagParts(normalizedTag);
+  if (parts.length === 0) {
+    return false;
+  }
+
+  return parts.every((part) => exclusions.has(part));
+}
+
+function resolvePos1Exclusions(
+  options: SubtitleAnnotationFilterOptions = {},
+): ReadonlySet<string> {
+  if (options.pos1Exclusions) {
+    return options.pos1Exclusions;
+  }
+
+  return resolveAnnotationPos1ExclusionSet(DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG);
+}
+
+function resolvePos2Exclusions(
+  options: SubtitleAnnotationFilterOptions = {},
+): ReadonlySet<string> {
+  if (options.pos2Exclusions) {
+    return options.pos2Exclusions;
+  }
+
+  return resolveAnnotationPos2ExclusionSet(DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG);
+}
+
+function normalizeKana(text: string): string {
+  const raw = text.trim();
+  if (!raw) {
+    return '';
+  }
+
+  let normalized = '';
+  for (const char of raw) {
+    const code = char.codePointAt(0);
+    if (code === undefined) {
+      continue;
+    }
+
+    if (code >= KATAKANA_CODEPOINT_START && code <= KATAKANA_CODEPOINT_END) {
+      normalized += String.fromCodePoint(code - KATAKANA_TO_HIRAGANA_OFFSET);
+      continue;
+    }
+
+    normalized += char;
+  }
+
+  return normalized;
+}
+
+function isKanaChar(char: string): boolean {
+  const code = char.codePointAt(0);
+  if (code === undefined) {
+    return false;
+  }
+
+  return (
+    (code >= 0x3041 && code <= 0x3096) ||
+    (code >= 0x309b && code <= 0x309f) ||
+    code === 0x30fc ||
+    (code >= 0x30a0 && code <= 0x30fa) ||
+    (code >= 0x30fd && code <= 0x30ff)
+  );
+}
+
+function isTrailingSmallTsuKanaSfx(text: string): boolean {
+  const normalized = normalizeKana(text);
+  if (!normalized) {
+    return false;
+  }
+
+  const chars = [...normalized];
+  if (chars.length < 2 || chars.length > 4) {
+    return false;
+  }
+
+  if (!chars.every(isKanaChar)) {
+    return false;
+  }
+
+  return chars[chars.length - 1] === 'っ';
+}
+
+function isReduplicatedKanaSfx(text: string): boolean {
+  const normalized = normalizeKana(text);
+  if (!normalized) {
+    return false;
+  }
+
+  const chars = [...normalized];
+  if (chars.length < 4 || chars.length % 2 !== 0) {
+    return false;
+  }
+
+  if (!chars.every(isKanaChar)) {
+    return false;
+  }
+
+  const half = chars.length / 2;
+  return chars.slice(0, half).join('') === chars.slice(half).join('');
+}
+
+function isReduplicatedKanaSfxWithOptionalTrailingTo(text: string): boolean {
+  const normalized = normalizeKana(text);
+  if (!normalized) {
+    return false;
+  }
+
+  if (isReduplicatedKanaSfx(normalized)) {
+    return true;
+  }
+
+  if (normalized.length <= 1 || !normalized.endsWith('と')) {
+    return false;
+  }
+
+  return isReduplicatedKanaSfx(normalized.slice(0, -1));
+}
+
+function isExcludedTrailingParticleMergedToken(token: MergedToken): boolean {
+  const normalizedSurface = normalizeKana(token.surface);
+  const normalizedHeadword = normalizeKana(token.headword);
+  if (!normalizedSurface || !normalizedHeadword || !normalizedSurface.startsWith(normalizedHeadword)) {
+    return false;
+  }
+
+  const suffix = normalizedSurface.slice(normalizedHeadword.length);
+  if (!SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES.has(suffix)) {
+    return false;
+  }
+
+  const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
+  if (pos1Parts.length < 2) {
+    return false;
+  }
+
+  const [leadingPos1, ...trailingPos1] = pos1Parts;
+  if (!leadingPos1 || resolvePos1Exclusions().has(leadingPos1)) {
+    return false;
+  }
+
+  return trailingPos1.length > 0 && trailingPos1.every((part) => part === '助詞');
+}
+
+function isAuxiliaryStemGrammarTailToken(token: MergedToken): boolean {
+  const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
+  if (pos1Parts.length === 0 || !pos1Parts.every((part) => AUXILIARY_STEM_GRAMMAR_TAIL_POS1.has(part))) {
+    return false;
+  }
+
+  const pos3Parts = splitNormalizedTagParts(normalizePosTag(token.pos3));
+  return pos3Parts.includes('助動詞語幹');
+}
+
+function isExcludedByTerm(token: MergedToken): boolean {
+  const candidates = [token.surface, token.reading, token.headword].filter(
+    (candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
+  );
+
+  for (const candidate of candidates) {
+    const trimmed = candidate.trim();
+    if (!trimmed) {
+      continue;
+    }
+
+    const normalized = normalizeKana(trimmed);
+    if (!normalized) {
+      continue;
+    }
+
+    if (
+      SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(trimmed) ||
+      SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalized) ||
+      SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(trimmed) ||
+      SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(normalized) ||
+      shouldIgnoreJlptByTerm(trimmed) ||
+      shouldIgnoreJlptByTerm(normalized)
+    ) {
+      return true;
+    }
+
+    if (
+      isTrailingSmallTsuKanaSfx(trimmed) ||
+      isTrailingSmallTsuKanaSfx(normalized) ||
+      isReduplicatedKanaSfxWithOptionalTrailingTo(trimmed) ||
+      isReduplicatedKanaSfxWithOptionalTrailingTo(normalized)
+    ) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+export function shouldExcludeTokenFromSubtitleAnnotations(
+  token: MergedToken,
+  options: SubtitleAnnotationFilterOptions = {},
+): boolean {
+  const pos1Exclusions = resolvePos1Exclusions(options);
+  const pos2Exclusions = resolvePos2Exclusions(options);
+  const normalizedPos1 = normalizePosTag(token.pos1);
+  const normalizedPos2 = normalizePosTag(token.pos2);
+  const hasPos1 = normalizedPos1.length > 0;
+  const hasPos2 = normalizedPos2.length > 0;
+
+  if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
+    return true;
+  }
+
+  if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) {
+    return true;
+  }
+
+  if (
+    !hasPos1 &&
+    !hasPos2 &&
+    (token.partOfSpeech === PartOfSpeech.particle ||
+      token.partOfSpeech === PartOfSpeech.bound_auxiliary ||
+      token.partOfSpeech === PartOfSpeech.symbol)
+  ) {
+    return true;
+  }
+
+  if (isAuxiliaryStemGrammarTailToken(token)) {
+    return true;
+  }
+
+  if (isExcludedTrailingParticleMergedToken(token)) {
+    return true;
+  }
+
+  return isExcludedByTerm(token);
+}
+
+export function stripSubtitleAnnotationMetadata(
+  token: MergedToken,
+  options: SubtitleAnnotationFilterOptions = {},
+): MergedToken {
+  if (!shouldExcludeTokenFromSubtitleAnnotations(token, options)) {
+    return token;
+  }
+
+  return {
+    ...token,
+    isKnown: false,
+    isNPlusOneTarget: false,
+    isNameMatch: false,
+    jlptLevel: undefined,
+    frequencyRank: undefined,
+  };
+}
@@ -19,6 +19,7 @@
 import { PartOfSpeech, Token, MergedToken } from './types';
 import { DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG } from './token-pos1-exclusions';
 import { DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG } from './token-pos2-exclusions';
+import { shouldExcludeTokenFromSubtitleAnnotations } from './core/services/tokenizer/subtitle-annotation-filter';

 export function isNoun(tok: Token): boolean {
  return tok.partOfSpeech === PartOfSpeech.noun;
@@ -297,6 +298,10 @@ function isNPlusOneWordCountToken(
  pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
  pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
 ): boolean {
+  if (shouldExcludeTokenFromSubtitleAnnotations(token, { pos1Exclusions, pos2Exclusions })) {
+    return false;
+  }
+
  const normalizedPos1 = normalizePos1Tag(token.pos1);
  const hasPos1 = normalizedPos1.length > 0;
  if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {