fix(subtitle): unify annotation token filtering

2026-05-27 12:55:20 -07:00 · 2026-03-19 23:48:38 -07:00
parent 4a01cebca6
commit 42028d0a4d
5 changed files with 527 additions and 29 deletions
@@ -3628,6 +3628,119 @@ test('tokenizeSubtitle excludes merged function/content token from frequency hig
  assert.equal(result.tokens?.[0]?.isNPlusOneTarget, true);
 });
 test('tokenizeSubtitle clears all annotations for kana-only demonstrative helper merges', async () => {
  const result = await tokenizeSubtitle(
    'これで実力どおりか',
    makeDepsFromYomitanTokens(
      [
        { surface: 'これで', reading: 'これで', headword: 'これ' },
        { surface: '実力どおり', reading: 'じつりょくどおり', headword: '実力どおり' },
        { surface: 'か', reading: 'か', headword: 'か' },
      ],
      {
        getFrequencyDictionaryEnabled: () => true,
        getFrequencyRank: (text) =>
          text === 'これ' ? 9 : text === '実力どおり' ? 2500 : text === 'か' ? 800 : null,
        getJlptLevel: (text) =>
          text === 'これ' ? 'N5' : text === '実力どおり' ? 'N1' : text === 'か' ? 'N5' : null,
        isKnownWord: (text) => text === 'これ',
        getMinSentenceWordsForNPlusOne: () => 1,
        tokenizeWithMecab: async () => [
          {
            headword: 'これ',
            surface: 'これ',
            reading: 'コレ',
            startPos: 0,
            endPos: 2,
            partOfSpeech: PartOfSpeech.noun,
            pos1: '名詞',
            pos2: '代名詞',
            isMerged: false,
            isKnown: false,
            isNPlusOneTarget: false,
          },
          {
            headword: 'で',
            surface: 'で',
            reading: 'デ',
            startPos: 2,
            endPos: 3,
            partOfSpeech: PartOfSpeech.particle,
            pos1: '助詞',
            pos2: '格助詞',
            isMerged: false,
            isKnown: false,
            isNPlusOneTarget: false,
          },
          {
            headword: '実力どおり',
            surface: '実力どおり',
            reading: 'ジツリョクドオリ',
            startPos: 3,
            endPos: 8,
            partOfSpeech: PartOfSpeech.noun,
            pos1: '名詞',
            pos2: '一般',
            isMerged: false,
            isKnown: false,
            isNPlusOneTarget: false,
          },
          {
            headword: 'か',
            surface: 'か',
            reading: 'カ',
            startPos: 8,
            endPos: 9,
            partOfSpeech: PartOfSpeech.particle,
            pos1: '助詞',
            pos2: '終助詞',
            isMerged: false,
            isKnown: false,
            isNPlusOneTarget: false,
          },
        ],
      },
    ),
  );
  assert.deepEqual(
    result.tokens?.map((token) => ({
      surface: token.surface,
      headword: token.headword,
      isKnown: token.isKnown,
      isNPlusOneTarget: token.isNPlusOneTarget,
      frequencyRank: token.frequencyRank,
      jlptLevel: token.jlptLevel,
    })),
    [
      {
        surface: 'これで',
        headword: 'これ',
        isKnown: false,
        isNPlusOneTarget: false,
        frequencyRank: undefined,
        jlptLevel: undefined,
      },
      {
        surface: '実力どおり',
        headword: '実力どおり',
        isKnown: false,
        isNPlusOneTarget: true,
        frequencyRank: 2500,
        jlptLevel: 'N1',
      },
      {
        surface: 'か',
        headword: 'か',
        isKnown: false,
        isNPlusOneTarget: false,
        frequencyRank: undefined,
        jlptLevel: undefined,
      },
    ],
  );
 });
 test('tokenizeSubtitle keeps frequency for content-led merged token with trailing colloquial suffixes', async () => {
  const result = await tokenizeSubtitle(
    '張り切ってんじゃ',
@@ -316,6 +316,19 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes merged lexical tokens w
  assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
 });
 test('shouldExcludeTokenFromSubtitleAnnotations excludes kana-only demonstrative helper merges', () => {
  const token = makeToken({
    surface: 'これで',
    headword: 'これ',
    reading: 'コレデ',
    partOfSpeech: PartOfSpeech.noun,
    pos1: '名詞|助詞',
    pos2: '代名詞|格助詞',
  });
  assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
 });
 test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => {
  const token = makeToken({
    surface: 'は',
@@ -481,8 +494,8 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
  );
  assert.equal(result[0]?.isKnown, false);
-  assert.equal(result[1]?.isKnown, true);
+  assert.equal(result[1]?.isKnown, false);
-  assert.equal(result[2]?.isKnown, true);
+  assert.equal(result[2]?.isKnown, false);
  assert.equal(result[0]?.isNPlusOneTarget, false);
 });
@@ -568,7 +581,7 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+
  assert.equal(result[0]?.isNPlusOneTarget, false);
 });
-test('annotateTokens keeps frequency for kanji noun tokens even when mecab marks them non-independent', () => {
+test('annotateTokens clears all annotations for non-independent kanji noun tokens under unified gate', () => {
  const tokens = [
    makeToken({
      surface: '者',
@@ -588,7 +601,10 @@ test('annotateTokens keeps frequency for kanji noun tokens even when mecab marks
    minSentenceWordsForNPlusOne: 1,
  });
-  assert.equal(result[0]?.frequencyRank, 475);
+  assert.equal(result[0]?.isKnown, false);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
 });
 test('annotateTokens excludes likely kana SFX tokens from frequency when POS tags are missing', () => {
@@ -742,3 +758,33 @@ test('annotateTokens excludes composite tokens when all component pos tags are e
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.isNPlusOneTarget, false);
 });
 test('annotateTokens applies one shared exclusion gate across known N+1 frequency and JLPT', () => {
  const tokens = [
    makeToken({
      surface: 'これで',
      headword: 'これ',
      reading: 'コレデ',
      partOfSpeech: PartOfSpeech.noun,
      pos1: '名詞|助詞',
      pos2: '代名詞|格助詞',
      startPos: 0,
      endPos: 3,
      frequencyRank: 9,
    }),
  ];
  const result = annotateTokens(
    tokens,
    makeDeps({
      isKnownWord: (text) => text === 'これ',
      getJlptLevel: (text) => (text === 'これ' ? 'N5' : null),
    }),
    { minSentenceWordsForNPlusOne: 1 },
  );
  assert.equal(result[0]?.isKnown, false);
  assert.equal(result[0]?.isNPlusOneTarget, false);
  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.jlptLevel, undefined);
 });
@@ -9,6 +9,10 @@ import {
 } from '../../../token-pos2-exclusions';
 import { JlptLevel, MergedToken, NPlusOneMatchMode, PartOfSpeech } from '../../../types';
 import { shouldIgnoreJlptByTerm, shouldIgnoreJlptForMecabPos1 } from '../jlpt-token-filter';
 import {
  shouldExcludeTokenFromSubtitleAnnotations as sharedShouldExcludeTokenFromSubtitleAnnotations,
  stripSubtitleAnnotationMetadata as sharedStripSubtitleAnnotationMetadata,
 } from './subtitle-annotation-filter';
 const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
 const KATAKANA_CODEPOINT_START = 0x30a1;
@@ -633,34 +637,11 @@ function isExcludedFromSubtitleAnnotationsByTerm(token: MergedToken): boolean {
 }
 export function shouldExcludeTokenFromSubtitleAnnotations(token: MergedToken): boolean {
-  if (isExcludedFromSubtitleAnnotationsByPos1(normalizePos1Tag(token.pos1))) {
+  return sharedShouldExcludeTokenFromSubtitleAnnotations(token);
    return true;
  }
  if (isAuxiliaryStemGrammarTailToken(token)) {
    return true;
  }
  if (isExcludedTrailingParticleMergedToken(token)) {
    return true;
  }
  return isExcludedFromSubtitleAnnotationsByTerm(token);
 }
 export function stripSubtitleAnnotationMetadata(token: MergedToken): MergedToken {
-  if (!shouldExcludeTokenFromSubtitleAnnotations(token)) {
+  return sharedStripSubtitleAnnotationMetadata(token);
    return token;
  }
  return {
    ...token,
    isKnown: false,
    isNPlusOneTarget: false,
    isNameMatch: false,
    jlptLevel: undefined,
    frequencyRank: undefined,
  };
 }
 function computeTokenKnownStatus(
@@ -737,6 +718,18 @@ export function annotateTokens(
  // Single pass: compute known word status, frequency filtering, and JLPT level together
  const annotated = tokens.map((token) => {
    if (
      sharedShouldExcludeTokenFromSubtitleAnnotations(token, {
        pos1Exclusions,
        pos2Exclusions,
      })
    ) {
      return sharedStripSubtitleAnnotationMetadata(token, {
        pos1Exclusions,
        pos2Exclusions,
      });
    }
    const prioritizedNameMatch = nameMatchEnabled && token.isNameMatch === true;
    const isKnown = nPlusOneEnabled
      ? computeTokenKnownStatus(token, deps.isKnownWord, deps.knownWordMatchMode)
@@ -0,0 +1,341 @@
 import {
  DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
  resolveAnnotationPos1ExclusionSet,
 } from '../../../token-pos1-exclusions';
 import {
  DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
  resolveAnnotationPos2ExclusionSet,
 } from '../../../token-pos2-exclusions';
 import { MergedToken, PartOfSpeech } from '../../../types';
 import { shouldIgnoreJlptByTerm } from '../jlpt-token-filter';
 const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
 const KATAKANA_CODEPOINT_START = 0x30a1;
 const KATAKANA_CODEPOINT_END = 0x30f6;
 const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
  'ああ',
  'ええ',
  'うう',
  'おお',
  'はあ',
  'はは',
  'へえ',
  'ふう',
  'ほう',
 ]);
 const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
 const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
  'だ',
  'です',
  'でした',
  'だった',
  'では',
  'じゃ',
  'でしょう',
  'だろう',
 ] as const;
 const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES = [
  '',
  'か',
  'ね',
  'よ',
  'な',
  'よね',
  'かな',
  'かね',
 ] as const;
 const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set(
  SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.flatMap((prefix) =>
    SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES.flatMap((core) =>
      SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES.map(
        (particle) => `${prefix}${core}${particle}`,
      ),
    ),
  ),
 );
 const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
  'って',
  'ってよ',
  'ってね',
  'ってな',
  'ってさ',
  'ってか',
  'ってば',
 ]);
 const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
 export interface SubtitleAnnotationFilterOptions {
  pos1Exclusions?: ReadonlySet<string>;
  pos2Exclusions?: ReadonlySet<string>;
 }
 function normalizePosTag(pos: string | undefined): string {
  return typeof pos === 'string' ? pos.trim() : '';
 }
 function splitNormalizedTagParts(normalizedTag: string): string[] {
  if (!normalizedTag) {
    return [];
  }
  return normalizedTag
    .split('|')
    .map((part) => part.trim())
    .filter((part) => part.length > 0);
 }
 function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet<string>): boolean {
  const parts = splitNormalizedTagParts(normalizedTag);
  if (parts.length === 0) {
    return false;
  }
  return parts.every((part) => exclusions.has(part));
 }
 function resolvePos1Exclusions(
  options: SubtitleAnnotationFilterOptions = {},
 ): ReadonlySet<string> {
  if (options.pos1Exclusions) {
    return options.pos1Exclusions;
  }
  return resolveAnnotationPos1ExclusionSet(DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG);
 }
 function resolvePos2Exclusions(
  options: SubtitleAnnotationFilterOptions = {},
 ): ReadonlySet<string> {
  if (options.pos2Exclusions) {
    return options.pos2Exclusions;
  }
  return resolveAnnotationPos2ExclusionSet(DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG);
 }
 function normalizeKana(text: string): string {
  const raw = text.trim();
  if (!raw) {
    return '';
  }
  let normalized = '';
  for (const char of raw) {
    const code = char.codePointAt(0);
    if (code === undefined) {
      continue;
    }
    if (code >= KATAKANA_CODEPOINT_START && code <= KATAKANA_CODEPOINT_END) {
      normalized += String.fromCodePoint(code - KATAKANA_TO_HIRAGANA_OFFSET);
      continue;
    }
    normalized += char;
  }
  return normalized;
 }
 function isKanaChar(char: string): boolean {
  const code = char.codePointAt(0);
  if (code === undefined) {
    return false;
  }
  return (
    (code >= 0x3041 && code <= 0x3096) ||
    (code >= 0x309b && code <= 0x309f) ||
    code === 0x30fc ||
    (code >= 0x30a0 && code <= 0x30fa) ||
    (code >= 0x30fd && code <= 0x30ff)
  );
 }
 function isTrailingSmallTsuKanaSfx(text: string): boolean {
  const normalized = normalizeKana(text);
  if (!normalized) {
    return false;
  }
  const chars = [...normalized];
  if (chars.length < 2 || chars.length > 4) {
    return false;
  }
  if (!chars.every(isKanaChar)) {
    return false;
  }
  return chars[chars.length - 1] === 'っ';
 }
 function isReduplicatedKanaSfx(text: string): boolean {
  const normalized = normalizeKana(text);
  if (!normalized) {
    return false;
  }
  const chars = [...normalized];
  if (chars.length < 4 || chars.length % 2 !== 0) {
    return false;
  }
  if (!chars.every(isKanaChar)) {
    return false;
  }
  const half = chars.length / 2;
  return chars.slice(0, half).join('') === chars.slice(half).join('');
 }
 function isReduplicatedKanaSfxWithOptionalTrailingTo(text: string): boolean {
  const normalized = normalizeKana(text);
  if (!normalized) {
    return false;
  }
  if (isReduplicatedKanaSfx(normalized)) {
    return true;
  }
  if (normalized.length <= 1 || !normalized.endsWith('と')) {
    return false;
  }
  return isReduplicatedKanaSfx(normalized.slice(0, -1));
 }
 function isExcludedTrailingParticleMergedToken(token: MergedToken): boolean {
  const normalizedSurface = normalizeKana(token.surface);
  const normalizedHeadword = normalizeKana(token.headword);
  if (!normalizedSurface || !normalizedHeadword || !normalizedSurface.startsWith(normalizedHeadword)) {
    return false;
  }
  const suffix = normalizedSurface.slice(normalizedHeadword.length);
  if (!SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES.has(suffix)) {
    return false;
  }
  const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
  if (pos1Parts.length < 2) {
    return false;
  }
  const [leadingPos1, ...trailingPos1] = pos1Parts;
  if (!leadingPos1 || resolvePos1Exclusions().has(leadingPos1)) {
    return false;
  }
  return trailingPos1.length > 0 && trailingPos1.every((part) => part === '助詞');
 }
 function isAuxiliaryStemGrammarTailToken(token: MergedToken): boolean {
  const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
  if (pos1Parts.length === 0 || !pos1Parts.every((part) => AUXILIARY_STEM_GRAMMAR_TAIL_POS1.has(part))) {
    return false;
  }
  const pos3Parts = splitNormalizedTagParts(normalizePosTag(token.pos3));
  return pos3Parts.includes('助動詞語幹');
 }
 function isExcludedByTerm(token: MergedToken): boolean {
  const candidates = [token.surface, token.reading, token.headword].filter(
    (candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
  );
  for (const candidate of candidates) {
    const trimmed = candidate.trim();
    if (!trimmed) {
      continue;
    }
    const normalized = normalizeKana(trimmed);
    if (!normalized) {
      continue;
    }
    if (
      SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(trimmed) ||
      SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalized) ||
      SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(trimmed) ||
      SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(normalized) ||
      shouldIgnoreJlptByTerm(trimmed) ||
      shouldIgnoreJlptByTerm(normalized)
    ) {
      return true;
    }
    if (
      isTrailingSmallTsuKanaSfx(trimmed) ||
      isTrailingSmallTsuKanaSfx(normalized) ||
      isReduplicatedKanaSfxWithOptionalTrailingTo(trimmed) ||
      isReduplicatedKanaSfxWithOptionalTrailingTo(normalized)
    ) {
      return true;
    }
  }
  return false;
 }
 export function shouldExcludeTokenFromSubtitleAnnotations(
  token: MergedToken,
  options: SubtitleAnnotationFilterOptions = {},
 ): boolean {
  const pos1Exclusions = resolvePos1Exclusions(options);
  const pos2Exclusions = resolvePos2Exclusions(options);
  const normalizedPos1 = normalizePosTag(token.pos1);
  const normalizedPos2 = normalizePosTag(token.pos2);
  const hasPos1 = normalizedPos1.length > 0;
  const hasPos2 = normalizedPos2.length > 0;
  if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
    return true;
  }
  if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) {
    return true;
  }
  if (
    !hasPos1 &&
    !hasPos2 &&
    (token.partOfSpeech === PartOfSpeech.particle ||
      token.partOfSpeech === PartOfSpeech.bound_auxiliary ||
      token.partOfSpeech === PartOfSpeech.symbol)
  ) {
    return true;
  }
  if (isAuxiliaryStemGrammarTailToken(token)) {
    return true;
  }
  if (isExcludedTrailingParticleMergedToken(token)) {
    return true;
  }
  return isExcludedByTerm(token);
 }
 export function stripSubtitleAnnotationMetadata(
  token: MergedToken,
  options: SubtitleAnnotationFilterOptions = {},
 ): MergedToken {
  if (!shouldExcludeTokenFromSubtitleAnnotations(token, options)) {
    return token;
  }
  return {
    ...token,
    isKnown: false,
    isNPlusOneTarget: false,
    isNameMatch: false,
    jlptLevel: undefined,
    frequencyRank: undefined,
  };
 }
@@ -19,6 +19,7 @@
 import { PartOfSpeech, Token, MergedToken } from './types';
 import { DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG } from './token-pos1-exclusions';
 import { DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG } from './token-pos2-exclusions';
 import { shouldExcludeTokenFromSubtitleAnnotations } from './core/services/tokenizer/subtitle-annotation-filter';
 export function isNoun(tok: Token): boolean {
  return tok.partOfSpeech === PartOfSpeech.noun;
@@ -297,6 +298,10 @@ function isNPlusOneWordCountToken(
  pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
  pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
 ): boolean {
  if (shouldExcludeTokenFromSubtitleAnnotations(token, { pos1Exclusions, pos2Exclusions })) {
    return false;
  }
  const normalizedPos1 = normalizePos1Tag(token.pos1);
  const hasPos1 = normalizedPos1.length > 0;
  if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {