fix(subtitle): exclude explanatory pondering endings

2026-05-28 00:55:16 -07:00 · 2026-03-20 00:30:41 -07:00
parent 3e5671270e
commit 0ee150ed91
3 changed files with 115 additions and 0 deletions
@@ -3741,6 +3741,98 @@ test('tokenizeSubtitle clears all annotations for kana-only demonstrative helper
  );
 });
 test('tokenizeSubtitle clears all annotations for explanatory pondering endings', async () => {
  const result = await tokenizeSubtitle(
    '俺どうかしちゃったのかな',
    makeDepsFromYomitanTokens(
      [
        { surface: '俺', reading: 'おれ', headword: '俺' },
        { surface: 'どうかしちゃった', reading: 'どうかしちゃった', headword: 'どうかしちゃう' },
        { surface: 'のかな', reading: 'のかな', headword: 'の' },
      ],
      {
        getFrequencyDictionaryEnabled: () => true,
        getFrequencyRank: (text) => (text === '俺' ? 19 : text === 'どうかしちゃう' ? 3200 : 77),
        getJlptLevel: (text) =>
          text === '俺' ? 'N5' : text === 'どうかしちゃう' ? 'N3' : text === 'の' ? 'N5' : null,
        isKnownWord: (text) => text === '俺' || text === 'の',
        getMinSentenceWordsForNPlusOne: () => 1,
        tokenizeWithMecab: async () => [
          {
            headword: '俺',
            surface: '俺',
            reading: 'オレ',
            startPos: 0,
            endPos: 1,
            partOfSpeech: PartOfSpeech.noun,
            pos1: '名詞',
            pos2: '代名詞',
            isMerged: false,
            isKnown: false,
            isNPlusOneTarget: false,
          },
          {
            headword: 'どうかしちゃう',
            surface: 'どうかしちゃった',
            reading: 'ドウカシチャッタ',
            startPos: 1,
            endPos: 8,
            partOfSpeech: PartOfSpeech.verb,
            pos1: '動詞',
            pos2: '自立',
            isMerged: false,
            isKnown: false,
            isNPlusOneTarget: false,
          },
          {
            headword: 'の',
            surface: 'のかな',
            reading: 'ノカナ',
            startPos: 8,
            endPos: 11,
            partOfSpeech: PartOfSpeech.other,
            pos1: '名詞|助動詞',
            pos2: '非自立',
            isMerged: false,
            isKnown: false,
            isNPlusOneTarget: false,
          },
        ],
      },
    ),
  );
  assert.deepEqual(
    result.tokens?.map((token) => ({
      surface: token.surface,
      headword: token.headword,
      isKnown: token.isKnown,
      isNPlusOneTarget: token.isNPlusOneTarget,
      frequencyRank: token.frequencyRank,
      jlptLevel: token.jlptLevel,
    })),
    [
      { surface: '俺', headword: '俺', isKnown: true, isNPlusOneTarget: false, frequencyRank: 19, jlptLevel: 'N5' },
      {
        surface: 'どうかしちゃった',
        headword: 'どうかしちゃう',
        isKnown: false,
        isNPlusOneTarget: true,
        frequencyRank: 3200,
        jlptLevel: 'N3',
      },
      {
        surface: 'のかな',
        headword: 'の',
        isKnown: false,
        isNPlusOneTarget: false,
        frequencyRank: undefined,
        jlptLevel: undefined,
      },
    ],
  );
 });
 test('tokenizeSubtitle keeps frequency for content-led merged token with trailing colloquial suffixes', async () => {
  const result = await tokenizeSubtitle(
    '張り切ってんじゃ',
@@ -234,6 +234,18 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes explanatory ending vari
  }
 });
 test('shouldExcludeTokenFromSubtitleAnnotations excludes explanatory pondering endings', () => {
  const token = makeToken({
    surface: 'のかな',
    headword: 'の',
    reading: 'ノカナ',
    pos1: '名詞|助動詞',
    pos2: '非自立',
  });
  assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
 });
 test('shouldExcludeTokenFromSubtitleAnnotations excludes auxiliary-stem そうだ grammar tails', () => {
  const token = makeToken({
    surface: 'そうだ',
@@ -45,6 +45,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES = [
  'かな',
  'かね',
 ] as const;
 const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_THOUGHT_SUFFIXES = ['か', 'かな', 'かね'] as const;
 const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set(
  SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.flatMap((prefix) =>
    SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES.flatMap((core) =>
@@ -258,6 +259,16 @@ function isExcludedByTerm(token: MergedToken): boolean {
      continue;
    }
    if (
      SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.some((prefix) =>
        SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_THOUGHT_SUFFIXES.some(
          (suffix) => normalized === `${prefix}${suffix}`,
        ),
      )
    ) {
      return true;
    }
    if (
      SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(trimmed) ||
      SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalized) ||