fix: exclude auxiliary grammar tails from subtitle annotations

2026-05-28 00:55:16 -07:00 · 2026-03-19 21:40:20 -07:00
parent ff95934f07
commit 59fa3b427d
4 changed files with 160 additions and 0 deletions
@@ -3483,6 +3483,79 @@ test('tokenizeSubtitle keeps trailing quote-particle merged tokens hoverable whi
  );
 });

+test('tokenizeSubtitle keeps auxiliary-stem そうだ grammar tails hoverable while clearing annotation metadata', async () => {
+  const result = await tokenizeSubtitle(
+    '与えるそうだ',
+    makeDepsFromYomitanTokens(
+      [
+        { surface: '与える', reading: 'あたえる', headword: '与える' },
+        { surface: 'そうだ', reading: 'そうだ', headword: 'そうだ' },
+      ],
+      {
+        getFrequencyDictionaryEnabled: () => true,
+        getFrequencyRank: (text) => (text === '与える' ? 100 : text === 'そうだ' ? 12 : null),
+        getJlptLevel: (text) => (text === '与える' ? 'N3' : text === 'そうだ' ? 'N5' : null),
+        tokenizeWithMecab: async () => [
+          {
+            headword: '与える',
+            surface: '与える',
+            reading: 'アタエル',
+            startPos: 0,
+            endPos: 3,
+            partOfSpeech: PartOfSpeech.verb,
+            pos1: '動詞',
+            pos2: '自立',
+            isMerged: false,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+          {
+            headword: 'そう',
+            surface: 'そう',
+            reading: 'ソウ',
+            startPos: 3,
+            endPos: 5,
+            partOfSpeech: PartOfSpeech.noun,
+            pos1: '名詞',
+            pos2: '特殊',
+            pos3: '助動詞語幹',
+            isMerged: false,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+          {
+            headword: 'だ',
+            surface: 'だ',
+            reading: 'ダ',
+            startPos: 5,
+            endPos: 6,
+            partOfSpeech: PartOfSpeech.bound_auxiliary,
+            pos1: '助動詞',
+            isMerged: false,
+            isKnown: false,
+            isNPlusOneTarget: false,
+          },
+        ],
+        getMinSentenceWordsForNPlusOne: () => 1,
+      },
+    ),
+  );
+
+  assert.equal(result.text, '与えるそうだ');
+  assert.deepEqual(
+    result.tokens?.map((token) => ({
+      surface: token.surface,
+      headword: token.headword,
+      frequencyRank: token.frequencyRank,
+      jlptLevel: token.jlptLevel,
+    })),
+    [
+      { surface: '与える', headword: '与える', frequencyRank: 100, jlptLevel: 'N3' },
+      { surface: 'そうだ', headword: 'そうだ', frequencyRank: undefined, jlptLevel: undefined },
+    ],
+  );
+});
+
 test('tokenizeSubtitle excludes single-kana merged tokens from frequency highlighting', async () => {
  const result = await tokenizeSubtitle(
    'た',
@@ -234,6 +234,19 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes explanatory ending vari
  }
 });

+test('shouldExcludeTokenFromSubtitleAnnotations excludes auxiliary-stem そうだ grammar tails', () => {
+  const token = makeToken({
+    surface: 'そうだ',
+    headword: 'そうだ',
+    reading: 'ソウダ',
+    pos1: '名詞|助動詞',
+    pos2: '特殊',
+    pos3: '助動詞語幹',
+  });
+
+  assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
+});
+
 test('shouldExcludeTokenFromSubtitleAnnotations keeps lexical tokens outside explanatory ending family', () => {
  const token = makeToken({
    surface: '問題',
@@ -100,6 +100,7 @@ function normalizePos1Tag(pos1: string | undefined): string {

 const SUBTITLE_ANNOTATION_EXCLUDED_POS1 = new Set(['感動詞']);
 const SUBTITLE_ANNOTATION_GRAMMAR_ONLY_POS1 = new Set(['助詞', '助動詞', '連体詞']);
+const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);

 function splitNormalizedTagParts(normalizedTag: string): string[] {
  if (!normalizedTag) {
@@ -156,6 +157,16 @@ function isExcludedTrailingParticleMergedToken(token: MergedToken): boolean {
  return trailingPos1.length > 0 && trailingPos1.every((part) => part === '助詞');
 }

+function isAuxiliaryStemGrammarTailToken(token: MergedToken): boolean {
+  const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1));
+  if (pos1Parts.length === 0 || !pos1Parts.every((part) => AUXILIARY_STEM_GRAMMAR_TAIL_POS1.has(part))) {
+    return false;
+  }
+
+  const pos3Parts = splitNormalizedTagParts(normalizePos2Tag(token.pos3));
+  return pos3Parts.includes('助動詞語幹');
+}
+
 function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
  if (options.pos1Exclusions) {
    return options.pos1Exclusions;
@@ -626,6 +637,10 @@ export function shouldExcludeTokenFromSubtitleAnnotations(token: MergedToken): b
    return true;
  }

+  if (isAuxiliaryStemGrammarTailToken(token)) {
+    return true;
+  }
+
  if (isExcludedTrailingParticleMergedToken(token)) {
    return true;
  }