feat(tokenizer): exclude interjections and sound effects from subtitle annotations

- Filter out 感動詞 (interjection) POS1 tokens from annotation payloads - Exclude common interjection terms (ああ, ええ, はあ, etc.) - Exclude reduplicated kana SFX with optional trailing と - shouldExcludeTokenFromSubtitleAnnotations checks both POS1 and term patterns - filterSubtitleAnnotationTokens applied after annotation stage
2026-05-28 00:55:16 -07:00 · 2026-03-16 01:45:58 -07:00
parent 5767667d51
commit a317019bb9
4 changed files with 253 additions and 4 deletions
@@ -1460,7 +1460,7 @@ test('tokenizeSubtitle skips JLPT level for excluded demonstratives', async () =
  assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
 });

-test('tokenizeSubtitle skips JLPT level for repeated kana SFX', async () => {
+test('tokenizeSubtitle excludes repeated kana interjections from annotation payloads entirely', async () => {
  const result = await tokenizeSubtitle(
    'ああ',
    makeDeps({
@@ -1491,8 +1491,7 @@ test('tokenizeSubtitle skips JLPT level for repeated kana SFX', async () => {
    }),
  );

-  assert.equal(result.tokens?.length, 1);
-  assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
+  assert.deepEqual(result, { text: 'ああ', tokens: null });
 });

 test('tokenizeSubtitle assigns JLPT level to Yomitan tokens', async () => {
@@ -3057,6 +3056,102 @@ test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and freque
  assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
 });

+test('tokenizeSubtitle excludes mecab-tagged interjections from annotation payloads entirely', async () => {
+  const result = await tokenizeSubtitle(
+    'ぐはっ',
+    makeDepsFromYomitanTokens([{ surface: 'ぐはっ', reading: 'ぐはっ', headword: 'ぐはっ' }], {
+      getFrequencyDictionaryEnabled: () => true,
+      getFrequencyRank: () => 17,
+      getJlptLevel: () => 'N5',
+      tokenizeWithMecab: async () => [
+        {
+          headword: 'ぐはっ',
+          surface: 'ぐはっ',
+          reading: 'グハッ',
+          startPos: 0,
+          endPos: 3,
+          partOfSpeech: PartOfSpeech.other,
+          pos1: '感動詞',
+          isMerged: true,
+          isKnown: false,
+          isNPlusOneTarget: false,
+        },
+      ],
+    }),
+  );
+
+  assert.deepEqual(result, { text: 'ぐはっ', tokens: null });
+});
+
+test('tokenizeSubtitle keeps visible text while excluding interjections from mixed annotation payloads', async () => {
+  const result = await tokenizeSubtitle(
+    'ぐはっ 猫',
+    makeDeps({
+      getFrequencyDictionaryEnabled: () => true,
+      getFrequencyRank: (text) => (text === '猫' ? 11 : 17),
+      getJlptLevel: (text) => (text === '猫' ? 'N5' : null),
+      getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
+      getYomitanParserWindow: () =>
+        ({
+          isDestroyed: () => false,
+          webContents: {
+            executeJavaScript: async (script: string) => {
+              if (script.includes('getTermFrequencies')) {
+                return [];
+              }
+
+              return [
+                {
+                  source: 'scanning-parser',
+                  index: 0,
+                  content: [
+                    [{ text: 'ぐはっ', reading: 'ぐはっ', headwords: [[{ term: 'ぐはっ' }]] }],
+                    [{ text: '猫', reading: 'ねこ', headwords: [[{ term: '猫' }]] }],
+                  ],
+                },
+              ];
+            },
+          },
+        }) as unknown as Electron.BrowserWindow,
+      tokenizeWithMecab: async () => [
+        {
+          headword: 'ぐはっ',
+          surface: 'ぐはっ',
+          reading: 'グハッ',
+          startPos: 0,
+          endPos: 3,
+          partOfSpeech: PartOfSpeech.other,
+          pos1: '感動詞',
+          isMerged: true,
+          isKnown: false,
+          isNPlusOneTarget: false,
+        },
+        {
+          headword: '猫',
+          surface: '猫',
+          reading: 'ネコ',
+          startPos: 4,
+          endPos: 5,
+          partOfSpeech: PartOfSpeech.noun,
+          pos1: '名詞',
+          isMerged: true,
+          isKnown: false,
+          isNPlusOneTarget: false,
+        },
+      ],
+    }),
+  );
+
+  assert.equal(result.text, 'ぐはっ 猫');
+  assert.deepEqual(
+    result.tokens?.map((token) => ({
+      surface: token.surface,
+      headword: token.headword,
+    })),
+    [{ surface: '猫', headword: '猫' }],
+  );
+});
+
 test('tokenizeSubtitle excludes single-kana merged tokens from frequency highlighting', async () => {
  const result = await tokenizeSubtitle(
    'た',
@@ -178,6 +178,19 @@ async function applyAnnotationStage(
  );
 }

+async function filterSubtitleAnnotationTokens(tokens: MergedToken[]): Promise<MergedToken[]> {
+  if (tokens.length === 0) {
+    return tokens;
+  }
+
+  if (!annotationStageModulePromise) {
+    annotationStageModulePromise = import('./tokenizer/annotation-stage');
+  }
+
+  const annotationStage = await annotationStageModulePromise;
+  return tokens.filter((token) => !annotationStage.shouldExcludeTokenFromSubtitleAnnotations(token));
+}
+
 export function createTokenizerDepsRuntime(
  options: TokenizerDepsRuntimeOptions,
 ): TokenizerServiceDeps {
@@ -698,9 +711,12 @@ export async function tokenizeSubtitle(

  const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps, annotationOptions);
  if (yomitanTokens && yomitanTokens.length > 0) {
+    const filteredTokens = await filterSubtitleAnnotationTokens(
+      await applyAnnotationStage(yomitanTokens, deps, annotationOptions),
+    );
    return {
      text: displayText,
-      tokens: await applyAnnotationStage(yomitanTokens, deps, annotationOptions),
+      tokens: filteredTokens.length > 0 ? filteredTokens : null,
    };
  }

@@ -14,6 +14,17 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
 const KATAKANA_CODEPOINT_START = 0x30a1;
 const KATAKANA_CODEPOINT_END = 0x30f6;
 const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
+const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
+  'ああ',
+  'ええ',
+  'うう',
+  'おお',
+  'はあ',
+  'はは',
+  'へえ',
+  'ふう',
+  'ほう',
+]);

 const jlptLevelLookupCaches = new WeakMap<
  (text: string) => JlptLevel | null,
@@ -48,6 +59,8 @@ function normalizePos1Tag(pos1: string | undefined): string {
  return typeof pos1 === 'string' ? pos1.trim() : '';
 }

+const SUBTITLE_ANNOTATION_EXCLUDED_POS1 = new Set(['感動詞']);
+
 function splitNormalizedTagParts(normalizedTag: string): string[] {
  if (!normalizedTag) {
    return [];
@@ -69,6 +82,11 @@ function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet<strin
  return parts.some((part) => exclusions.has(part));
 }

+function isExcludedFromSubtitleAnnotationsByPos1(normalizedPos1: string): boolean {
+  const parts = splitNormalizedTagParts(normalizedPos1);
+  return parts.some((part) => SUBTITLE_ANNOTATION_EXCLUDED_POS1.has(part));
+}
+
 function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
  if (options.pos1Exclusions) {
    return options.pos1Exclusions;
@@ -383,6 +401,23 @@ function isReduplicatedKanaSfx(text: string): boolean {
  return chars.slice(0, half).join('') === chars.slice(half).join('');
 }

+function isReduplicatedKanaSfxWithOptionalTrailingTo(text: string): boolean {
+  const normalized = normalizeJlptTextForExclusion(text);
+  if (!normalized) {
+    return false;
+  }
+
+  if (isReduplicatedKanaSfx(normalized)) {
+    return true;
+  }
+
+  if (normalized.length <= 1 || !normalized.endsWith('と')) {
+    return false;
+  }
+
+  return isReduplicatedKanaSfx(normalized.slice(0, -1));
+}
+
 function hasAdjacentKanaRepeat(text: string): boolean {
  const normalized = normalizeJlptTextForExclusion(text);
  if (!normalized) {
@@ -485,6 +520,55 @@ function isJlptEligibleToken(token: MergedToken): boolean {
  return true;
 }

+function isExcludedFromSubtitleAnnotationsByTerm(token: MergedToken): boolean {
+  const candidates = [
+    resolveJlptLookupText(token),
+    token.surface,
+    token.headword,
+    token.reading,
+  ].filter(
+    (candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
+  );
+
+  for (const candidate of candidates) {
+    const trimmedCandidate = candidate.trim();
+    if (!trimmedCandidate) {
+      continue;
+    }
+
+    const normalizedCandidate = normalizeJlptTextForExclusion(trimmedCandidate);
+    if (!normalizedCandidate) {
+      continue;
+    }
+
+    if (
+      SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(trimmedCandidate) ||
+      SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalizedCandidate)
+    ) {
+      return true;
+    }
+
+    if (
+      isTrailingSmallTsuKanaSfx(trimmedCandidate) ||
+      isTrailingSmallTsuKanaSfx(normalizedCandidate) ||
+      isReduplicatedKanaSfxWithOptionalTrailingTo(trimmedCandidate) ||
+      isReduplicatedKanaSfxWithOptionalTrailingTo(normalizedCandidate)
+    ) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+export function shouldExcludeTokenFromSubtitleAnnotations(token: MergedToken): boolean {
+  if (isExcludedFromSubtitleAnnotationsByPos1(normalizePos1Tag(token.pos1))) {
+    return true;
+  }
+
+  return isExcludedFromSubtitleAnnotationsByTerm(token);
+}
+
 function computeTokenKnownStatus(
  token: MergedToken,
  isKnownWord: (text: string) => boolean,
@@ -212,3 +212,57 @@ test('merges trailing katakana continuation without headword into previous token
    ],
  );
 });
+
+// Regression: merged content+function token candidate must not beat a multi-token split
+// candidate that preserves the content token as a standalone frequency-eligible unit.
+// Background: Yomitan scanning can produce a single-token candidate where a content word
+// is merged with trailing function particles (e.g. かかってこいよ → headword かかってくる).
+// When a competing multi-token candidate splits content and function separately, the
+// multi-token candidate should win so the content token remains frequency-highlightable.
+test('multi-token candidate beats single merged content+function token candidate (frequency regression)', () => {
+  // Candidate A: single merged token — content verb fused with trailing sentence-final particle
+  // This is the "bad" candidate: downstream annotation would exclude frequency for the whole
+  // token because the merged pos1 would contain a function-word component.
+  const mergedCandidate = makeParseItem('scanning-parser', [
+    [{ text: 'かかってこいよ', reading: 'かかってこいよ', headword: 'かかってくる' }],
+  ]);
+
+  // Candidate B: two tokens — content verb surface + particle separately.
+  // The content token is frequency-eligible on its own.
+  const splitCandidate = makeParseItem('scanning-parser', [
+    [{ text: 'かかってこい', reading: 'かかってこい', headword: 'かかってくる' }],
+    [{ text: 'よ', reading: 'よ', headword: 'よ' }],
+  ]);
+
+  // When merged candidate comes first in the array, multi-token split still wins.
+  const tokens = selectYomitanParseTokens(
+    [mergedCandidate, splitCandidate],
+    () => false,
+    'headword',
+  );
+  assert.equal(tokens?.length, 2);
+  assert.equal(tokens?.[0]?.surface, 'かかってこい');
+  assert.equal(tokens?.[0]?.headword, 'かかってくる');
+  assert.equal(tokens?.[1]?.surface, 'よ');
+});
+
+test('multi-token candidate beats single merged content+function token regardless of input order', () => {
+  const mergedCandidate = makeParseItem('scanning-parser', [
+    [{ text: 'かかってこいよ', reading: 'かかってこいよ', headword: 'かかってくる' }],
+  ]);
+
+  const splitCandidate = makeParseItem('scanning-parser', [
+    [{ text: 'かかってこい', reading: 'かかってこい', headword: 'かかってくる' }],
+    [{ text: 'よ', reading: 'よ', headword: 'よ' }],
+  ]);
+
+  // Split candidate comes first — should still win over merged.
+  const tokens = selectYomitanParseTokens(
+    [splitCandidate, mergedCandidate],
+    () => false,
+    'headword',
+  );
+  assert.equal(tokens?.length, 2);
+  assert.equal(tokens?.[0]?.surface, 'かかってこい');
+  assert.equal(tokens?.[1]?.surface, 'よ');
+});