From a317019bb96007161409a80dcbccbf02ffed2f0d Mon Sep 17 00:00:00 2001 From: sudacode Date: Mon, 16 Mar 2026 01:45:58 -0700 Subject: [PATCH] feat(tokenizer): exclude interjections and sound effects from subtitle annotations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Filter out 感動詞 (interjection) POS1 tokens from annotation payloads - Exclude common interjection terms (ああ, ええ, はあ, etc.) - Exclude reduplicated kana SFX with optional trailing と - shouldExcludeTokenFromSubtitleAnnotations checks both POS1 and term patterns - filterSubtitleAnnotationTokens applied after annotation stage --- src/core/services/tokenizer.test.ts | 101 +++++++++++++++++- src/core/services/tokenizer.ts | 18 +++- .../services/tokenizer/annotation-stage.ts | 84 +++++++++++++++ .../tokenizer/parser-selection-stage.test.ts | 54 ++++++++++ 4 files changed, 253 insertions(+), 4 deletions(-) diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index 801d7c2..98e652d 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -1460,7 +1460,7 @@ test('tokenizeSubtitle skips JLPT level for excluded demonstratives', async () = assert.equal(result.tokens?.[0]?.jlptLevel, undefined); }); -test('tokenizeSubtitle skips JLPT level for repeated kana SFX', async () => { +test('tokenizeSubtitle excludes repeated kana interjections from annotation payloads entirely', async () => { const result = await tokenizeSubtitle( 'ああ', makeDeps({ @@ -1491,8 +1491,7 @@ test('tokenizeSubtitle skips JLPT level for repeated kana SFX', async () => { }), ); - assert.equal(result.tokens?.length, 1); - assert.equal(result.tokens?.[0]?.jlptLevel, undefined); + assert.deepEqual(result, { text: 'ああ', tokens: null }); }); test('tokenizeSubtitle assigns JLPT level to Yomitan tokens', async () => { @@ -3057,6 +3056,102 @@ test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and freque assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false); }); +test('tokenizeSubtitle excludes mecab-tagged interjections from annotation payloads entirely', async () => { + const result = await tokenizeSubtitle( + 'ぐはっ', + makeDepsFromYomitanTokens([{ surface: 'ぐはっ', reading: 'ぐはっ', headword: 'ぐはっ' }], { + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: () => 17, + getJlptLevel: () => 'N5', + tokenizeWithMecab: async () => [ + { + headword: 'ぐはっ', + surface: 'ぐはっ', + reading: 'グハッ', + startPos: 0, + endPos: 3, + partOfSpeech: PartOfSpeech.other, + pos1: '感動詞', + isMerged: true, + isKnown: false, + isNPlusOneTarget: false, + }, + ], + }), + ); + + assert.deepEqual(result, { text: 'ぐはっ', tokens: null }); +}); + +test('tokenizeSubtitle keeps visible text while excluding interjections from mixed annotation payloads', async () => { + const result = await tokenizeSubtitle( + 'ぐはっ 猫', + makeDeps({ + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => (text === '猫' ? 11 : 17), + getJlptLevel: (text) => (text === '猫' ? 'N5' : null), + getYomitanExt: () => ({ id: 'dummy-ext' }) as any, + getYomitanParserWindow: () => + ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async (script: string) => { + if (script.includes('getTermFrequencies')) { + return []; + } + + return [ + { + source: 'scanning-parser', + index: 0, + content: [ + [{ text: 'ぐはっ', reading: 'ぐはっ', headwords: [[{ term: 'ぐはっ' }]] }], + [{ text: '猫', reading: 'ねこ', headwords: [[{ term: '猫' }]] }], + ], + }, + ]; + }, + }, + }) as unknown as Electron.BrowserWindow, + tokenizeWithMecab: async () => [ + { + headword: 'ぐはっ', + surface: 'ぐはっ', + reading: 'グハッ', + startPos: 0, + endPos: 3, + partOfSpeech: PartOfSpeech.other, + pos1: '感動詞', + isMerged: true, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: '猫', + surface: '猫', + reading: 'ネコ', + startPos: 4, + endPos: 5, + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + isMerged: true, + isKnown: false, + isNPlusOneTarget: false, + }, + ], + }), + ); + + assert.equal(result.text, 'ぐはっ 猫'); + assert.deepEqual( + result.tokens?.map((token) => ({ + surface: token.surface, + headword: token.headword, + })), + [{ surface: '猫', headword: '猫' }], + ); +}); + test('tokenizeSubtitle excludes single-kana merged tokens from frequency highlighting', async () => { const result = await tokenizeSubtitle( 'た', diff --git a/src/core/services/tokenizer.ts b/src/core/services/tokenizer.ts index 7b48d7a..88993e5 100644 --- a/src/core/services/tokenizer.ts +++ b/src/core/services/tokenizer.ts @@ -178,6 +178,19 @@ async function applyAnnotationStage( ); } +async function filterSubtitleAnnotationTokens(tokens: MergedToken[]): Promise { + if (tokens.length === 0) { + return tokens; + } + + if (!annotationStageModulePromise) { + annotationStageModulePromise = import('./tokenizer/annotation-stage'); + } + + const annotationStage = await annotationStageModulePromise; + return tokens.filter((token) => !annotationStage.shouldExcludeTokenFromSubtitleAnnotations(token)); +} + export function createTokenizerDepsRuntime( options: TokenizerDepsRuntimeOptions, ): TokenizerServiceDeps { @@ -698,9 +711,12 @@ export async function tokenizeSubtitle( const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps, annotationOptions); if (yomitanTokens && yomitanTokens.length > 0) { + const filteredTokens = await filterSubtitleAnnotationTokens( + await applyAnnotationStage(yomitanTokens, deps, annotationOptions), + ); return { text: displayText, - tokens: await applyAnnotationStage(yomitanTokens, deps, annotationOptions), + tokens: filteredTokens.length > 0 ? filteredTokens : null, }; } diff --git a/src/core/services/tokenizer/annotation-stage.ts b/src/core/services/tokenizer/annotation-stage.ts index fcd1449..e3df7de 100644 --- a/src/core/services/tokenizer/annotation-stage.ts +++ b/src/core/services/tokenizer/annotation-stage.ts @@ -14,6 +14,17 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60; const KATAKANA_CODEPOINT_START = 0x30a1; const KATAKANA_CODEPOINT_END = 0x30f6; const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048; +const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([ + 'ああ', + 'ええ', + 'うう', + 'おお', + 'はあ', + 'はは', + 'へえ', + 'ふう', + 'ほう', +]); const jlptLevelLookupCaches = new WeakMap< (text: string) => JlptLevel | null, @@ -48,6 +59,8 @@ function normalizePos1Tag(pos1: string | undefined): string { return typeof pos1 === 'string' ? pos1.trim() : ''; } +const SUBTITLE_ANNOTATION_EXCLUDED_POS1 = new Set(['感動詞']); + function splitNormalizedTagParts(normalizedTag: string): string[] { if (!normalizedTag) { return []; @@ -69,6 +82,11 @@ function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet exclusions.has(part)); } +function isExcludedFromSubtitleAnnotationsByPos1(normalizedPos1: string): boolean { + const parts = splitNormalizedTagParts(normalizedPos1); + return parts.some((part) => SUBTITLE_ANNOTATION_EXCLUDED_POS1.has(part)); +} + function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet { if (options.pos1Exclusions) { return options.pos1Exclusions; @@ -383,6 +401,23 @@ function isReduplicatedKanaSfx(text: string): boolean { return chars.slice(0, half).join('') === chars.slice(half).join(''); } +function isReduplicatedKanaSfxWithOptionalTrailingTo(text: string): boolean { + const normalized = normalizeJlptTextForExclusion(text); + if (!normalized) { + return false; + } + + if (isReduplicatedKanaSfx(normalized)) { + return true; + } + + if (normalized.length <= 1 || !normalized.endsWith('と')) { + return false; + } + + return isReduplicatedKanaSfx(normalized.slice(0, -1)); +} + function hasAdjacentKanaRepeat(text: string): boolean { const normalized = normalizeJlptTextForExclusion(text); if (!normalized) { @@ -485,6 +520,55 @@ function isJlptEligibleToken(token: MergedToken): boolean { return true; } +function isExcludedFromSubtitleAnnotationsByTerm(token: MergedToken): boolean { + const candidates = [ + resolveJlptLookupText(token), + token.surface, + token.headword, + token.reading, + ].filter( + (candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0, + ); + + for (const candidate of candidates) { + const trimmedCandidate = candidate.trim(); + if (!trimmedCandidate) { + continue; + } + + const normalizedCandidate = normalizeJlptTextForExclusion(trimmedCandidate); + if (!normalizedCandidate) { + continue; + } + + if ( + SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(trimmedCandidate) || + SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalizedCandidate) + ) { + return true; + } + + if ( + isTrailingSmallTsuKanaSfx(trimmedCandidate) || + isTrailingSmallTsuKanaSfx(normalizedCandidate) || + isReduplicatedKanaSfxWithOptionalTrailingTo(trimmedCandidate) || + isReduplicatedKanaSfxWithOptionalTrailingTo(normalizedCandidate) + ) { + return true; + } + } + + return false; +} + +export function shouldExcludeTokenFromSubtitleAnnotations(token: MergedToken): boolean { + if (isExcludedFromSubtitleAnnotationsByPos1(normalizePos1Tag(token.pos1))) { + return true; + } + + return isExcludedFromSubtitleAnnotationsByTerm(token); +} + function computeTokenKnownStatus( token: MergedToken, isKnownWord: (text: string) => boolean, diff --git a/src/core/services/tokenizer/parser-selection-stage.test.ts b/src/core/services/tokenizer/parser-selection-stage.test.ts index 607f3b4..8ffb8f6 100644 --- a/src/core/services/tokenizer/parser-selection-stage.test.ts +++ b/src/core/services/tokenizer/parser-selection-stage.test.ts @@ -212,3 +212,57 @@ test('merges trailing katakana continuation without headword into previous token ], ); }); + +// Regression: merged content+function token candidate must not beat a multi-token split +// candidate that preserves the content token as a standalone frequency-eligible unit. +// Background: Yomitan scanning can produce a single-token candidate where a content word +// is merged with trailing function particles (e.g. かかってこいよ → headword かかってくる). +// When a competing multi-token candidate splits content and function separately, the +// multi-token candidate should win so the content token remains frequency-highlightable. +test('multi-token candidate beats single merged content+function token candidate (frequency regression)', () => { + // Candidate A: single merged token — content verb fused with trailing sentence-final particle + // This is the "bad" candidate: downstream annotation would exclude frequency for the whole + // token because the merged pos1 would contain a function-word component. + const mergedCandidate = makeParseItem('scanning-parser', [ + [{ text: 'かかってこいよ', reading: 'かかってこいよ', headword: 'かかってくる' }], + ]); + + // Candidate B: two tokens — content verb surface + particle separately. + // The content token is frequency-eligible on its own. + const splitCandidate = makeParseItem('scanning-parser', [ + [{ text: 'かかってこい', reading: 'かかってこい', headword: 'かかってくる' }], + [{ text: 'よ', reading: 'よ', headword: 'よ' }], + ]); + + // When merged candidate comes first in the array, multi-token split still wins. + const tokens = selectYomitanParseTokens( + [mergedCandidate, splitCandidate], + () => false, + 'headword', + ); + assert.equal(tokens?.length, 2); + assert.equal(tokens?.[0]?.surface, 'かかってこい'); + assert.equal(tokens?.[0]?.headword, 'かかってくる'); + assert.equal(tokens?.[1]?.surface, 'よ'); +}); + +test('multi-token candidate beats single merged content+function token regardless of input order', () => { + const mergedCandidate = makeParseItem('scanning-parser', [ + [{ text: 'かかってこいよ', reading: 'かかってこいよ', headword: 'かかってくる' }], + ]); + + const splitCandidate = makeParseItem('scanning-parser', [ + [{ text: 'かかってこい', reading: 'かかってこい', headword: 'かかってくる' }], + [{ text: 'よ', reading: 'よ', headword: 'よ' }], + ]); + + // Split candidate comes first — should still win over merged. + const tokens = selectYomitanParseTokens( + [splitCandidate, mergedCandidate], + () => false, + 'headword', + ); + assert.equal(tokens?.length, 2); + assert.equal(tokens?.[0]?.surface, 'かかってこい'); + assert.equal(tokens?.[1]?.surface, 'よ'); +});