From 42028d0a4dee8d07fb9a631a120518cfb2534809 Mon Sep 17 00:00:00 2001 From: sudacode Date: Thu, 19 Mar 2026 23:48:38 -0700 Subject: [PATCH] fix(subtitle): unify annotation token filtering --- src/core/services/tokenizer.test.ts | 113 ++++++ .../tokenizer/annotation-stage.test.ts | 54 ++- .../services/tokenizer/annotation-stage.ts | 43 +-- .../tokenizer/subtitle-annotation-filter.ts | 341 ++++++++++++++++++ src/token-merger.ts | 5 + 5 files changed, 527 insertions(+), 29 deletions(-) create mode 100644 src/core/services/tokenizer/subtitle-annotation-filter.ts diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index ee933a9..a824ca9 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -3628,6 +3628,119 @@ test('tokenizeSubtitle excludes merged function/content token from frequency hig assert.equal(result.tokens?.[0]?.isNPlusOneTarget, true); }); +test('tokenizeSubtitle clears all annotations for kana-only demonstrative helper merges', async () => { + const result = await tokenizeSubtitle( + 'これで実力どおりか', + makeDepsFromYomitanTokens( + [ + { surface: 'これで', reading: 'これで', headword: 'これ' }, + { surface: '実力どおり', reading: 'じつりょくどおり', headword: '実力どおり' }, + { surface: 'か', reading: 'か', headword: 'か' }, + ], + { + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => + text === 'これ' ? 9 : text === '実力どおり' ? 2500 : text === 'か' ? 800 : null, + getJlptLevel: (text) => + text === 'これ' ? 'N5' : text === '実力どおり' ? 'N1' : text === 'か' ? 'N5' : null, + isKnownWord: (text) => text === 'これ', + getMinSentenceWordsForNPlusOne: () => 1, + tokenizeWithMecab: async () => [ + { + headword: 'これ', + surface: 'これ', + reading: 'コレ', + startPos: 0, + endPos: 2, + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '代名詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'で', + surface: 'で', + reading: 'デ', + startPos: 2, + endPos: 3, + partOfSpeech: PartOfSpeech.particle, + pos1: '助詞', + pos2: '格助詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: '実力どおり', + surface: '実力どおり', + reading: 'ジツリョクドオリ', + startPos: 3, + endPos: 8, + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '一般', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'か', + surface: 'か', + reading: 'カ', + startPos: 8, + endPos: 9, + partOfSpeech: PartOfSpeech.particle, + pos1: '助詞', + pos2: '終助詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + ], + }, + ), + ); + + assert.deepEqual( + result.tokens?.map((token) => ({ + surface: token.surface, + headword: token.headword, + isKnown: token.isKnown, + isNPlusOneTarget: token.isNPlusOneTarget, + frequencyRank: token.frequencyRank, + jlptLevel: token.jlptLevel, + })), + [ + { + surface: 'これで', + headword: 'これ', + isKnown: false, + isNPlusOneTarget: false, + frequencyRank: undefined, + jlptLevel: undefined, + }, + { + surface: '実力どおり', + headword: '実力どおり', + isKnown: false, + isNPlusOneTarget: true, + frequencyRank: 2500, + jlptLevel: 'N1', + }, + { + surface: 'か', + headword: 'か', + isKnown: false, + isNPlusOneTarget: false, + frequencyRank: undefined, + jlptLevel: undefined, + }, + ], + ); +}); + test('tokenizeSubtitle keeps frequency for content-led merged token with trailing colloquial suffixes', async () => { const result = await tokenizeSubtitle( '張り切ってんじゃ', diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts index 1ea49f1..d9a6727 100644 --- a/src/core/services/tokenizer/annotation-stage.test.ts +++ b/src/core/services/tokenizer/annotation-stage.test.ts @@ -316,6 +316,19 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes merged lexical tokens w assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true); }); +test('shouldExcludeTokenFromSubtitleAnnotations excludes kana-only demonstrative helper merges', () => { + const token = makeToken({ + surface: 'これで', + headword: 'これ', + reading: 'コレデ', + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞|助詞', + pos2: '代名詞|格助詞', + }); + + assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true); +}); + test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => { const token = makeToken({ surface: 'は', @@ -481,8 +494,8 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens ); assert.equal(result[0]?.isKnown, false); - assert.equal(result[1]?.isKnown, true); - assert.equal(result[2]?.isKnown, true); + assert.equal(result[1]?.isKnown, false); + assert.equal(result[2]?.isKnown, false); assert.equal(result[0]?.isNPlusOneTarget, false); }); @@ -568,7 +581,7 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+ assert.equal(result[0]?.isNPlusOneTarget, false); }); -test('annotateTokens keeps frequency for kanji noun tokens even when mecab marks them non-independent', () => { +test('annotateTokens clears all annotations for non-independent kanji noun tokens under unified gate', () => { const tokens = [ makeToken({ surface: '者', @@ -588,7 +601,10 @@ test('annotateTokens keeps frequency for kanji noun tokens even when mecab marks minSentenceWordsForNPlusOne: 1, }); - assert.equal(result[0]?.frequencyRank, 475); + assert.equal(result[0]?.isKnown, false); + assert.equal(result[0]?.isNPlusOneTarget, false); + assert.equal(result[0]?.frequencyRank, undefined); + assert.equal(result[0]?.jlptLevel, undefined); }); test('annotateTokens excludes likely kana SFX tokens from frequency when POS tags are missing', () => { @@ -742,3 +758,33 @@ test('annotateTokens excludes composite tokens when all component pos tags are e assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.isNPlusOneTarget, false); }); + +test('annotateTokens applies one shared exclusion gate across known N+1 frequency and JLPT', () => { + const tokens = [ + makeToken({ + surface: 'これで', + headword: 'これ', + reading: 'コレデ', + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞|助詞', + pos2: '代名詞|格助詞', + startPos: 0, + endPos: 3, + frequencyRank: 9, + }), + ]; + + const result = annotateTokens( + tokens, + makeDeps({ + isKnownWord: (text) => text === 'これ', + getJlptLevel: (text) => (text === 'これ' ? 'N5' : null), + }), + { minSentenceWordsForNPlusOne: 1 }, + ); + + assert.equal(result[0]?.isKnown, false); + assert.equal(result[0]?.isNPlusOneTarget, false); + assert.equal(result[0]?.frequencyRank, undefined); + assert.equal(result[0]?.jlptLevel, undefined); +}); diff --git a/src/core/services/tokenizer/annotation-stage.ts b/src/core/services/tokenizer/annotation-stage.ts index e9f9dda..2931b03 100644 --- a/src/core/services/tokenizer/annotation-stage.ts +++ b/src/core/services/tokenizer/annotation-stage.ts @@ -9,6 +9,10 @@ import { } from '../../../token-pos2-exclusions'; import { JlptLevel, MergedToken, NPlusOneMatchMode, PartOfSpeech } from '../../../types'; import { shouldIgnoreJlptByTerm, shouldIgnoreJlptForMecabPos1 } from '../jlpt-token-filter'; +import { + shouldExcludeTokenFromSubtitleAnnotations as sharedShouldExcludeTokenFromSubtitleAnnotations, + stripSubtitleAnnotationMetadata as sharedStripSubtitleAnnotationMetadata, +} from './subtitle-annotation-filter'; const KATAKANA_TO_HIRAGANA_OFFSET = 0x60; const KATAKANA_CODEPOINT_START = 0x30a1; @@ -633,34 +637,11 @@ function isExcludedFromSubtitleAnnotationsByTerm(token: MergedToken): boolean { } export function shouldExcludeTokenFromSubtitleAnnotations(token: MergedToken): boolean { - if (isExcludedFromSubtitleAnnotationsByPos1(normalizePos1Tag(token.pos1))) { - return true; - } - - if (isAuxiliaryStemGrammarTailToken(token)) { - return true; - } - - if (isExcludedTrailingParticleMergedToken(token)) { - return true; - } - - return isExcludedFromSubtitleAnnotationsByTerm(token); + return sharedShouldExcludeTokenFromSubtitleAnnotations(token); } export function stripSubtitleAnnotationMetadata(token: MergedToken): MergedToken { - if (!shouldExcludeTokenFromSubtitleAnnotations(token)) { - return token; - } - - return { - ...token, - isKnown: false, - isNPlusOneTarget: false, - isNameMatch: false, - jlptLevel: undefined, - frequencyRank: undefined, - }; + return sharedStripSubtitleAnnotationMetadata(token); } function computeTokenKnownStatus( @@ -737,6 +718,18 @@ export function annotateTokens( // Single pass: compute known word status, frequency filtering, and JLPT level together const annotated = tokens.map((token) => { + if ( + sharedShouldExcludeTokenFromSubtitleAnnotations(token, { + pos1Exclusions, + pos2Exclusions, + }) + ) { + return sharedStripSubtitleAnnotationMetadata(token, { + pos1Exclusions, + pos2Exclusions, + }); + } + const prioritizedNameMatch = nameMatchEnabled && token.isNameMatch === true; const isKnown = nPlusOneEnabled ? computeTokenKnownStatus(token, deps.isKnownWord, deps.knownWordMatchMode) diff --git a/src/core/services/tokenizer/subtitle-annotation-filter.ts b/src/core/services/tokenizer/subtitle-annotation-filter.ts new file mode 100644 index 0000000..03ea473 --- /dev/null +++ b/src/core/services/tokenizer/subtitle-annotation-filter.ts @@ -0,0 +1,341 @@ +import { + DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG, + resolveAnnotationPos1ExclusionSet, +} from '../../../token-pos1-exclusions'; +import { + DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG, + resolveAnnotationPos2ExclusionSet, +} from '../../../token-pos2-exclusions'; +import { MergedToken, PartOfSpeech } from '../../../types'; +import { shouldIgnoreJlptByTerm } from '../jlpt-token-filter'; + +const KATAKANA_TO_HIRAGANA_OFFSET = 0x60; +const KATAKANA_CODEPOINT_START = 0x30a1; +const KATAKANA_CODEPOINT_END = 0x30f6; + +const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([ + 'ああ', + 'ええ', + 'うう', + 'おお', + 'はあ', + 'はは', + 'へえ', + 'ふう', + 'ほう', +]); +const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの']; +const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [ + 'だ', + 'です', + 'でした', + 'だった', + 'では', + 'じゃ', + 'でしょう', + 'だろう', +] as const; +const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES = [ + '', + 'か', + 'ね', + 'よ', + 'な', + 'よね', + 'かな', + 'かね', +] as const; +const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set( + SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.flatMap((prefix) => + SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES.flatMap((core) => + SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES.map( + (particle) => `${prefix}${core}${particle}`, + ), + ), + ), +); +const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([ + 'って', + 'ってよ', + 'ってね', + 'ってな', + 'ってさ', + 'ってか', + 'ってば', +]); +const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']); + +export interface SubtitleAnnotationFilterOptions { + pos1Exclusions?: ReadonlySet; + pos2Exclusions?: ReadonlySet; +} + +function normalizePosTag(pos: string | undefined): string { + return typeof pos === 'string' ? pos.trim() : ''; +} + +function splitNormalizedTagParts(normalizedTag: string): string[] { + if (!normalizedTag) { + return []; + } + + return normalizedTag + .split('|') + .map((part) => part.trim()) + .filter((part) => part.length > 0); +} + +function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet): boolean { + const parts = splitNormalizedTagParts(normalizedTag); + if (parts.length === 0) { + return false; + } + + return parts.every((part) => exclusions.has(part)); +} + +function resolvePos1Exclusions( + options: SubtitleAnnotationFilterOptions = {}, +): ReadonlySet { + if (options.pos1Exclusions) { + return options.pos1Exclusions; + } + + return resolveAnnotationPos1ExclusionSet(DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG); +} + +function resolvePos2Exclusions( + options: SubtitleAnnotationFilterOptions = {}, +): ReadonlySet { + if (options.pos2Exclusions) { + return options.pos2Exclusions; + } + + return resolveAnnotationPos2ExclusionSet(DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG); +} + +function normalizeKana(text: string): string { + const raw = text.trim(); + if (!raw) { + return ''; + } + + let normalized = ''; + for (const char of raw) { + const code = char.codePointAt(0); + if (code === undefined) { + continue; + } + + if (code >= KATAKANA_CODEPOINT_START && code <= KATAKANA_CODEPOINT_END) { + normalized += String.fromCodePoint(code - KATAKANA_TO_HIRAGANA_OFFSET); + continue; + } + + normalized += char; + } + + return normalized; +} + +function isKanaChar(char: string): boolean { + const code = char.codePointAt(0); + if (code === undefined) { + return false; + } + + return ( + (code >= 0x3041 && code <= 0x3096) || + (code >= 0x309b && code <= 0x309f) || + code === 0x30fc || + (code >= 0x30a0 && code <= 0x30fa) || + (code >= 0x30fd && code <= 0x30ff) + ); +} + +function isTrailingSmallTsuKanaSfx(text: string): boolean { + const normalized = normalizeKana(text); + if (!normalized) { + return false; + } + + const chars = [...normalized]; + if (chars.length < 2 || chars.length > 4) { + return false; + } + + if (!chars.every(isKanaChar)) { + return false; + } + + return chars[chars.length - 1] === 'っ'; +} + +function isReduplicatedKanaSfx(text: string): boolean { + const normalized = normalizeKana(text); + if (!normalized) { + return false; + } + + const chars = [...normalized]; + if (chars.length < 4 || chars.length % 2 !== 0) { + return false; + } + + if (!chars.every(isKanaChar)) { + return false; + } + + const half = chars.length / 2; + return chars.slice(0, half).join('') === chars.slice(half).join(''); +} + +function isReduplicatedKanaSfxWithOptionalTrailingTo(text: string): boolean { + const normalized = normalizeKana(text); + if (!normalized) { + return false; + } + + if (isReduplicatedKanaSfx(normalized)) { + return true; + } + + if (normalized.length <= 1 || !normalized.endsWith('と')) { + return false; + } + + return isReduplicatedKanaSfx(normalized.slice(0, -1)); +} + +function isExcludedTrailingParticleMergedToken(token: MergedToken): boolean { + const normalizedSurface = normalizeKana(token.surface); + const normalizedHeadword = normalizeKana(token.headword); + if (!normalizedSurface || !normalizedHeadword || !normalizedSurface.startsWith(normalizedHeadword)) { + return false; + } + + const suffix = normalizedSurface.slice(normalizedHeadword.length); + if (!SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES.has(suffix)) { + return false; + } + + const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1)); + if (pos1Parts.length < 2) { + return false; + } + + const [leadingPos1, ...trailingPos1] = pos1Parts; + if (!leadingPos1 || resolvePos1Exclusions().has(leadingPos1)) { + return false; + } + + return trailingPos1.length > 0 && trailingPos1.every((part) => part === '助詞'); +} + +function isAuxiliaryStemGrammarTailToken(token: MergedToken): boolean { + const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1)); + if (pos1Parts.length === 0 || !pos1Parts.every((part) => AUXILIARY_STEM_GRAMMAR_TAIL_POS1.has(part))) { + return false; + } + + const pos3Parts = splitNormalizedTagParts(normalizePosTag(token.pos3)); + return pos3Parts.includes('助動詞語幹'); +} + +function isExcludedByTerm(token: MergedToken): boolean { + const candidates = [token.surface, token.reading, token.headword].filter( + (candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0, + ); + + for (const candidate of candidates) { + const trimmed = candidate.trim(); + if (!trimmed) { + continue; + } + + const normalized = normalizeKana(trimmed); + if (!normalized) { + continue; + } + + if ( + SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(trimmed) || + SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalized) || + SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(trimmed) || + SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(normalized) || + shouldIgnoreJlptByTerm(trimmed) || + shouldIgnoreJlptByTerm(normalized) + ) { + return true; + } + + if ( + isTrailingSmallTsuKanaSfx(trimmed) || + isTrailingSmallTsuKanaSfx(normalized) || + isReduplicatedKanaSfxWithOptionalTrailingTo(trimmed) || + isReduplicatedKanaSfxWithOptionalTrailingTo(normalized) + ) { + return true; + } + } + + return false; +} + +export function shouldExcludeTokenFromSubtitleAnnotations( + token: MergedToken, + options: SubtitleAnnotationFilterOptions = {}, +): boolean { + const pos1Exclusions = resolvePos1Exclusions(options); + const pos2Exclusions = resolvePos2Exclusions(options); + const normalizedPos1 = normalizePosTag(token.pos1); + const normalizedPos2 = normalizePosTag(token.pos2); + const hasPos1 = normalizedPos1.length > 0; + const hasPos2 = normalizedPos2.length > 0; + + if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) { + return true; + } + + if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) { + return true; + } + + if ( + !hasPos1 && + !hasPos2 && + (token.partOfSpeech === PartOfSpeech.particle || + token.partOfSpeech === PartOfSpeech.bound_auxiliary || + token.partOfSpeech === PartOfSpeech.symbol) + ) { + return true; + } + + if (isAuxiliaryStemGrammarTailToken(token)) { + return true; + } + + if (isExcludedTrailingParticleMergedToken(token)) { + return true; + } + + return isExcludedByTerm(token); +} + +export function stripSubtitleAnnotationMetadata( + token: MergedToken, + options: SubtitleAnnotationFilterOptions = {}, +): MergedToken { + if (!shouldExcludeTokenFromSubtitleAnnotations(token, options)) { + return token; + } + + return { + ...token, + isKnown: false, + isNPlusOneTarget: false, + isNameMatch: false, + jlptLevel: undefined, + frequencyRank: undefined, + }; +} diff --git a/src/token-merger.ts b/src/token-merger.ts index fbec420..493725c 100644 --- a/src/token-merger.ts +++ b/src/token-merger.ts @@ -19,6 +19,7 @@ import { PartOfSpeech, Token, MergedToken } from './types'; import { DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG } from './token-pos1-exclusions'; import { DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG } from './token-pos2-exclusions'; +import { shouldExcludeTokenFromSubtitleAnnotations } from './core/services/tokenizer/subtitle-annotation-filter'; export function isNoun(tok: Token): boolean { return tok.partOfSpeech === PartOfSpeech.noun; @@ -297,6 +298,10 @@ function isNPlusOneWordCountToken( pos1Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS1, pos2Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS2, ): boolean { + if (shouldExcludeTokenFromSubtitleAnnotations(token, { pos1Exclusions, pos2Exclusions })) { + return false; + } + const normalizedPos1 = normalizePos1Tag(token.pos1); const hasPos1 = normalizedPos1.length > 0; if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {