diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index a824ca9..f15063c 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -3741,6 +3741,98 @@ test('tokenizeSubtitle clears all annotations for kana-only demonstrative helper ); }); +test('tokenizeSubtitle clears all annotations for explanatory pondering endings', async () => { + const result = await tokenizeSubtitle( + '俺どうかしちゃったのかな', + makeDepsFromYomitanTokens( + [ + { surface: '俺', reading: 'おれ', headword: '俺' }, + { surface: 'どうかしちゃった', reading: 'どうかしちゃった', headword: 'どうかしちゃう' }, + { surface: 'のかな', reading: 'のかな', headword: 'の' }, + ], + { + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => (text === '俺' ? 19 : text === 'どうかしちゃう' ? 3200 : 77), + getJlptLevel: (text) => + text === '俺' ? 'N5' : text === 'どうかしちゃう' ? 'N3' : text === 'の' ? 'N5' : null, + isKnownWord: (text) => text === '俺' || text === 'の', + getMinSentenceWordsForNPlusOne: () => 1, + tokenizeWithMecab: async () => [ + { + headword: '俺', + surface: '俺', + reading: 'オレ', + startPos: 0, + endPos: 1, + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '代名詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'どうかしちゃう', + surface: 'どうかしちゃった', + reading: 'ドウカシチャッタ', + startPos: 1, + endPos: 8, + partOfSpeech: PartOfSpeech.verb, + pos1: '動詞', + pos2: '自立', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'の', + surface: 'のかな', + reading: 'ノカナ', + startPos: 8, + endPos: 11, + partOfSpeech: PartOfSpeech.other, + pos1: '名詞|助動詞', + pos2: '非自立', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + ], + }, + ), + ); + + assert.deepEqual( + result.tokens?.map((token) => ({ + surface: token.surface, + headword: token.headword, + isKnown: token.isKnown, + isNPlusOneTarget: token.isNPlusOneTarget, + frequencyRank: token.frequencyRank, + jlptLevel: token.jlptLevel, + })), + [ + { surface: '俺', headword: '俺', isKnown: true, isNPlusOneTarget: false, frequencyRank: 19, jlptLevel: 'N5' }, + { + surface: 'どうかしちゃった', + headword: 'どうかしちゃう', + isKnown: false, + isNPlusOneTarget: true, + frequencyRank: 3200, + jlptLevel: 'N3', + }, + { + surface: 'のかな', + headword: 'の', + isKnown: false, + isNPlusOneTarget: false, + frequencyRank: undefined, + jlptLevel: undefined, + }, + ], + ); +}); + test('tokenizeSubtitle keeps frequency for content-led merged token with trailing colloquial suffixes', async () => { const result = await tokenizeSubtitle( '張り切ってんじゃ', diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts index d9a6727..7093a72 100644 --- a/src/core/services/tokenizer/annotation-stage.test.ts +++ b/src/core/services/tokenizer/annotation-stage.test.ts @@ -234,6 +234,18 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes explanatory ending vari } }); +test('shouldExcludeTokenFromSubtitleAnnotations excludes explanatory pondering endings', () => { + const token = makeToken({ + surface: 'のかな', + headword: 'の', + reading: 'ノカナ', + pos1: '名詞|助動詞', + pos2: '非自立', + }); + + assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true); +}); + test('shouldExcludeTokenFromSubtitleAnnotations excludes auxiliary-stem そうだ grammar tails', () => { const token = makeToken({ surface: 'そうだ', diff --git a/src/core/services/tokenizer/subtitle-annotation-filter.ts b/src/core/services/tokenizer/subtitle-annotation-filter.ts index 03ea473..b0464fe 100644 --- a/src/core/services/tokenizer/subtitle-annotation-filter.ts +++ b/src/core/services/tokenizer/subtitle-annotation-filter.ts @@ -45,6 +45,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES = [ 'かな', 'かね', ] as const; +const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_THOUGHT_SUFFIXES = ['か', 'かな', 'かね'] as const; const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set( SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.flatMap((prefix) => SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES.flatMap((core) => @@ -258,6 +259,16 @@ function isExcludedByTerm(token: MergedToken): boolean { continue; } + if ( + SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.some((prefix) => + SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_THOUGHT_SUFFIXES.some( + (suffix) => normalized === `${prefix}${suffix}`, + ), + ) + ) { + return true; + } + if ( SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(trimmed) || SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalized) ||