diff --git a/changes/2026-03-23-tokenizer-explanatory-contrast-ending.md b/changes/2026-03-23-tokenizer-explanatory-contrast-ending.md new file mode 100644 index 0000000..1f9fe85 --- /dev/null +++ b/changes/2026-03-23-tokenizer-explanatory-contrast-ending.md @@ -0,0 +1,4 @@ +type: fixed +area: tokenizer + +- Fixed subtitle annotation clearing so explanatory contrast endings like `んですけど` are excluded consistently across the shared tokenizer filter and annotation stage. diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index f15063c..2fd57c0 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -3893,6 +3893,172 @@ test('tokenizeSubtitle keeps frequency for content-led merged token with trailin assert.equal(result.tokens?.[0]?.frequencyRank, 5468); }); +test('tokenizeSubtitle clears all annotations for explanatory contrast endings', async () => { + const result = await tokenizeSubtitle( + '最近辛いものが続いとるんですけど', + makeDepsFromYomitanTokens( + [ + { surface: '最近', reading: 'さいきん', headword: '最近' }, + { surface: '辛い', reading: 'つらい', headword: '辛い' }, + { surface: 'もの', reading: 'もの', headword: 'もの' }, + { surface: 'が', reading: 'が', headword: 'が' }, + { surface: '続いとる', reading: 'つづいとる', headword: '続く' }, + { surface: 'んですけど', reading: 'んですけど', headword: 'ん' }, + ], + { + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => + text === '最近' ? 120 : text === '辛い' ? 800 : text === '続く' ? 240 : 77, + getJlptLevel: (text) => + text === '最近' ? 'N4' : text === '辛い' ? 'N2' : text === '続く' ? 'N4' : null, + isKnownWord: (text) => text === '最近', + getMinSentenceWordsForNPlusOne: () => 1, + tokenizeWithMecab: async () => [ + { + headword: '最近', + surface: '最近', + reading: 'サイキン', + startPos: 0, + endPos: 2, + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '副詞可能', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: '辛い', + surface: '辛い', + reading: 'ツライ', + startPos: 2, + endPos: 4, + partOfSpeech: PartOfSpeech.i_adjective, + pos1: '形容詞', + pos2: '自立', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'もの', + surface: 'もの', + reading: 'モノ', + startPos: 4, + endPos: 6, + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '一般', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'が', + surface: 'が', + reading: 'ガ', + startPos: 6, + endPos: 7, + partOfSpeech: PartOfSpeech.particle, + pos1: '助詞', + pos2: '格助詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: '続く', + surface: '続いとる', + reading: 'ツヅイトル', + startPos: 7, + endPos: 11, + partOfSpeech: PartOfSpeech.verb, + pos1: '動詞', + pos2: '自立', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'ん', + surface: 'んですけど', + reading: 'ンデスケド', + startPos: 11, + endPos: 16, + partOfSpeech: PartOfSpeech.other, + pos1: '名詞|助動詞|助詞', + pos2: '非自立', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + ], + }, + ), + ); + + assert.deepEqual( + result.tokens?.map((token) => ({ + surface: token.surface, + headword: token.headword, + isKnown: token.isKnown, + isNPlusOneTarget: token.isNPlusOneTarget, + frequencyRank: token.frequencyRank, + jlptLevel: token.jlptLevel, + })), + [ + { + surface: '最近', + headword: '最近', + isKnown: true, + isNPlusOneTarget: false, + frequencyRank: 120, + jlptLevel: 'N4', + }, + { + surface: '辛い', + headword: '辛い', + isKnown: false, + isNPlusOneTarget: false, + frequencyRank: 800, + jlptLevel: 'N2', + }, + { + surface: 'もの', + headword: 'もの', + isKnown: false, + isNPlusOneTarget: false, + frequencyRank: 77, + jlptLevel: undefined, + }, + { + surface: 'が', + headword: 'が', + isKnown: false, + isNPlusOneTarget: false, + frequencyRank: undefined, + jlptLevel: undefined, + }, + { + surface: '続いとる', + headword: '続く', + isKnown: false, + isNPlusOneTarget: false, + frequencyRank: 240, + jlptLevel: 'N4', + }, + { + surface: 'んですけど', + headword: 'ん', + isKnown: false, + isNPlusOneTarget: false, + frequencyRank: undefined, + jlptLevel: undefined, + }, + ], + ); +}); + test('tokenizeSubtitle excludes default non-independent pos2 from N+1 when JLPT/frequency are disabled', async () => { let mecabCalls = 0; const result = await tokenizeSubtitle( diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts index 7093a72..7683ea6 100644 --- a/src/core/services/tokenizer/annotation-stage.test.ts +++ b/src/core/services/tokenizer/annotation-stage.test.ts @@ -246,6 +246,18 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes explanatory pondering e assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true); }); +test('shouldExcludeTokenFromSubtitleAnnotations excludes explanatory contrast endings', () => { + const token = makeToken({ + surface: 'んですけど', + headword: 'ん', + reading: 'ンデスケド', + pos1: '名詞|助動詞|助詞', + pos2: '非自立', + }); + + assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true); +}); + test('shouldExcludeTokenFromSubtitleAnnotations excludes auxiliary-stem そうだ grammar tails', () => { const token = makeToken({ surface: 'そうだ', diff --git a/src/core/services/tokenizer/annotation-stage.ts b/src/core/services/tokenizer/annotation-stage.ts index 2931b03..99fc5c1 100644 --- a/src/core/services/tokenizer/annotation-stage.ts +++ b/src/core/services/tokenizer/annotation-stage.ts @@ -46,6 +46,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES = [ 'ね', 'よ', 'な', + 'けど', 'よね', 'かな', 'かね', diff --git a/src/core/services/tokenizer/subtitle-annotation-filter.ts b/src/core/services/tokenizer/subtitle-annotation-filter.ts index b0464fe..f64edde 100644 --- a/src/core/services/tokenizer/subtitle-annotation-filter.ts +++ b/src/core/services/tokenizer/subtitle-annotation-filter.ts @@ -41,6 +41,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES = [ 'ね', 'よ', 'な', + 'けど', 'よね', 'かな', 'かね',