From 9a30419a23d648b3f203a50b47129a5b647fa842 Mon Sep 17 00:00:00 2001 From: sudacode Date: Wed, 4 Mar 2026 11:19:24 -0800 Subject: [PATCH] fix(tokenizer): tighten frequency highlighting exclusions --- src/core/services/tokenizer.test.ts | 32 +++++++++++++++++-- .../tokenizer/annotation-stage.test.ts | 24 ++++++++++++-- .../services/tokenizer/annotation-stage.ts | 5 +-- .../tokenizer/parser-enrichment-stage.test.ts | 24 ++++++++++++++ 4 files changed, 79 insertions(+), 6 deletions(-) diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index c18cdff..5a9e42a 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -972,6 +972,34 @@ test('tokenizeSubtitle skips frequency rank when Yomitan token is enriched as pa assert.equal(result.tokens?.[0]?.frequencyRank, undefined); }); +test('tokenizeSubtitle keeps frequency rank when mecab tags classify token as content-bearing', async () => { + const result = await tokenizeSubtitle( + 'ふふ', + makeDepsFromYomitanTokens([{ surface: 'ふふ', reading: '', headword: 'ふふ' }], { + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => (text === 'ふふ' ? 3014 : null), + tokenizeWithMecab: async () => [ + { + headword: 'ふふ', + surface: 'ふふ', + reading: 'フフ', + startPos: 0, + endPos: 2, + partOfSpeech: PartOfSpeech.verb, + pos1: '動詞', + pos2: '自立', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + ], + }), + ); + + assert.equal(result.tokens?.length, 1); + assert.equal(result.tokens?.[0]?.frequencyRank, 3014); +}); + test('tokenizeSubtitle ignores invalid frequency ranks', async () => { const result = await tokenizeSubtitle( '猫', @@ -2400,7 +2428,7 @@ test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and freque assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false); }); -test('tokenizeSubtitle keeps merged token when overlap contains at least one content pos1 tag', async () => { +test('tokenizeSubtitle excludes merged function/content token from frequency highlighting but keeps N+1', async () => { const result = await tokenizeSubtitle( 'になれば', makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], { @@ -2453,7 +2481,7 @@ test('tokenizeSubtitle keeps merged token when overlap contains at least one con assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.pos1, '助詞|動詞'); - assert.equal(result.tokens?.[0]?.frequencyRank, 13); + assert.equal(result.tokens?.[0]?.frequencyRank, undefined); assert.equal(result.tokens?.[0]?.isNPlusOneTarget, true); }); diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts index a09400a..1123a5d 100644 --- a/src/core/services/tokenizer/annotation-stage.test.ts +++ b/src/core/services/tokenizer/annotation-stage.test.ts @@ -314,6 +314,26 @@ test('annotateTokens excludes likely kana SFX tokens from frequency when POS tag assert.equal(result[0]?.frequencyRank, undefined); }); +test('annotateTokens keeps frequency when mecab tags classify token as content-bearing', () => { + const tokens = [ + makeToken({ + surface: 'ふふ', + headword: 'ふふ', + pos1: '動詞', + pos2: '自立', + frequencyRank: 3014, + startPos: 0, + endPos: 2, + }), + ]; + + const result = annotateTokens(tokens, makeDeps(), { + minSentenceWordsForNPlusOne: 1, + }); + + assert.equal(result[0]?.frequencyRank, 3014); +}); + test('annotateTokens allows previously default-excluded pos2 when removed from effective set', () => { const tokens = [ makeToken({ @@ -337,7 +357,7 @@ test('annotateTokens allows previously default-excluded pos2 when removed from e assert.equal(result[0]?.isNPlusOneTarget, true); }); -test('annotateTokens keeps composite tokens when any component pos tag is content-bearing', () => { +test('annotateTokens excludes composite function/content tokens from frequency but keeps N+1 eligible', () => { const tokens = [ makeToken({ surface: 'になれば', @@ -354,7 +374,7 @@ test('annotateTokens keeps composite tokens when any component pos tag is conten minSentenceWordsForNPlusOne: 1, }); - assert.equal(result[0]?.frequencyRank, 5); + assert.equal(result[0]?.frequencyRank, undefined); assert.equal(result[0]?.isNPlusOneTarget, true); }); diff --git a/src/core/services/tokenizer/annotation-stage.ts b/src/core/services/tokenizer/annotation-stage.ts index ab68274..1015c42 100644 --- a/src/core/services/tokenizer/annotation-stage.ts +++ b/src/core/services/tokenizer/annotation-stage.ts @@ -73,8 +73,9 @@ function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet exclusions.has(part)); + // Frequency highlighting should be conservative: if any merged component is excluded, + // skip highlighting the whole token to avoid noisy merged fragments. + return parts.some((part) => exclusions.has(part)); } function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet { diff --git a/src/core/services/tokenizer/parser-enrichment-stage.test.ts b/src/core/services/tokenizer/parser-enrichment-stage.test.ts index be45e06..5fc4723 100644 --- a/src/core/services/tokenizer/parser-enrichment-stage.test.ts +++ b/src/core/services/tokenizer/parser-enrichment-stage.test.ts @@ -39,6 +39,30 @@ test('enrichTokensWithMecabPos1 fills missing pos1 using surface-sequence fallba assert.equal(enriched[0]?.pos1, '助詞'); }); +test('enrichTokensWithMecabPos1 keeps partOfSpeech unchanged and only enriches POS tags', () => { + const tokens = [makeToken({ surface: 'これは', startPos: 0, endPos: 3 })]; + const mecabTokens = [ + makeToken({ + surface: 'これ', + startPos: 0, + endPos: 2, + pos1: '名詞', + partOfSpeech: PartOfSpeech.noun, + }), + makeToken({ + surface: 'は', + startPos: 2, + endPos: 3, + pos1: '助詞', + partOfSpeech: PartOfSpeech.particle, + }), + ]; + + const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens); + assert.equal(enriched[0]?.pos1, '名詞|助詞'); + assert.equal(enriched[0]?.partOfSpeech, PartOfSpeech.other); +}); + test('enrichTokensWithMecabPos1 passes through unchanged when mecab tokens are null or empty', () => { const tokens = [makeToken({ surface: '猫', startPos: 0, endPos: 1 })];