fix(tokenizer): tighten frequency highlighting exclusions

This commit is contained in:
2026-03-04 11:19:24 -08:00
parent 092c56f98f
commit 9a30419a23
4 changed files with 79 additions and 6 deletions

View File

@@ -314,6 +314,26 @@ test('annotateTokens excludes likely kana SFX tokens from frequency when POS tag
assert.equal(result[0]?.frequencyRank, undefined);
});
test('annotateTokens keeps frequency when mecab tags classify token as content-bearing', () => {
const tokens = [
makeToken({
surface: 'ふふ',
headword: 'ふふ',
pos1: '動詞',
pos2: '自立',
frequencyRank: 3014,
startPos: 0,
endPos: 2,
}),
];
const result = annotateTokens(tokens, makeDeps(), {
minSentenceWordsForNPlusOne: 1,
});
assert.equal(result[0]?.frequencyRank, 3014);
});
test('annotateTokens allows previously default-excluded pos2 when removed from effective set', () => {
const tokens = [
makeToken({
@@ -337,7 +357,7 @@ test('annotateTokens allows previously default-excluded pos2 when removed from e
assert.equal(result[0]?.isNPlusOneTarget, true);
});
test('annotateTokens keeps composite tokens when any component pos tag is content-bearing', () => {
test('annotateTokens excludes composite function/content tokens from frequency but keeps N+1 eligible', () => {
const tokens = [
makeToken({
surface: 'になれば',
@@ -354,7 +374,7 @@ test('annotateTokens keeps composite tokens when any component pos tag is conten
minSentenceWordsForNPlusOne: 1,
});
assert.equal(result[0]?.frequencyRank, 5);
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.isNPlusOneTarget, true);
});

View File

@@ -73,8 +73,9 @@ function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet<strin
if (parts.length === 0) {
return false;
}
// Composite tags like "助詞|名詞" stay eligible unless every component is excluded.
return parts.every((part) => exclusions.has(part));
// Frequency highlighting should be conservative: if any merged component is excluded,
// skip highlighting the whole token to avoid noisy merged fragments.
return parts.some((part) => exclusions.has(part));
}
function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {

View File

@@ -39,6 +39,30 @@ test('enrichTokensWithMecabPos1 fills missing pos1 using surface-sequence fallba
assert.equal(enriched[0]?.pos1, '助詞');
});
test('enrichTokensWithMecabPos1 keeps partOfSpeech unchanged and only enriches POS tags', () => {
const tokens = [makeToken({ surface: 'これは', startPos: 0, endPos: 3 })];
const mecabTokens = [
makeToken({
surface: 'これ',
startPos: 0,
endPos: 2,
pos1: '名詞',
partOfSpeech: PartOfSpeech.noun,
}),
makeToken({
surface: 'は',
startPos: 2,
endPos: 3,
pos1: '助詞',
partOfSpeech: PartOfSpeech.particle,
}),
];
const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
assert.equal(enriched[0]?.pos1, '名詞|助詞');
assert.equal(enriched[0]?.partOfSpeech, PartOfSpeech.other);
});
test('enrichTokensWithMecabPos1 passes through unchanged when mecab tokens are null or empty', () => {
const tokens = [makeToken({ surface: '猫', startPos: 0, endPos: 1 })];