fix(tokenizer): tighten frequency highlighting exclusions

2026-06-19 03:13:32 -07:00 · 2026-03-04 11:19:24 -08:00
parent 092c56f98f
commit 9a30419a23
4 changed files with 79 additions and 6 deletions
@@ -314,6 +314,26 @@ test('annotateTokens excludes likely kana SFX tokens from frequency when POS tag
  assert.equal(result[0]?.frequencyRank, undefined);
 });

+test('annotateTokens keeps frequency when mecab tags classify token as content-bearing', () => {
+  const tokens = [
+    makeToken({
+      surface: 'ふふ',
+      headword: 'ふふ',
+      pos1: '動詞',
+      pos2: '自立',
+      frequencyRank: 3014,
+      startPos: 0,
+      endPos: 2,
+    }),
+  ];
+
+  const result = annotateTokens(tokens, makeDeps(), {
+    minSentenceWordsForNPlusOne: 1,
+  });
+
+  assert.equal(result[0]?.frequencyRank, 3014);
+});
+
 test('annotateTokens allows previously default-excluded pos2 when removed from effective set', () => {
  const tokens = [
    makeToken({
@@ -337,7 +357,7 @@ test('annotateTokens allows previously default-excluded pos2 when removed from e
  assert.equal(result[0]?.isNPlusOneTarget, true);
 });

-test('annotateTokens keeps composite tokens when any component pos tag is content-bearing', () => {
+test('annotateTokens excludes composite function/content tokens from frequency but keeps N+1 eligible', () => {
  const tokens = [
    makeToken({
      surface: 'になれば',
@@ -354,7 +374,7 @@ test('annotateTokens keeps composite tokens when any component pos tag is conten
    minSentenceWordsForNPlusOne: 1,
  });

-  assert.equal(result[0]?.frequencyRank, 5);
+  assert.equal(result[0]?.frequencyRank, undefined);
  assert.equal(result[0]?.isNPlusOneTarget, true);
 });

@@ -73,8 +73,9 @@ function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet<strin
  if (parts.length === 0) {
    return false;
  }
-  // Composite tags like "助詞|名詞" stay eligible unless every component is excluded.
-  return parts.every((part) => exclusions.has(part));
+  // Frequency highlighting should be conservative: if any merged component is excluded,
+  // skip highlighting the whole token to avoid noisy merged fragments.
+  return parts.some((part) => exclusions.has(part));
 }

 function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
@@ -39,6 +39,30 @@ test('enrichTokensWithMecabPos1 fills missing pos1 using surface-sequence fallba
  assert.equal(enriched[0]?.pos1, '助詞');
 });

+test('enrichTokensWithMecabPos1 keeps partOfSpeech unchanged and only enriches POS tags', () => {
+  const tokens = [makeToken({ surface: 'これは', startPos: 0, endPos: 3 })];
+  const mecabTokens = [
+    makeToken({
+      surface: 'これ',
+      startPos: 0,
+      endPos: 2,
+      pos1: '名詞',
+      partOfSpeech: PartOfSpeech.noun,
+    }),
+    makeToken({
+      surface: 'は',
+      startPos: 2,
+      endPos: 3,
+      pos1: '助詞',
+      partOfSpeech: PartOfSpeech.particle,
+    }),
+  ];
+
+  const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
+  assert.equal(enriched[0]?.pos1, '名詞|助詞');
+  assert.equal(enriched[0]?.partOfSpeech, PartOfSpeech.other);
+});
+
 test('enrichTokensWithMecabPos1 passes through unchanged when mecab tokens are null or empty', () => {
  const tokens = [makeToken({ surface: '猫', startPos: 0, endPos: 1 })];