fix(tokenizer): preserve annotation and enrichment behavior

2026-04-28 04:19:27 -07:00 · 2026-04-26 17:57:39 -07:00
parent 96894ff85c
commit b10a7b3e98
6 changed files with 55 additions and 16 deletions
--- a/changes/305-tokenizer-word-class-pos-filtering.md
+++ b/changes/305-tokenizer-word-class-pos-filtering.md
@@ -0,0 +1,6 @@
+type: fixed
+area: tokenizer
+
+- Use Yomitan `wordClasses` metadata for subtitle POS filtering.
+- Backfill blank MeCab POS detail fields during parser enrichment.
+- Keep subtitle annotation metadata stripped from token results.
--- a/src/core/services/tokenizer.ts
+++ b/src/core/services/tokenizer.ts
@@ -160,7 +160,7 @@ async function applyAnnotationStage(
  options: TokenizerAnnotationOptions,
 ): Promise<MergedToken[]> {
  if (!hasAnyAnnotationEnabled(options)) {
-    return tokens;
+    return stripSubtitleAnnotationMetadata(tokens);
  }

  if (!annotationStageModulePromise) {
--- a/src/core/services/tokenizer/annotation-stage.test.ts
+++ b/src/core/services/tokenizer/annotation-stage.test.ts
@@ -789,10 +789,7 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+

  const result = annotateTokens(
    tokens,
-    makeDeps({
-      isKnownWord: (text) => text === 'た' || text === '負',
-      getJlptLevel: (text) => (text === 'た' || text === '負' ? 'N3' : null),
-    }),
+    makeDeps(),
    {
      minSentenceWordsForNPlusOne: 1,
    },
--- a/src/core/services/tokenizer/parser-enrichment-stage.test.ts
+++ b/src/core/services/tokenizer/parser-enrichment-stage.test.ts
@@ -39,6 +39,33 @@ test('enrichTokensWithMecabPos1 fills missing pos1 using surface-sequence fallba
  assert.equal(enriched[0]?.pos1, '助詞');
 });

+test('enrichTokensWithMecabPos1 backfills blank pos2 and pos3 fields', () => {
+  const tokens = [
+    makeToken({
+      surface: 'は',
+      startPos: 0,
+      endPos: 1,
+      pos1: '助詞',
+      pos2: '',
+      pos3: ' ',
+    }),
+  ];
+  const mecabTokens = [
+    makeToken({
+      surface: 'は',
+      startPos: 0,
+      endPos: 1,
+      pos1: '助詞',
+      pos2: '係助詞',
+      pos3: '一般',
+    }),
+  ];
+
+  const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
+  assert.equal(enriched[0]?.pos2, '係助詞');
+  assert.equal(enriched[0]?.pos3, '一般');
+});
+
 test('enrichTokensWithMecabPos1 keeps partOfSpeech unchanged and only enriches POS tags', () => {
  const tokens = [makeToken({ surface: 'これは', startPos: 0, endPos: 3 })];
  const mecabTokens = [
--- a/src/core/services/tokenizer/parser-enrichment-stage.ts
+++ b/src/core/services/tokenizer/parser-enrichment-stage.ts
@@ -120,6 +120,13 @@ function lowerBoundByIndex(candidates: IndexedMecabToken[], targetIndex: number)
  return low;
 }

+function coalesceMissingPosField(
+  current: string | undefined,
+  fallback: string | undefined,
+): string | undefined {
+  return typeof current === 'string' && current.trim().length > 0 ? current : fallback;
+}
+
 function joinUniqueTags(values: Array<string | undefined>): string | undefined {
  const unique: string[] = [];
  for (const value of values) {
@@ -329,9 +336,9 @@ function fillMissingPos1BySurfaceSequence(
    cursor = best.index + 1;
    return {
      ...token,
-      pos1: token.pos1 ?? best.pos1,
-      pos2: token.pos2 ?? best.pos2,
-      pos3: token.pos3 ?? best.pos3,
+      pos1: coalesceMissingPosField(token.pos1, best.pos1),
+      pos2: coalesceMissingPosField(token.pos2, best.pos2),
+      pos3: coalesceMissingPosField(token.pos3, best.pos3),
    };
  });
 }
@@ -412,9 +419,9 @@ export function enrichTokensWithMecabPos1(

    return {
      ...token,
-      pos1: token.pos1 ?? metadata.pos1,
-      pos2: token.pos2 ?? metadata.pos2,
-      pos3: token.pos3 ?? metadata.pos3,
+      pos1: coalesceMissingPosField(token.pos1, metadata.pos1),
+      pos2: coalesceMissingPosField(token.pos2, metadata.pos2),
+      pos3: coalesceMissingPosField(token.pos3, metadata.pos3),
    };
  });

--- a/src/core/services/tokenizer/subtitle-annotation-filter.ts
+++ b/src/core/services/tokenizer/subtitle-annotation-filter.ts
@@ -13,6 +13,11 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
 const KATAKANA_CODEPOINT_START = 0x30a1;
 const KATAKANA_CODEPOINT_END = 0x30f6;

+const STANDALONE_GRAMMAR_PARTICLE_PHRASES = ['たって', 'だって'] as const;
+const STANDALONE_GRAMMAR_PARTICLE_PHRASES_SET: ReadonlySet<string> = new Set(
+  STANDALONE_GRAMMAR_PARTICLE_PHRASES,
+);
+
 const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
  'あ',
  'ああ',
@@ -20,9 +25,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
  'うう',
  'おお',
  'くれ',
-  'たって',
  'って',
-  'だって',
  'はあ',
  'はは',
  'べき',
@@ -31,6 +34,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
  'ほう',
  'もんか',
  'ものか',
+  ...STANDALONE_GRAMMAR_PARTICLE_PHRASES,
 ]);
 const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
 const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
@@ -98,8 +102,6 @@ const STANDALONE_GRAMMAR_PARTICLE_SURFACES = new Set([
  'よ',
  'を',
 ]);
-const STANDALONE_GRAMMAR_PARTICLE_PHRASES = new Set(['たって', 'だって']);
-
 export interface SubtitleAnnotationFilterOptions {
  pos1Exclusions?: ReadonlySet<string>;
  pos2Exclusions?: ReadonlySet<string>;
@@ -327,7 +329,7 @@ function isStandaloneGrammarParticle(token: MergedToken): boolean {
  return (
    normalizedSurface === normalizedHeadword &&
    (STANDALONE_GRAMMAR_PARTICLE_SURFACES.has(normalizedSurface) ||
-      STANDALONE_GRAMMAR_PARTICLE_PHRASES.has(normalizedSurface))
+      STANDALONE_GRAMMAR_PARTICLE_PHRASES_SET.has(normalizedSurface))
  );
 }