fix(tokenizer): preserve annotation and enrichment behavior

2026-04-28 04:19:27 -07:00 · 2026-04-26 17:57:39 -07:00
parent 96894ff85c
commit b10a7b3e98
6 changed files with 55 additions and 16 deletions
--- a/changes/305-tokenizer-word-class-pos-filtering.md
+++ b/changes/305-tokenizer-word-class-pos-filtering.md
@@ -0,0 +1,6 @@
 type: fixed
 area: tokenizer
 - Use Yomitan `wordClasses` metadata for subtitle POS filtering.
 - Backfill blank MeCab POS detail fields during parser enrichment.
 - Keep subtitle annotation metadata stripped from token results.
--- a/src/core/services/tokenizer.ts
+++ b/src/core/services/tokenizer.ts
@@ -160,7 +160,7 @@ async function applyAnnotationStage(
  options: TokenizerAnnotationOptions,
 ): Promise<MergedToken[]> {
  if (!hasAnyAnnotationEnabled(options)) {
-    return tokens;
+    return stripSubtitleAnnotationMetadata(tokens);
  }
  if (!annotationStageModulePromise) {
--- a/src/core/services/tokenizer/annotation-stage.test.ts
+++ b/src/core/services/tokenizer/annotation-stage.test.ts
@@ -789,10 +789,7 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+
  const result = annotateTokens(
    tokens,
-    makeDeps({
+    makeDeps(),
      isKnownWord: (text) => text === 'た' || text === '負',
      getJlptLevel: (text) => (text === 'た' || text === '負' ? 'N3' : null),
    }),
    {
      minSentenceWordsForNPlusOne: 1,
    },
--- a/src/core/services/tokenizer/parser-enrichment-stage.test.ts
+++ b/src/core/services/tokenizer/parser-enrichment-stage.test.ts
@@ -39,6 +39,33 @@ test('enrichTokensWithMecabPos1 fills missing pos1 using surface-sequence fallba
  assert.equal(enriched[0]?.pos1, '助詞');
 });
 test('enrichTokensWithMecabPos1 backfills blank pos2 and pos3 fields', () => {
  const tokens = [
    makeToken({
      surface: 'は',
      startPos: 0,
      endPos: 1,
      pos1: '助詞',
      pos2: '',
      pos3: ' ',
    }),
  ];
  const mecabTokens = [
    makeToken({
      surface: 'は',
      startPos: 0,
      endPos: 1,
      pos1: '助詞',
      pos2: '係助詞',
      pos3: '一般',
    }),
  ];
  const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
  assert.equal(enriched[0]?.pos2, '係助詞');
  assert.equal(enriched[0]?.pos3, '一般');
 });
 test('enrichTokensWithMecabPos1 keeps partOfSpeech unchanged and only enriches POS tags', () => {
  const tokens = [makeToken({ surface: 'これは', startPos: 0, endPos: 3 })];
  const mecabTokens = [
--- a/src/core/services/tokenizer/parser-enrichment-stage.ts
+++ b/src/core/services/tokenizer/parser-enrichment-stage.ts
@@ -120,6 +120,13 @@ function lowerBoundByIndex(candidates: IndexedMecabToken[], targetIndex: number)
  return low;
 }
 function coalesceMissingPosField(
  current: string | undefined,
  fallback: string | undefined,
 ): string | undefined {
  return typeof current === 'string' && current.trim().length > 0 ? current : fallback;
 }
 function joinUniqueTags(values: Array<string | undefined>): string | undefined {
  const unique: string[] = [];
  for (const value of values) {
@@ -329,9 +336,9 @@ function fillMissingPos1BySurfaceSequence(
    cursor = best.index + 1;
    return {
      ...token,
-      pos1: token.pos1 ?? best.pos1,
+      pos1: coalesceMissingPosField(token.pos1, best.pos1),
-      pos2: token.pos2 ?? best.pos2,
+      pos2: coalesceMissingPosField(token.pos2, best.pos2),
-      pos3: token.pos3 ?? best.pos3,
+      pos3: coalesceMissingPosField(token.pos3, best.pos3),
    };
  });
 }
@@ -412,9 +419,9 @@ export function enrichTokensWithMecabPos1(
    return {
      ...token,
-      pos1: token.pos1 ?? metadata.pos1,
+      pos1: coalesceMissingPosField(token.pos1, metadata.pos1),
-      pos2: token.pos2 ?? metadata.pos2,
+      pos2: coalesceMissingPosField(token.pos2, metadata.pos2),
-      pos3: token.pos3 ?? metadata.pos3,
+      pos3: coalesceMissingPosField(token.pos3, metadata.pos3),
    };
  });
--- a/src/core/services/tokenizer/subtitle-annotation-filter.ts
+++ b/src/core/services/tokenizer/subtitle-annotation-filter.ts
@@ -13,6 +13,11 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
 const KATAKANA_CODEPOINT_START = 0x30a1;
 const KATAKANA_CODEPOINT_END = 0x30f6;
 const STANDALONE_GRAMMAR_PARTICLE_PHRASES = ['たって', 'だって'] as const;
 const STANDALONE_GRAMMAR_PARTICLE_PHRASES_SET: ReadonlySet<string> = new Set(
  STANDALONE_GRAMMAR_PARTICLE_PHRASES,
 );
 const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
  'あ',
  'ああ',
@@ -20,9 +25,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
  'うう',
  'おお',
  'くれ',
  'たって',
  'って',
  'だって',
  'はあ',
  'はは',
  'べき',
@@ -31,6 +34,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
  'ほう',
  'もんか',
  'ものか',
  ...STANDALONE_GRAMMAR_PARTICLE_PHRASES,
 ]);
 const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
 const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
@@ -98,8 +102,6 @@ const STANDALONE_GRAMMAR_PARTICLE_SURFACES = new Set([
  'よ',
  'を',
 ]);
 const STANDALONE_GRAMMAR_PARTICLE_PHRASES = new Set(['たって', 'だって']);
 export interface SubtitleAnnotationFilterOptions {
  pos1Exclusions?: ReadonlySet<string>;
  pos2Exclusions?: ReadonlySet<string>;
@@ -327,7 +329,7 @@ function isStandaloneGrammarParticle(token: MergedToken): boolean {
  return (
    normalizedSurface === normalizedHeadword &&
    (STANDALONE_GRAMMAR_PARTICLE_SURFACES.has(normalizedSurface) ||
-      STANDALONE_GRAMMAR_PARTICLE_PHRASES.has(normalizedSurface))
+      STANDALONE_GRAMMAR_PARTICLE_PHRASES_SET.has(normalizedSurface))
  );
 }