diff --git a/changes/305-tokenizer-word-class-pos-filtering.md b/changes/305-tokenizer-word-class-pos-filtering.md new file mode 100644 index 00000000..cb3a3ab9 --- /dev/null +++ b/changes/305-tokenizer-word-class-pos-filtering.md @@ -0,0 +1,6 @@ +type: fixed +area: tokenizer + +- Use Yomitan `wordClasses` metadata for subtitle POS filtering. +- Backfill blank MeCab POS detail fields during parser enrichment. +- Keep subtitle annotation metadata stripped from token results. diff --git a/src/core/services/tokenizer.ts b/src/core/services/tokenizer.ts index 0e227697..dda46d8c 100644 --- a/src/core/services/tokenizer.ts +++ b/src/core/services/tokenizer.ts @@ -160,7 +160,7 @@ async function applyAnnotationStage( options: TokenizerAnnotationOptions, ): Promise { if (!hasAnyAnnotationEnabled(options)) { - return tokens; + return stripSubtitleAnnotationMetadata(tokens); } if (!annotationStageModulePromise) { diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts index 4324acb0..70ec7ba0 100644 --- a/src/core/services/tokenizer/annotation-stage.test.ts +++ b/src/core/services/tokenizer/annotation-stage.test.ts @@ -789,10 +789,7 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+ const result = annotateTokens( tokens, - makeDeps({ - isKnownWord: (text) => text === 'た' || text === '負', - getJlptLevel: (text) => (text === 'た' || text === '負' ? 'N3' : null), - }), + makeDeps(), { minSentenceWordsForNPlusOne: 1, }, diff --git a/src/core/services/tokenizer/parser-enrichment-stage.test.ts b/src/core/services/tokenizer/parser-enrichment-stage.test.ts index 5fc47233..b4df9cd7 100644 --- a/src/core/services/tokenizer/parser-enrichment-stage.test.ts +++ b/src/core/services/tokenizer/parser-enrichment-stage.test.ts @@ -39,6 +39,33 @@ test('enrichTokensWithMecabPos1 fills missing pos1 using surface-sequence fallba assert.equal(enriched[0]?.pos1, '助詞'); }); +test('enrichTokensWithMecabPos1 backfills blank pos2 and pos3 fields', () => { + const tokens = [ + makeToken({ + surface: 'は', + startPos: 0, + endPos: 1, + pos1: '助詞', + pos2: '', + pos3: ' ', + }), + ]; + const mecabTokens = [ + makeToken({ + surface: 'は', + startPos: 0, + endPos: 1, + pos1: '助詞', + pos2: '係助詞', + pos3: '一般', + }), + ]; + + const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens); + assert.equal(enriched[0]?.pos2, '係助詞'); + assert.equal(enriched[0]?.pos3, '一般'); +}); + test('enrichTokensWithMecabPos1 keeps partOfSpeech unchanged and only enriches POS tags', () => { const tokens = [makeToken({ surface: 'これは', startPos: 0, endPos: 3 })]; const mecabTokens = [ diff --git a/src/core/services/tokenizer/parser-enrichment-stage.ts b/src/core/services/tokenizer/parser-enrichment-stage.ts index 3a9e6590..86f6af4d 100644 --- a/src/core/services/tokenizer/parser-enrichment-stage.ts +++ b/src/core/services/tokenizer/parser-enrichment-stage.ts @@ -120,6 +120,13 @@ function lowerBoundByIndex(candidates: IndexedMecabToken[], targetIndex: number) return low; } +function coalesceMissingPosField( + current: string | undefined, + fallback: string | undefined, +): string | undefined { + return typeof current === 'string' && current.trim().length > 0 ? current : fallback; +} + function joinUniqueTags(values: Array): string | undefined { const unique: string[] = []; for (const value of values) { @@ -329,9 +336,9 @@ function fillMissingPos1BySurfaceSequence( cursor = best.index + 1; return { ...token, - pos1: token.pos1 ?? best.pos1, - pos2: token.pos2 ?? best.pos2, - pos3: token.pos3 ?? best.pos3, + pos1: coalesceMissingPosField(token.pos1, best.pos1), + pos2: coalesceMissingPosField(token.pos2, best.pos2), + pos3: coalesceMissingPosField(token.pos3, best.pos3), }; }); } @@ -412,9 +419,9 @@ export function enrichTokensWithMecabPos1( return { ...token, - pos1: token.pos1 ?? metadata.pos1, - pos2: token.pos2 ?? metadata.pos2, - pos3: token.pos3 ?? metadata.pos3, + pos1: coalesceMissingPosField(token.pos1, metadata.pos1), + pos2: coalesceMissingPosField(token.pos2, metadata.pos2), + pos3: coalesceMissingPosField(token.pos3, metadata.pos3), }; }); diff --git a/src/core/services/tokenizer/subtitle-annotation-filter.ts b/src/core/services/tokenizer/subtitle-annotation-filter.ts index f4434d7b..2070f024 100644 --- a/src/core/services/tokenizer/subtitle-annotation-filter.ts +++ b/src/core/services/tokenizer/subtitle-annotation-filter.ts @@ -13,6 +13,11 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60; const KATAKANA_CODEPOINT_START = 0x30a1; const KATAKANA_CODEPOINT_END = 0x30f6; +const STANDALONE_GRAMMAR_PARTICLE_PHRASES = ['たって', 'だって'] as const; +const STANDALONE_GRAMMAR_PARTICLE_PHRASES_SET: ReadonlySet = new Set( + STANDALONE_GRAMMAR_PARTICLE_PHRASES, +); + const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([ 'あ', 'ああ', @@ -20,9 +25,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([ 'うう', 'おお', 'くれ', - 'たって', 'って', - 'だって', 'はあ', 'はは', 'べき', @@ -31,6 +34,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([ 'ほう', 'もんか', 'ものか', + ...STANDALONE_GRAMMAR_PARTICLE_PHRASES, ]); const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの']; const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [ @@ -98,8 +102,6 @@ const STANDALONE_GRAMMAR_PARTICLE_SURFACES = new Set([ 'よ', 'を', ]); -const STANDALONE_GRAMMAR_PARTICLE_PHRASES = new Set(['たって', 'だって']); - export interface SubtitleAnnotationFilterOptions { pos1Exclusions?: ReadonlySet; pos2Exclusions?: ReadonlySet; @@ -327,7 +329,7 @@ function isStandaloneGrammarParticle(token: MergedToken): boolean { return ( normalizedSurface === normalizedHeadword && (STANDALONE_GRAMMAR_PARTICLE_SURFACES.has(normalizedSurface) || - STANDALONE_GRAMMAR_PARTICLE_PHRASES.has(normalizedSurface)) + STANDALONE_GRAMMAR_PARTICLE_PHRASES_SET.has(normalizedSurface)) ); }