fix(tokenizer): preserve annotation and enrichment behavior

This commit is contained in:
2026-04-26 17:57:39 -07:00
parent 96894ff85c
commit b10a7b3e98
6 changed files with 55 additions and 16 deletions

View File

@@ -0,0 +1,6 @@
type: fixed
area: tokenizer
- Use Yomitan `wordClasses` metadata for subtitle POS filtering.
- Backfill blank MeCab POS detail fields during parser enrichment.
- Keep subtitle annotation metadata stripped from token results.

View File

@@ -160,7 +160,7 @@ async function applyAnnotationStage(
options: TokenizerAnnotationOptions, options: TokenizerAnnotationOptions,
): Promise<MergedToken[]> { ): Promise<MergedToken[]> {
if (!hasAnyAnnotationEnabled(options)) { if (!hasAnyAnnotationEnabled(options)) {
return tokens; return stripSubtitleAnnotationMetadata(tokens);
} }
if (!annotationStageModulePromise) { if (!annotationStageModulePromise) {

View File

@@ -789,10 +789,7 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+
const result = annotateTokens( const result = annotateTokens(
tokens, tokens,
makeDeps({ makeDeps(),
isKnownWord: (text) => text === 'た' || text === '負',
getJlptLevel: (text) => (text === 'た' || text === '負' ? 'N3' : null),
}),
{ {
minSentenceWordsForNPlusOne: 1, minSentenceWordsForNPlusOne: 1,
}, },

View File

@@ -39,6 +39,33 @@ test('enrichTokensWithMecabPos1 fills missing pos1 using surface-sequence fallba
assert.equal(enriched[0]?.pos1, '助詞'); assert.equal(enriched[0]?.pos1, '助詞');
}); });
test('enrichTokensWithMecabPos1 backfills blank pos2 and pos3 fields', () => {
const tokens = [
makeToken({
surface: 'は',
startPos: 0,
endPos: 1,
pos1: '助詞',
pos2: '',
pos3: ' ',
}),
];
const mecabTokens = [
makeToken({
surface: 'は',
startPos: 0,
endPos: 1,
pos1: '助詞',
pos2: '係助詞',
pos3: '一般',
}),
];
const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
assert.equal(enriched[0]?.pos2, '係助詞');
assert.equal(enriched[0]?.pos3, '一般');
});
test('enrichTokensWithMecabPos1 keeps partOfSpeech unchanged and only enriches POS tags', () => { test('enrichTokensWithMecabPos1 keeps partOfSpeech unchanged and only enriches POS tags', () => {
const tokens = [makeToken({ surface: 'これは', startPos: 0, endPos: 3 })]; const tokens = [makeToken({ surface: 'これは', startPos: 0, endPos: 3 })];
const mecabTokens = [ const mecabTokens = [

View File

@@ -120,6 +120,13 @@ function lowerBoundByIndex(candidates: IndexedMecabToken[], targetIndex: number)
return low; return low;
} }
function coalesceMissingPosField(
current: string | undefined,
fallback: string | undefined,
): string | undefined {
return typeof current === 'string' && current.trim().length > 0 ? current : fallback;
}
function joinUniqueTags(values: Array<string | undefined>): string | undefined { function joinUniqueTags(values: Array<string | undefined>): string | undefined {
const unique: string[] = []; const unique: string[] = [];
for (const value of values) { for (const value of values) {
@@ -329,9 +336,9 @@ function fillMissingPos1BySurfaceSequence(
cursor = best.index + 1; cursor = best.index + 1;
return { return {
...token, ...token,
pos1: token.pos1 ?? best.pos1, pos1: coalesceMissingPosField(token.pos1, best.pos1),
pos2: token.pos2 ?? best.pos2, pos2: coalesceMissingPosField(token.pos2, best.pos2),
pos3: token.pos3 ?? best.pos3, pos3: coalesceMissingPosField(token.pos3, best.pos3),
}; };
}); });
} }
@@ -412,9 +419,9 @@ export function enrichTokensWithMecabPos1(
return { return {
...token, ...token,
pos1: token.pos1 ?? metadata.pos1, pos1: coalesceMissingPosField(token.pos1, metadata.pos1),
pos2: token.pos2 ?? metadata.pos2, pos2: coalesceMissingPosField(token.pos2, metadata.pos2),
pos3: token.pos3 ?? metadata.pos3, pos3: coalesceMissingPosField(token.pos3, metadata.pos3),
}; };
}); });

View File

@@ -13,6 +13,11 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
const KATAKANA_CODEPOINT_START = 0x30a1; const KATAKANA_CODEPOINT_START = 0x30a1;
const KATAKANA_CODEPOINT_END = 0x30f6; const KATAKANA_CODEPOINT_END = 0x30f6;
const STANDALONE_GRAMMAR_PARTICLE_PHRASES = ['たって', 'だって'] as const;
const STANDALONE_GRAMMAR_PARTICLE_PHRASES_SET: ReadonlySet<string> = new Set(
STANDALONE_GRAMMAR_PARTICLE_PHRASES,
);
const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([ const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
'あ', 'あ',
'ああ', 'ああ',
@@ -20,9 +25,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
'うう', 'うう',
'おお', 'おお',
'くれ', 'くれ',
'たって',
'って', 'って',
'だって',
'はあ', 'はあ',
'はは', 'はは',
'べき', 'べき',
@@ -31,6 +34,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
'ほう', 'ほう',
'もんか', 'もんか',
'ものか', 'ものか',
...STANDALONE_GRAMMAR_PARTICLE_PHRASES,
]); ]);
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの']; const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [ const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
@@ -98,8 +102,6 @@ const STANDALONE_GRAMMAR_PARTICLE_SURFACES = new Set([
'よ', 'よ',
'を', 'を',
]); ]);
const STANDALONE_GRAMMAR_PARTICLE_PHRASES = new Set(['たって', 'だって']);
export interface SubtitleAnnotationFilterOptions { export interface SubtitleAnnotationFilterOptions {
pos1Exclusions?: ReadonlySet<string>; pos1Exclusions?: ReadonlySet<string>;
pos2Exclusions?: ReadonlySet<string>; pos2Exclusions?: ReadonlySet<string>;
@@ -327,7 +329,7 @@ function isStandaloneGrammarParticle(token: MergedToken): boolean {
return ( return (
normalizedSurface === normalizedHeadword && normalizedSurface === normalizedHeadword &&
(STANDALONE_GRAMMAR_PARTICLE_SURFACES.has(normalizedSurface) || (STANDALONE_GRAMMAR_PARTICLE_SURFACES.has(normalizedSurface) ||
STANDALONE_GRAMMAR_PARTICLE_PHRASES.has(normalizedSurface)) STANDALONE_GRAMMAR_PARTICLE_PHRASES_SET.has(normalizedSurface))
); );
} }