fix(tokenizer): preserve annotation and enrichment behavior

This commit is contained in:
2026-04-26 17:57:39 -07:00
parent 96894ff85c
commit b10a7b3e98
6 changed files with 55 additions and 16 deletions

View File

@@ -0,0 +1,6 @@
type: fixed
area: tokenizer
- Use Yomitan `wordClasses` metadata for subtitle POS filtering.
- Backfill blank MeCab POS detail fields during parser enrichment.
- Keep subtitle annotation metadata stripped from token results.

View File

@@ -160,7 +160,7 @@ async function applyAnnotationStage(
options: TokenizerAnnotationOptions,
): Promise<MergedToken[]> {
if (!hasAnyAnnotationEnabled(options)) {
return tokens;
return stripSubtitleAnnotationMetadata(tokens);
}
if (!annotationStageModulePromise) {

View File

@@ -789,10 +789,7 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === 'た' || text === '負',
getJlptLevel: (text) => (text === 'た' || text === '負' ? 'N3' : null),
}),
makeDeps(),
{
minSentenceWordsForNPlusOne: 1,
},

View File

@@ -39,6 +39,33 @@ test('enrichTokensWithMecabPos1 fills missing pos1 using surface-sequence fallba
assert.equal(enriched[0]?.pos1, '助詞');
});
test('enrichTokensWithMecabPos1 backfills blank pos2 and pos3 fields', () => {
const tokens = [
makeToken({
surface: 'は',
startPos: 0,
endPos: 1,
pos1: '助詞',
pos2: '',
pos3: ' ',
}),
];
const mecabTokens = [
makeToken({
surface: 'は',
startPos: 0,
endPos: 1,
pos1: '助詞',
pos2: '係助詞',
pos3: '一般',
}),
];
const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
assert.equal(enriched[0]?.pos2, '係助詞');
assert.equal(enriched[0]?.pos3, '一般');
});
test('enrichTokensWithMecabPos1 keeps partOfSpeech unchanged and only enriches POS tags', () => {
const tokens = [makeToken({ surface: 'これは', startPos: 0, endPos: 3 })];
const mecabTokens = [

View File

@@ -120,6 +120,13 @@ function lowerBoundByIndex(candidates: IndexedMecabToken[], targetIndex: number)
return low;
}
function coalesceMissingPosField(
current: string | undefined,
fallback: string | undefined,
): string | undefined {
return typeof current === 'string' && current.trim().length > 0 ? current : fallback;
}
function joinUniqueTags(values: Array<string | undefined>): string | undefined {
const unique: string[] = [];
for (const value of values) {
@@ -329,9 +336,9 @@ function fillMissingPos1BySurfaceSequence(
cursor = best.index + 1;
return {
...token,
pos1: token.pos1 ?? best.pos1,
pos2: token.pos2 ?? best.pos2,
pos3: token.pos3 ?? best.pos3,
pos1: coalesceMissingPosField(token.pos1, best.pos1),
pos2: coalesceMissingPosField(token.pos2, best.pos2),
pos3: coalesceMissingPosField(token.pos3, best.pos3),
};
});
}
@@ -412,9 +419,9 @@ export function enrichTokensWithMecabPos1(
return {
...token,
pos1: token.pos1 ?? metadata.pos1,
pos2: token.pos2 ?? metadata.pos2,
pos3: token.pos3 ?? metadata.pos3,
pos1: coalesceMissingPosField(token.pos1, metadata.pos1),
pos2: coalesceMissingPosField(token.pos2, metadata.pos2),
pos3: coalesceMissingPosField(token.pos3, metadata.pos3),
};
});

View File

@@ -13,6 +13,11 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
const KATAKANA_CODEPOINT_START = 0x30a1;
const KATAKANA_CODEPOINT_END = 0x30f6;
const STANDALONE_GRAMMAR_PARTICLE_PHRASES = ['たって', 'だって'] as const;
const STANDALONE_GRAMMAR_PARTICLE_PHRASES_SET: ReadonlySet<string> = new Set(
STANDALONE_GRAMMAR_PARTICLE_PHRASES,
);
const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
'あ',
'ああ',
@@ -20,9 +25,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
'うう',
'おお',
'くれ',
'たって',
'って',
'だって',
'はあ',
'はは',
'べき',
@@ -31,6 +34,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
'ほう',
'もんか',
'ものか',
...STANDALONE_GRAMMAR_PARTICLE_PHRASES,
]);
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
@@ -98,8 +102,6 @@ const STANDALONE_GRAMMAR_PARTICLE_SURFACES = new Set([
'よ',
'を',
]);
const STANDALONE_GRAMMAR_PARTICLE_PHRASES = new Set(['たって', 'だって']);
export interface SubtitleAnnotationFilterOptions {
pos1Exclusions?: ReadonlySet<string>;
pos2Exclusions?: ReadonlySet<string>;
@@ -327,7 +329,7 @@ function isStandaloneGrammarParticle(token: MergedToken): boolean {
return (
normalizedSurface === normalizedHeadword &&
(STANDALONE_GRAMMAR_PARTICLE_SURFACES.has(normalizedSurface) ||
STANDALONE_GRAMMAR_PARTICLE_PHRASES.has(normalizedSurface))
STANDALONE_GRAMMAR_PARTICLE_PHRASES_SET.has(normalizedSurface))
);
}