mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-04-27 16:19:35 -07:00
fix(tokenizer): preserve annotation and enrichment behavior
This commit is contained in:
6
changes/305-tokenizer-word-class-pos-filtering.md
Normal file
6
changes/305-tokenizer-word-class-pos-filtering.md
Normal file
@@ -0,0 +1,6 @@
|
||||
type: fixed
|
||||
area: tokenizer
|
||||
|
||||
- Use Yomitan `wordClasses` metadata for subtitle POS filtering.
|
||||
- Backfill blank MeCab POS detail fields during parser enrichment.
|
||||
- Keep subtitle annotation metadata stripped from token results.
|
||||
@@ -160,7 +160,7 @@ async function applyAnnotationStage(
|
||||
options: TokenizerAnnotationOptions,
|
||||
): Promise<MergedToken[]> {
|
||||
if (!hasAnyAnnotationEnabled(options)) {
|
||||
return tokens;
|
||||
return stripSubtitleAnnotationMetadata(tokens);
|
||||
}
|
||||
|
||||
if (!annotationStageModulePromise) {
|
||||
|
||||
@@ -789,10 +789,7 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === 'た' || text === '負',
|
||||
getJlptLevel: (text) => (text === 'た' || text === '負' ? 'N3' : null),
|
||||
}),
|
||||
makeDeps(),
|
||||
{
|
||||
minSentenceWordsForNPlusOne: 1,
|
||||
},
|
||||
|
||||
@@ -39,6 +39,33 @@ test('enrichTokensWithMecabPos1 fills missing pos1 using surface-sequence fallba
|
||||
assert.equal(enriched[0]?.pos1, '助詞');
|
||||
});
|
||||
|
||||
test('enrichTokensWithMecabPos1 backfills blank pos2 and pos3 fields', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'は',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
pos1: '助詞',
|
||||
pos2: '',
|
||||
pos3: ' ',
|
||||
}),
|
||||
];
|
||||
const mecabTokens = [
|
||||
makeToken({
|
||||
surface: 'は',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
pos1: '助詞',
|
||||
pos2: '係助詞',
|
||||
pos3: '一般',
|
||||
}),
|
||||
];
|
||||
|
||||
const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
|
||||
assert.equal(enriched[0]?.pos2, '係助詞');
|
||||
assert.equal(enriched[0]?.pos3, '一般');
|
||||
});
|
||||
|
||||
test('enrichTokensWithMecabPos1 keeps partOfSpeech unchanged and only enriches POS tags', () => {
|
||||
const tokens = [makeToken({ surface: 'これは', startPos: 0, endPos: 3 })];
|
||||
const mecabTokens = [
|
||||
|
||||
@@ -120,6 +120,13 @@ function lowerBoundByIndex(candidates: IndexedMecabToken[], targetIndex: number)
|
||||
return low;
|
||||
}
|
||||
|
||||
function coalesceMissingPosField(
|
||||
current: string | undefined,
|
||||
fallback: string | undefined,
|
||||
): string | undefined {
|
||||
return typeof current === 'string' && current.trim().length > 0 ? current : fallback;
|
||||
}
|
||||
|
||||
function joinUniqueTags(values: Array<string | undefined>): string | undefined {
|
||||
const unique: string[] = [];
|
||||
for (const value of values) {
|
||||
@@ -329,9 +336,9 @@ function fillMissingPos1BySurfaceSequence(
|
||||
cursor = best.index + 1;
|
||||
return {
|
||||
...token,
|
||||
pos1: token.pos1 ?? best.pos1,
|
||||
pos2: token.pos2 ?? best.pos2,
|
||||
pos3: token.pos3 ?? best.pos3,
|
||||
pos1: coalesceMissingPosField(token.pos1, best.pos1),
|
||||
pos2: coalesceMissingPosField(token.pos2, best.pos2),
|
||||
pos3: coalesceMissingPosField(token.pos3, best.pos3),
|
||||
};
|
||||
});
|
||||
}
|
||||
@@ -412,9 +419,9 @@ export function enrichTokensWithMecabPos1(
|
||||
|
||||
return {
|
||||
...token,
|
||||
pos1: token.pos1 ?? metadata.pos1,
|
||||
pos2: token.pos2 ?? metadata.pos2,
|
||||
pos3: token.pos3 ?? metadata.pos3,
|
||||
pos1: coalesceMissingPosField(token.pos1, metadata.pos1),
|
||||
pos2: coalesceMissingPosField(token.pos2, metadata.pos2),
|
||||
pos3: coalesceMissingPosField(token.pos3, metadata.pos3),
|
||||
};
|
||||
});
|
||||
|
||||
|
||||
@@ -13,6 +13,11 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
|
||||
const KATAKANA_CODEPOINT_START = 0x30a1;
|
||||
const KATAKANA_CODEPOINT_END = 0x30f6;
|
||||
|
||||
const STANDALONE_GRAMMAR_PARTICLE_PHRASES = ['たって', 'だって'] as const;
|
||||
const STANDALONE_GRAMMAR_PARTICLE_PHRASES_SET: ReadonlySet<string> = new Set(
|
||||
STANDALONE_GRAMMAR_PARTICLE_PHRASES,
|
||||
);
|
||||
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
||||
'あ',
|
||||
'ああ',
|
||||
@@ -20,9 +25,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
||||
'うう',
|
||||
'おお',
|
||||
'くれ',
|
||||
'たって',
|
||||
'って',
|
||||
'だって',
|
||||
'はあ',
|
||||
'はは',
|
||||
'べき',
|
||||
@@ -31,6 +34,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
||||
'ほう',
|
||||
'もんか',
|
||||
'ものか',
|
||||
...STANDALONE_GRAMMAR_PARTICLE_PHRASES,
|
||||
]);
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
|
||||
@@ -98,8 +102,6 @@ const STANDALONE_GRAMMAR_PARTICLE_SURFACES = new Set([
|
||||
'よ',
|
||||
'を',
|
||||
]);
|
||||
const STANDALONE_GRAMMAR_PARTICLE_PHRASES = new Set(['たって', 'だって']);
|
||||
|
||||
export interface SubtitleAnnotationFilterOptions {
|
||||
pos1Exclusions?: ReadonlySet<string>;
|
||||
pos2Exclusions?: ReadonlySet<string>;
|
||||
@@ -327,7 +329,7 @@ function isStandaloneGrammarParticle(token: MergedToken): boolean {
|
||||
return (
|
||||
normalizedSurface === normalizedHeadword &&
|
||||
(STANDALONE_GRAMMAR_PARTICLE_SURFACES.has(normalizedSurface) ||
|
||||
STANDALONE_GRAMMAR_PARTICLE_PHRASES.has(normalizedSurface))
|
||||
STANDALONE_GRAMMAR_PARTICLE_PHRASES_SET.has(normalizedSurface))
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user