mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-04-28 04:19:27 -07:00
fix(tokenizer): preserve annotation and enrichment behavior
This commit is contained in:
6
changes/305-tokenizer-word-class-pos-filtering.md
Normal file
6
changes/305-tokenizer-word-class-pos-filtering.md
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
type: fixed
|
||||||
|
area: tokenizer
|
||||||
|
|
||||||
|
- Use Yomitan `wordClasses` metadata for subtitle POS filtering.
|
||||||
|
- Backfill blank MeCab POS detail fields during parser enrichment.
|
||||||
|
- Keep subtitle annotation metadata stripped from token results.
|
||||||
@@ -160,7 +160,7 @@ async function applyAnnotationStage(
|
|||||||
options: TokenizerAnnotationOptions,
|
options: TokenizerAnnotationOptions,
|
||||||
): Promise<MergedToken[]> {
|
): Promise<MergedToken[]> {
|
||||||
if (!hasAnyAnnotationEnabled(options)) {
|
if (!hasAnyAnnotationEnabled(options)) {
|
||||||
return tokens;
|
return stripSubtitleAnnotationMetadata(tokens);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!annotationStageModulePromise) {
|
if (!annotationStageModulePromise) {
|
||||||
|
|||||||
@@ -789,10 +789,7 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+
|
|||||||
|
|
||||||
const result = annotateTokens(
|
const result = annotateTokens(
|
||||||
tokens,
|
tokens,
|
||||||
makeDeps({
|
makeDeps(),
|
||||||
isKnownWord: (text) => text === 'た' || text === '負',
|
|
||||||
getJlptLevel: (text) => (text === 'た' || text === '負' ? 'N3' : null),
|
|
||||||
}),
|
|
||||||
{
|
{
|
||||||
minSentenceWordsForNPlusOne: 1,
|
minSentenceWordsForNPlusOne: 1,
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -39,6 +39,33 @@ test('enrichTokensWithMecabPos1 fills missing pos1 using surface-sequence fallba
|
|||||||
assert.equal(enriched[0]?.pos1, '助詞');
|
assert.equal(enriched[0]?.pos1, '助詞');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('enrichTokensWithMecabPos1 backfills blank pos2 and pos3 fields', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: 'は',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 1,
|
||||||
|
pos1: '助詞',
|
||||||
|
pos2: '',
|
||||||
|
pos3: ' ',
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
const mecabTokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: 'は',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 1,
|
||||||
|
pos1: '助詞',
|
||||||
|
pos2: '係助詞',
|
||||||
|
pos3: '一般',
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
|
||||||
|
assert.equal(enriched[0]?.pos2, '係助詞');
|
||||||
|
assert.equal(enriched[0]?.pos3, '一般');
|
||||||
|
});
|
||||||
|
|
||||||
test('enrichTokensWithMecabPos1 keeps partOfSpeech unchanged and only enriches POS tags', () => {
|
test('enrichTokensWithMecabPos1 keeps partOfSpeech unchanged and only enriches POS tags', () => {
|
||||||
const tokens = [makeToken({ surface: 'これは', startPos: 0, endPos: 3 })];
|
const tokens = [makeToken({ surface: 'これは', startPos: 0, endPos: 3 })];
|
||||||
const mecabTokens = [
|
const mecabTokens = [
|
||||||
|
|||||||
@@ -120,6 +120,13 @@ function lowerBoundByIndex(candidates: IndexedMecabToken[], targetIndex: number)
|
|||||||
return low;
|
return low;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function coalesceMissingPosField(
|
||||||
|
current: string | undefined,
|
||||||
|
fallback: string | undefined,
|
||||||
|
): string | undefined {
|
||||||
|
return typeof current === 'string' && current.trim().length > 0 ? current : fallback;
|
||||||
|
}
|
||||||
|
|
||||||
function joinUniqueTags(values: Array<string | undefined>): string | undefined {
|
function joinUniqueTags(values: Array<string | undefined>): string | undefined {
|
||||||
const unique: string[] = [];
|
const unique: string[] = [];
|
||||||
for (const value of values) {
|
for (const value of values) {
|
||||||
@@ -329,9 +336,9 @@ function fillMissingPos1BySurfaceSequence(
|
|||||||
cursor = best.index + 1;
|
cursor = best.index + 1;
|
||||||
return {
|
return {
|
||||||
...token,
|
...token,
|
||||||
pos1: token.pos1 ?? best.pos1,
|
pos1: coalesceMissingPosField(token.pos1, best.pos1),
|
||||||
pos2: token.pos2 ?? best.pos2,
|
pos2: coalesceMissingPosField(token.pos2, best.pos2),
|
||||||
pos3: token.pos3 ?? best.pos3,
|
pos3: coalesceMissingPosField(token.pos3, best.pos3),
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -412,9 +419,9 @@ export function enrichTokensWithMecabPos1(
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
...token,
|
...token,
|
||||||
pos1: token.pos1 ?? metadata.pos1,
|
pos1: coalesceMissingPosField(token.pos1, metadata.pos1),
|
||||||
pos2: token.pos2 ?? metadata.pos2,
|
pos2: coalesceMissingPosField(token.pos2, metadata.pos2),
|
||||||
pos3: token.pos3 ?? metadata.pos3,
|
pos3: coalesceMissingPosField(token.pos3, metadata.pos3),
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -13,6 +13,11 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
|
|||||||
const KATAKANA_CODEPOINT_START = 0x30a1;
|
const KATAKANA_CODEPOINT_START = 0x30a1;
|
||||||
const KATAKANA_CODEPOINT_END = 0x30f6;
|
const KATAKANA_CODEPOINT_END = 0x30f6;
|
||||||
|
|
||||||
|
const STANDALONE_GRAMMAR_PARTICLE_PHRASES = ['たって', 'だって'] as const;
|
||||||
|
const STANDALONE_GRAMMAR_PARTICLE_PHRASES_SET: ReadonlySet<string> = new Set(
|
||||||
|
STANDALONE_GRAMMAR_PARTICLE_PHRASES,
|
||||||
|
);
|
||||||
|
|
||||||
const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
||||||
'あ',
|
'あ',
|
||||||
'ああ',
|
'ああ',
|
||||||
@@ -20,9 +25,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
|||||||
'うう',
|
'うう',
|
||||||
'おお',
|
'おお',
|
||||||
'くれ',
|
'くれ',
|
||||||
'たって',
|
|
||||||
'って',
|
'って',
|
||||||
'だって',
|
|
||||||
'はあ',
|
'はあ',
|
||||||
'はは',
|
'はは',
|
||||||
'べき',
|
'べき',
|
||||||
@@ -31,6 +34,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
|||||||
'ほう',
|
'ほう',
|
||||||
'もんか',
|
'もんか',
|
||||||
'ものか',
|
'ものか',
|
||||||
|
...STANDALONE_GRAMMAR_PARTICLE_PHRASES,
|
||||||
]);
|
]);
|
||||||
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
|
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
|
||||||
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
|
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
|
||||||
@@ -98,8 +102,6 @@ const STANDALONE_GRAMMAR_PARTICLE_SURFACES = new Set([
|
|||||||
'よ',
|
'よ',
|
||||||
'を',
|
'を',
|
||||||
]);
|
]);
|
||||||
const STANDALONE_GRAMMAR_PARTICLE_PHRASES = new Set(['たって', 'だって']);
|
|
||||||
|
|
||||||
export interface SubtitleAnnotationFilterOptions {
|
export interface SubtitleAnnotationFilterOptions {
|
||||||
pos1Exclusions?: ReadonlySet<string>;
|
pos1Exclusions?: ReadonlySet<string>;
|
||||||
pos2Exclusions?: ReadonlySet<string>;
|
pos2Exclusions?: ReadonlySet<string>;
|
||||||
@@ -327,7 +329,7 @@ function isStandaloneGrammarParticle(token: MergedToken): boolean {
|
|||||||
return (
|
return (
|
||||||
normalizedSurface === normalizedHeadword &&
|
normalizedSurface === normalizedHeadword &&
|
||||||
(STANDALONE_GRAMMAR_PARTICLE_SURFACES.has(normalizedSurface) ||
|
(STANDALONE_GRAMMAR_PARTICLE_SURFACES.has(normalizedSurface) ||
|
||||||
STANDALONE_GRAMMAR_PARTICLE_PHRASES.has(normalizedSurface))
|
STANDALONE_GRAMMAR_PARTICLE_PHRASES_SET.has(normalizedSurface))
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user