fix: exclude auxiliary grammar tails from subtitle annotations

This commit is contained in:
2026-03-19 21:40:20 -07:00
parent ff95934f07
commit 59fa3b427d
4 changed files with 160 additions and 0 deletions

View File

@@ -3483,6 +3483,79 @@ test('tokenizeSubtitle keeps trailing quote-particle merged tokens hoverable whi
);
});
test('tokenizeSubtitle keeps auxiliary-stem そうだ grammar tails hoverable while clearing annotation metadata', async () => {
const result = await tokenizeSubtitle(
'与えるそうだ',
makeDepsFromYomitanTokens(
[
{ surface: '与える', reading: 'あたえる', headword: '与える' },
{ surface: 'そうだ', reading: 'そうだ', headword: 'そうだ' },
],
{
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) => (text === '与える' ? 100 : text === 'そうだ' ? 12 : null),
getJlptLevel: (text) => (text === '与える' ? 'N3' : text === 'そうだ' ? 'N5' : null),
tokenizeWithMecab: async () => [
{
headword: '与える',
surface: '与える',
reading: 'アタエル',
startPos: 0,
endPos: 3,
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '自立',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'そう',
surface: 'そう',
reading: 'ソウ',
startPos: 3,
endPos: 5,
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '特殊',
pos3: '助動詞語幹',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'だ',
surface: 'だ',
reading: 'ダ',
startPos: 5,
endPos: 6,
partOfSpeech: PartOfSpeech.bound_auxiliary,
pos1: '助動詞',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
],
getMinSentenceWordsForNPlusOne: () => 1,
},
),
);
assert.equal(result.text, '与えるそうだ');
assert.deepEqual(
result.tokens?.map((token) => ({
surface: token.surface,
headword: token.headword,
frequencyRank: token.frequencyRank,
jlptLevel: token.jlptLevel,
})),
[
{ surface: '与える', headword: '与える', frequencyRank: 100, jlptLevel: 'N3' },
{ surface: 'そうだ', headword: 'そうだ', frequencyRank: undefined, jlptLevel: undefined },
],
);
});
test('tokenizeSubtitle excludes single-kana merged tokens from frequency highlighting', async () => {
const result = await tokenizeSubtitle(
'た',

View File

@@ -234,6 +234,19 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes explanatory ending vari
}
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes auxiliary-stem そうだ grammar tails', () => {
const token = makeToken({
surface: 'そうだ',
headword: 'そうだ',
reading: 'ソウダ',
pos1: '名詞|助動詞',
pos2: '特殊',
pos3: '助動詞語幹',
});
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
});
test('shouldExcludeTokenFromSubtitleAnnotations keeps lexical tokens outside explanatory ending family', () => {
const token = makeToken({
surface: '問題',

View File

@@ -100,6 +100,7 @@ function normalizePos1Tag(pos1: string | undefined): string {
const SUBTITLE_ANNOTATION_EXCLUDED_POS1 = new Set(['感動詞']);
const SUBTITLE_ANNOTATION_GRAMMAR_ONLY_POS1 = new Set(['助詞', '助動詞', '連体詞']);
const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
function splitNormalizedTagParts(normalizedTag: string): string[] {
if (!normalizedTag) {
@@ -156,6 +157,16 @@ function isExcludedTrailingParticleMergedToken(token: MergedToken): boolean {
return trailingPos1.length > 0 && trailingPos1.every((part) => part === '助詞');
}
function isAuxiliaryStemGrammarTailToken(token: MergedToken): boolean {
const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1));
if (pos1Parts.length === 0 || !pos1Parts.every((part) => AUXILIARY_STEM_GRAMMAR_TAIL_POS1.has(part))) {
return false;
}
const pos3Parts = splitNormalizedTagParts(normalizePos2Tag(token.pos3));
return pos3Parts.includes('助動詞語幹');
}
function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
if (options.pos1Exclusions) {
return options.pos1Exclusions;
@@ -626,6 +637,10 @@ export function shouldExcludeTokenFromSubtitleAnnotations(token: MergedToken): b
return true;
}
if (isAuxiliaryStemGrammarTailToken(token)) {
return true;
}
if (isExcludedTrailingParticleMergedToken(token)) {
return true;
}