mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-20 12:11:28 -07:00
fix: exclude auxiliary grammar tails from subtitle annotations
This commit is contained in:
@@ -3483,6 +3483,79 @@ test('tokenizeSubtitle keeps trailing quote-particle merged tokens hoverable whi
|
||||
);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle keeps auxiliary-stem そうだ grammar tails hoverable while clearing annotation metadata', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'与えるそうだ',
|
||||
makeDepsFromYomitanTokens(
|
||||
[
|
||||
{ surface: '与える', reading: 'あたえる', headword: '与える' },
|
||||
{ surface: 'そうだ', reading: 'そうだ', headword: 'そうだ' },
|
||||
],
|
||||
{
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) => (text === '与える' ? 100 : text === 'そうだ' ? 12 : null),
|
||||
getJlptLevel: (text) => (text === '与える' ? 'N3' : text === 'そうだ' ? 'N5' : null),
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
headword: '与える',
|
||||
surface: '与える',
|
||||
reading: 'アタエル',
|
||||
startPos: 0,
|
||||
endPos: 3,
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞',
|
||||
pos2: '自立',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'そう',
|
||||
surface: 'そう',
|
||||
reading: 'ソウ',
|
||||
startPos: 3,
|
||||
endPos: 5,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '特殊',
|
||||
pos3: '助動詞語幹',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'だ',
|
||||
surface: 'だ',
|
||||
reading: 'ダ',
|
||||
startPos: 5,
|
||||
endPos: 6,
|
||||
partOfSpeech: PartOfSpeech.bound_auxiliary,
|
||||
pos1: '助動詞',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
getMinSentenceWordsForNPlusOne: () => 1,
|
||||
},
|
||||
),
|
||||
);
|
||||
|
||||
assert.equal(result.text, '与えるそうだ');
|
||||
assert.deepEqual(
|
||||
result.tokens?.map((token) => ({
|
||||
surface: token.surface,
|
||||
headword: token.headword,
|
||||
frequencyRank: token.frequencyRank,
|
||||
jlptLevel: token.jlptLevel,
|
||||
})),
|
||||
[
|
||||
{ surface: '与える', headword: '与える', frequencyRank: 100, jlptLevel: 'N3' },
|
||||
{ surface: 'そうだ', headword: 'そうだ', frequencyRank: undefined, jlptLevel: undefined },
|
||||
],
|
||||
);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle excludes single-kana merged tokens from frequency highlighting', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'た',
|
||||
|
||||
@@ -234,6 +234,19 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes explanatory ending vari
|
||||
}
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes auxiliary-stem そうだ grammar tails', () => {
|
||||
const token = makeToken({
|
||||
surface: 'そうだ',
|
||||
headword: 'そうだ',
|
||||
reading: 'ソウダ',
|
||||
pos1: '名詞|助動詞',
|
||||
pos2: '特殊',
|
||||
pos3: '助動詞語幹',
|
||||
});
|
||||
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations keeps lexical tokens outside explanatory ending family', () => {
|
||||
const token = makeToken({
|
||||
surface: '問題',
|
||||
|
||||
@@ -100,6 +100,7 @@ function normalizePos1Tag(pos1: string | undefined): string {
|
||||
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_POS1 = new Set(['感動詞']);
|
||||
const SUBTITLE_ANNOTATION_GRAMMAR_ONLY_POS1 = new Set(['助詞', '助動詞', '連体詞']);
|
||||
const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
|
||||
|
||||
function splitNormalizedTagParts(normalizedTag: string): string[] {
|
||||
if (!normalizedTag) {
|
||||
@@ -156,6 +157,16 @@ function isExcludedTrailingParticleMergedToken(token: MergedToken): boolean {
|
||||
return trailingPos1.length > 0 && trailingPos1.every((part) => part === '助詞');
|
||||
}
|
||||
|
||||
function isAuxiliaryStemGrammarTailToken(token: MergedToken): boolean {
|
||||
const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1));
|
||||
if (pos1Parts.length === 0 || !pos1Parts.every((part) => AUXILIARY_STEM_GRAMMAR_TAIL_POS1.has(part))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const pos3Parts = splitNormalizedTagParts(normalizePos2Tag(token.pos3));
|
||||
return pos3Parts.includes('助動詞語幹');
|
||||
}
|
||||
|
||||
function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
|
||||
if (options.pos1Exclusions) {
|
||||
return options.pos1Exclusions;
|
||||
@@ -626,6 +637,10 @@ export function shouldExcludeTokenFromSubtitleAnnotations(token: MergedToken): b
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isAuxiliaryStemGrammarTailToken(token)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isExcludedTrailingParticleMergedToken(token)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user