Suppress subtitle annotations for grammar fragments

- Hide annotation metadata for auxiliary inflection and ja-nai endings
- Preserve lexical `くれる` forms and add regression coverage
This commit is contained in:
2026-05-02 14:59:01 -07:00
parent 0c051c988c
commit 55ec191db5
7 changed files with 595 additions and 0 deletions
+313
View File
@@ -4227,6 +4227,211 @@ test('tokenizeSubtitle clears all annotations for explanatory contrast endings',
);
});
test('tokenizeSubtitle clears annotations for ja-nai explanatory endings while preserving lexical content', async () => {
const result = await tokenizeSubtitle(
'みたいなのあるじゃないですか',
makeDepsFromYomitanTokens(
[
{ surface: 'みたいな', reading: 'みたいな', headword: 'みたい' },
{ surface: 'の', reading: 'の', headword: 'の' },
{ surface: 'ある', reading: 'ある', headword: 'ある' },
{ surface: 'じゃないですか', reading: 'じゃないですか', headword: 'じゃない' },
],
{
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) =>
text === 'みたい' ? 320 : text === 'ある' ? 240 : text === 'じゃない' ? 80 : null,
getJlptLevel: (text) =>
text === 'みたい' ? 'N4' : text === 'ある' ? 'N5' : text === 'じゃない' ? 'N5' : null,
isKnownWord: (text) => text === 'みたい' || text === 'の',
getMinSentenceWordsForNPlusOne: () => 1,
tokenizeWithMecab: async () => [
{
headword: 'みたい',
surface: 'みたい',
reading: 'ミタイ',
startPos: 0,
endPos: 3,
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '非自立',
pos3: '形容動詞語幹',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'だ',
surface: 'な',
reading: 'ナ',
startPos: 3,
endPos: 4,
partOfSpeech: PartOfSpeech.bound_auxiliary,
pos1: '助動詞',
pos2: '*',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'の',
surface: 'の',
reading: '',
startPos: 4,
endPos: 5,
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '非自立',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'ある',
surface: 'ある',
reading: 'アル',
startPos: 5,
endPos: 7,
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '自立',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'じゃない',
surface: 'じゃない',
reading: 'ジャナイ',
startPos: 7,
endPos: 11,
partOfSpeech: PartOfSpeech.i_adjective,
pos1: '接続詞|形容詞',
pos2: '*|自立',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'です',
surface: 'です',
reading: 'デス',
startPos: 11,
endPos: 13,
partOfSpeech: PartOfSpeech.bound_auxiliary,
pos1: '助動詞',
pos2: '*',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'か',
surface: 'か',
reading: 'カ',
startPos: 13,
endPos: 14,
partOfSpeech: PartOfSpeech.particle,
pos1: '助詞',
pos2: '副助詞/並立助詞/終助詞',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
],
},
),
);
const tokenSummary = result.tokens?.map((token) => ({
surface: token.surface,
headword: token.headword,
isKnown: token.isKnown,
isNPlusOneTarget: token.isNPlusOneTarget,
frequencyRank: token.frequencyRank,
jlptLevel: token.jlptLevel,
}));
assert.deepEqual(
tokenSummary?.find((token) => token.surface === 'じゃないですか'),
{
surface: 'じゃないですか',
headword: 'じゃない',
isKnown: false,
isNPlusOneTarget: false,
frequencyRank: undefined,
jlptLevel: undefined,
},
);
assert.deepEqual(
tokenSummary?.find((token) => token.surface === 'ある'),
{
surface: 'ある',
headword: 'ある',
isKnown: false,
isNPlusOneTarget: false,
frequencyRank: 240,
jlptLevel: 'N5',
},
);
});
test('tokenizeSubtitle clears annotations for standalone polite copula endings without POS metadata', async () => {
const result = await tokenizeSubtitle(
'現実は感じですよ',
makeDepsFromYomitanTokens(
[
{ surface: '現実', reading: 'げんじつ', headword: '現実' },
{ surface: 'は', reading: 'は', headword: 'は' },
{ surface: '感じ', reading: 'かんじ', headword: '感じ' },
{ surface: 'ですよ', reading: 'ですよ', headword: 'です' },
],
{
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) =>
text === '現実' ? 600 : text === '感じ' ? 240 : text === 'です' ? 50 : null,
getJlptLevel: (text) =>
text === '現実' ? 'N3' : text === '感じ' ? 'N4' : text === 'です' ? 'N5' : null,
isKnownWord: (text) => text === '現実' || text === 'は' || text === 'です',
getMinSentenceWordsForNPlusOne: () => 1,
tokenizeWithMecab: async () => null,
},
),
);
const tokenSummary = result.tokens?.map((token) => ({
surface: token.surface,
headword: token.headword,
isKnown: token.isKnown,
isNPlusOneTarget: token.isNPlusOneTarget,
frequencyRank: token.frequencyRank,
jlptLevel: token.jlptLevel,
}));
assert.deepEqual(
tokenSummary?.find((token) => token.surface === 'ですよ'),
{
surface: 'ですよ',
headword: 'です',
isKnown: false,
isNPlusOneTarget: false,
frequencyRank: undefined,
jlptLevel: undefined,
},
);
assert.deepEqual(
tokenSummary?.find((token) => token.surface === '感じ'),
{
surface: '感じ',
headword: '感じ',
isKnown: false,
isNPlusOneTarget: true,
frequencyRank: 240,
jlptLevel: 'N4',
},
);
});
test('tokenizeSubtitle clears annotations for ことに while preserving lexical N+1 target', async () => {
const result = await tokenizeSubtitle(
'さっきの俺と違うことに気付かないのかい?',
@@ -4446,6 +4651,114 @@ test('tokenizeSubtitle clears annotations for ことに while preserving lexical
);
});
test('tokenizeSubtitle clears annotations for auxiliary inflection fragments while preserving lexical N+1 target', async () => {
const result = await tokenizeSubtitle(
'私れた猫',
makeDepsFromYomitanTokens(
[
{ surface: '私', reading: 'わたし', headword: '私' },
{ surface: 'れた', reading: 'れた', headword: 'れる' },
{ surface: '猫', reading: 'ねこ', headword: '猫' },
],
{
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) =>
text === '私' ? 50 : text === 'れる' ? 18 : text === '猫' ? 900 : null,
getJlptLevel: (text) =>
text === '私' ? 'N5' : text === 'れる' ? 'N4' : text === '猫' ? 'N5' : null,
isKnownWord: (text) => text === '私' || text === 'れる',
getMinSentenceWordsForNPlusOne: () => 1,
tokenizeWithMecab: async () => [
{
headword: '私',
surface: '私',
reading: 'ワタシ',
startPos: 0,
endPos: 1,
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '代名詞',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'れる',
surface: 'れ',
reading: 'レ',
startPos: 1,
endPos: 2,
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '接尾',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'た',
surface: 'た',
reading: 'タ',
startPos: 2,
endPos: 3,
partOfSpeech: PartOfSpeech.bound_auxiliary,
pos1: '助動詞',
pos2: '*',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: '猫',
surface: '猫',
reading: 'ネコ',
startPos: 3,
endPos: 4,
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '一般',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
],
},
),
);
const tokenSummary = result.tokens?.map((token) => ({
surface: token.surface,
headword: token.headword,
isKnown: token.isKnown,
isNPlusOneTarget: token.isNPlusOneTarget,
frequencyRank: token.frequencyRank,
jlptLevel: token.jlptLevel,
}));
assert.deepEqual(
tokenSummary?.find((token) => token.surface === 'れた'),
{
surface: 'れた',
headword: 'れる',
isKnown: false,
isNPlusOneTarget: false,
frequencyRank: undefined,
jlptLevel: undefined,
},
);
assert.deepEqual(
tokenSummary?.find((token) => token.surface === '猫'),
{
surface: '猫',
headword: '猫',
isKnown: false,
isNPlusOneTarget: true,
frequencyRank: 900,
jlptLevel: 'N5',
},
);
});
test('tokenizeSubtitle excludes default non-independent pos2 from N+1 when JLPT/frequency are disabled', async () => {
let mecabCalls = 0;
const result = await tokenizeSubtitle(
@@ -259,6 +259,48 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes explanatory contrast en
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes ja-nai explanatory endings', () => {
const tokens = [
makeToken({
surface: 'じゃない',
headword: 'じゃない',
reading: 'ジャナイ',
partOfSpeech: PartOfSpeech.i_adjective,
pos1: '接続詞|形容詞',
pos2: '*|自立',
}),
makeToken({
surface: 'じゃないですか',
headword: 'じゃない',
reading: 'ジャナイデスカ',
partOfSpeech: PartOfSpeech.i_adjective,
pos1: '接続詞|形容詞|助動詞|助詞',
pos2: '*|自立|*|副助詞/並立助詞/終助詞',
}),
];
for (const token of tokens) {
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
}
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone polite copula suffix endings without POS tags', () => {
const tokens = [
makeToken({
surface: 'ですよ',
headword: 'です',
reading: 'デスヨ',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
}),
];
for (const token of tokens) {
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
}
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes auxiliary-stem そうだ grammar tails', () => {
const token = makeToken({
surface: 'そうだ',
@@ -1286,6 +1328,78 @@ test('annotateTokens clears all annotations for kana-only non-independent noun h
assert.equal(result[0]?.jlptLevel, undefined);
});
test('annotateTokens clears all annotations for standalone auxiliary inflection fragments', () => {
const tokens = [
makeToken({
surface: 'れる',
headword: 'れる',
reading: 'レル',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '接尾',
startPos: 0,
endPos: 2,
frequencyRank: 18,
}),
makeToken({
surface: 'れた',
headword: 'れる',
reading: 'レタ',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞|助動詞',
pos2: '接尾|*',
startPos: 2,
endPos: 4,
frequencyRank: 19,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === 'れる',
getJlptLevel: (text) => (text === 'れる' ? 'N4' : null),
}),
{ minSentenceWordsForNPlusOne: 1 },
);
for (const token of result) {
assert.equal(token.isKnown, false, token.surface);
assert.equal(token.isNPlusOneTarget, false, token.surface);
assert.equal(token.frequencyRank, undefined, token.surface);
assert.equal(token.jlptLevel, undefined, token.surface);
}
});
test('annotateTokens keeps lexical くれる forms eligible for annotation', () => {
const tokens = [
makeToken({
surface: 'くれ',
headword: 'くれる',
reading: 'クレ',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '自立',
startPos: 0,
endPos: 2,
frequencyRank: 20,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
getJlptLevel: (text) => (text === 'くれる' ? 'N4' : null),
}),
{ minSentenceWordsForNPlusOne: 1 },
);
assert.equal(result[0]?.isKnown, false);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[0]?.frequencyRank, 20);
assert.equal(result[0]?.jlptLevel, 'N4');
});
test('annotateTokens clears all annotations for standalone して helper fragments', () => {
const tokens = [
makeToken({
@@ -84,6 +84,24 @@ const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_THOUGHT_SUFFIXES = [
'かな',
'かね',
] as const;
const SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_SUFFIXES = [
'か',
'ね',
'よ',
'な',
] as const;
const SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_SUFFIXES = [
'',
'か',
'ね',
'よ',
'な',
'です',
'ですか',
'ですよ',
'ですね',
'ですな',
] as const;
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set(
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.flatMap((prefix) =>
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES.flatMap((core) =>
@@ -93,6 +111,12 @@ const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set(
),
),
);
const SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS = new Set(
SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_SUFFIXES.map((suffix) => `です${suffix}`),
);
const SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS = new Set(
SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_SUFFIXES.map((suffix) => `じゃない${suffix}`),
);
const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
'って',
'ってよ',
@@ -104,6 +128,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
]);
const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
const NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1 = new Set(['助詞', '助動詞']);
const AUXILIARY_INFLECTION_TRAILING_POS1 = new Set(['助動詞']);
const STANDALONE_GRAMMAR_PARTICLE_SURFACES = new Set([
'か',
'が',
@@ -333,6 +358,44 @@ function isKanaOnlyText(text: string): boolean {
return normalized.length > 0 && [...normalized].every(isKanaChar);
}
function isLexicalKureruVerb(token: MergedToken): boolean {
const normalizedSurface = normalizeKana(token.surface);
const normalizedHeadword = normalizeKana(token.headword);
const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
const pos2Parts = splitNormalizedTagParts(normalizePosTag(token.pos2));
return (
normalizedSurface === 'くれ' &&
normalizedHeadword === 'くれる' &&
pos1Parts.length === 1 &&
pos1Parts[0] === '動詞' &&
pos2Parts.length === 1 &&
pos2Parts[0] === '自立'
);
}
function isStandaloneAuxiliaryInflectionFragment(token: MergedToken): boolean {
const normalizedSurface = normalizeKana(token.surface);
if (!isKanaOnlyText(normalizedSurface)) {
return false;
}
const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
if (pos1Parts.length === 0) {
return false;
}
if (pos1Parts.every((part) => part === '助動詞')) {
return true;
}
const pos2Parts = splitNormalizedTagParts(normalizePosTag(token.pos2));
return (
pos1Parts[0] === '動詞' &&
pos2Parts[0] === '接尾' &&
pos1Parts.slice(1).every((part) => AUXILIARY_INFLECTION_TRAILING_POS1.has(part))
);
}
function isStandaloneSuruTeGrammarHelper(token: MergedToken): boolean {
const normalizedSurface = normalizeKana(token.surface);
const normalizedHeadword = normalizeKana(token.headword);
@@ -391,6 +454,10 @@ function isExcludedByTerm(token: MergedToken): boolean {
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalized) ||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(trimmed) ||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(normalized) ||
SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS.has(trimmed) ||
SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS.has(normalized) ||
SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS.has(trimmed) ||
SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS.has(normalized) ||
shouldIgnoreJlptByTerm(trimmed) ||
shouldIgnoreJlptByTerm(normalized)
) {
@@ -447,6 +514,10 @@ export function shouldExcludeTokenFromSubtitleAnnotations(
return true;
}
if (isStandaloneAuxiliaryInflectionFragment(token)) {
return true;
}
if (isStandaloneSuruTeGrammarHelper(token)) {
return true;
}
@@ -463,6 +534,10 @@ export function shouldExcludeTokenFromSubtitleAnnotations(
return true;
}
if (isLexicalKureruVerb(token)) {
return false;
}
return isExcludedByTerm(token);
}