diff --git a/backlog/tasks/task-311 - Suppress-auxiliary-inflection-fragments-from-subtitle-annotations.md b/backlog/tasks/task-311 - Suppress-auxiliary-inflection-fragments-from-subtitle-annotations.md new file mode 100644 index 00000000..e34a8302 --- /dev/null +++ b/backlog/tasks/task-311 - Suppress-auxiliary-inflection-fragments-from-subtitle-annotations.md @@ -0,0 +1,43 @@ +--- +id: TASK-311 +title: Suppress auxiliary inflection fragments from subtitle annotations +status: Done +assignee: [] +created_date: '2026-05-02 09:07' +updated_date: '2026-05-02 09:10' +labels: + - tokenizer + - annotations + - bug +dependencies: [] +priority: medium +--- + +## Description + + +Suppress standalone Japanese auxiliary/inflection subtitle fragments such as `れる` and `れた` from frequency/JLPT/N+1/known annotation styling while keeping lexical verbs such as `くれ` / `くれる` annotatable. Tokens must remain hoverable; only annotation metadata should be stripped. + + +## Acceptance Criteria + +- [x] #1 `れる` and `れた`-style standalone helper fragments render as plain hoverable subtitle tokens. +- [x] #2 Lexical verbs like `くれ` / `くれる` remain eligible for annotation. +- [x] #3 Regression tests cover unit filter behavior and tokenizer integration. + + +## Implementation Notes + + +Implemented with TDD. Added failing coverage first for standalone `れる`/`れた` auxiliary fragments and a lexical `くれ`/`くれる` guard. Updated the shared subtitle annotation filter to strip annotation metadata for kana-only auxiliary inflection fragments identified by MeCab POS (`助動詞` only, or `動詞/接尾` with optional trailing `助動詞`) while preserving lexical `くれ` as `くれる` when tagged `動詞/自立`. Added tokenizer integration coverage for `れた` and neighboring lexical N+1 behavior. + + +## Final Summary + + +Suppressed annotation metadata for standalone auxiliary inflection fragments such as `れる` and `れた` in subtitle tokens, leaving them hoverable but plain. Preserved lexical `くれ` -> `くれる` verb metadata when MeCab tags it as `動詞/自立`. + +Added unit and tokenizer regression coverage, plus a release fragment in `changes/311-auxiliary-inflection-annotation-filter.md`. + +Validation: targeted annotation/tokenizer tests passed; `bun run typecheck` passed; `bun run changelog:lint` passed. `bun run test:fast` was attempted twice and failed in unrelated `src/core/services/subsync.test.ts` cross-file state (`window.electronAPI` undefined), while `bun test src/core/services/subsync.test.ts` passes by itself. + diff --git a/backlog/tasks/task-312 - Suppress-ja-nai-explanatory-ending-subtitle-annotations.md b/backlog/tasks/task-312 - Suppress-ja-nai-explanatory-ending-subtitle-annotations.md new file mode 100644 index 00000000..987a822d --- /dev/null +++ b/backlog/tasks/task-312 - Suppress-ja-nai-explanatory-ending-subtitle-annotations.md @@ -0,0 +1,42 @@ +--- +id: TASK-312 +title: Suppress ja-nai explanatory ending subtitle annotations +status: Done +assignee: [] +created_date: '2026-05-02 09:55' +updated_date: '2026-05-02 10:03' +labels: + - tokenizer + - annotations + - bug +dependencies: [] +priority: medium +--- + +## Description + + +Suppress subtitle annotation styling for grammar-only explanatory endings like `じゃない` and `じゃないですか` while preserving nearby lexical content annotations. + + +## Acceptance Criteria + +- [x] #1 `じゃない` and `じゃないですか`-style endings render as plain hoverable subtitle tokens. +- [x] #2 The reported phrase `みたいなのあるじゃないですか` does not annotate `じゃない`/`じゃないですか` as lexical/frequency content. +- [x] #3 Regression tests cover unit filter behavior and tokenizer integration without suppressing lexical content tokens. +- [x] #4 Standalone polite copula endings such as `です` / `ですよ` render as plain hoverable subtitle tokens even if POS metadata is missing or too lexical. + + +## Implementation Notes + + +Added failing coverage first for `じゃない` / `じゃないですか` and `ですよ` leaking annotation metadata when POS metadata is missing or too lexical. Implemented term-family exclusions in the shared subtitle annotation filter for the `じゃない` explanatory family and polite copula suffix endings (`ですか`, `ですね`, `ですよ`, `ですな`). Kept bare `です` term-only behavior unchanged to preserve existing no-POS frequency tests; POS-tagged `です` is already stripped by the grammar POS exclusion path. + + +## Final Summary + + +Suppressed subtitle annotation metadata for grammar-only endings like `じゃないですか` and `ですよ`, while preserving nearby lexical content annotations. Added unit and tokenizer regression coverage for the reported `みたいなのあるじゃないですか` and `感じですよ` shapes, plus changelog fragment `changes/312-grammar-ending-annotation-filter.md`. + +Validation: `bun test src/core/services/tokenizer/annotation-stage.test.ts`; `bun test src/core/services/tokenizer.test.ts`; `bun run typecheck`; `bun run changelog:lint`; `git diff --check`. + diff --git a/changes/311-auxiliary-inflection-annotation-filter.md b/changes/311-auxiliary-inflection-annotation-filter.md new file mode 100644 index 00000000..61982ac4 --- /dev/null +++ b/changes/311-auxiliary-inflection-annotation-filter.md @@ -0,0 +1,4 @@ +type: fixed +area: overlay + +- Suppressed subtitle annotation styling for standalone auxiliary inflection fragments such as `れる` and `れた` while keeping lexical `くれる` forms eligible for lookup metadata. diff --git a/changes/312-grammar-ending-annotation-filter.md b/changes/312-grammar-ending-annotation-filter.md new file mode 100644 index 00000000..eade8503 --- /dev/null +++ b/changes/312-grammar-ending-annotation-filter.md @@ -0,0 +1,4 @@ +type: fixed +area: overlay + +- Suppressed subtitle annotation styling for grammar-only endings such as `じゃないですか` and standalone polite copula tails like `です` / `ですよ`. diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index 84b7b28d..cfe10ee8 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -4227,6 +4227,211 @@ test('tokenizeSubtitle clears all annotations for explanatory contrast endings', ); }); +test('tokenizeSubtitle clears annotations for ja-nai explanatory endings while preserving lexical content', async () => { + const result = await tokenizeSubtitle( + 'みたいなのあるじゃないですか', + makeDepsFromYomitanTokens( + [ + { surface: 'みたいな', reading: 'みたいな', headword: 'みたい' }, + { surface: 'の', reading: 'の', headword: 'の' }, + { surface: 'ある', reading: 'ある', headword: 'ある' }, + { surface: 'じゃないですか', reading: 'じゃないですか', headword: 'じゃない' }, + ], + { + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => + text === 'みたい' ? 320 : text === 'ある' ? 240 : text === 'じゃない' ? 80 : null, + getJlptLevel: (text) => + text === 'みたい' ? 'N4' : text === 'ある' ? 'N5' : text === 'じゃない' ? 'N5' : null, + isKnownWord: (text) => text === 'みたい' || text === 'の', + getMinSentenceWordsForNPlusOne: () => 1, + tokenizeWithMecab: async () => [ + { + headword: 'みたい', + surface: 'みたい', + reading: 'ミタイ', + startPos: 0, + endPos: 3, + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '非自立', + pos3: '形容動詞語幹', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'だ', + surface: 'な', + reading: 'ナ', + startPos: 3, + endPos: 4, + partOfSpeech: PartOfSpeech.bound_auxiliary, + pos1: '助動詞', + pos2: '*', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'の', + surface: 'の', + reading: 'ノ', + startPos: 4, + endPos: 5, + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '非自立', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'ある', + surface: 'ある', + reading: 'アル', + startPos: 5, + endPos: 7, + partOfSpeech: PartOfSpeech.verb, + pos1: '動詞', + pos2: '自立', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'じゃない', + surface: 'じゃない', + reading: 'ジャナイ', + startPos: 7, + endPos: 11, + partOfSpeech: PartOfSpeech.i_adjective, + pos1: '接続詞|形容詞', + pos2: '*|自立', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'です', + surface: 'です', + reading: 'デス', + startPos: 11, + endPos: 13, + partOfSpeech: PartOfSpeech.bound_auxiliary, + pos1: '助動詞', + pos2: '*', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'か', + surface: 'か', + reading: 'カ', + startPos: 13, + endPos: 14, + partOfSpeech: PartOfSpeech.particle, + pos1: '助詞', + pos2: '副助詞/並立助詞/終助詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + ], + }, + ), + ); + + const tokenSummary = result.tokens?.map((token) => ({ + surface: token.surface, + headword: token.headword, + isKnown: token.isKnown, + isNPlusOneTarget: token.isNPlusOneTarget, + frequencyRank: token.frequencyRank, + jlptLevel: token.jlptLevel, + })); + + assert.deepEqual( + tokenSummary?.find((token) => token.surface === 'じゃないですか'), + { + surface: 'じゃないですか', + headword: 'じゃない', + isKnown: false, + isNPlusOneTarget: false, + frequencyRank: undefined, + jlptLevel: undefined, + }, + ); + assert.deepEqual( + tokenSummary?.find((token) => token.surface === 'ある'), + { + surface: 'ある', + headword: 'ある', + isKnown: false, + isNPlusOneTarget: false, + frequencyRank: 240, + jlptLevel: 'N5', + }, + ); +}); + +test('tokenizeSubtitle clears annotations for standalone polite copula endings without POS metadata', async () => { + const result = await tokenizeSubtitle( + '現実は感じですよ', + makeDepsFromYomitanTokens( + [ + { surface: '現実', reading: 'げんじつ', headword: '現実' }, + { surface: 'は', reading: 'は', headword: 'は' }, + { surface: '感じ', reading: 'かんじ', headword: '感じ' }, + { surface: 'ですよ', reading: 'ですよ', headword: 'です' }, + ], + { + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => + text === '現実' ? 600 : text === '感じ' ? 240 : text === 'です' ? 50 : null, + getJlptLevel: (text) => + text === '現実' ? 'N3' : text === '感じ' ? 'N4' : text === 'です' ? 'N5' : null, + isKnownWord: (text) => text === '現実' || text === 'は' || text === 'です', + getMinSentenceWordsForNPlusOne: () => 1, + tokenizeWithMecab: async () => null, + }, + ), + ); + + const tokenSummary = result.tokens?.map((token) => ({ + surface: token.surface, + headword: token.headword, + isKnown: token.isKnown, + isNPlusOneTarget: token.isNPlusOneTarget, + frequencyRank: token.frequencyRank, + jlptLevel: token.jlptLevel, + })); + + assert.deepEqual( + tokenSummary?.find((token) => token.surface === 'ですよ'), + { + surface: 'ですよ', + headword: 'です', + isKnown: false, + isNPlusOneTarget: false, + frequencyRank: undefined, + jlptLevel: undefined, + }, + ); + assert.deepEqual( + tokenSummary?.find((token) => token.surface === '感じ'), + { + surface: '感じ', + headword: '感じ', + isKnown: false, + isNPlusOneTarget: true, + frequencyRank: 240, + jlptLevel: 'N4', + }, + ); +}); + test('tokenizeSubtitle clears annotations for ことに while preserving lexical N+1 target', async () => { const result = await tokenizeSubtitle( 'さっきの俺と違うことに気付かないのかい?', @@ -4446,6 +4651,114 @@ test('tokenizeSubtitle clears annotations for ことに while preserving lexical ); }); +test('tokenizeSubtitle clears annotations for auxiliary inflection fragments while preserving lexical N+1 target', async () => { + const result = await tokenizeSubtitle( + '私れた猫', + makeDepsFromYomitanTokens( + [ + { surface: '私', reading: 'わたし', headword: '私' }, + { surface: 'れた', reading: 'れた', headword: 'れる' }, + { surface: '猫', reading: 'ねこ', headword: '猫' }, + ], + { + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => + text === '私' ? 50 : text === 'れる' ? 18 : text === '猫' ? 900 : null, + getJlptLevel: (text) => + text === '私' ? 'N5' : text === 'れる' ? 'N4' : text === '猫' ? 'N5' : null, + isKnownWord: (text) => text === '私' || text === 'れる', + getMinSentenceWordsForNPlusOne: () => 1, + tokenizeWithMecab: async () => [ + { + headword: '私', + surface: '私', + reading: 'ワタシ', + startPos: 0, + endPos: 1, + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '代名詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'れる', + surface: 'れ', + reading: 'レ', + startPos: 1, + endPos: 2, + partOfSpeech: PartOfSpeech.verb, + pos1: '動詞', + pos2: '接尾', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'た', + surface: 'た', + reading: 'タ', + startPos: 2, + endPos: 3, + partOfSpeech: PartOfSpeech.bound_auxiliary, + pos1: '助動詞', + pos2: '*', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: '猫', + surface: '猫', + reading: 'ネコ', + startPos: 3, + endPos: 4, + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '一般', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + ], + }, + ), + ); + + const tokenSummary = result.tokens?.map((token) => ({ + surface: token.surface, + headword: token.headword, + isKnown: token.isKnown, + isNPlusOneTarget: token.isNPlusOneTarget, + frequencyRank: token.frequencyRank, + jlptLevel: token.jlptLevel, + })); + + assert.deepEqual( + tokenSummary?.find((token) => token.surface === 'れた'), + { + surface: 'れた', + headword: 'れる', + isKnown: false, + isNPlusOneTarget: false, + frequencyRank: undefined, + jlptLevel: undefined, + }, + ); + assert.deepEqual( + tokenSummary?.find((token) => token.surface === '猫'), + { + surface: '猫', + headword: '猫', + isKnown: false, + isNPlusOneTarget: true, + frequencyRank: 900, + jlptLevel: 'N5', + }, + ); +}); + test('tokenizeSubtitle excludes default non-independent pos2 from N+1 when JLPT/frequency are disabled', async () => { let mecabCalls = 0; const result = await tokenizeSubtitle( diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts index 32e6ee25..87021e88 100644 --- a/src/core/services/tokenizer/annotation-stage.test.ts +++ b/src/core/services/tokenizer/annotation-stage.test.ts @@ -258,6 +258,48 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes explanatory contrast en assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true); }); +test('shouldExcludeTokenFromSubtitleAnnotations excludes ja-nai explanatory endings', () => { + const tokens = [ + makeToken({ + surface: 'じゃない', + headword: 'じゃない', + reading: 'ジャナイ', + partOfSpeech: PartOfSpeech.i_adjective, + pos1: '接続詞|形容詞', + pos2: '*|自立', + }), + makeToken({ + surface: 'じゃないですか', + headword: 'じゃない', + reading: 'ジャナイデスカ', + partOfSpeech: PartOfSpeech.i_adjective, + pos1: '接続詞|形容詞|助動詞|助詞', + pos2: '*|自立|*|副助詞/並立助詞/終助詞', + }), + ]; + + for (const token of tokens) { + assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface); + } +}); + +test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone polite copula suffix endings without POS tags', () => { + const tokens = [ + makeToken({ + surface: 'ですよ', + headword: 'です', + reading: 'デスヨ', + partOfSpeech: PartOfSpeech.other, + pos1: '', + pos2: '', + }), + ]; + + for (const token of tokens) { + assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface); + } +}); + test('shouldExcludeTokenFromSubtitleAnnotations excludes auxiliary-stem そうだ grammar tails', () => { const token = makeToken({ surface: 'そうだ', @@ -1204,6 +1246,78 @@ test('annotateTokens clears all annotations for kana-only non-independent noun h assert.equal(result[0]?.jlptLevel, undefined); }); +test('annotateTokens clears all annotations for standalone auxiliary inflection fragments', () => { + const tokens = [ + makeToken({ + surface: 'れる', + headword: 'れる', + reading: 'レル', + partOfSpeech: PartOfSpeech.verb, + pos1: '動詞', + pos2: '接尾', + startPos: 0, + endPos: 2, + frequencyRank: 18, + }), + makeToken({ + surface: 'れた', + headword: 'れる', + reading: 'レタ', + partOfSpeech: PartOfSpeech.verb, + pos1: '動詞|助動詞', + pos2: '接尾|*', + startPos: 2, + endPos: 4, + frequencyRank: 19, + }), + ]; + + const result = annotateTokens( + tokens, + makeDeps({ + isKnownWord: (text) => text === 'れる', + getJlptLevel: (text) => (text === 'れる' ? 'N4' : null), + }), + { minSentenceWordsForNPlusOne: 1 }, + ); + + for (const token of result) { + assert.equal(token.isKnown, false, token.surface); + assert.equal(token.isNPlusOneTarget, false, token.surface); + assert.equal(token.frequencyRank, undefined, token.surface); + assert.equal(token.jlptLevel, undefined, token.surface); + } +}); + +test('annotateTokens keeps lexical くれる forms eligible for annotation', () => { + const tokens = [ + makeToken({ + surface: 'くれ', + headword: 'くれる', + reading: 'クレ', + partOfSpeech: PartOfSpeech.verb, + pos1: '動詞', + pos2: '自立', + startPos: 0, + endPos: 2, + frequencyRank: 20, + }), + ]; + + const result = annotateTokens( + tokens, + makeDeps({ + getJlptLevel: (text) => (text === 'くれる' ? 'N4' : null), + }), + { minSentenceWordsForNPlusOne: 1 }, + ); + + assert.equal(result[0]?.isKnown, false); + assert.equal(result[0]?.isNPlusOneTarget, false); + assert.equal(result[0]?.frequencyRank, 20); + assert.equal(result[0]?.jlptLevel, 'N4'); +}); + test('annotateTokens clears all annotations for standalone して helper fragments', () => { const tokens = [ makeToken({ diff --git a/src/core/services/tokenizer/subtitle-annotation-filter.ts b/src/core/services/tokenizer/subtitle-annotation-filter.ts index 2070f024..5146ab34 100644 --- a/src/core/services/tokenizer/subtitle-annotation-filter.ts +++ b/src/core/services/tokenizer/subtitle-annotation-filter.ts @@ -63,6 +63,24 @@ const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_THOUGHT_SUFFIXES = [ 'かな', 'かね', ] as const; +const SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_SUFFIXES = [ + 'か', + 'ね', + 'よ', + 'な', +] as const; +const SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_SUFFIXES = [ + '', + 'か', + 'ね', + 'よ', + 'な', + 'です', + 'ですか', + 'ですよ', + 'ですね', + 'ですな', +] as const; const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set( SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.flatMap((prefix) => SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES.flatMap((core) => @@ -72,6 +90,12 @@ const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set( ), ), ); +const SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS = new Set( + SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_SUFFIXES.map((suffix) => `です${suffix}`), +); +const SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS = new Set( + SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_SUFFIXES.map((suffix) => `じゃない${suffix}`), +); const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([ 'って', 'ってよ', @@ -83,6 +107,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([ ]); const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']); const NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1 = new Set(['助詞', '助動詞']); +const AUXILIARY_INFLECTION_TRAILING_POS1 = new Set(['助動詞']); const STANDALONE_GRAMMAR_PARTICLE_SURFACES = new Set([ 'か', 'が', @@ -312,6 +337,44 @@ function isKanaOnlyText(text: string): boolean { return normalized.length > 0 && [...normalized].every(isKanaChar); } +function isLexicalKureruVerb(token: MergedToken): boolean { + const normalizedSurface = normalizeKana(token.surface); + const normalizedHeadword = normalizeKana(token.headword); + const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1)); + const pos2Parts = splitNormalizedTagParts(normalizePosTag(token.pos2)); + return ( + normalizedSurface === 'くれ' && + normalizedHeadword === 'くれる' && + pos1Parts.length === 1 && + pos1Parts[0] === '動詞' && + pos2Parts.length === 1 && + pos2Parts[0] === '自立' + ); +} + +function isStandaloneAuxiliaryInflectionFragment(token: MergedToken): boolean { + const normalizedSurface = normalizeKana(token.surface); + if (!isKanaOnlyText(normalizedSurface)) { + return false; + } + + const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1)); + if (pos1Parts.length === 0) { + return false; + } + + if (pos1Parts.every((part) => part === '助動詞')) { + return true; + } + + const pos2Parts = splitNormalizedTagParts(normalizePosTag(token.pos2)); + return ( + pos1Parts[0] === '動詞' && + pos2Parts[0] === '接尾' && + pos1Parts.slice(1).every((part) => AUXILIARY_INFLECTION_TRAILING_POS1.has(part)) + ); +} + function isStandaloneSuruTeGrammarHelper(token: MergedToken): boolean { const normalizedSurface = normalizeKana(token.surface); const normalizedHeadword = normalizeKana(token.headword); @@ -370,6 +433,10 @@ function isExcludedByTerm(token: MergedToken): boolean { SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalized) || SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(trimmed) || SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(normalized) || + SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS.has(trimmed) || + SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS.has(normalized) || + SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS.has(trimmed) || + SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS.has(normalized) || shouldIgnoreJlptByTerm(trimmed) || shouldIgnoreJlptByTerm(normalized) ) { @@ -426,6 +493,10 @@ export function shouldExcludeTokenFromSubtitleAnnotations( return true; } + if (isStandaloneAuxiliaryInflectionFragment(token)) { + return true; + } + if (isStandaloneSuruTeGrammarHelper(token)) { return true; } @@ -442,6 +513,10 @@ export function shouldExcludeTokenFromSubtitleAnnotations( return true; } + if (isLexicalKureruVerb(token)) { + return false; + } + return isExcludedByTerm(token); }