mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-05-02 16:19:25 -07:00
Suppress subtitle annotations for grammar fragments
- Hide annotation metadata for auxiliary inflection and ja-nai endings - Preserve lexical `くれる` forms and add regression coverage
This commit is contained in:
@@ -0,0 +1,43 @@
|
|||||||
|
---
|
||||||
|
id: TASK-311
|
||||||
|
title: Suppress auxiliary inflection fragments from subtitle annotations
|
||||||
|
status: Done
|
||||||
|
assignee: []
|
||||||
|
created_date: '2026-05-02 09:07'
|
||||||
|
updated_date: '2026-05-02 09:10'
|
||||||
|
labels:
|
||||||
|
- tokenizer
|
||||||
|
- annotations
|
||||||
|
- bug
|
||||||
|
dependencies: []
|
||||||
|
priority: medium
|
||||||
|
---
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||||
|
Suppress standalone Japanese auxiliary/inflection subtitle fragments such as `れる` and `れた` from frequency/JLPT/N+1/known annotation styling while keeping lexical verbs such as `くれ` / `くれる` annotatable. Tokens must remain hoverable; only annotation metadata should be stripped.
|
||||||
|
<!-- SECTION:DESCRIPTION:END -->
|
||||||
|
|
||||||
|
## Acceptance Criteria
|
||||||
|
<!-- AC:BEGIN -->
|
||||||
|
- [x] #1 `れる` and `れた`-style standalone helper fragments render as plain hoverable subtitle tokens.
|
||||||
|
- [x] #2 Lexical verbs like `くれ` / `くれる` remain eligible for annotation.
|
||||||
|
- [x] #3 Regression tests cover unit filter behavior and tokenizer integration.
|
||||||
|
<!-- AC:END -->
|
||||||
|
|
||||||
|
## Implementation Notes
|
||||||
|
|
||||||
|
<!-- SECTION:NOTES:BEGIN -->
|
||||||
|
Implemented with TDD. Added failing coverage first for standalone `れる`/`れた` auxiliary fragments and a lexical `くれ`/`くれる` guard. Updated the shared subtitle annotation filter to strip annotation metadata for kana-only auxiliary inflection fragments identified by MeCab POS (`助動詞` only, or `動詞/接尾` with optional trailing `助動詞`) while preserving lexical `くれ` as `くれる` when tagged `動詞/自立`. Added tokenizer integration coverage for `れた` and neighboring lexical N+1 behavior.
|
||||||
|
<!-- SECTION:NOTES:END -->
|
||||||
|
|
||||||
|
## Final Summary
|
||||||
|
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||||
|
Suppressed annotation metadata for standalone auxiliary inflection fragments such as `れる` and `れた` in subtitle tokens, leaving them hoverable but plain. Preserved lexical `くれ` -> `くれる` verb metadata when MeCab tags it as `動詞/自立`.
|
||||||
|
|
||||||
|
Added unit and tokenizer regression coverage, plus a release fragment in `changes/311-auxiliary-inflection-annotation-filter.md`.
|
||||||
|
|
||||||
|
Validation: targeted annotation/tokenizer tests passed; `bun run typecheck` passed; `bun run changelog:lint` passed. `bun run test:fast` was attempted twice and failed in unrelated `src/core/services/subsync.test.ts` cross-file state (`window.electronAPI` undefined), while `bun test src/core/services/subsync.test.ts` passes by itself.
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||||
@@ -0,0 +1,42 @@
|
|||||||
|
---
|
||||||
|
id: TASK-312
|
||||||
|
title: Suppress ja-nai explanatory ending subtitle annotations
|
||||||
|
status: Done
|
||||||
|
assignee: []
|
||||||
|
created_date: '2026-05-02 09:55'
|
||||||
|
updated_date: '2026-05-02 10:03'
|
||||||
|
labels:
|
||||||
|
- tokenizer
|
||||||
|
- annotations
|
||||||
|
- bug
|
||||||
|
dependencies: []
|
||||||
|
priority: medium
|
||||||
|
---
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||||
|
Suppress subtitle annotation styling for grammar-only explanatory endings like `じゃない` and `じゃないですか` while preserving nearby lexical content annotations.
|
||||||
|
<!-- SECTION:DESCRIPTION:END -->
|
||||||
|
|
||||||
|
## Acceptance Criteria
|
||||||
|
<!-- AC:BEGIN -->
|
||||||
|
- [x] #1 `じゃない` and `じゃないですか`-style endings render as plain hoverable subtitle tokens.
|
||||||
|
- [x] #2 The reported phrase `みたいなのあるじゃないですか` does not annotate `じゃない`/`じゃないですか` as lexical/frequency content.
|
||||||
|
- [x] #3 Regression tests cover unit filter behavior and tokenizer integration without suppressing lexical content tokens.
|
||||||
|
- [x] #4 Standalone polite copula endings such as `です` / `ですよ` render as plain hoverable subtitle tokens even if POS metadata is missing or too lexical.
|
||||||
|
<!-- AC:END -->
|
||||||
|
|
||||||
|
## Implementation Notes
|
||||||
|
|
||||||
|
<!-- SECTION:NOTES:BEGIN -->
|
||||||
|
Added failing coverage first for `じゃない` / `じゃないですか` and `ですよ` leaking annotation metadata when POS metadata is missing or too lexical. Implemented term-family exclusions in the shared subtitle annotation filter for the `じゃない` explanatory family and polite copula suffix endings (`ですか`, `ですね`, `ですよ`, `ですな`). Kept bare `です` term-only behavior unchanged to preserve existing no-POS frequency tests; POS-tagged `です` is already stripped by the grammar POS exclusion path.
|
||||||
|
<!-- SECTION:NOTES:END -->
|
||||||
|
|
||||||
|
## Final Summary
|
||||||
|
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||||
|
Suppressed subtitle annotation metadata for grammar-only endings like `じゃないですか` and `ですよ`, while preserving nearby lexical content annotations. Added unit and tokenizer regression coverage for the reported `みたいなのあるじゃないですか` and `感じですよ` shapes, plus changelog fragment `changes/312-grammar-ending-annotation-filter.md`.
|
||||||
|
|
||||||
|
Validation: `bun test src/core/services/tokenizer/annotation-stage.test.ts`; `bun test src/core/services/tokenizer.test.ts`; `bun run typecheck`; `bun run changelog:lint`; `git diff --check`.
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||||
4
changes/311-auxiliary-inflection-annotation-filter.md
Normal file
4
changes/311-auxiliary-inflection-annotation-filter.md
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
type: fixed
|
||||||
|
area: overlay
|
||||||
|
|
||||||
|
- Suppressed subtitle annotation styling for standalone auxiliary inflection fragments such as `れる` and `れた` while keeping lexical `くれる` forms eligible for lookup metadata.
|
||||||
4
changes/312-grammar-ending-annotation-filter.md
Normal file
4
changes/312-grammar-ending-annotation-filter.md
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
type: fixed
|
||||||
|
area: overlay
|
||||||
|
|
||||||
|
- Suppressed subtitle annotation styling for grammar-only endings such as `じゃないですか` and standalone polite copula tails like `です` / `ですよ`.
|
||||||
@@ -4227,6 +4227,211 @@ test('tokenizeSubtitle clears all annotations for explanatory contrast endings',
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('tokenizeSubtitle clears annotations for ja-nai explanatory endings while preserving lexical content', async () => {
|
||||||
|
const result = await tokenizeSubtitle(
|
||||||
|
'みたいなのあるじゃないですか',
|
||||||
|
makeDepsFromYomitanTokens(
|
||||||
|
[
|
||||||
|
{ surface: 'みたいな', reading: 'みたいな', headword: 'みたい' },
|
||||||
|
{ surface: 'の', reading: 'の', headword: 'の' },
|
||||||
|
{ surface: 'ある', reading: 'ある', headword: 'ある' },
|
||||||
|
{ surface: 'じゃないですか', reading: 'じゃないですか', headword: 'じゃない' },
|
||||||
|
],
|
||||||
|
{
|
||||||
|
getFrequencyDictionaryEnabled: () => true,
|
||||||
|
getFrequencyRank: (text) =>
|
||||||
|
text === 'みたい' ? 320 : text === 'ある' ? 240 : text === 'じゃない' ? 80 : null,
|
||||||
|
getJlptLevel: (text) =>
|
||||||
|
text === 'みたい' ? 'N4' : text === 'ある' ? 'N5' : text === 'じゃない' ? 'N5' : null,
|
||||||
|
isKnownWord: (text) => text === 'みたい' || text === 'の',
|
||||||
|
getMinSentenceWordsForNPlusOne: () => 1,
|
||||||
|
tokenizeWithMecab: async () => [
|
||||||
|
{
|
||||||
|
headword: 'みたい',
|
||||||
|
surface: 'みたい',
|
||||||
|
reading: 'ミタイ',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 3,
|
||||||
|
partOfSpeech: PartOfSpeech.noun,
|
||||||
|
pos1: '名詞',
|
||||||
|
pos2: '非自立',
|
||||||
|
pos3: '形容動詞語幹',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: 'だ',
|
||||||
|
surface: 'な',
|
||||||
|
reading: 'ナ',
|
||||||
|
startPos: 3,
|
||||||
|
endPos: 4,
|
||||||
|
partOfSpeech: PartOfSpeech.bound_auxiliary,
|
||||||
|
pos1: '助動詞',
|
||||||
|
pos2: '*',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: 'の',
|
||||||
|
surface: 'の',
|
||||||
|
reading: 'ノ',
|
||||||
|
startPos: 4,
|
||||||
|
endPos: 5,
|
||||||
|
partOfSpeech: PartOfSpeech.noun,
|
||||||
|
pos1: '名詞',
|
||||||
|
pos2: '非自立',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: 'ある',
|
||||||
|
surface: 'ある',
|
||||||
|
reading: 'アル',
|
||||||
|
startPos: 5,
|
||||||
|
endPos: 7,
|
||||||
|
partOfSpeech: PartOfSpeech.verb,
|
||||||
|
pos1: '動詞',
|
||||||
|
pos2: '自立',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: 'じゃない',
|
||||||
|
surface: 'じゃない',
|
||||||
|
reading: 'ジャナイ',
|
||||||
|
startPos: 7,
|
||||||
|
endPos: 11,
|
||||||
|
partOfSpeech: PartOfSpeech.i_adjective,
|
||||||
|
pos1: '接続詞|形容詞',
|
||||||
|
pos2: '*|自立',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: 'です',
|
||||||
|
surface: 'です',
|
||||||
|
reading: 'デス',
|
||||||
|
startPos: 11,
|
||||||
|
endPos: 13,
|
||||||
|
partOfSpeech: PartOfSpeech.bound_auxiliary,
|
||||||
|
pos1: '助動詞',
|
||||||
|
pos2: '*',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: 'か',
|
||||||
|
surface: 'か',
|
||||||
|
reading: 'カ',
|
||||||
|
startPos: 13,
|
||||||
|
endPos: 14,
|
||||||
|
partOfSpeech: PartOfSpeech.particle,
|
||||||
|
pos1: '助詞',
|
||||||
|
pos2: '副助詞/並立助詞/終助詞',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
const tokenSummary = result.tokens?.map((token) => ({
|
||||||
|
surface: token.surface,
|
||||||
|
headword: token.headword,
|
||||||
|
isKnown: token.isKnown,
|
||||||
|
isNPlusOneTarget: token.isNPlusOneTarget,
|
||||||
|
frequencyRank: token.frequencyRank,
|
||||||
|
jlptLevel: token.jlptLevel,
|
||||||
|
}));
|
||||||
|
|
||||||
|
assert.deepEqual(
|
||||||
|
tokenSummary?.find((token) => token.surface === 'じゃないですか'),
|
||||||
|
{
|
||||||
|
surface: 'じゃないですか',
|
||||||
|
headword: 'じゃない',
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
frequencyRank: undefined,
|
||||||
|
jlptLevel: undefined,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
assert.deepEqual(
|
||||||
|
tokenSummary?.find((token) => token.surface === 'ある'),
|
||||||
|
{
|
||||||
|
surface: 'ある',
|
||||||
|
headword: 'ある',
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
frequencyRank: 240,
|
||||||
|
jlptLevel: 'N5',
|
||||||
|
},
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('tokenizeSubtitle clears annotations for standalone polite copula endings without POS metadata', async () => {
|
||||||
|
const result = await tokenizeSubtitle(
|
||||||
|
'現実は感じですよ',
|
||||||
|
makeDepsFromYomitanTokens(
|
||||||
|
[
|
||||||
|
{ surface: '現実', reading: 'げんじつ', headword: '現実' },
|
||||||
|
{ surface: 'は', reading: 'は', headword: 'は' },
|
||||||
|
{ surface: '感じ', reading: 'かんじ', headword: '感じ' },
|
||||||
|
{ surface: 'ですよ', reading: 'ですよ', headword: 'です' },
|
||||||
|
],
|
||||||
|
{
|
||||||
|
getFrequencyDictionaryEnabled: () => true,
|
||||||
|
getFrequencyRank: (text) =>
|
||||||
|
text === '現実' ? 600 : text === '感じ' ? 240 : text === 'です' ? 50 : null,
|
||||||
|
getJlptLevel: (text) =>
|
||||||
|
text === '現実' ? 'N3' : text === '感じ' ? 'N4' : text === 'です' ? 'N5' : null,
|
||||||
|
isKnownWord: (text) => text === '現実' || text === 'は' || text === 'です',
|
||||||
|
getMinSentenceWordsForNPlusOne: () => 1,
|
||||||
|
tokenizeWithMecab: async () => null,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
const tokenSummary = result.tokens?.map((token) => ({
|
||||||
|
surface: token.surface,
|
||||||
|
headword: token.headword,
|
||||||
|
isKnown: token.isKnown,
|
||||||
|
isNPlusOneTarget: token.isNPlusOneTarget,
|
||||||
|
frequencyRank: token.frequencyRank,
|
||||||
|
jlptLevel: token.jlptLevel,
|
||||||
|
}));
|
||||||
|
|
||||||
|
assert.deepEqual(
|
||||||
|
tokenSummary?.find((token) => token.surface === 'ですよ'),
|
||||||
|
{
|
||||||
|
surface: 'ですよ',
|
||||||
|
headword: 'です',
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
frequencyRank: undefined,
|
||||||
|
jlptLevel: undefined,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
assert.deepEqual(
|
||||||
|
tokenSummary?.find((token) => token.surface === '感じ'),
|
||||||
|
{
|
||||||
|
surface: '感じ',
|
||||||
|
headword: '感じ',
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: true,
|
||||||
|
frequencyRank: 240,
|
||||||
|
jlptLevel: 'N4',
|
||||||
|
},
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle clears annotations for ことに while preserving lexical N+1 target', async () => {
|
test('tokenizeSubtitle clears annotations for ことに while preserving lexical N+1 target', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'さっきの俺と違うことに気付かないのかい?',
|
'さっきの俺と違うことに気付かないのかい?',
|
||||||
@@ -4446,6 +4651,114 @@ test('tokenizeSubtitle clears annotations for ことに while preserving lexical
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('tokenizeSubtitle clears annotations for auxiliary inflection fragments while preserving lexical N+1 target', async () => {
|
||||||
|
const result = await tokenizeSubtitle(
|
||||||
|
'私れた猫',
|
||||||
|
makeDepsFromYomitanTokens(
|
||||||
|
[
|
||||||
|
{ surface: '私', reading: 'わたし', headword: '私' },
|
||||||
|
{ surface: 'れた', reading: 'れた', headword: 'れる' },
|
||||||
|
{ surface: '猫', reading: 'ねこ', headword: '猫' },
|
||||||
|
],
|
||||||
|
{
|
||||||
|
getFrequencyDictionaryEnabled: () => true,
|
||||||
|
getFrequencyRank: (text) =>
|
||||||
|
text === '私' ? 50 : text === 'れる' ? 18 : text === '猫' ? 900 : null,
|
||||||
|
getJlptLevel: (text) =>
|
||||||
|
text === '私' ? 'N5' : text === 'れる' ? 'N4' : text === '猫' ? 'N5' : null,
|
||||||
|
isKnownWord: (text) => text === '私' || text === 'れる',
|
||||||
|
getMinSentenceWordsForNPlusOne: () => 1,
|
||||||
|
tokenizeWithMecab: async () => [
|
||||||
|
{
|
||||||
|
headword: '私',
|
||||||
|
surface: '私',
|
||||||
|
reading: 'ワタシ',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 1,
|
||||||
|
partOfSpeech: PartOfSpeech.noun,
|
||||||
|
pos1: '名詞',
|
||||||
|
pos2: '代名詞',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: 'れる',
|
||||||
|
surface: 'れ',
|
||||||
|
reading: 'レ',
|
||||||
|
startPos: 1,
|
||||||
|
endPos: 2,
|
||||||
|
partOfSpeech: PartOfSpeech.verb,
|
||||||
|
pos1: '動詞',
|
||||||
|
pos2: '接尾',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: 'た',
|
||||||
|
surface: 'た',
|
||||||
|
reading: 'タ',
|
||||||
|
startPos: 2,
|
||||||
|
endPos: 3,
|
||||||
|
partOfSpeech: PartOfSpeech.bound_auxiliary,
|
||||||
|
pos1: '助動詞',
|
||||||
|
pos2: '*',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: '猫',
|
||||||
|
surface: '猫',
|
||||||
|
reading: 'ネコ',
|
||||||
|
startPos: 3,
|
||||||
|
endPos: 4,
|
||||||
|
partOfSpeech: PartOfSpeech.noun,
|
||||||
|
pos1: '名詞',
|
||||||
|
pos2: '一般',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
const tokenSummary = result.tokens?.map((token) => ({
|
||||||
|
surface: token.surface,
|
||||||
|
headword: token.headword,
|
||||||
|
isKnown: token.isKnown,
|
||||||
|
isNPlusOneTarget: token.isNPlusOneTarget,
|
||||||
|
frequencyRank: token.frequencyRank,
|
||||||
|
jlptLevel: token.jlptLevel,
|
||||||
|
}));
|
||||||
|
|
||||||
|
assert.deepEqual(
|
||||||
|
tokenSummary?.find((token) => token.surface === 'れた'),
|
||||||
|
{
|
||||||
|
surface: 'れた',
|
||||||
|
headword: 'れる',
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
frequencyRank: undefined,
|
||||||
|
jlptLevel: undefined,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
assert.deepEqual(
|
||||||
|
tokenSummary?.find((token) => token.surface === '猫'),
|
||||||
|
{
|
||||||
|
surface: '猫',
|
||||||
|
headword: '猫',
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: true,
|
||||||
|
frequencyRank: 900,
|
||||||
|
jlptLevel: 'N5',
|
||||||
|
},
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle excludes default non-independent pos2 from N+1 when JLPT/frequency are disabled', async () => {
|
test('tokenizeSubtitle excludes default non-independent pos2 from N+1 when JLPT/frequency are disabled', async () => {
|
||||||
let mecabCalls = 0;
|
let mecabCalls = 0;
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
|
|||||||
@@ -258,6 +258,48 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes explanatory contrast en
|
|||||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('shouldExcludeTokenFromSubtitleAnnotations excludes ja-nai explanatory endings', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: 'じゃない',
|
||||||
|
headword: 'じゃない',
|
||||||
|
reading: 'ジャナイ',
|
||||||
|
partOfSpeech: PartOfSpeech.i_adjective,
|
||||||
|
pos1: '接続詞|形容詞',
|
||||||
|
pos2: '*|自立',
|
||||||
|
}),
|
||||||
|
makeToken({
|
||||||
|
surface: 'じゃないですか',
|
||||||
|
headword: 'じゃない',
|
||||||
|
reading: 'ジャナイデスカ',
|
||||||
|
partOfSpeech: PartOfSpeech.i_adjective,
|
||||||
|
pos1: '接続詞|形容詞|助動詞|助詞',
|
||||||
|
pos2: '*|自立|*|副助詞/並立助詞/終助詞',
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const token of tokens) {
|
||||||
|
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone polite copula suffix endings without POS tags', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: 'ですよ',
|
||||||
|
headword: 'です',
|
||||||
|
reading: 'デスヨ',
|
||||||
|
partOfSpeech: PartOfSpeech.other,
|
||||||
|
pos1: '',
|
||||||
|
pos2: '',
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const token of tokens) {
|
||||||
|
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes auxiliary-stem そうだ grammar tails', () => {
|
test('shouldExcludeTokenFromSubtitleAnnotations excludes auxiliary-stem そうだ grammar tails', () => {
|
||||||
const token = makeToken({
|
const token = makeToken({
|
||||||
surface: 'そうだ',
|
surface: 'そうだ',
|
||||||
@@ -1204,6 +1246,78 @@ test('annotateTokens clears all annotations for kana-only non-independent noun h
|
|||||||
assert.equal(result[0]?.jlptLevel, undefined);
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('annotateTokens clears all annotations for standalone auxiliary inflection fragments', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: 'れる',
|
||||||
|
headword: 'れる',
|
||||||
|
reading: 'レル',
|
||||||
|
partOfSpeech: PartOfSpeech.verb,
|
||||||
|
pos1: '動詞',
|
||||||
|
pos2: '接尾',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 2,
|
||||||
|
frequencyRank: 18,
|
||||||
|
}),
|
||||||
|
makeToken({
|
||||||
|
surface: 'れた',
|
||||||
|
headword: 'れる',
|
||||||
|
reading: 'レタ',
|
||||||
|
partOfSpeech: PartOfSpeech.verb,
|
||||||
|
pos1: '動詞|助動詞',
|
||||||
|
pos2: '接尾|*',
|
||||||
|
startPos: 2,
|
||||||
|
endPos: 4,
|
||||||
|
frequencyRank: 19,
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = annotateTokens(
|
||||||
|
tokens,
|
||||||
|
makeDeps({
|
||||||
|
isKnownWord: (text) => text === 'れる',
|
||||||
|
getJlptLevel: (text) => (text === 'れる' ? 'N4' : null),
|
||||||
|
}),
|
||||||
|
{ minSentenceWordsForNPlusOne: 1 },
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const token of result) {
|
||||||
|
assert.equal(token.isKnown, false, token.surface);
|
||||||
|
assert.equal(token.isNPlusOneTarget, false, token.surface);
|
||||||
|
assert.equal(token.frequencyRank, undefined, token.surface);
|
||||||
|
assert.equal(token.jlptLevel, undefined, token.surface);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test('annotateTokens keeps lexical くれる forms eligible for annotation', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: 'くれ',
|
||||||
|
headword: 'くれる',
|
||||||
|
reading: 'クレ',
|
||||||
|
partOfSpeech: PartOfSpeech.verb,
|
||||||
|
pos1: '動詞',
|
||||||
|
pos2: '自立',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 2,
|
||||||
|
frequencyRank: 20,
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = annotateTokens(
|
||||||
|
tokens,
|
||||||
|
makeDeps({
|
||||||
|
getJlptLevel: (text) => (text === 'くれる' ? 'N4' : null),
|
||||||
|
}),
|
||||||
|
{ minSentenceWordsForNPlusOne: 1 },
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.equal(result[0]?.isKnown, false);
|
||||||
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
|
assert.equal(result[0]?.frequencyRank, 20);
|
||||||
|
assert.equal(result[0]?.jlptLevel, 'N4');
|
||||||
|
});
|
||||||
|
|
||||||
test('annotateTokens clears all annotations for standalone して helper fragments', () => {
|
test('annotateTokens clears all annotations for standalone して helper fragments', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
|
|||||||
@@ -63,6 +63,24 @@ const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_THOUGHT_SUFFIXES = [
|
|||||||
'かな',
|
'かな',
|
||||||
'かね',
|
'かね',
|
||||||
] as const;
|
] as const;
|
||||||
|
const SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_SUFFIXES = [
|
||||||
|
'か',
|
||||||
|
'ね',
|
||||||
|
'よ',
|
||||||
|
'な',
|
||||||
|
] as const;
|
||||||
|
const SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_SUFFIXES = [
|
||||||
|
'',
|
||||||
|
'か',
|
||||||
|
'ね',
|
||||||
|
'よ',
|
||||||
|
'な',
|
||||||
|
'です',
|
||||||
|
'ですか',
|
||||||
|
'ですよ',
|
||||||
|
'ですね',
|
||||||
|
'ですな',
|
||||||
|
] as const;
|
||||||
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set(
|
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set(
|
||||||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.flatMap((prefix) =>
|
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.flatMap((prefix) =>
|
||||||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES.flatMap((core) =>
|
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES.flatMap((core) =>
|
||||||
@@ -72,6 +90,12 @@ const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set(
|
|||||||
),
|
),
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
|
const SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS = new Set(
|
||||||
|
SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_SUFFIXES.map((suffix) => `です${suffix}`),
|
||||||
|
);
|
||||||
|
const SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS = new Set(
|
||||||
|
SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_SUFFIXES.map((suffix) => `じゃない${suffix}`),
|
||||||
|
);
|
||||||
const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
|
const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
|
||||||
'って',
|
'って',
|
||||||
'ってよ',
|
'ってよ',
|
||||||
@@ -83,6 +107,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
|
|||||||
]);
|
]);
|
||||||
const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
|
const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
|
||||||
const NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1 = new Set(['助詞', '助動詞']);
|
const NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1 = new Set(['助詞', '助動詞']);
|
||||||
|
const AUXILIARY_INFLECTION_TRAILING_POS1 = new Set(['助動詞']);
|
||||||
const STANDALONE_GRAMMAR_PARTICLE_SURFACES = new Set([
|
const STANDALONE_GRAMMAR_PARTICLE_SURFACES = new Set([
|
||||||
'か',
|
'か',
|
||||||
'が',
|
'が',
|
||||||
@@ -312,6 +337,44 @@ function isKanaOnlyText(text: string): boolean {
|
|||||||
return normalized.length > 0 && [...normalized].every(isKanaChar);
|
return normalized.length > 0 && [...normalized].every(isKanaChar);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function isLexicalKureruVerb(token: MergedToken): boolean {
|
||||||
|
const normalizedSurface = normalizeKana(token.surface);
|
||||||
|
const normalizedHeadword = normalizeKana(token.headword);
|
||||||
|
const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
|
||||||
|
const pos2Parts = splitNormalizedTagParts(normalizePosTag(token.pos2));
|
||||||
|
return (
|
||||||
|
normalizedSurface === 'くれ' &&
|
||||||
|
normalizedHeadword === 'くれる' &&
|
||||||
|
pos1Parts.length === 1 &&
|
||||||
|
pos1Parts[0] === '動詞' &&
|
||||||
|
pos2Parts.length === 1 &&
|
||||||
|
pos2Parts[0] === '自立'
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function isStandaloneAuxiliaryInflectionFragment(token: MergedToken): boolean {
|
||||||
|
const normalizedSurface = normalizeKana(token.surface);
|
||||||
|
if (!isKanaOnlyText(normalizedSurface)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
|
||||||
|
if (pos1Parts.length === 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pos1Parts.every((part) => part === '助動詞')) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const pos2Parts = splitNormalizedTagParts(normalizePosTag(token.pos2));
|
||||||
|
return (
|
||||||
|
pos1Parts[0] === '動詞' &&
|
||||||
|
pos2Parts[0] === '接尾' &&
|
||||||
|
pos1Parts.slice(1).every((part) => AUXILIARY_INFLECTION_TRAILING_POS1.has(part))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
function isStandaloneSuruTeGrammarHelper(token: MergedToken): boolean {
|
function isStandaloneSuruTeGrammarHelper(token: MergedToken): boolean {
|
||||||
const normalizedSurface = normalizeKana(token.surface);
|
const normalizedSurface = normalizeKana(token.surface);
|
||||||
const normalizedHeadword = normalizeKana(token.headword);
|
const normalizedHeadword = normalizeKana(token.headword);
|
||||||
@@ -370,6 +433,10 @@ function isExcludedByTerm(token: MergedToken): boolean {
|
|||||||
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalized) ||
|
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalized) ||
|
||||||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(trimmed) ||
|
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(trimmed) ||
|
||||||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(normalized) ||
|
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(normalized) ||
|
||||||
|
SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS.has(trimmed) ||
|
||||||
|
SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS.has(normalized) ||
|
||||||
|
SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS.has(trimmed) ||
|
||||||
|
SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS.has(normalized) ||
|
||||||
shouldIgnoreJlptByTerm(trimmed) ||
|
shouldIgnoreJlptByTerm(trimmed) ||
|
||||||
shouldIgnoreJlptByTerm(normalized)
|
shouldIgnoreJlptByTerm(normalized)
|
||||||
) {
|
) {
|
||||||
@@ -426,6 +493,10 @@ export function shouldExcludeTokenFromSubtitleAnnotations(
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (isStandaloneAuxiliaryInflectionFragment(token)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
if (isStandaloneSuruTeGrammarHelper(token)) {
|
if (isStandaloneSuruTeGrammarHelper(token)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -442,6 +513,10 @@ export function shouldExcludeTokenFromSubtitleAnnotations(
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (isLexicalKureruVerb(token)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return isExcludedByTerm(token);
|
return isExcludedByTerm(token);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user