mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-04-26 04:19:27 -07:00
fix: exclude kana grammar helper annotations
This commit is contained in:
@@ -0,0 +1,54 @@
|
|||||||
|
---
|
||||||
|
id: TASK-298
|
||||||
|
title: Exclude kana grammar-helper merges like ことに from subtitle annotations
|
||||||
|
status: Done
|
||||||
|
assignee:
|
||||||
|
- codex
|
||||||
|
created_date: '2026-04-26 00:08'
|
||||||
|
updated_date: '2026-04-26 00:15'
|
||||||
|
labels:
|
||||||
|
- tokenizer
|
||||||
|
- annotations
|
||||||
|
- bug
|
||||||
|
dependencies: []
|
||||||
|
priority: medium
|
||||||
|
---
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||||
|
Investigate and fix subtitle tokenizer annotation behavior where all-hiragana grammar-helper merged tokens such as `ことに` can be marked as N+1. Current likely path: Yomitan emits `ことに` with headword `こと`; MeCab enrichment supplies content-led POS (`名詞|助詞`, likely `非自立|格助詞`); shared subtitle annotation filter does not exclude this family unless it matches narrower rules such as `これで` or explanatory endings.
|
||||||
|
<!-- SECTION:DESCRIPTION:END -->
|
||||||
|
|
||||||
|
## Acceptance Criteria
|
||||||
|
<!-- AC:BEGIN -->
|
||||||
|
- [x] #1 `ことに`-style kana grammar-helper merges are not marked known, N+1, JLPT, or frequency-highlighted when their MeCab metadata indicates a non-independent noun plus helper particle.
|
||||||
|
- [x] #2 Regression coverage demonstrates the reported subtitle phrase does not mark `ことに` as N+1 while preserving annotation for real lexical content tokens.
|
||||||
|
- [x] #3 Existing tokenizer annotation tests pass.
|
||||||
|
<!-- AC:END -->
|
||||||
|
|
||||||
|
## Implementation Plan
|
||||||
|
|
||||||
|
<!-- SECTION:PLAN:BEGIN -->
|
||||||
|
Approved approach (user: "let's do it"):
|
||||||
|
1. Add a regression test for the reported `ことに` case using Yomitan token `ことに` -> headword `こと` and MeCab metadata `名詞|助詞` / `非自立|格助詞`; assert all annotation fields are stripped while nearby lexical content can still be N+1.
|
||||||
|
2. Verify the new test fails before production changes.
|
||||||
|
3. Update the shared subtitle annotation filter to exclude conservative kana-only grammar-helper merges: merged surface differs from headword, surface is kana-only, first POS component is `名詞`, first POS2 component is `非自立`, and remaining POS components are grammar helpers (`助詞`/`助動詞`).
|
||||||
|
4. Run targeted tokenizer/annotation tests and update the task acceptance criteria/final notes.
|
||||||
|
<!-- SECTION:PLAN:END -->
|
||||||
|
|
||||||
|
## Implementation Notes
|
||||||
|
|
||||||
|
<!-- SECTION:NOTES:BEGIN -->
|
||||||
|
Red test initially passed with headword `こと` because `こと` is already in `JLPT_EXCLUDED_TERMS` and the shared subtitle annotation filter checks that set. Updated regression to the live-risk shape `surface=ことに`, `headword=事`, with MeCab POS `名詞|助詞` / `非自立|格助詞`; this failed before the filter change and passed after.
|
||||||
|
<!-- SECTION:NOTES:END -->
|
||||||
|
|
||||||
|
## Final Summary
|
||||||
|
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||||
|
Implemented a conservative shared subtitle annotation filter for kana-only non-independent noun helper merges. Tokens such as `ことに` with a kanji dictionary headword like `事` are now stripped of known-word, N+1, JLPT, and frequency metadata when MeCab shows the first component as `名詞/非自立` and trailing components as grammar helpers.
|
||||||
|
|
||||||
|
Added unit coverage in `src/core/services/tokenizer/annotation-stage.test.ts` and an integration-style tokenizer regression for the reported phrase shape in `src/core/services/tokenizer.test.ts`, verifying `ことに` stays plain while a real lexical token can still become the N+1 target.
|
||||||
|
|
||||||
|
Validation: `bun test src/core/services/tokenizer/annotation-stage.test.ts`; `bun test src/core/services/tokenizer.test.ts`; `bun run test:fast`; `bun run changelog:lint`.
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||||
4
changes/298-kana-grammar-helper-annotations.md
Normal file
4
changes/298-kana-grammar-helper-annotations.md
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
type: fixed
|
||||||
|
area: tokenizer
|
||||||
|
|
||||||
|
- Stopped kana-only grammar-helper merges such as `ことに` from receiving subtitle annotation metadata like N+1, JLPT, known-word, or frequency highlighting.
|
||||||
@@ -4069,6 +4069,226 @@ test('tokenizeSubtitle clears all annotations for explanatory contrast endings',
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('tokenizeSubtitle clears annotations for ことに while preserving lexical N+1 target', async () => {
|
||||||
|
const result = await tokenizeSubtitle(
|
||||||
|
'さっきの俺と違うことに気付かないのかい?',
|
||||||
|
makeDepsFromYomitanTokens(
|
||||||
|
[
|
||||||
|
{ surface: 'さっき', reading: 'さっき', headword: 'さっき' },
|
||||||
|
{ surface: 'の', reading: 'の', headword: 'の' },
|
||||||
|
{ surface: '俺', reading: 'おれ', headword: '俺' },
|
||||||
|
{ surface: 'と', reading: 'と', headword: 'と' },
|
||||||
|
{ surface: '違う', reading: 'ちがう', headword: '違う' },
|
||||||
|
{ surface: 'ことに', reading: 'ことに', headword: '事' },
|
||||||
|
{ surface: '気付かない', reading: 'きづかない', headword: '気付く' },
|
||||||
|
{ surface: 'の', reading: 'の', headword: 'の' },
|
||||||
|
{ surface: 'かい', reading: 'かい', headword: 'かい' },
|
||||||
|
{ surface: '?', reading: '', headword: '?' },
|
||||||
|
],
|
||||||
|
{
|
||||||
|
getFrequencyDictionaryEnabled: () => true,
|
||||||
|
getFrequencyRank: (text) =>
|
||||||
|
text === '違う' ? 900 : text === '事' ? 81 : text === '気付く' ? 1500 : null,
|
||||||
|
getJlptLevel: (text) =>
|
||||||
|
text === '違う' ? 'N4' : text === '事' ? 'N4' : text === '気付く' ? 'N3' : null,
|
||||||
|
isKnownWord: (text) =>
|
||||||
|
['さっき', 'の', '俺', 'と', '気付く', 'かい', '?'].includes(text),
|
||||||
|
getMinSentenceWordsForNPlusOne: () => 1,
|
||||||
|
tokenizeWithMecab: async () => [
|
||||||
|
{
|
||||||
|
headword: 'さっき',
|
||||||
|
surface: 'さっき',
|
||||||
|
reading: 'サッキ',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 3,
|
||||||
|
partOfSpeech: PartOfSpeech.noun,
|
||||||
|
pos1: '名詞',
|
||||||
|
pos2: '副詞可能',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: 'の',
|
||||||
|
surface: 'の',
|
||||||
|
reading: 'ノ',
|
||||||
|
startPos: 3,
|
||||||
|
endPos: 4,
|
||||||
|
partOfSpeech: PartOfSpeech.particle,
|
||||||
|
pos1: '助詞',
|
||||||
|
pos2: '連体化',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: '俺',
|
||||||
|
surface: '俺',
|
||||||
|
reading: 'オレ',
|
||||||
|
startPos: 4,
|
||||||
|
endPos: 5,
|
||||||
|
partOfSpeech: PartOfSpeech.noun,
|
||||||
|
pos1: '名詞',
|
||||||
|
pos2: '代名詞',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: 'と',
|
||||||
|
surface: 'と',
|
||||||
|
reading: 'ト',
|
||||||
|
startPos: 5,
|
||||||
|
endPos: 6,
|
||||||
|
partOfSpeech: PartOfSpeech.particle,
|
||||||
|
pos1: '助詞',
|
||||||
|
pos2: '格助詞',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: '違う',
|
||||||
|
surface: '違う',
|
||||||
|
reading: 'チガウ',
|
||||||
|
startPos: 6,
|
||||||
|
endPos: 8,
|
||||||
|
partOfSpeech: PartOfSpeech.verb,
|
||||||
|
pos1: '動詞',
|
||||||
|
pos2: '自立',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: '事',
|
||||||
|
surface: 'こと',
|
||||||
|
reading: 'コト',
|
||||||
|
startPos: 8,
|
||||||
|
endPos: 10,
|
||||||
|
partOfSpeech: PartOfSpeech.noun,
|
||||||
|
pos1: '名詞',
|
||||||
|
pos2: '非自立',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: 'に',
|
||||||
|
surface: 'に',
|
||||||
|
reading: 'ニ',
|
||||||
|
startPos: 10,
|
||||||
|
endPos: 11,
|
||||||
|
partOfSpeech: PartOfSpeech.particle,
|
||||||
|
pos1: '助詞',
|
||||||
|
pos2: '格助詞',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: '気付く',
|
||||||
|
surface: '気付か',
|
||||||
|
reading: 'キヅカ',
|
||||||
|
startPos: 11,
|
||||||
|
endPos: 14,
|
||||||
|
partOfSpeech: PartOfSpeech.verb,
|
||||||
|
pos1: '動詞',
|
||||||
|
pos2: '自立',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: 'ない',
|
||||||
|
surface: 'ない',
|
||||||
|
reading: 'ナイ',
|
||||||
|
startPos: 14,
|
||||||
|
endPos: 16,
|
||||||
|
partOfSpeech: PartOfSpeech.bound_auxiliary,
|
||||||
|
pos1: '助動詞',
|
||||||
|
pos2: '*',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: 'の',
|
||||||
|
surface: 'の',
|
||||||
|
reading: 'ノ',
|
||||||
|
startPos: 16,
|
||||||
|
endPos: 17,
|
||||||
|
partOfSpeech: PartOfSpeech.particle,
|
||||||
|
pos1: '助詞',
|
||||||
|
pos2: '終助詞',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: 'かい',
|
||||||
|
surface: 'かい',
|
||||||
|
reading: 'カイ',
|
||||||
|
startPos: 17,
|
||||||
|
endPos: 19,
|
||||||
|
partOfSpeech: PartOfSpeech.particle,
|
||||||
|
pos1: '助詞',
|
||||||
|
pos2: '終助詞',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: '?',
|
||||||
|
surface: '?',
|
||||||
|
reading: '',
|
||||||
|
startPos: 19,
|
||||||
|
endPos: 20,
|
||||||
|
partOfSpeech: PartOfSpeech.symbol,
|
||||||
|
pos1: '記号',
|
||||||
|
pos2: '一般',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
const tokenSummary = result.tokens?.map((token) => ({
|
||||||
|
surface: token.surface,
|
||||||
|
headword: token.headword,
|
||||||
|
isKnown: token.isKnown,
|
||||||
|
isNPlusOneTarget: token.isNPlusOneTarget,
|
||||||
|
frequencyRank: token.frequencyRank,
|
||||||
|
jlptLevel: token.jlptLevel,
|
||||||
|
}));
|
||||||
|
|
||||||
|
assert.deepEqual(
|
||||||
|
tokenSummary?.find((token) => token.surface === 'ことに'),
|
||||||
|
{
|
||||||
|
surface: 'ことに',
|
||||||
|
headword: '事',
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
frequencyRank: undefined,
|
||||||
|
jlptLevel: undefined,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
assert.deepEqual(
|
||||||
|
tokenSummary?.find((token) => token.surface === '違う'),
|
||||||
|
{
|
||||||
|
surface: '違う',
|
||||||
|
headword: '違う',
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: true,
|
||||||
|
frequencyRank: 900,
|
||||||
|
jlptLevel: 'N4',
|
||||||
|
},
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle excludes default non-independent pos2 from N+1 when JLPT/frequency are disabled', async () => {
|
test('tokenizeSubtitle excludes default non-independent pos2 from N+1 when JLPT/frequency are disabled', async () => {
|
||||||
let mecabCalls = 0;
|
let mecabCalls = 0;
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
|
|||||||
@@ -353,6 +353,19 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes kana-only demonstrative
|
|||||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('shouldExcludeTokenFromSubtitleAnnotations excludes kana-only non-independent noun helper merges', () => {
|
||||||
|
const token = makeToken({
|
||||||
|
surface: 'ことに',
|
||||||
|
headword: '事',
|
||||||
|
reading: 'コトニ',
|
||||||
|
partOfSpeech: PartOfSpeech.noun,
|
||||||
|
pos1: '名詞|助詞',
|
||||||
|
pos2: '非自立|格助詞',
|
||||||
|
});
|
||||||
|
|
||||||
|
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||||
|
});
|
||||||
|
|
||||||
test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => {
|
test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => {
|
||||||
const token = makeToken({
|
const token = makeToken({
|
||||||
surface: 'は',
|
surface: 'は',
|
||||||
@@ -813,6 +826,36 @@ test('annotateTokens applies one shared exclusion gate across known N+1 frequenc
|
|||||||
assert.equal(result[0]?.jlptLevel, undefined);
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('annotateTokens clears all annotations for kana-only non-independent noun helper merges', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: 'ことに',
|
||||||
|
headword: '事',
|
||||||
|
reading: 'コトニ',
|
||||||
|
partOfSpeech: PartOfSpeech.noun,
|
||||||
|
pos1: '名詞|助詞',
|
||||||
|
pos2: '非自立|格助詞',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 3,
|
||||||
|
frequencyRank: 81,
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = annotateTokens(
|
||||||
|
tokens,
|
||||||
|
makeDeps({
|
||||||
|
isKnownWord: (text) => text === '事',
|
||||||
|
getJlptLevel: (text) => (text === '事' ? 'N4' : null),
|
||||||
|
}),
|
||||||
|
{ minSentenceWordsForNPlusOne: 1 },
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.equal(result[0]?.isKnown, false);
|
||||||
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
|
});
|
||||||
|
|
||||||
test('annotateTokens clears all annotations from standalone あ interjections without POS tags', () => {
|
test('annotateTokens clears all annotations from standalone あ interjections without POS tags', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
|
|||||||
@@ -71,6 +71,7 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
|
|||||||
'ってば',
|
'ってば',
|
||||||
]);
|
]);
|
||||||
const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
|
const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
|
||||||
|
const NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1 = new Set(['助詞', '助動詞']);
|
||||||
|
|
||||||
export interface SubtitleAnnotationFilterOptions {
|
export interface SubtitleAnnotationFilterOptions {
|
||||||
pos1Exclusions?: ReadonlySet<string>;
|
pos1Exclusions?: ReadonlySet<string>;
|
||||||
@@ -252,6 +253,31 @@ function isAuxiliaryStemGrammarTailToken(token: MergedToken): boolean {
|
|||||||
return pos3Parts.includes('助動詞語幹');
|
return pos3Parts.includes('助動詞語幹');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function isKanaOnlyNonIndependentNounHelperMerge(token: MergedToken): boolean {
|
||||||
|
const normalizedSurface = normalizeKana(token.surface);
|
||||||
|
const normalizedHeadword = normalizeKana(token.headword);
|
||||||
|
if (
|
||||||
|
!normalizedSurface ||
|
||||||
|
!normalizedHeadword ||
|
||||||
|
normalizedSurface === normalizedHeadword ||
|
||||||
|
![...normalizedSurface].every(isKanaChar)
|
||||||
|
) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
|
||||||
|
if (pos1Parts.length < 2 || pos1Parts[0] !== '名詞') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const pos2Parts = splitNormalizedTagParts(normalizePosTag(token.pos2));
|
||||||
|
if (pos2Parts[0] !== '非自立') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return pos1Parts.slice(1).every((part) => NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1.has(part));
|
||||||
|
}
|
||||||
|
|
||||||
function isExcludedByTerm(token: MergedToken): boolean {
|
function isExcludedByTerm(token: MergedToken): boolean {
|
||||||
const candidates = [token.surface, token.reading, token.headword].filter(
|
const candidates = [token.surface, token.reading, token.headword].filter(
|
||||||
(candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
|
(candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
|
||||||
@@ -335,6 +361,10 @@ export function shouldExcludeTokenFromSubtitleAnnotations(
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (isKanaOnlyNonIndependentNounHelperMerge(token)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
if (isExcludedTrailingParticleMergedToken(token)) {
|
if (isExcludedTrailingParticleMergedToken(token)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user