mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-04-28 04:19:27 -07:00
fix: exclude kana-only n+1 targets
This commit is contained in:
@@ -0,0 +1,58 @@
|
|||||||
|
---
|
||||||
|
id: TASK-307
|
||||||
|
title: Exclude kana-only words from N+1 subtitle targets
|
||||||
|
status: Done
|
||||||
|
assignee:
|
||||||
|
- codex
|
||||||
|
created_date: '2026-04-27 01:52'
|
||||||
|
updated_date: '2026-04-27 01:57'
|
||||||
|
labels:
|
||||||
|
- tokenizer
|
||||||
|
- annotations
|
||||||
|
dependencies: []
|
||||||
|
priority: medium
|
||||||
|
---
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||||
|
Subtitle N+1 annotation is over-targeting kana-only or hiragana/katakana tokens that collapse to dictionary words. Adjust targeting so kana-only tokens are not selected as N+1 candidates, while preserving tokenization/hover behavior and other annotation metadata where existing filters allow it.
|
||||||
|
<!-- SECTION:DESCRIPTION:END -->
|
||||||
|
|
||||||
|
## Acceptance Criteria
|
||||||
|
<!-- AC:BEGIN -->
|
||||||
|
- [x] #1 Kana-only subtitle tokens are not marked as N+1 targets.
|
||||||
|
- [x] #2 Kanji or mixed lexical tokens can still be marked as N+1 targets when they are the single unknown candidate in a sentence.
|
||||||
|
- [x] #3 Regression coverage demonstrates the kana-only N+1 exclusion.
|
||||||
|
<!-- AC:END -->
|
||||||
|
|
||||||
|
## Implementation Plan
|
||||||
|
|
||||||
|
<!-- SECTION:PLAN:BEGIN -->
|
||||||
|
1. Add a failing regression in `src/core/services/tokenizer.test.ts` showing a kana-only Yomitan token is not selected as the single N+1 target, while a mixed lexical token in the same style still can be targeted.
|
||||||
|
2. Implement the smallest filter in `src/token-merger.ts`: N+1 candidate selection rejects tokens whose surface is entirely kana; word-count behavior remains governed by existing annotation/POS filters.
|
||||||
|
3. Run the focused tokenizer tests, then update task acceptance criteria/final summary.
|
||||||
|
<!-- SECTION:PLAN:END -->
|
||||||
|
|
||||||
|
## Implementation Notes
|
||||||
|
|
||||||
|
<!-- SECTION:NOTES:BEGIN -->
|
||||||
|
Implemented a surface-level kana-only guard in N+1 candidate selection. Kept existing word-count/POS filtering behavior intact; updated tokenizer and annotation-stage expectations where old tests intentionally allowed kana-only N+1 targets.
|
||||||
|
<!-- SECTION:NOTES:END -->
|
||||||
|
|
||||||
|
## Final Summary
|
||||||
|
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||||
|
Summary:
|
||||||
|
- Added kana-only surface detection to `isNPlusOneCandidateToken` so hiragana/katakana-only subtitle tokens are not selected as N+1 targets.
|
||||||
|
- Added/updated tokenizer and annotation-stage regressions for kana-only targets while preserving non-kana N+1 behavior.
|
||||||
|
- Added changelog fragment `changes/307-kana-nplusone-targets.md`.
|
||||||
|
|
||||||
|
Verification:
|
||||||
|
- `bun test src/core/services/tokenizer.test.ts --test-name-pattern "kana-only N\+1"` failed before the fix with `true !== false`.
|
||||||
|
- `bun test src/core/services/tokenizer/annotation-stage.test.ts src/core/services/tokenizer.test.ts` passed.
|
||||||
|
- `bun run typecheck` passed.
|
||||||
|
- `bun run test:fast` passed.
|
||||||
|
- `bun run changelog:lint` passed.
|
||||||
|
- `bunx prettier --check src/core/services/tokenizer.test.ts src/core/services/tokenizer/annotation-stage.test.ts src/token-merger.ts changes/307-kana-nplusone-targets.md` passed.
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||||
4
changes/307-kana-nplusone-targets.md
Normal file
4
changes/307-kana-nplusone-targets.md
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
type: fixed
|
||||||
|
area: tokenizer
|
||||||
|
|
||||||
|
- Stopped kana-only subtitle tokens from being selected as N+1 targets.
|
||||||
@@ -2306,6 +2306,29 @@ test('tokenizeSubtitle selects one N+1 target token', async () => {
|
|||||||
assert.equal(targets[0]?.surface, '犬');
|
assert.equal(targets[0]?.surface, '犬');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('tokenizeSubtitle does not select kana-only N+1 target tokens', async () => {
|
||||||
|
const result = await tokenizeSubtitle(
|
||||||
|
'私のばあい',
|
||||||
|
makeDepsFromYomitanTokens(
|
||||||
|
[
|
||||||
|
{ surface: '私', reading: 'わたし', headword: '私' },
|
||||||
|
{ surface: 'の', reading: 'の', headword: 'の' },
|
||||||
|
{ surface: 'ばあい', reading: 'ばあい', headword: '場合' },
|
||||||
|
],
|
||||||
|
{
|
||||||
|
getMinSentenceWordsForNPlusOne: () => 2,
|
||||||
|
isKnownWord: (text) => text === '私',
|
||||||
|
},
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.equal(result.tokens?.length, 3);
|
||||||
|
assert.equal(
|
||||||
|
result.tokens?.some((token) => token.isNPlusOneTarget),
|
||||||
|
false,
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle does not mark target when sentence has multiple candidates', async () => {
|
test('tokenizeSubtitle does not mark target when sentence has multiple candidates', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'猫犬',
|
'猫犬',
|
||||||
@@ -3040,15 +3063,18 @@ test('tokenizeSubtitle uses Yomitan word classes to classify standalone particle
|
|||||||
let mecabCalls = 0;
|
let mecabCalls = 0;
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'は',
|
'は',
|
||||||
makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は', wordClasses: ['prt'] }], {
|
makeDepsFromYomitanTokens(
|
||||||
getFrequencyDictionaryEnabled: () => true,
|
[{ surface: 'は', reading: 'は', headword: 'は', wordClasses: ['prt'] }],
|
||||||
getFrequencyRank: (text) => (text === 'は' ? 10 : null),
|
{
|
||||||
getJlptLevel: (text) => (text === 'は' ? 'N5' : null),
|
getFrequencyDictionaryEnabled: () => true,
|
||||||
tokenizeWithMecab: async () => {
|
getFrequencyRank: (text) => (text === 'は' ? 10 : null),
|
||||||
mecabCalls += 1;
|
getJlptLevel: (text) => (text === 'は' ? 'N5' : null),
|
||||||
return null;
|
tokenizeWithMecab: async () => {
|
||||||
|
mecabCalls += 1;
|
||||||
|
return null;
|
||||||
|
},
|
||||||
},
|
},
|
||||||
}),
|
),
|
||||||
);
|
);
|
||||||
|
|
||||||
assert.equal(mecabCalls, 1);
|
assert.equal(mecabCalls, 1);
|
||||||
@@ -3063,24 +3089,27 @@ test('tokenizeSubtitle uses Yomitan word classes to classify standalone particle
|
|||||||
test('tokenizeSubtitle fills detailed MeCab POS when Yomitan word class supplies coarse POS', async () => {
|
test('tokenizeSubtitle fills detailed MeCab POS when Yomitan word class supplies coarse POS', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'は',
|
'は',
|
||||||
makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は', wordClasses: ['prt'] }], {
|
makeDepsFromYomitanTokens(
|
||||||
tokenizeWithMecab: async () => [
|
[{ surface: 'は', reading: 'は', headword: 'は', wordClasses: ['prt'] }],
|
||||||
{
|
{
|
||||||
headword: 'は',
|
tokenizeWithMecab: async () => [
|
||||||
surface: 'は',
|
{
|
||||||
reading: 'ハ',
|
headword: 'は',
|
||||||
startPos: 0,
|
surface: 'は',
|
||||||
endPos: 1,
|
reading: 'ハ',
|
||||||
partOfSpeech: PartOfSpeech.particle,
|
startPos: 0,
|
||||||
pos1: '助詞',
|
endPos: 1,
|
||||||
pos2: '係助詞',
|
partOfSpeech: PartOfSpeech.particle,
|
||||||
pos3: '*',
|
pos1: '助詞',
|
||||||
isMerged: false,
|
pos2: '係助詞',
|
||||||
isKnown: false,
|
pos3: '*',
|
||||||
isNPlusOneTarget: false,
|
isMerged: false,
|
||||||
},
|
isKnown: false,
|
||||||
],
|
isNPlusOneTarget: false,
|
||||||
}),
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
),
|
||||||
);
|
);
|
||||||
|
|
||||||
assert.equal(result.tokens?.[0]?.partOfSpeech, PartOfSpeech.particle);
|
assert.equal(result.tokens?.[0]?.partOfSpeech, PartOfSpeech.particle);
|
||||||
@@ -3682,7 +3711,7 @@ test('tokenizeSubtitle excludes single-kana merged tokens from frequency highlig
|
|||||||
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle excludes merged function/content token from frequency highlighting but keeps N+1', async () => {
|
test('tokenizeSubtitle excludes merged kana-only function/content token from frequency and N+1', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'になれば',
|
'になれば',
|
||||||
makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], {
|
makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], {
|
||||||
@@ -3736,7 +3765,7 @@ test('tokenizeSubtitle excludes merged function/content token from frequency hig
|
|||||||
assert.equal(result.tokens?.length, 1);
|
assert.equal(result.tokens?.length, 1);
|
||||||
assert.equal(result.tokens?.[0]?.pos1, '助詞|動詞');
|
assert.equal(result.tokens?.[0]?.pos1, '助詞|動詞');
|
||||||
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, true);
|
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle clears all annotations for kana-only demonstrative helper merges', async () => {
|
test('tokenizeSubtitle clears all annotations for kana-only demonstrative helper merges', async () => {
|
||||||
@@ -3935,7 +3964,7 @@ test('tokenizeSubtitle clears all annotations for explanatory pondering endings'
|
|||||||
surface: 'どうかしちゃった',
|
surface: 'どうかしちゃった',
|
||||||
headword: 'どうかしちゃう',
|
headword: 'どうかしちゃう',
|
||||||
isKnown: false,
|
isKnown: false,
|
||||||
isNPlusOneTarget: true,
|
isNPlusOneTarget: false,
|
||||||
frequencyRank: 3200,
|
frequencyRank: 3200,
|
||||||
jlptLevel: 'N3',
|
jlptLevel: 'N3',
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -570,13 +570,13 @@ test('annotateTokens keeps other annotations for name matches when name highligh
|
|||||||
let jlptLookupCalls = 0;
|
let jlptLookupCalls = 0;
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'オリヴィア',
|
surface: '山田',
|
||||||
reading: 'オリヴィア',
|
reading: 'ヤマダ',
|
||||||
headword: 'オリヴィア',
|
headword: '山田',
|
||||||
isNameMatch: true,
|
isNameMatch: true,
|
||||||
frequencyRank: 42,
|
frequencyRank: 42,
|
||||||
startPos: 0,
|
startPos: 0,
|
||||||
endPos: 5,
|
endPos: 2,
|
||||||
}),
|
}),
|
||||||
];
|
];
|
||||||
|
|
||||||
@@ -770,7 +770,7 @@ test('annotateTokens allows previously default-excluded pos1 when removed from e
|
|||||||
});
|
});
|
||||||
|
|
||||||
assert.equal(result[0]?.frequencyRank, 8);
|
assert.equal(result[0]?.frequencyRank, 8);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, true);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens excludes default non-independent pos2 from frequency and N+1', () => {
|
test('annotateTokens excludes default non-independent pos2 from frequency and N+1', () => {
|
||||||
@@ -787,13 +787,9 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+
|
|||||||
}),
|
}),
|
||||||
];
|
];
|
||||||
|
|
||||||
const result = annotateTokens(
|
const result = annotateTokens(tokens, makeDeps(), {
|
||||||
tokens,
|
minSentenceWordsForNPlusOne: 1,
|
||||||
makeDeps(),
|
});
|
||||||
{
|
|
||||||
minSentenceWordsForNPlusOne: 1,
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
@@ -969,10 +965,10 @@ test('annotateTokens allows previously default-excluded pos2 when removed from e
|
|||||||
});
|
});
|
||||||
|
|
||||||
assert.equal(result[0]?.frequencyRank, 9);
|
assert.equal(result[0]?.frequencyRank, 9);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, true);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens excludes composite function/content tokens from frequency but keeps N+1 eligible', () => {
|
test('annotateTokens excludes kana-only composite function/content tokens from frequency and N+1', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'になれば',
|
surface: 'になれば',
|
||||||
@@ -990,7 +986,7 @@ test('annotateTokens excludes composite function/content tokens from frequency b
|
|||||||
});
|
});
|
||||||
|
|
||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, true);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens excludes composite tokens when all component pos tags are excluded', () => {
|
test('annotateTokens excludes composite tokens when all component pos tags are excluded', () => {
|
||||||
|
|||||||
@@ -282,6 +282,26 @@ function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet<strin
|
|||||||
return parts.every((part) => exclusions.has(part));
|
return parts.every((part) => exclusions.has(part));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function isKanaChar(char: string): boolean {
|
||||||
|
const code = char.codePointAt(0);
|
||||||
|
if (code === undefined) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (
|
||||||
|
(code >= 0x3041 && code <= 0x3096) ||
|
||||||
|
(code >= 0x309b && code <= 0x309f) ||
|
||||||
|
code === 0x30fc ||
|
||||||
|
(code >= 0x30a0 && code <= 0x30fa) ||
|
||||||
|
(code >= 0x30fd && code <= 0x30ff)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function isKanaOnlyText(text: string): boolean {
|
||||||
|
const normalized = text.trim();
|
||||||
|
return normalized.length > 0 && Array.from(normalized).every((char) => isKanaChar(char));
|
||||||
|
}
|
||||||
|
|
||||||
export function isNPlusOneCandidateToken(
|
export function isNPlusOneCandidateToken(
|
||||||
token: MergedToken,
|
token: MergedToken,
|
||||||
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
|
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
|
||||||
@@ -290,6 +310,9 @@ export function isNPlusOneCandidateToken(
|
|||||||
if (token.isKnown) {
|
if (token.isKnown) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (isKanaOnlyText(token.surface)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
return isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions);
|
return isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user