mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-04-28 04:19:27 -07:00
fix: suppress N+1 for kana-only candidates and fix minSentenceWords coun
- Treat kana-only tokens with surrounding subtitle punctuation (…, ―, etc.) as kana-only so they are not promoted to N+1 targets - Exclude unknown tokens filtered from N+1 targeting from the minSentenceWords count so filtered kana-only unknowns cannot satisfy sentence length threshold - Add regression tests for kana-only candidate suppression and filtered-unknown padding cases
This commit is contained in:
@@ -0,0 +1,58 @@
|
|||||||
|
---
|
||||||
|
id: TASK-310
|
||||||
|
title: Suppress N+1 highlight for kana-only candidate sentences
|
||||||
|
status: Done
|
||||||
|
assignee:
|
||||||
|
- Codex
|
||||||
|
created_date: '2026-04-28 06:55'
|
||||||
|
updated_date: '2026-04-28 07:04'
|
||||||
|
labels:
|
||||||
|
- tokenizer
|
||||||
|
- n+1
|
||||||
|
dependencies: []
|
||||||
|
priority: medium
|
||||||
|
---
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||||
|
Reduce noisy N+1 subtitle annotations when the only unknown candidates in a sentence are kana-only hiragana or katakana words, such as mostly-kana subtitle lines where highlighting a particle/helper-like token is low value.
|
||||||
|
<!-- SECTION:DESCRIPTION:END -->
|
||||||
|
|
||||||
|
## Acceptance Criteria
|
||||||
|
<!-- AC:BEGIN -->
|
||||||
|
- [x] #1 N+1 annotation does not mark a kana-only unknown target when all N+1 candidates in the sentence are kana-only.
|
||||||
|
- [x] #2 N+1 annotation continues to mark kanji or mixed-script unknown targets in otherwise eligible sentences.
|
||||||
|
- [x] #3 A focused regression test covers the kana-only candidate case.
|
||||||
|
- [x] #4 N+1 minimum sentence word count excludes tokens stripped by the subtitle annotation filter, so filtered grammar/noise tokens cannot satisfy minSentenceWords.
|
||||||
|
<!-- AC:END -->
|
||||||
|
|
||||||
|
## Implementation Plan
|
||||||
|
|
||||||
|
<!-- SECTION:PLAN:BEGIN -->
|
||||||
|
1. Keep the existing N+1 target eligibility guard: kana-only subtitle surfaces do not become N+1 targets.
|
||||||
|
2. Add a focused regression in src/core/services/tokenizer/annotation-stage.test.ts proving annotation-filtered tokens do not count toward ankiConnect.nPlusOne.minSentenceWords.
|
||||||
|
3. Verify the new regression fails before code changes.
|
||||||
|
4. Patch src/token-merger.ts so the N+1 minimum sentence word count uses the same subtitle-annotation eligibility filter as annotation rendering, excluding filtered particles/auxiliaries/noise from the count.
|
||||||
|
5. Re-run focused tokenizer tests, then update TASK-310 acceptance criteria and final notes.
|
||||||
|
<!-- SECTION:PLAN:END -->
|
||||||
|
|
||||||
|
## Implementation Notes
|
||||||
|
|
||||||
|
<!-- SECTION:NOTES:BEGIN -->
|
||||||
|
Initial context: current token-merger has an existing surface-level kana-only guard in isNPlusOneCandidateToken, added in commit 9e4ad907. Need decide whether to broaden behavior to lookup/headword forms or verify current behavior only.
|
||||||
|
|
||||||
|
Implemented by treating kana-only N+1 candidates as kana-only even when their token surface includes surrounding subtitle punctuation such as ellipsis or dashes. Focused regression was red before the token-merger change: スイッチ… was marked true, then passed after the guard update. test:env initially hit an unrelated immersion-tracker active_days timing/order failure and Bun follow-on loader error; the failing test passed in isolation and the full test:env rerun passed.
|
||||||
|
|
||||||
|
Reopened for follow-up scope: minSentenceWords must count annotation-eligible tokens only, not tokens stripped from annotation metadata.
|
||||||
|
|
||||||
|
Implemented follow-up minSentenceWords behavior: unknown tokens filtered from N+1 targeting no longer contribute to sentence length; known eligible tokens and true N+1 candidates still count.
|
||||||
|
<!-- SECTION:NOTES:END -->
|
||||||
|
|
||||||
|
## Final Summary
|
||||||
|
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||||
|
Changed N+1 sentence-length counting so minSentenceWords only counts known eligible words and actual N+1 target candidates. Unknown tokens filtered from N+1 targeting, including kana-only unknowns, no longer pad a sentence into eligibility. Existing annotation-filtered particles/auxiliaries remain excluded. Added regression coverage for the filtered unknown padding case while preserving kanji/mixed-script target behavior.
|
||||||
|
|
||||||
|
Verification: new regression failed before implementation; `bun test src/core/services/tokenizer/annotation-stage.test.ts -t "N\\+1"` pass; full `bun test src/core/services/tokenizer/annotation-stage.test.ts` pass; `bun test src/core/services/tokenizer.test.ts -t "N\\+1"` pass; `bun run typecheck` pass.
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||||
@@ -627,6 +627,63 @@ test('annotateTokens N+1 handoff marks expected target when threshold is satisfi
|
|||||||
assert.equal(result[2]?.isNPlusOneTarget, false);
|
assert.equal(result[2]?.isNPlusOneTarget, false);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('annotateTokens does not mark kana-only unknown target with subtitle punctuation as N+1', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: '何やら',
|
||||||
|
headword: '何やら',
|
||||||
|
reading: 'ナニヤラ',
|
||||||
|
pos1: '副詞',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 3,
|
||||||
|
}),
|
||||||
|
makeToken({
|
||||||
|
surface: 'ボタン',
|
||||||
|
headword: 'ボタン',
|
||||||
|
reading: 'ボタン',
|
||||||
|
pos1: '名詞',
|
||||||
|
startPos: 3,
|
||||||
|
endPos: 6,
|
||||||
|
}),
|
||||||
|
makeToken({
|
||||||
|
surface: 'スイッチ…',
|
||||||
|
headword: 'スイッチ',
|
||||||
|
reading: 'スイッチ',
|
||||||
|
pos1: '名詞',
|
||||||
|
startPos: 6,
|
||||||
|
endPos: 11,
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = annotateTokens(
|
||||||
|
tokens,
|
||||||
|
makeDeps({
|
||||||
|
isKnownWord: (text) => text === '何やら' || text === 'ボタン',
|
||||||
|
}),
|
||||||
|
{ minSentenceWordsForNPlusOne: 3 },
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.equal(result[2]?.isNPlusOneTarget, false);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('annotateTokens still marks kanji unknown target in otherwise eligible sentence as N+1', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({ surface: '私', headword: '私', pos1: '名詞', startPos: 0, endPos: 1 }),
|
||||||
|
makeToken({ surface: '猫', headword: '猫', pos1: '名詞', startPos: 1, endPos: 2 }),
|
||||||
|
makeToken({ surface: '装置…', headword: '装置', pos1: '名詞', startPos: 2, endPos: 5 }),
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = annotateTokens(
|
||||||
|
tokens,
|
||||||
|
makeDeps({
|
||||||
|
isKnownWord: (text) => text === '私' || text === '猫',
|
||||||
|
}),
|
||||||
|
{ minSentenceWordsForNPlusOne: 3 },
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.equal(result[2]?.isNPlusOneTarget, true);
|
||||||
|
});
|
||||||
|
|
||||||
test('annotateTokens N+1 minimum sentence words counts only eligible word tokens', () => {
|
test('annotateTokens N+1 minimum sentence words counts only eligible word tokens', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({ surface: '猫', headword: '猫', startPos: 0, endPos: 1 }),
|
makeToken({ surface: '猫', headword: '猫', startPos: 0, endPos: 1 }),
|
||||||
@@ -662,6 +719,32 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
|
|||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('annotateTokens N+1 minimum sentence words excludes unknown tokens filtered from N+1 targeting', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({ surface: '私', headword: '私', pos1: '名詞', startPos: 0, endPos: 1 }),
|
||||||
|
makeToken({ surface: '猫', headword: '猫', pos1: '名詞', startPos: 1, endPos: 2 }),
|
||||||
|
makeToken({
|
||||||
|
surface: 'スイッチ',
|
||||||
|
headword: 'スイッチ',
|
||||||
|
reading: 'スイッチ',
|
||||||
|
pos1: '名詞',
|
||||||
|
startPos: 2,
|
||||||
|
endPos: 6,
|
||||||
|
}),
|
||||||
|
makeToken({ surface: '装置', headword: '装置', pos1: '名詞', startPos: 6, endPos: 8 }),
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = annotateTokens(
|
||||||
|
tokens,
|
||||||
|
makeDeps({
|
||||||
|
isKnownWord: (text) => text === '私' || text === '猫',
|
||||||
|
}),
|
||||||
|
{ minSentenceWordsForNPlusOne: 4 },
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.equal(result[3]?.isNPlusOneTarget, false);
|
||||||
|
});
|
||||||
|
|
||||||
test('annotateTokens N+1 sentence word count respects source punctuation gaps omitted by Yomitan', () => {
|
test('annotateTokens N+1 sentence word count respects source punctuation gaps omitted by Yomitan', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
|
|||||||
@@ -298,9 +298,28 @@ function isKanaChar(char: string): boolean {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function isKanaCandidateIgnorableChar(char: string): boolean {
|
||||||
|
return /^[\s.,!?;:()[\]{}"'`、。!?…‥・「」『』()[]{}〈〉《》【】―-]$/u.test(char);
|
||||||
|
}
|
||||||
|
|
||||||
function isKanaOnlyText(text: string): boolean {
|
function isKanaOnlyText(text: string): boolean {
|
||||||
const normalized = text.trim();
|
const normalized = text.trim();
|
||||||
return normalized.length > 0 && Array.from(normalized).every((char) => isKanaChar(char));
|
if (normalized.length === 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
let hasKana = false;
|
||||||
|
for (const char of normalized) {
|
||||||
|
if (isKanaChar(char)) {
|
||||||
|
hasKana = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!isKanaCandidateIgnorableChar(char)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return hasKana;
|
||||||
}
|
}
|
||||||
|
|
||||||
function normalizeSourceTextForTokenOffsets(sourceText: string | undefined): string | undefined {
|
function normalizeSourceTextForTokenOffsets(sourceText: string | undefined): string | undefined {
|
||||||
@@ -367,6 +386,18 @@ function isNPlusOneWordCountToken(
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function isNPlusOneSentenceLengthToken(
|
||||||
|
token: MergedToken,
|
||||||
|
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
|
||||||
|
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
|
||||||
|
): boolean {
|
||||||
|
if (!isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return token.isKnown || isNPlusOneCandidateToken(token, pos1Exclusions, pos2Exclusions);
|
||||||
|
}
|
||||||
|
|
||||||
function isSentenceBoundaryToken(token: MergedToken): boolean {
|
function isSentenceBoundaryToken(token: MergedToken): boolean {
|
||||||
if (token.partOfSpeech !== PartOfSpeech.symbol) {
|
if (token.partOfSpeech !== PartOfSpeech.symbol) {
|
||||||
return false;
|
return false;
|
||||||
@@ -418,7 +449,7 @@ export function markNPlusOneTargets(
|
|||||||
for (let i = start; i < endExclusive; i++) {
|
for (let i = start; i < endExclusive; i++) {
|
||||||
const token = markedTokens[i];
|
const token = markedTokens[i];
|
||||||
if (!token) continue;
|
if (!token) continue;
|
||||||
if (isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions)) {
|
if (isNPlusOneSentenceLengthToken(token, pos1Exclusions, pos2Exclusions)) {
|
||||||
sentenceWordCount += 1;
|
sentenceWordCount += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user