Add configurable minimum sentence length for N+1 targets

This commit is contained in:
2026-02-15 18:34:10 -08:00
parent f1b5082801
commit 667bde944c
11 changed files with 180 additions and 9 deletions

View File

@@ -173,6 +173,7 @@ This example is intentionally compact. The option table below documents availabl
| `ankiConnect.nPlusOne.nPlusOne` | hex color string | Text color for the single target token to study when exactly one unknown candidate exists in a sentence (default: `"#c6a0f6"`). |
| `ankiConnect.nPlusOne.knownWord` | hex color string | Legacy known-word color kept for backward compatibility (default: `"#a6da95"`). |
| `ankiConnect.nPlusOne.matchMode` | `"headword"`, `"surface"` | Matching strategy for known-word highlighting (default: `"headword"`). `headword` uses token headwords; `surface` uses visible subtitle text. |
| `ankiConnect.nPlusOne.minSentenceWords` | number | Minimum number of words required in a sentence before single unknown-word N+1 highlighting can trigger (default: `3`). |
| `ankiConnect.nPlusOne.refreshMinutes` | number | Minutes between known-word cache refreshes (default: `1440`) |
| `ankiConnect.nPlusOne.decks` | array of strings | Decks used by known-word cache refresh. Leave empty for compatibility with legacy `deck` scope. |
| `behavior.notificationType` | `"osd"`, `"system"`, `"both"`, `"none"` | Notification type on card update (default: `"osd"`) |
@@ -198,6 +199,7 @@ Known-word cache policy:
- Initial sync runs when the integration starts if the cache is missing or stale.
- `ankiConnect.nPlusOne.refreshMinutes` controls the minimum time between refreshes; between refreshes, cached words are reused without querying Anki.
- `ankiConnect.nPlusOne.nPlusOne` sets the color for the single target token when exactly one eligible unknown word exists.
- `ankiConnect.nPlusOne.minSentenceWords` sets the minimum token count required in a sentence for N+1 highlighting (default: `3`).
- `ankiConnect.nPlusOne.knownWord` sets the legacy known-word highlight color for tokens already in Anki.
- `ankiConnect.nPlusOne.decks` accepts one or more decks. If empty, it uses the legacy single `ankiConnect.deck` value as scope.
- Cache state is persisted to `known-words-cache.json` under the app `userData` directory.

View File

@@ -172,6 +172,7 @@ When enabled, SubMiner highlights words you already know in your Anki deck, maki
"highlightEnabled": true,
"refreshMinutes": 1440,
"matchMode": "headword",
"minSentenceWords": 3,
"decks": ["Learning::Japanese"]
}
}
@@ -183,6 +184,7 @@ When enabled, SubMiner highlights words you already know in your Anki deck, maki
| `highlightEnabled` | Turn on/off the highlighting feature |
| `refreshMinutes` | How often to refresh the known-word cache (default: 1440 = daily) |
| `matchMode` | `"headword"` (dictionary form) or `"surface"` (exact text match) |
| `minSentenceWords` | Minimum sentence length in tokens required to allow N+1 highlighting (default: `3`) |
| `decks` | Which Anki decks to consider "known" (empty = uses `ankiConnect.deck`) |
### Use Cases

View File

@@ -87,6 +87,7 @@
"refreshMinutes": 1440,
"matchMode": "headword",
"decks": [],
"minSentenceWords": 3,
"nPlusOne": "#c6a0f6",
"knownWord": "#a6da95"
},

View File

@@ -137,6 +137,55 @@ test("accepts valid ankiConnect n+1 behavior values", () => {
assert.equal(config.ankiConnect.nPlusOne.refreshMinutes, 120);
});
test("validates ankiConnect n+1 minimum sentence word count", () => {
const dir = makeTempDir();
fs.writeFileSync(
path.join(dir, "config.jsonc"),
`{
"ankiConnect": {
"nPlusOne": {
"minSentenceWords": 0
}
}
}`,
"utf-8",
);
const service = new ConfigService(dir);
const config = service.getConfig();
const warnings = service.getWarnings();
assert.equal(
config.ankiConnect.nPlusOne.minSentenceWords,
DEFAULT_CONFIG.ankiConnect.nPlusOne.minSentenceWords,
);
assert.ok(
warnings.some(
(warning) => warning.path === "ankiConnect.nPlusOne.minSentenceWords",
),
);
});
test("accepts valid ankiConnect n+1 minimum sentence word count", () => {
const dir = makeTempDir();
fs.writeFileSync(
path.join(dir, "config.jsonc"),
`{
"ankiConnect": {
"nPlusOne": {
"minSentenceWords": 4
}
}
}`,
"utf-8",
);
const service = new ConfigService(dir);
const config = service.getConfig();
assert.equal(config.ankiConnect.nPlusOne.minSentenceWords, 4);
});
test("validates ankiConnect n+1 match mode values", () => {
const dir = makeTempDir();
fs.writeFileSync(
@@ -328,5 +377,6 @@ test("template generator includes known keys", () => {
assert.match(output, /"nPlusOne"\s*:\s*\{/);
assert.match(output, /"nPlusOne": "#c6a0f6"/);
assert.match(output, /"knownWord": "#a6da95"/);
assert.match(output, /"minSentenceWords": 3/);
assert.match(output, /auto-generated from src\/config\/definitions.ts/);
});

View File

@@ -128,6 +128,7 @@ export const DEFAULT_CONFIG: ResolvedConfig = {
refreshMinutes: 1440,
matchMode: "headword",
decks: [],
minSentenceWords: 3,
nPlusOne: "#c6a0f6",
knownWord: "#a6da95",
},
@@ -333,6 +334,13 @@ export const CONFIG_OPTION_REGISTRY: ConfigOptionRegistryEntry[] = [
defaultValue: DEFAULT_CONFIG.ankiConnect.nPlusOne.refreshMinutes,
description: "Minutes between known-word cache refreshes.",
},
{
path: "ankiConnect.nPlusOne.minSentenceWords",
kind: "number",
defaultValue: DEFAULT_CONFIG.ankiConnect.nPlusOne.minSentenceWords,
description:
"Minimum sentence word count required for N+1 targeting (default: 3).",
},
{
path: "ankiConnect.nPlusOne.decks",
kind: "array",

View File

@@ -698,6 +698,32 @@ export class ConfigService {
DEFAULT_CONFIG.ankiConnect.nPlusOne.refreshMinutes;
}
const nPlusOneMinSentenceWords = asNumber(
nPlusOneConfig.minSentenceWords,
);
const hasValidNPlusOneMinSentenceWords =
nPlusOneMinSentenceWords !== undefined &&
Number.isInteger(nPlusOneMinSentenceWords) &&
nPlusOneMinSentenceWords > 0;
if (nPlusOneMinSentenceWords !== undefined) {
if (hasValidNPlusOneMinSentenceWords) {
resolved.ankiConnect.nPlusOne.minSentenceWords =
nPlusOneMinSentenceWords;
} else {
warn(
"ankiConnect.nPlusOne.minSentenceWords",
nPlusOneConfig.minSentenceWords,
resolved.ankiConnect.nPlusOne.minSentenceWords,
"Expected a positive integer.",
);
resolved.ankiConnect.nPlusOne.minSentenceWords =
DEFAULT_CONFIG.ankiConnect.nPlusOne.minSentenceWords;
}
} else {
resolved.ankiConnect.nPlusOne.minSentenceWords =
DEFAULT_CONFIG.ankiConnect.nPlusOne.minSentenceWords;
}
const nPlusOneMatchMode = asString(nPlusOneConfig.matchMode);
const legacyNPlusOneMatchMode = asString(behavior.nPlusOneMatchMode);
const hasValidNPlusOneMatchMode =

View File

@@ -480,6 +480,7 @@ test("tokenizeSubtitleService selects one N+1 target token", async () => {
isNPlusOneTarget: false,
},
],
getMinSentenceWordsForNPlusOne: () => 2,
}),
);
@@ -561,6 +562,7 @@ test("tokenizeSubtitleService applies N+1 target marking to Yomitan results", as
getYomitanParserWindow: () => parserWindow,
tokenizeWithMecab: async () => null,
isKnownWord: (text) => text === "です",
getMinSentenceWordsForNPlusOne: () => 2,
}),
);
@@ -571,6 +573,43 @@ test("tokenizeSubtitleService applies N+1 target marking to Yomitan results", as
assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false);
});
test("tokenizeSubtitleService does not color 1-2 word sentences by default", async () => {
const result = await tokenizeSubtitleService(
"猫です",
makeDeps({
tokenizeWithMecab: async () => [
{
surface: "私",
reading: "ワタシ",
headword: "私",
startPos: 0,
endPos: 1,
partOfSpeech: PartOfSpeech.noun,
isMerged: false,
isKnown: true,
isNPlusOneTarget: false,
},
{
surface: "犬",
reading: "イヌ",
headword: "犬",
startPos: 1,
endPos: 2,
partOfSpeech: PartOfSpeech.noun,
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
],
}),
);
assert.equal(
result.tokens?.some((token) => token.isNPlusOneTarget),
false,
);
});
test("tokenizeSubtitleService checks known words by headword, not surface", async () => {
const result = await tokenizeSubtitleService(
"猫です",

View File

@@ -59,6 +59,7 @@ export interface TokenizerServiceDeps {
getKnownWordMatchMode: () => NPlusOneMatchMode;
getJlptLevel: (text: string) => JlptLevel | null;
getJlptEnabled?: () => boolean;
getMinSentenceWordsForNPlusOne?: () => number;
tokenizeWithMecab: (text: string) => Promise<MergedToken[] | null>;
}
@@ -78,6 +79,7 @@ export interface TokenizerDepsRuntimeOptions {
getKnownWordMatchMode: () => NPlusOneMatchMode;
getJlptLevel: (text: string) => JlptLevel | null;
getJlptEnabled?: () => boolean;
getMinSentenceWordsForNPlusOne?: () => number;
getMecabTokenizer: () => MecabTokenizerLike | null;
}
@@ -133,6 +135,8 @@ export function createTokenizerDepsRuntimeService(
getKnownWordMatchMode: options.getKnownWordMatchMode,
getJlptLevel: options.getJlptLevel,
getJlptEnabled: options.getJlptEnabled,
getMinSentenceWordsForNPlusOne:
options.getMinSentenceWordsForNPlusOne ?? (() => 3),
tokenizeWithMecab: async (text) => {
const mecabTokenizer = options.getMecabTokenizer();
if (!mecabTokenizer) {
@@ -724,6 +728,14 @@ export async function tokenizeSubtitleService(
text: string,
deps: TokenizerServiceDeps,
): Promise<SubtitleData> {
const minSentenceWordsForNPlusOne = deps.getMinSentenceWordsForNPlusOne?.();
const sanitizedMinSentenceWordsForNPlusOne =
minSentenceWordsForNPlusOne !== undefined &&
Number.isInteger(minSentenceWordsForNPlusOne) &&
minSentenceWordsForNPlusOne > 0
? minSentenceWordsForNPlusOne
: 3;
const displayText = text
.replace(/\r\n/g, "\n")
.replace(/\\N/g, "\n")
@@ -750,7 +762,13 @@ export async function tokenizeSubtitleService(
const jlptMarkedTokens = jlptEnabled
? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel)
: knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
return { text: displayText, tokens: markNPlusOneTargets(jlptMarkedTokens) };
return {
text: displayText,
tokens: markNPlusOneTargets(
jlptMarkedTokens,
sanitizedMinSentenceWordsForNPlusOne,
),
};
}
try {
@@ -764,7 +782,13 @@ export async function tokenizeSubtitleService(
const jlptMarkedTokens = jlptEnabled
? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel)
: knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
return { text: displayText, tokens: markNPlusOneTargets(jlptMarkedTokens) };
return {
text: displayText,
tokens: markNPlusOneTargets(
jlptMarkedTokens,
sanitizedMinSentenceWordsForNPlusOne,
),
};
}
} catch (err) {
console.error("Tokenization error:", (err as Error).message);

View File

@@ -896,6 +896,8 @@ async function tokenizeSubtitle(text: string): Promise<SubtitleData> {
getKnownWordMatchMode: () =>
appState.ankiIntegration?.getKnownWordMatchMode() ??
getResolvedConfig().ankiConnect.nPlusOne.matchMode,
getMinSentenceWordsForNPlusOne: () =>
getResolvedConfig().ankiConnect.nPlusOne.minSentenceWords,
getJlptLevel: (text) => appState.jlptLevelLookup(text),
getJlptEnabled: () =>
getResolvedConfig().subtitleStyle.enableJlpt,

View File

@@ -305,7 +305,10 @@ function isSentenceBoundaryToken(token: MergedToken): boolean {
return SENTENCE_BOUNDARY_SURFACES.has(token.surface);
}
export function markNPlusOneTargets(tokens: MergedToken[]): MergedToken[] {
export function markNPlusOneTargets(
tokens: MergedToken[],
minSentenceWords = 3,
): MergedToken[] {
if (tokens.length === 0) {
return [];
}
@@ -316,16 +319,28 @@ export function markNPlusOneTargets(tokens: MergedToken[]): MergedToken[] {
}));
let sentenceStart = 0;
const minimumSentenceWords = Number.isInteger(minSentenceWords)
? Math.max(1, minSentenceWords)
: 3;
const markSentence = (start: number, endExclusive: number): void => {
const sentenceCandidates: number[] = [];
let sentenceWordCount = 0;
for (let i = start; i < endExclusive; i++) {
if (isNPlusOneCandidateToken(markedTokens[i])) {
const token = markedTokens[i];
if (!isSentenceBoundaryToken(token) && token.surface.trim().length > 0) {
sentenceWordCount += 1;
}
if (isNPlusOneCandidateToken(token)) {
sentenceCandidates.push(i);
}
}
if (sentenceCandidates.length === 1) {
if (
sentenceWordCount >= minimumSentenceWords &&
sentenceCandidates.length === 1
) {
markedTokens[sentenceCandidates[0]] = {
...markedTokens[sentenceCandidates[0]],
isNPlusOneTarget: true,

View File

@@ -239,6 +239,7 @@ export interface AnkiConnectConfig {
decks?: string[];
nPlusOne?: string;
knownWord?: string;
minSentenceWords?: number;
};
behavior?: {
overwriteAudio?: boolean;
@@ -399,6 +400,7 @@ export interface ResolvedConfig {
decks: string[];
nPlusOne: string;
knownWord: string;
minSentenceWords: number;
};
behavior: {
overwriteAudio: boolean;