diff --git a/docs/configuration.md b/docs/configuration.md index 3f048f4..ec4dac0 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -173,6 +173,7 @@ This example is intentionally compact. The option table below documents availabl | `ankiConnect.nPlusOne.nPlusOne` | hex color string | Text color for the single target token to study when exactly one unknown candidate exists in a sentence (default: `"#c6a0f6"`). | | `ankiConnect.nPlusOne.knownWord` | hex color string | Legacy known-word color kept for backward compatibility (default: `"#a6da95"`). | | `ankiConnect.nPlusOne.matchMode` | `"headword"`, `"surface"` | Matching strategy for known-word highlighting (default: `"headword"`). `headword` uses token headwords; `surface` uses visible subtitle text. | +| `ankiConnect.nPlusOne.minSentenceWords` | number | Minimum number of words required in a sentence before single unknown-word N+1 highlighting can trigger (default: `3`). | | `ankiConnect.nPlusOne.refreshMinutes` | number | Minutes between known-word cache refreshes (default: `1440`) | | `ankiConnect.nPlusOne.decks` | array of strings | Decks used by known-word cache refresh. Leave empty for compatibility with legacy `deck` scope. | | `behavior.notificationType` | `"osd"`, `"system"`, `"both"`, `"none"` | Notification type on card update (default: `"osd"`) | @@ -198,6 +199,7 @@ Known-word cache policy: - Initial sync runs when the integration starts if the cache is missing or stale. - `ankiConnect.nPlusOne.refreshMinutes` controls the minimum time between refreshes; between refreshes, cached words are reused without querying Anki. - `ankiConnect.nPlusOne.nPlusOne` sets the color for the single target token when exactly one eligible unknown word exists. +- `ankiConnect.nPlusOne.minSentenceWords` sets the minimum token count required in a sentence for N+1 highlighting (default: `3`). - `ankiConnect.nPlusOne.knownWord` sets the legacy known-word highlight color for tokens already in Anki. - `ankiConnect.nPlusOne.decks` accepts one or more decks. If empty, it uses the legacy single `ankiConnect.deck` value as scope. - Cache state is persisted to `known-words-cache.json` under the app `userData` directory. diff --git a/docs/mining-workflow.md b/docs/mining-workflow.md index 8ed8927..768685b 100644 --- a/docs/mining-workflow.md +++ b/docs/mining-workflow.md @@ -172,6 +172,7 @@ When enabled, SubMiner highlights words you already know in your Anki deck, maki "highlightEnabled": true, "refreshMinutes": 1440, "matchMode": "headword", + "minSentenceWords": 3, "decks": ["Learning::Japanese"] } } @@ -183,6 +184,7 @@ When enabled, SubMiner highlights words you already know in your Anki deck, maki | `highlightEnabled` | Turn on/off the highlighting feature | | `refreshMinutes` | How often to refresh the known-word cache (default: 1440 = daily) | | `matchMode` | `"headword"` (dictionary form) or `"surface"` (exact text match) | +| `minSentenceWords` | Minimum sentence length in tokens required to allow N+1 highlighting (default: `3`) | | `decks` | Which Anki decks to consider "known" (empty = uses `ankiConnect.deck`) | ### Use Cases diff --git a/docs/public/config.example.jsonc b/docs/public/config.example.jsonc index fdf142c..c488ca0 100644 --- a/docs/public/config.example.jsonc +++ b/docs/public/config.example.jsonc @@ -87,6 +87,7 @@ "refreshMinutes": 1440, "matchMode": "headword", "decks": [], + "minSentenceWords": 3, "nPlusOne": "#c6a0f6", "knownWord": "#a6da95" }, diff --git a/src/config/config.test.ts b/src/config/config.test.ts index d887267..559a040 100644 --- a/src/config/config.test.ts +++ b/src/config/config.test.ts @@ -137,6 +137,55 @@ test("accepts valid ankiConnect n+1 behavior values", () => { assert.equal(config.ankiConnect.nPlusOne.refreshMinutes, 120); }); +test("validates ankiConnect n+1 minimum sentence word count", () => { + const dir = makeTempDir(); + fs.writeFileSync( + path.join(dir, "config.jsonc"), + `{ + "ankiConnect": { + "nPlusOne": { + "minSentenceWords": 0 + } + } + }`, + "utf-8", + ); + + const service = new ConfigService(dir); + const config = service.getConfig(); + const warnings = service.getWarnings(); + + assert.equal( + config.ankiConnect.nPlusOne.minSentenceWords, + DEFAULT_CONFIG.ankiConnect.nPlusOne.minSentenceWords, + ); + assert.ok( + warnings.some( + (warning) => warning.path === "ankiConnect.nPlusOne.minSentenceWords", + ), + ); +}); + +test("accepts valid ankiConnect n+1 minimum sentence word count", () => { + const dir = makeTempDir(); + fs.writeFileSync( + path.join(dir, "config.jsonc"), + `{ + "ankiConnect": { + "nPlusOne": { + "minSentenceWords": 4 + } + } + }`, + "utf-8", + ); + + const service = new ConfigService(dir); + const config = service.getConfig(); + + assert.equal(config.ankiConnect.nPlusOne.minSentenceWords, 4); +}); + test("validates ankiConnect n+1 match mode values", () => { const dir = makeTempDir(); fs.writeFileSync( @@ -328,5 +377,6 @@ test("template generator includes known keys", () => { assert.match(output, /"nPlusOne"\s*:\s*\{/); assert.match(output, /"nPlusOne": "#c6a0f6"/); assert.match(output, /"knownWord": "#a6da95"/); + assert.match(output, /"minSentenceWords": 3/); assert.match(output, /auto-generated from src\/config\/definitions.ts/); }); diff --git a/src/config/definitions.ts b/src/config/definitions.ts index d5b1685..be4eb70 100644 --- a/src/config/definitions.ts +++ b/src/config/definitions.ts @@ -128,6 +128,7 @@ export const DEFAULT_CONFIG: ResolvedConfig = { refreshMinutes: 1440, matchMode: "headword", decks: [], + minSentenceWords: 3, nPlusOne: "#c6a0f6", knownWord: "#a6da95", }, @@ -333,6 +334,13 @@ export const CONFIG_OPTION_REGISTRY: ConfigOptionRegistryEntry[] = [ defaultValue: DEFAULT_CONFIG.ankiConnect.nPlusOne.refreshMinutes, description: "Minutes between known-word cache refreshes.", }, + { + path: "ankiConnect.nPlusOne.minSentenceWords", + kind: "number", + defaultValue: DEFAULT_CONFIG.ankiConnect.nPlusOne.minSentenceWords, + description: + "Minimum sentence word count required for N+1 targeting (default: 3).", + }, { path: "ankiConnect.nPlusOne.decks", kind: "array", diff --git a/src/config/service.ts b/src/config/service.ts index 2007438..0a46428 100644 --- a/src/config/service.ts +++ b/src/config/service.ts @@ -698,6 +698,32 @@ export class ConfigService { DEFAULT_CONFIG.ankiConnect.nPlusOne.refreshMinutes; } + const nPlusOneMinSentenceWords = asNumber( + nPlusOneConfig.minSentenceWords, + ); + const hasValidNPlusOneMinSentenceWords = + nPlusOneMinSentenceWords !== undefined && + Number.isInteger(nPlusOneMinSentenceWords) && + nPlusOneMinSentenceWords > 0; + if (nPlusOneMinSentenceWords !== undefined) { + if (hasValidNPlusOneMinSentenceWords) { + resolved.ankiConnect.nPlusOne.minSentenceWords = + nPlusOneMinSentenceWords; + } else { + warn( + "ankiConnect.nPlusOne.minSentenceWords", + nPlusOneConfig.minSentenceWords, + resolved.ankiConnect.nPlusOne.minSentenceWords, + "Expected a positive integer.", + ); + resolved.ankiConnect.nPlusOne.minSentenceWords = + DEFAULT_CONFIG.ankiConnect.nPlusOne.minSentenceWords; + } + } else { + resolved.ankiConnect.nPlusOne.minSentenceWords = + DEFAULT_CONFIG.ankiConnect.nPlusOne.minSentenceWords; + } + const nPlusOneMatchMode = asString(nPlusOneConfig.matchMode); const legacyNPlusOneMatchMode = asString(behavior.nPlusOneMatchMode); const hasValidNPlusOneMatchMode = diff --git a/src/core/services/tokenizer-service.test.ts b/src/core/services/tokenizer-service.test.ts index 3ce56f1..3fea01f 100644 --- a/src/core/services/tokenizer-service.test.ts +++ b/src/core/services/tokenizer-service.test.ts @@ -476,10 +476,11 @@ test("tokenizeSubtitleService selects one N+1 target token", async () => { endPos: 2, partOfSpeech: PartOfSpeech.noun, isMerged: false, - isKnown: false, + isKnown: false, isNPlusOneTarget: false, }, ], + getMinSentenceWordsForNPlusOne: () => 2, }), ); @@ -561,6 +562,7 @@ test("tokenizeSubtitleService applies N+1 target marking to Yomitan results", as getYomitanParserWindow: () => parserWindow, tokenizeWithMecab: async () => null, isKnownWord: (text) => text === "です", + getMinSentenceWordsForNPlusOne: () => 2, }), ); @@ -571,6 +573,43 @@ test("tokenizeSubtitleService applies N+1 target marking to Yomitan results", as assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false); }); +test("tokenizeSubtitleService does not color 1-2 word sentences by default", async () => { + const result = await tokenizeSubtitleService( + "猫です", + makeDeps({ + tokenizeWithMecab: async () => [ + { + surface: "私", + reading: "ワタシ", + headword: "私", + startPos: 0, + endPos: 1, + partOfSpeech: PartOfSpeech.noun, + isMerged: false, + isKnown: true, + isNPlusOneTarget: false, + }, + { + surface: "犬", + reading: "イヌ", + headword: "犬", + startPos: 1, + endPos: 2, + partOfSpeech: PartOfSpeech.noun, + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + ], + }), + ); + + assert.equal( + result.tokens?.some((token) => token.isNPlusOneTarget), + false, + ); +}); + test("tokenizeSubtitleService checks known words by headword, not surface", async () => { const result = await tokenizeSubtitleService( "猫です", diff --git a/src/core/services/tokenizer-service.ts b/src/core/services/tokenizer-service.ts index a9fdabe..0cac83e 100644 --- a/src/core/services/tokenizer-service.ts +++ b/src/core/services/tokenizer-service.ts @@ -59,6 +59,7 @@ export interface TokenizerServiceDeps { getKnownWordMatchMode: () => NPlusOneMatchMode; getJlptLevel: (text: string) => JlptLevel | null; getJlptEnabled?: () => boolean; + getMinSentenceWordsForNPlusOne?: () => number; tokenizeWithMecab: (text: string) => Promise; } @@ -78,6 +79,7 @@ export interface TokenizerDepsRuntimeOptions { getKnownWordMatchMode: () => NPlusOneMatchMode; getJlptLevel: (text: string) => JlptLevel | null; getJlptEnabled?: () => boolean; + getMinSentenceWordsForNPlusOne?: () => number; getMecabTokenizer: () => MecabTokenizerLike | null; } @@ -133,6 +135,8 @@ export function createTokenizerDepsRuntimeService( getKnownWordMatchMode: options.getKnownWordMatchMode, getJlptLevel: options.getJlptLevel, getJlptEnabled: options.getJlptEnabled, + getMinSentenceWordsForNPlusOne: + options.getMinSentenceWordsForNPlusOne ?? (() => 3), tokenizeWithMecab: async (text) => { const mecabTokenizer = options.getMecabTokenizer(); if (!mecabTokenizer) { @@ -724,6 +728,14 @@ export async function tokenizeSubtitleService( text: string, deps: TokenizerServiceDeps, ): Promise { + const minSentenceWordsForNPlusOne = deps.getMinSentenceWordsForNPlusOne?.(); + const sanitizedMinSentenceWordsForNPlusOne = + minSentenceWordsForNPlusOne !== undefined && + Number.isInteger(minSentenceWordsForNPlusOne) && + minSentenceWordsForNPlusOne > 0 + ? minSentenceWordsForNPlusOne + : 3; + const displayText = text .replace(/\r\n/g, "\n") .replace(/\\N/g, "\n") @@ -747,10 +759,16 @@ export async function tokenizeSubtitleService( deps.isKnownWord, deps.getKnownWordMatchMode(), ); - const jlptMarkedTokens = jlptEnabled - ? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel) - : knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined })); - return { text: displayText, tokens: markNPlusOneTargets(jlptMarkedTokens) }; + const jlptMarkedTokens = jlptEnabled + ? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel) + : knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined })); + return { + text: displayText, + tokens: markNPlusOneTargets( + jlptMarkedTokens, + sanitizedMinSentenceWordsForNPlusOne, + ), + }; } try { @@ -764,7 +782,13 @@ export async function tokenizeSubtitleService( const jlptMarkedTokens = jlptEnabled ? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel) : knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined })); - return { text: displayText, tokens: markNPlusOneTargets(jlptMarkedTokens) }; + return { + text: displayText, + tokens: markNPlusOneTargets( + jlptMarkedTokens, + sanitizedMinSentenceWordsForNPlusOne, + ), + }; } } catch (err) { console.error("Tokenization error:", (err as Error).message); diff --git a/src/main.ts b/src/main.ts index 4b2324c..a58f00a 100644 --- a/src/main.ts +++ b/src/main.ts @@ -896,6 +896,8 @@ async function tokenizeSubtitle(text: string): Promise { getKnownWordMatchMode: () => appState.ankiIntegration?.getKnownWordMatchMode() ?? getResolvedConfig().ankiConnect.nPlusOne.matchMode, + getMinSentenceWordsForNPlusOne: () => + getResolvedConfig().ankiConnect.nPlusOne.minSentenceWords, getJlptLevel: (text) => appState.jlptLevelLookup(text), getJlptEnabled: () => getResolvedConfig().subtitleStyle.enableJlpt, diff --git a/src/token-merger.ts b/src/token-merger.ts index 6176bde..192f015 100644 --- a/src/token-merger.ts +++ b/src/token-merger.ts @@ -305,7 +305,10 @@ function isSentenceBoundaryToken(token: MergedToken): boolean { return SENTENCE_BOUNDARY_SURFACES.has(token.surface); } -export function markNPlusOneTargets(tokens: MergedToken[]): MergedToken[] { +export function markNPlusOneTargets( + tokens: MergedToken[], + minSentenceWords = 3, +): MergedToken[] { if (tokens.length === 0) { return []; } @@ -316,16 +319,28 @@ export function markNPlusOneTargets(tokens: MergedToken[]): MergedToken[] { })); let sentenceStart = 0; + const minimumSentenceWords = Number.isInteger(minSentenceWords) + ? Math.max(1, minSentenceWords) + : 3; const markSentence = (start: number, endExclusive: number): void => { const sentenceCandidates: number[] = []; + let sentenceWordCount = 0; for (let i = start; i < endExclusive; i++) { - if (isNPlusOneCandidateToken(markedTokens[i])) { + const token = markedTokens[i]; + if (!isSentenceBoundaryToken(token) && token.surface.trim().length > 0) { + sentenceWordCount += 1; + } + + if (isNPlusOneCandidateToken(token)) { sentenceCandidates.push(i); } } - if (sentenceCandidates.length === 1) { + if ( + sentenceWordCount >= minimumSentenceWords && + sentenceCandidates.length === 1 + ) { markedTokens[sentenceCandidates[0]] = { ...markedTokens[sentenceCandidates[0]], isNPlusOneTarget: true, diff --git a/src/types.ts b/src/types.ts index 692cc1f..07826a3 100644 --- a/src/types.ts +++ b/src/types.ts @@ -239,6 +239,7 @@ export interface AnkiConnectConfig { decks?: string[]; nPlusOne?: string; knownWord?: string; + minSentenceWords?: number; }; behavior?: { overwriteAudio?: boolean; @@ -399,6 +400,7 @@ export interface ResolvedConfig { decks: string[]; nPlusOne: string; knownWord: string; + minSentenceWords: number; }; behavior: { overwriteAudio: boolean;