mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-02-27 18:22:41 -08:00
Add configurable minimum sentence length for N+1 targets
This commit is contained in:
@@ -173,6 +173,7 @@ This example is intentionally compact. The option table below documents availabl
|
||||
| `ankiConnect.nPlusOne.nPlusOne` | hex color string | Text color for the single target token to study when exactly one unknown candidate exists in a sentence (default: `"#c6a0f6"`). |
|
||||
| `ankiConnect.nPlusOne.knownWord` | hex color string | Legacy known-word color kept for backward compatibility (default: `"#a6da95"`). |
|
||||
| `ankiConnect.nPlusOne.matchMode` | `"headword"`, `"surface"` | Matching strategy for known-word highlighting (default: `"headword"`). `headword` uses token headwords; `surface` uses visible subtitle text. |
|
||||
| `ankiConnect.nPlusOne.minSentenceWords` | number | Minimum number of words required in a sentence before single unknown-word N+1 highlighting can trigger (default: `3`). |
|
||||
| `ankiConnect.nPlusOne.refreshMinutes` | number | Minutes between known-word cache refreshes (default: `1440`) |
|
||||
| `ankiConnect.nPlusOne.decks` | array of strings | Decks used by known-word cache refresh. Leave empty for compatibility with legacy `deck` scope. |
|
||||
| `behavior.notificationType` | `"osd"`, `"system"`, `"both"`, `"none"` | Notification type on card update (default: `"osd"`) |
|
||||
@@ -198,6 +199,7 @@ Known-word cache policy:
|
||||
- Initial sync runs when the integration starts if the cache is missing or stale.
|
||||
- `ankiConnect.nPlusOne.refreshMinutes` controls the minimum time between refreshes; between refreshes, cached words are reused without querying Anki.
|
||||
- `ankiConnect.nPlusOne.nPlusOne` sets the color for the single target token when exactly one eligible unknown word exists.
|
||||
- `ankiConnect.nPlusOne.minSentenceWords` sets the minimum token count required in a sentence for N+1 highlighting (default: `3`).
|
||||
- `ankiConnect.nPlusOne.knownWord` sets the legacy known-word highlight color for tokens already in Anki.
|
||||
- `ankiConnect.nPlusOne.decks` accepts one or more decks. If empty, it uses the legacy single `ankiConnect.deck` value as scope.
|
||||
- Cache state is persisted to `known-words-cache.json` under the app `userData` directory.
|
||||
|
||||
@@ -172,6 +172,7 @@ When enabled, SubMiner highlights words you already know in your Anki deck, maki
|
||||
"highlightEnabled": true,
|
||||
"refreshMinutes": 1440,
|
||||
"matchMode": "headword",
|
||||
"minSentenceWords": 3,
|
||||
"decks": ["Learning::Japanese"]
|
||||
}
|
||||
}
|
||||
@@ -183,6 +184,7 @@ When enabled, SubMiner highlights words you already know in your Anki deck, maki
|
||||
| `highlightEnabled` | Turn on/off the highlighting feature |
|
||||
| `refreshMinutes` | How often to refresh the known-word cache (default: 1440 = daily) |
|
||||
| `matchMode` | `"headword"` (dictionary form) or `"surface"` (exact text match) |
|
||||
| `minSentenceWords` | Minimum sentence length in tokens required to allow N+1 highlighting (default: `3`) |
|
||||
| `decks` | Which Anki decks to consider "known" (empty = uses `ankiConnect.deck`) |
|
||||
|
||||
### Use Cases
|
||||
|
||||
@@ -87,6 +87,7 @@
|
||||
"refreshMinutes": 1440,
|
||||
"matchMode": "headword",
|
||||
"decks": [],
|
||||
"minSentenceWords": 3,
|
||||
"nPlusOne": "#c6a0f6",
|
||||
"knownWord": "#a6da95"
|
||||
},
|
||||
|
||||
@@ -137,6 +137,55 @@ test("accepts valid ankiConnect n+1 behavior values", () => {
|
||||
assert.equal(config.ankiConnect.nPlusOne.refreshMinutes, 120);
|
||||
});
|
||||
|
||||
test("validates ankiConnect n+1 minimum sentence word count", () => {
|
||||
const dir = makeTempDir();
|
||||
fs.writeFileSync(
|
||||
path.join(dir, "config.jsonc"),
|
||||
`{
|
||||
"ankiConnect": {
|
||||
"nPlusOne": {
|
||||
"minSentenceWords": 0
|
||||
}
|
||||
}
|
||||
}`,
|
||||
"utf-8",
|
||||
);
|
||||
|
||||
const service = new ConfigService(dir);
|
||||
const config = service.getConfig();
|
||||
const warnings = service.getWarnings();
|
||||
|
||||
assert.equal(
|
||||
config.ankiConnect.nPlusOne.minSentenceWords,
|
||||
DEFAULT_CONFIG.ankiConnect.nPlusOne.minSentenceWords,
|
||||
);
|
||||
assert.ok(
|
||||
warnings.some(
|
||||
(warning) => warning.path === "ankiConnect.nPlusOne.minSentenceWords",
|
||||
),
|
||||
);
|
||||
});
|
||||
|
||||
test("accepts valid ankiConnect n+1 minimum sentence word count", () => {
|
||||
const dir = makeTempDir();
|
||||
fs.writeFileSync(
|
||||
path.join(dir, "config.jsonc"),
|
||||
`{
|
||||
"ankiConnect": {
|
||||
"nPlusOne": {
|
||||
"minSentenceWords": 4
|
||||
}
|
||||
}
|
||||
}`,
|
||||
"utf-8",
|
||||
);
|
||||
|
||||
const service = new ConfigService(dir);
|
||||
const config = service.getConfig();
|
||||
|
||||
assert.equal(config.ankiConnect.nPlusOne.minSentenceWords, 4);
|
||||
});
|
||||
|
||||
test("validates ankiConnect n+1 match mode values", () => {
|
||||
const dir = makeTempDir();
|
||||
fs.writeFileSync(
|
||||
@@ -328,5 +377,6 @@ test("template generator includes known keys", () => {
|
||||
assert.match(output, /"nPlusOne"\s*:\s*\{/);
|
||||
assert.match(output, /"nPlusOne": "#c6a0f6"/);
|
||||
assert.match(output, /"knownWord": "#a6da95"/);
|
||||
assert.match(output, /"minSentenceWords": 3/);
|
||||
assert.match(output, /auto-generated from src\/config\/definitions.ts/);
|
||||
});
|
||||
|
||||
@@ -128,6 +128,7 @@ export const DEFAULT_CONFIG: ResolvedConfig = {
|
||||
refreshMinutes: 1440,
|
||||
matchMode: "headword",
|
||||
decks: [],
|
||||
minSentenceWords: 3,
|
||||
nPlusOne: "#c6a0f6",
|
||||
knownWord: "#a6da95",
|
||||
},
|
||||
@@ -333,6 +334,13 @@ export const CONFIG_OPTION_REGISTRY: ConfigOptionRegistryEntry[] = [
|
||||
defaultValue: DEFAULT_CONFIG.ankiConnect.nPlusOne.refreshMinutes,
|
||||
description: "Minutes between known-word cache refreshes.",
|
||||
},
|
||||
{
|
||||
path: "ankiConnect.nPlusOne.minSentenceWords",
|
||||
kind: "number",
|
||||
defaultValue: DEFAULT_CONFIG.ankiConnect.nPlusOne.minSentenceWords,
|
||||
description:
|
||||
"Minimum sentence word count required for N+1 targeting (default: 3).",
|
||||
},
|
||||
{
|
||||
path: "ankiConnect.nPlusOne.decks",
|
||||
kind: "array",
|
||||
|
||||
@@ -698,6 +698,32 @@ export class ConfigService {
|
||||
DEFAULT_CONFIG.ankiConnect.nPlusOne.refreshMinutes;
|
||||
}
|
||||
|
||||
const nPlusOneMinSentenceWords = asNumber(
|
||||
nPlusOneConfig.minSentenceWords,
|
||||
);
|
||||
const hasValidNPlusOneMinSentenceWords =
|
||||
nPlusOneMinSentenceWords !== undefined &&
|
||||
Number.isInteger(nPlusOneMinSentenceWords) &&
|
||||
nPlusOneMinSentenceWords > 0;
|
||||
if (nPlusOneMinSentenceWords !== undefined) {
|
||||
if (hasValidNPlusOneMinSentenceWords) {
|
||||
resolved.ankiConnect.nPlusOne.minSentenceWords =
|
||||
nPlusOneMinSentenceWords;
|
||||
} else {
|
||||
warn(
|
||||
"ankiConnect.nPlusOne.minSentenceWords",
|
||||
nPlusOneConfig.minSentenceWords,
|
||||
resolved.ankiConnect.nPlusOne.minSentenceWords,
|
||||
"Expected a positive integer.",
|
||||
);
|
||||
resolved.ankiConnect.nPlusOne.minSentenceWords =
|
||||
DEFAULT_CONFIG.ankiConnect.nPlusOne.minSentenceWords;
|
||||
}
|
||||
} else {
|
||||
resolved.ankiConnect.nPlusOne.minSentenceWords =
|
||||
DEFAULT_CONFIG.ankiConnect.nPlusOne.minSentenceWords;
|
||||
}
|
||||
|
||||
const nPlusOneMatchMode = asString(nPlusOneConfig.matchMode);
|
||||
const legacyNPlusOneMatchMode = asString(behavior.nPlusOneMatchMode);
|
||||
const hasValidNPlusOneMatchMode =
|
||||
|
||||
@@ -476,10 +476,11 @@ test("tokenizeSubtitleService selects one N+1 target token", async () => {
|
||||
endPos: 2,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
getMinSentenceWordsForNPlusOne: () => 2,
|
||||
}),
|
||||
);
|
||||
|
||||
@@ -561,6 +562,7 @@ test("tokenizeSubtitleService applies N+1 target marking to Yomitan results", as
|
||||
getYomitanParserWindow: () => parserWindow,
|
||||
tokenizeWithMecab: async () => null,
|
||||
isKnownWord: (text) => text === "です",
|
||||
getMinSentenceWordsForNPlusOne: () => 2,
|
||||
}),
|
||||
);
|
||||
|
||||
@@ -571,6 +573,43 @@ test("tokenizeSubtitleService applies N+1 target marking to Yomitan results", as
|
||||
assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService does not color 1-2 word sentences by default", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"猫です",
|
||||
makeDeps({
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
surface: "私",
|
||||
reading: "ワタシ",
|
||||
headword: "私",
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
isMerged: false,
|
||||
isKnown: true,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
surface: "犬",
|
||||
reading: "イヌ",
|
||||
headword: "犬",
|
||||
startPos: 1,
|
||||
endPos: 2,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(
|
||||
result.tokens?.some((token) => token.isNPlusOneTarget),
|
||||
false,
|
||||
);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService checks known words by headword, not surface", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"猫です",
|
||||
|
||||
@@ -59,6 +59,7 @@ export interface TokenizerServiceDeps {
|
||||
getKnownWordMatchMode: () => NPlusOneMatchMode;
|
||||
getJlptLevel: (text: string) => JlptLevel | null;
|
||||
getJlptEnabled?: () => boolean;
|
||||
getMinSentenceWordsForNPlusOne?: () => number;
|
||||
tokenizeWithMecab: (text: string) => Promise<MergedToken[] | null>;
|
||||
}
|
||||
|
||||
@@ -78,6 +79,7 @@ export interface TokenizerDepsRuntimeOptions {
|
||||
getKnownWordMatchMode: () => NPlusOneMatchMode;
|
||||
getJlptLevel: (text: string) => JlptLevel | null;
|
||||
getJlptEnabled?: () => boolean;
|
||||
getMinSentenceWordsForNPlusOne?: () => number;
|
||||
getMecabTokenizer: () => MecabTokenizerLike | null;
|
||||
}
|
||||
|
||||
@@ -133,6 +135,8 @@ export function createTokenizerDepsRuntimeService(
|
||||
getKnownWordMatchMode: options.getKnownWordMatchMode,
|
||||
getJlptLevel: options.getJlptLevel,
|
||||
getJlptEnabled: options.getJlptEnabled,
|
||||
getMinSentenceWordsForNPlusOne:
|
||||
options.getMinSentenceWordsForNPlusOne ?? (() => 3),
|
||||
tokenizeWithMecab: async (text) => {
|
||||
const mecabTokenizer = options.getMecabTokenizer();
|
||||
if (!mecabTokenizer) {
|
||||
@@ -724,6 +728,14 @@ export async function tokenizeSubtitleService(
|
||||
text: string,
|
||||
deps: TokenizerServiceDeps,
|
||||
): Promise<SubtitleData> {
|
||||
const minSentenceWordsForNPlusOne = deps.getMinSentenceWordsForNPlusOne?.();
|
||||
const sanitizedMinSentenceWordsForNPlusOne =
|
||||
minSentenceWordsForNPlusOne !== undefined &&
|
||||
Number.isInteger(minSentenceWordsForNPlusOne) &&
|
||||
minSentenceWordsForNPlusOne > 0
|
||||
? minSentenceWordsForNPlusOne
|
||||
: 3;
|
||||
|
||||
const displayText = text
|
||||
.replace(/\r\n/g, "\n")
|
||||
.replace(/\\N/g, "\n")
|
||||
@@ -747,10 +759,16 @@ export async function tokenizeSubtitleService(
|
||||
deps.isKnownWord,
|
||||
deps.getKnownWordMatchMode(),
|
||||
);
|
||||
const jlptMarkedTokens = jlptEnabled
|
||||
? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel)
|
||||
: knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
|
||||
return { text: displayText, tokens: markNPlusOneTargets(jlptMarkedTokens) };
|
||||
const jlptMarkedTokens = jlptEnabled
|
||||
? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel)
|
||||
: knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
|
||||
return {
|
||||
text: displayText,
|
||||
tokens: markNPlusOneTargets(
|
||||
jlptMarkedTokens,
|
||||
sanitizedMinSentenceWordsForNPlusOne,
|
||||
),
|
||||
};
|
||||
}
|
||||
|
||||
try {
|
||||
@@ -764,7 +782,13 @@ export async function tokenizeSubtitleService(
|
||||
const jlptMarkedTokens = jlptEnabled
|
||||
? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel)
|
||||
: knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
|
||||
return { text: displayText, tokens: markNPlusOneTargets(jlptMarkedTokens) };
|
||||
return {
|
||||
text: displayText,
|
||||
tokens: markNPlusOneTargets(
|
||||
jlptMarkedTokens,
|
||||
sanitizedMinSentenceWordsForNPlusOne,
|
||||
),
|
||||
};
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("Tokenization error:", (err as Error).message);
|
||||
|
||||
@@ -896,6 +896,8 @@ async function tokenizeSubtitle(text: string): Promise<SubtitleData> {
|
||||
getKnownWordMatchMode: () =>
|
||||
appState.ankiIntegration?.getKnownWordMatchMode() ??
|
||||
getResolvedConfig().ankiConnect.nPlusOne.matchMode,
|
||||
getMinSentenceWordsForNPlusOne: () =>
|
||||
getResolvedConfig().ankiConnect.nPlusOne.minSentenceWords,
|
||||
getJlptLevel: (text) => appState.jlptLevelLookup(text),
|
||||
getJlptEnabled: () =>
|
||||
getResolvedConfig().subtitleStyle.enableJlpt,
|
||||
|
||||
@@ -305,7 +305,10 @@ function isSentenceBoundaryToken(token: MergedToken): boolean {
|
||||
return SENTENCE_BOUNDARY_SURFACES.has(token.surface);
|
||||
}
|
||||
|
||||
export function markNPlusOneTargets(tokens: MergedToken[]): MergedToken[] {
|
||||
export function markNPlusOneTargets(
|
||||
tokens: MergedToken[],
|
||||
minSentenceWords = 3,
|
||||
): MergedToken[] {
|
||||
if (tokens.length === 0) {
|
||||
return [];
|
||||
}
|
||||
@@ -316,16 +319,28 @@ export function markNPlusOneTargets(tokens: MergedToken[]): MergedToken[] {
|
||||
}));
|
||||
|
||||
let sentenceStart = 0;
|
||||
const minimumSentenceWords = Number.isInteger(minSentenceWords)
|
||||
? Math.max(1, minSentenceWords)
|
||||
: 3;
|
||||
|
||||
const markSentence = (start: number, endExclusive: number): void => {
|
||||
const sentenceCandidates: number[] = [];
|
||||
let sentenceWordCount = 0;
|
||||
for (let i = start; i < endExclusive; i++) {
|
||||
if (isNPlusOneCandidateToken(markedTokens[i])) {
|
||||
const token = markedTokens[i];
|
||||
if (!isSentenceBoundaryToken(token) && token.surface.trim().length > 0) {
|
||||
sentenceWordCount += 1;
|
||||
}
|
||||
|
||||
if (isNPlusOneCandidateToken(token)) {
|
||||
sentenceCandidates.push(i);
|
||||
}
|
||||
}
|
||||
|
||||
if (sentenceCandidates.length === 1) {
|
||||
if (
|
||||
sentenceWordCount >= minimumSentenceWords &&
|
||||
sentenceCandidates.length === 1
|
||||
) {
|
||||
markedTokens[sentenceCandidates[0]] = {
|
||||
...markedTokens[sentenceCandidates[0]],
|
||||
isNPlusOneTarget: true,
|
||||
|
||||
@@ -239,6 +239,7 @@ export interface AnkiConnectConfig {
|
||||
decks?: string[];
|
||||
nPlusOne?: string;
|
||||
knownWord?: string;
|
||||
minSentenceWords?: number;
|
||||
};
|
||||
behavior?: {
|
||||
overwriteAudio?: boolean;
|
||||
@@ -399,6 +400,7 @@ export interface ResolvedConfig {
|
||||
decks: string[];
|
||||
nPlusOne: string;
|
||||
knownWord: string;
|
||||
minSentenceWords: number;
|
||||
};
|
||||
behavior: {
|
||||
overwriteAudio: boolean;
|
||||
|
||||
Reference in New Issue
Block a user