Add configurable minimum sentence length for N+1 targets

This commit is contained in:
2026-02-15 18:34:10 -08:00
parent f1b5082801
commit 667bde944c
11 changed files with 180 additions and 9 deletions

View File

@@ -476,10 +476,11 @@ test("tokenizeSubtitleService selects one N+1 target token", async () => {
endPos: 2,
partOfSpeech: PartOfSpeech.noun,
isMerged: false,
isKnown: false,
isKnown: false,
isNPlusOneTarget: false,
},
],
getMinSentenceWordsForNPlusOne: () => 2,
}),
);
@@ -561,6 +562,7 @@ test("tokenizeSubtitleService applies N+1 target marking to Yomitan results", as
getYomitanParserWindow: () => parserWindow,
tokenizeWithMecab: async () => null,
isKnownWord: (text) => text === "です",
getMinSentenceWordsForNPlusOne: () => 2,
}),
);
@@ -571,6 +573,43 @@ test("tokenizeSubtitleService applies N+1 target marking to Yomitan results", as
assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false);
});
test("tokenizeSubtitleService does not color 1-2 word sentences by default", async () => {
const result = await tokenizeSubtitleService(
"猫です",
makeDeps({
tokenizeWithMecab: async () => [
{
surface: "私",
reading: "ワタシ",
headword: "私",
startPos: 0,
endPos: 1,
partOfSpeech: PartOfSpeech.noun,
isMerged: false,
isKnown: true,
isNPlusOneTarget: false,
},
{
surface: "犬",
reading: "イヌ",
headword: "犬",
startPos: 1,
endPos: 2,
partOfSpeech: PartOfSpeech.noun,
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
],
}),
);
assert.equal(
result.tokens?.some((token) => token.isNPlusOneTarget),
false,
);
});
test("tokenizeSubtitleService checks known words by headword, not surface", async () => {
const result = await tokenizeSubtitleService(
"猫です",

View File

@@ -59,6 +59,7 @@ export interface TokenizerServiceDeps {
getKnownWordMatchMode: () => NPlusOneMatchMode;
getJlptLevel: (text: string) => JlptLevel | null;
getJlptEnabled?: () => boolean;
getMinSentenceWordsForNPlusOne?: () => number;
tokenizeWithMecab: (text: string) => Promise<MergedToken[] | null>;
}
@@ -78,6 +79,7 @@ export interface TokenizerDepsRuntimeOptions {
getKnownWordMatchMode: () => NPlusOneMatchMode;
getJlptLevel: (text: string) => JlptLevel | null;
getJlptEnabled?: () => boolean;
getMinSentenceWordsForNPlusOne?: () => number;
getMecabTokenizer: () => MecabTokenizerLike | null;
}
@@ -133,6 +135,8 @@ export function createTokenizerDepsRuntimeService(
getKnownWordMatchMode: options.getKnownWordMatchMode,
getJlptLevel: options.getJlptLevel,
getJlptEnabled: options.getJlptEnabled,
getMinSentenceWordsForNPlusOne:
options.getMinSentenceWordsForNPlusOne ?? (() => 3),
tokenizeWithMecab: async (text) => {
const mecabTokenizer = options.getMecabTokenizer();
if (!mecabTokenizer) {
@@ -724,6 +728,14 @@ export async function tokenizeSubtitleService(
text: string,
deps: TokenizerServiceDeps,
): Promise<SubtitleData> {
const minSentenceWordsForNPlusOne = deps.getMinSentenceWordsForNPlusOne?.();
const sanitizedMinSentenceWordsForNPlusOne =
minSentenceWordsForNPlusOne !== undefined &&
Number.isInteger(minSentenceWordsForNPlusOne) &&
minSentenceWordsForNPlusOne > 0
? minSentenceWordsForNPlusOne
: 3;
const displayText = text
.replace(/\r\n/g, "\n")
.replace(/\\N/g, "\n")
@@ -747,10 +759,16 @@ export async function tokenizeSubtitleService(
deps.isKnownWord,
deps.getKnownWordMatchMode(),
);
const jlptMarkedTokens = jlptEnabled
? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel)
: knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
return { text: displayText, tokens: markNPlusOneTargets(jlptMarkedTokens) };
const jlptMarkedTokens = jlptEnabled
? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel)
: knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
return {
text: displayText,
tokens: markNPlusOneTargets(
jlptMarkedTokens,
sanitizedMinSentenceWordsForNPlusOne,
),
};
}
try {
@@ -764,7 +782,13 @@ export async function tokenizeSubtitleService(
const jlptMarkedTokens = jlptEnabled
? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel)
: knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
return { text: displayText, tokens: markNPlusOneTargets(jlptMarkedTokens) };
return {
text: displayText,
tokens: markNPlusOneTargets(
jlptMarkedTokens,
sanitizedMinSentenceWordsForNPlusOne,
),
};
}
} catch (err) {
console.error("Tokenization error:", (err as Error).message);