mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-02-28 06:22:45 -08:00
Add configurable minimum sentence length for N+1 targets
This commit is contained in:
@@ -476,10 +476,11 @@ test("tokenizeSubtitleService selects one N+1 target token", async () => {
|
||||
endPos: 2,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
getMinSentenceWordsForNPlusOne: () => 2,
|
||||
}),
|
||||
);
|
||||
|
||||
@@ -561,6 +562,7 @@ test("tokenizeSubtitleService applies N+1 target marking to Yomitan results", as
|
||||
getYomitanParserWindow: () => parserWindow,
|
||||
tokenizeWithMecab: async () => null,
|
||||
isKnownWord: (text) => text === "です",
|
||||
getMinSentenceWordsForNPlusOne: () => 2,
|
||||
}),
|
||||
);
|
||||
|
||||
@@ -571,6 +573,43 @@ test("tokenizeSubtitleService applies N+1 target marking to Yomitan results", as
|
||||
assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService does not color 1-2 word sentences by default", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"猫です",
|
||||
makeDeps({
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
surface: "私",
|
||||
reading: "ワタシ",
|
||||
headword: "私",
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
isMerged: false,
|
||||
isKnown: true,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
surface: "犬",
|
||||
reading: "イヌ",
|
||||
headword: "犬",
|
||||
startPos: 1,
|
||||
endPos: 2,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(
|
||||
result.tokens?.some((token) => token.isNPlusOneTarget),
|
||||
false,
|
||||
);
|
||||
});
|
||||
|
||||
test("tokenizeSubtitleService checks known words by headword, not surface", async () => {
|
||||
const result = await tokenizeSubtitleService(
|
||||
"猫です",
|
||||
|
||||
@@ -59,6 +59,7 @@ export interface TokenizerServiceDeps {
|
||||
getKnownWordMatchMode: () => NPlusOneMatchMode;
|
||||
getJlptLevel: (text: string) => JlptLevel | null;
|
||||
getJlptEnabled?: () => boolean;
|
||||
getMinSentenceWordsForNPlusOne?: () => number;
|
||||
tokenizeWithMecab: (text: string) => Promise<MergedToken[] | null>;
|
||||
}
|
||||
|
||||
@@ -78,6 +79,7 @@ export interface TokenizerDepsRuntimeOptions {
|
||||
getKnownWordMatchMode: () => NPlusOneMatchMode;
|
||||
getJlptLevel: (text: string) => JlptLevel | null;
|
||||
getJlptEnabled?: () => boolean;
|
||||
getMinSentenceWordsForNPlusOne?: () => number;
|
||||
getMecabTokenizer: () => MecabTokenizerLike | null;
|
||||
}
|
||||
|
||||
@@ -133,6 +135,8 @@ export function createTokenizerDepsRuntimeService(
|
||||
getKnownWordMatchMode: options.getKnownWordMatchMode,
|
||||
getJlptLevel: options.getJlptLevel,
|
||||
getJlptEnabled: options.getJlptEnabled,
|
||||
getMinSentenceWordsForNPlusOne:
|
||||
options.getMinSentenceWordsForNPlusOne ?? (() => 3),
|
||||
tokenizeWithMecab: async (text) => {
|
||||
const mecabTokenizer = options.getMecabTokenizer();
|
||||
if (!mecabTokenizer) {
|
||||
@@ -724,6 +728,14 @@ export async function tokenizeSubtitleService(
|
||||
text: string,
|
||||
deps: TokenizerServiceDeps,
|
||||
): Promise<SubtitleData> {
|
||||
const minSentenceWordsForNPlusOne = deps.getMinSentenceWordsForNPlusOne?.();
|
||||
const sanitizedMinSentenceWordsForNPlusOne =
|
||||
minSentenceWordsForNPlusOne !== undefined &&
|
||||
Number.isInteger(minSentenceWordsForNPlusOne) &&
|
||||
minSentenceWordsForNPlusOne > 0
|
||||
? minSentenceWordsForNPlusOne
|
||||
: 3;
|
||||
|
||||
const displayText = text
|
||||
.replace(/\r\n/g, "\n")
|
||||
.replace(/\\N/g, "\n")
|
||||
@@ -747,10 +759,16 @@ export async function tokenizeSubtitleService(
|
||||
deps.isKnownWord,
|
||||
deps.getKnownWordMatchMode(),
|
||||
);
|
||||
const jlptMarkedTokens = jlptEnabled
|
||||
? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel)
|
||||
: knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
|
||||
return { text: displayText, tokens: markNPlusOneTargets(jlptMarkedTokens) };
|
||||
const jlptMarkedTokens = jlptEnabled
|
||||
? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel)
|
||||
: knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
|
||||
return {
|
||||
text: displayText,
|
||||
tokens: markNPlusOneTargets(
|
||||
jlptMarkedTokens,
|
||||
sanitizedMinSentenceWordsForNPlusOne,
|
||||
),
|
||||
};
|
||||
}
|
||||
|
||||
try {
|
||||
@@ -764,7 +782,13 @@ export async function tokenizeSubtitleService(
|
||||
const jlptMarkedTokens = jlptEnabled
|
||||
? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel)
|
||||
: knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
|
||||
return { text: displayText, tokens: markNPlusOneTargets(jlptMarkedTokens) };
|
||||
return {
|
||||
text: displayText,
|
||||
tokens: markNPlusOneTargets(
|
||||
jlptMarkedTokens,
|
||||
sanitizedMinSentenceWordsForNPlusOne,
|
||||
),
|
||||
};
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("Tokenization error:", (err as Error).message);
|
||||
|
||||
Reference in New Issue
Block a user