mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-05-04 00:41:33 -07:00
fix: suppress N+1 for kana-only candidates and fix minSentenceWords coun
- Treat kana-only tokens with surrounding subtitle punctuation (…, ―, etc.) as kana-only so they are not promoted to N+1 targets - Exclude unknown tokens filtered from N+1 targeting from the minSentenceWords count so filtered kana-only unknowns cannot satisfy sentence length threshold - Add regression tests for kana-only candidate suppression and filtered-unknown padding cases
This commit is contained in:
@@ -709,6 +709,63 @@ test('annotateTokens N+1 handoff marks expected target when threshold is satisfi
|
||||
assert.equal(result[2]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('annotateTokens does not mark kana-only unknown target with subtitle punctuation as N+1', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: '何やら',
|
||||
headword: '何やら',
|
||||
reading: 'ナニヤラ',
|
||||
pos1: '副詞',
|
||||
startPos: 0,
|
||||
endPos: 3,
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'ボタン',
|
||||
headword: 'ボタン',
|
||||
reading: 'ボタン',
|
||||
pos1: '名詞',
|
||||
startPos: 3,
|
||||
endPos: 6,
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'スイッチ…',
|
||||
headword: 'スイッチ',
|
||||
reading: 'スイッチ',
|
||||
pos1: '名詞',
|
||||
startPos: 6,
|
||||
endPos: 11,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === '何やら' || text === 'ボタン',
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 3 },
|
||||
);
|
||||
|
||||
assert.equal(result[2]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('annotateTokens still marks kanji unknown target in otherwise eligible sentence as N+1', () => {
|
||||
const tokens = [
|
||||
makeToken({ surface: '私', headword: '私', pos1: '名詞', startPos: 0, endPos: 1 }),
|
||||
makeToken({ surface: '猫', headword: '猫', pos1: '名詞', startPos: 1, endPos: 2 }),
|
||||
makeToken({ surface: '装置…', headword: '装置', pos1: '名詞', startPos: 2, endPos: 5 }),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === '私' || text === '猫',
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 3 },
|
||||
);
|
||||
|
||||
assert.equal(result[2]?.isNPlusOneTarget, true);
|
||||
});
|
||||
|
||||
test('annotateTokens N+1 minimum sentence words counts only eligible word tokens', () => {
|
||||
const tokens = [
|
||||
makeToken({ surface: '猫', headword: '猫', startPos: 0, endPos: 1 }),
|
||||
@@ -744,6 +801,32 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('annotateTokens N+1 minimum sentence words excludes unknown tokens filtered from N+1 targeting', () => {
|
||||
const tokens = [
|
||||
makeToken({ surface: '私', headword: '私', pos1: '名詞', startPos: 0, endPos: 1 }),
|
||||
makeToken({ surface: '猫', headword: '猫', pos1: '名詞', startPos: 1, endPos: 2 }),
|
||||
makeToken({
|
||||
surface: 'スイッチ',
|
||||
headword: 'スイッチ',
|
||||
reading: 'スイッチ',
|
||||
pos1: '名詞',
|
||||
startPos: 2,
|
||||
endPos: 6,
|
||||
}),
|
||||
makeToken({ surface: '装置', headword: '装置', pos1: '名詞', startPos: 6, endPos: 8 }),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === '私' || text === '猫',
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 4 },
|
||||
);
|
||||
|
||||
assert.equal(result[3]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('annotateTokens N+1 sentence word count respects source punctuation gaps omitted by Yomitan', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
|
||||
+33
-2
@@ -298,9 +298,28 @@ function isKanaChar(char: string): boolean {
|
||||
);
|
||||
}
|
||||
|
||||
function isKanaCandidateIgnorableChar(char: string): boolean {
|
||||
return /^[\s.,!?;:()[\]{}"'`、。!?…‥・「」『』()[]{}〈〉《》【】―-]$/u.test(char);
|
||||
}
|
||||
|
||||
function isKanaOnlyText(text: string): boolean {
|
||||
const normalized = text.trim();
|
||||
return normalized.length > 0 && Array.from(normalized).every((char) => isKanaChar(char));
|
||||
if (normalized.length === 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
let hasKana = false;
|
||||
for (const char of normalized) {
|
||||
if (isKanaChar(char)) {
|
||||
hasKana = true;
|
||||
continue;
|
||||
}
|
||||
if (!isKanaCandidateIgnorableChar(char)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return hasKana;
|
||||
}
|
||||
|
||||
function normalizeSourceTextForTokenOffsets(sourceText: string | undefined): string | undefined {
|
||||
@@ -367,6 +386,18 @@ function isNPlusOneWordCountToken(
|
||||
return true;
|
||||
}
|
||||
|
||||
function isNPlusOneSentenceLengthToken(
|
||||
token: MergedToken,
|
||||
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
|
||||
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
|
||||
): boolean {
|
||||
if (!isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return token.isKnown || isNPlusOneCandidateToken(token, pos1Exclusions, pos2Exclusions);
|
||||
}
|
||||
|
||||
function isSentenceBoundaryToken(token: MergedToken): boolean {
|
||||
if (token.partOfSpeech !== PartOfSpeech.symbol) {
|
||||
return false;
|
||||
@@ -418,7 +449,7 @@ export function markNPlusOneTargets(
|
||||
for (let i = start; i < endExclusive; i++) {
|
||||
const token = markedTokens[i];
|
||||
if (!token) continue;
|
||||
if (isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions)) {
|
||||
if (isNPlusOneSentenceLengthToken(token, pos1Exclusions, pos2Exclusions)) {
|
||||
sentenceWordCount += 1;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user