mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-05-13 20:12:54 -07:00
feat(tokenizer): use Yomitan word classes for subtitle POS filtering
- Carry matched headword wordClasses from termsFind into YomitanScanToken - Map recognized Yomitan wordClasses to SubMiner coarse POS before annotation - MeCab enrichment now fills only missing POS fields, preserving existing coarse pos1 - Exclude standalone grammar particles, して helper fragments, and single-kana surfaces from annotations - Respect source-text punctuation gaps when counting N+1 sentence words - Preserve known-word highlight on excluded kanji-containing tokens - Add backlog tasks 304 (N+1 boundary bug) and 305 (wordClasses POS, done)
This commit is contained in:
@@ -347,11 +347,25 @@ function isSentenceBoundaryToken(token: MergedToken): boolean {
|
||||
return SENTENCE_BOUNDARY_SURFACES.has(token.surface);
|
||||
}
|
||||
|
||||
function hasSentenceBoundaryInSourceGap(
|
||||
sourceText: string | undefined,
|
||||
previousEnd: number | null,
|
||||
nextStart: number,
|
||||
): boolean {
|
||||
if (typeof sourceText !== 'string' || previousEnd === null || nextStart <= previousEnd) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const gap = sourceText.slice(previousEnd, nextStart);
|
||||
return [...gap].some((char) => SENTENCE_BOUNDARY_SURFACES.has(char));
|
||||
}
|
||||
|
||||
export function markNPlusOneTargets(
|
||||
tokens: MergedToken[],
|
||||
minSentenceWords = 3,
|
||||
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
|
||||
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
|
||||
sourceText?: string,
|
||||
): MergedToken[] {
|
||||
if (tokens.length === 0) {
|
||||
return [];
|
||||
@@ -363,6 +377,7 @@ export function markNPlusOneTargets(
|
||||
}));
|
||||
|
||||
let sentenceStart = 0;
|
||||
let previousTokenEnd: number | null = null;
|
||||
const minimumSentenceWords = Number.isInteger(minSentenceWords)
|
||||
? Math.max(1, minSentenceWords)
|
||||
: 3;
|
||||
@@ -393,10 +408,15 @@ export function markNPlusOneTargets(
|
||||
for (let i = 0; i < markedTokens.length; i++) {
|
||||
const token = markedTokens[i];
|
||||
if (!token) continue;
|
||||
if (hasSentenceBoundaryInSourceGap(sourceText, previousTokenEnd, token.startPos)) {
|
||||
markSentence(sentenceStart, i);
|
||||
sentenceStart = i;
|
||||
}
|
||||
if (isSentenceBoundaryToken(token)) {
|
||||
markSentence(sentenceStart, i);
|
||||
sentenceStart = i + 1;
|
||||
}
|
||||
previousTokenEnd = token.endPos;
|
||||
}
|
||||
|
||||
if (sentenceStart < markedTokens.length) {
|
||||
|
||||
Reference in New Issue
Block a user