mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-05-15 20:12:59 -07:00
430373f010
* feat(tokenizer): use Yomitan word classes for subtitle POS filtering - Carry matched headword wordClasses from termsFind into YomitanScanToken - Map recognized Yomitan wordClasses to SubMiner coarse POS before annotation - MeCab enrichment now fills only missing POS fields, preserving existing coarse pos1 - Exclude standalone grammar particles, して helper fragments, and single-kana surfaces from annotations - Respect source-text punctuation gaps when counting N+1 sentence words - Preserve known-word highlight on excluded kanji-containing tokens - Add backlog tasks 304 (N+1 boundary bug) and 305 (wordClasses POS, done) * fix(tokenizer): preserve annotation and enrichment behavior * fix: restore jlpt subtitle underlines * fix: exclude kana-only n+1 targets * fix: refresh overlay on Hyprland fullscreen * fix: address fullscreen and n-plus-one review notes * fix: address CodeRabbit review comments * fix: accept modified digits for multi-line sentence mining * Cancel pending Linux MPV fullscreen overlay refresh bursts - return a cancel handle from the Linux refresh burst scheduler - clear pending refresh bursts when overlays hide or windows close - tighten the burst test polling to wait for the async refresh * fix: suppress N+1 for kana-only candidates and fix minSentenceWords coun - Treat kana-only tokens with surrounding subtitle punctuation (…, ―, etc.) as kana-only so they are not promoted to N+1 targets - Exclude unknown tokens filtered from N+1 targeting from the minSentenceWords count so filtered kana-only unknowns cannot satisfy sentence length threshold - Add regression tests for kana-only candidate suppression and filtered-unknown padding cases * Suppress subtitle annotations for grammar fragments - Hide annotation metadata for auxiliary inflection and ja-nai endings - Preserve lexical `くれる` forms and add regression coverage * Fix kana-only N+1 tokenizer regression test - Use a pure-kana fixture for the subtitle token N+1 case - Update task notes for the latest CodeRabbit follow-up * Fix managed playback exit and tokenizer grammar splits - Ignore background stats daemons during regular app startup - Split standalone grammar endings before applying annotations - Clear helper-span annotations for auxiliary-only tokens * fix: refresh current subtitle after known-word mining * fix: suppress sigh interjection annotations * fix: preserve jlpt underline color after lookup * Replace grammar-ending permutations with shared matcher; preserve word a - Extract `grammar-ending.ts` with `isStandaloneGrammarEndingText` / `isSubtitleGrammarEndingText` pattern matchers - Replace `STANDALONE_GRAMMAR_ENDINGS` set in parser-selection-stage with shared matcher - Replace generated phrase sets in subtitle-annotation-filter with shared matcher - Remove stale duplicate subtitle-exclusion constants and helpers from annotation-stage - Manual clipboard card updates now write only to the sentence audio field, leaving word/expression audio untouched * fix: CI changelog, annotation options threading, and Jellyfin quit - Add `type: fixed` / `area:` frontmatter to `changes/319` to pass `changelog:lint` - Thread `TokenizerAnnotationOptions` through `stripSubtitleAnnotationMetadata` so `sourceText` is honored - Include `jellyfinPlay` in `shouldQuitOnDisconnectWhenOverlayRuntimeInitialized` predicate - Make mouse test `elementFromPoint` stubs coordinate-sensitive - Make Lua test `.tmp` mkdir portable on Windows * Preserve overlay across macOS flaps and mpv playlist changes - keep visible overlays alive during transient macOS tracker loss - reuse the running mpv overlay path on playlist navigation - update regression coverage and changelog fragments * fix: restore stats daemon deferral * fix: keep subtitle prefetch alive after cache hits * Fix JLPT underline color drift and AniList skipped-threshold sync - Replace JLPT `text-decoration` underlines with `border-bottom` so Chromium selection/hover cannot repaint them to another annotation's color - Lock JLPT underline color for combined annotation selectors (known, n+1, frequency) and character hover/selection states - Trigger AniList post-watch check on every mpv time-position update to catch skipped completion thresholds - Fall back to filename-parser season/episode when guessit omits them * fix: address coderabbit feedback * fix: sync AniList after seeked completion * fix: preserve ordinal frequency annotations * fix: preserve known highlighting for filtered tokens * fix: address PR #57 CodeRabbit feedback - Acquire AniList post-watch in-flight lock before async gating to prevent duplicate writes - Isolate manual watched mark result from AniList post-watch callback failures - Report known-word cache clears as mutations during immediate append when state existed - Add regression tests for each fix * fix: stop AniList setup reopening on Linux when keyring token exists - Gate setup success on token persistence: `saveToken` now returns `boolean`; on failure, keeps the setup window open instead of reporting success - Config reload passes `allowSetupPrompt: false` so playback reloads don't re-open the setup window - Add regression test for persistence-failure path * fix: suppress known highlights for subtitle particles * fix: retry transient AniList safeStorage failures * fix: hide overlay focus ring * fix: align Hyprland fullscreen overlays * fix: restore subtitle playback keybindings * fix: align Hyprland overlay windows to mpv and stop pinning them - Force-apply exact Hyprland move/resize/setprop dispatches when bounds are provided - Stop pinning overlay windows; toggle pin off when Hyprland reports pinned=true - Compensate stats overlay outer placement for Electron/Wayland content insets - Make stats overlay window and page opaque so mpv cannot show through transparent insets - Constrain stats app to h-screen with internal scroll so content covers mpv from y=0 - Lock overlay/stats window titles against page-title-updated events - Add regression coverage for placement dispatches, inset compensation, and CSS overlay mode * fix: retain frequency rank for honorific prefix-noun tokens - Add `shouldAllowHonorificPrefixNounFrequency` to exempt お/ご/御 + noun merged tokens from frequency exclusion - Add regression test for `ご機嫌` asserting rank 5484 is preserved after MeCab enrichment and annotation - Close TASK-341 * fix: map openCharacterDictionary session action to --open-character-dict - Add missing Lua CLI dispatch entry for openCharacterDictionary - Add regression test for Alt+Meta+A binding and CLI flag forwarding * fix: keep macOS overlay interactive while mpv remains active - Overlay no longer hides or becomes click-through during tracker refreshes when mpv is the focused window - Preserve already-visible overlay when tracker is temporarily not ready but mpv target signal is active - Add regression tests for active-mpv tracker refresh and transient tracker-not-ready paths * fix: address coderabbit subtitle follow-ups * fix: resolve media detail from sessions when lifetime summary is absent - Change `getMediaDetail` JOIN to LEFT JOIN on `imm_lifetime_media` and fall back to aggregated session metrics when no lifetime row exists - Add filter `AND (lm.video_id IS NOT NULL OR s.session_id IS NOT NULL)` to keep results valid - Add regression test covering the session-visible / media-detail-missing mismatch * fix: address PR-57 CodeRabbit findings and CI failures - use filtered word counts in media detail session token aggregation - cancel fullscreen refresh burst on exit via updateLinuxMpvFullscreenOverlayRefreshBurst - guard Hyprland JSON.parse in try/catch; exclude windowtitle from geometry events - narrow focus suppression from :focus to :focus-visible - apply JLPT lock selectors to word-name-match tokens (N1–N5) * fix: macOS overlay z-order and Yomitan compound token known highlighting - Release always-on-top when tracked mpv loses foreground on macOS - Skip visible overlay blur restacking on macOS to avoid covering unrelated windows - Prefer Yomitan internal parse tokens over fragmented scanner output for known-word decisions - Add regression tests for both behaviors * fix: macOS visible-overlay blur no longer invokes Windows-only blur call - Split win32/darwin branches in handleOverlayWindowBlurred so darwin visible blur returns early without calling onWindowsVisibleOverlayBlur - Add regression test asserting Windows callback stays inactive on macOS visible overlay blur - Close TASK-347
489 lines
14 KiB
TypeScript
489 lines
14 KiB
TypeScript
/*
|
||
* SubMiner - All-in-one sentence mining overlay
|
||
* Copyright (C) 2024 sudacode
|
||
*
|
||
* This program is free software: you can redistribute it and/or modify
|
||
* it under the terms of the GNU General Public License as published by
|
||
* the Free Software Foundation, either version 3 of the License, or
|
||
* (at your option) any later version.
|
||
*
|
||
* This program is distributed in the hope that it will be useful,
|
||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
* GNU General Public License for more details.
|
||
*
|
||
* You should have received a copy of the GNU General Public License
|
||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||
*/
|
||
|
||
import { PartOfSpeech, Token, MergedToken } from './types';
|
||
import { DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG } from './token-pos1-exclusions';
|
||
import { DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG } from './token-pos2-exclusions';
|
||
import { shouldExcludeTokenFromSubtitleAnnotations } from './core/services/tokenizer/subtitle-annotation-filter';
|
||
|
||
export function isNoun(tok: Token): boolean {
|
||
return tok.partOfSpeech === PartOfSpeech.noun;
|
||
}
|
||
|
||
export function isProperNoun(tok: Token): boolean {
|
||
return tok.partOfSpeech === PartOfSpeech.noun && tok.pos2 === '固有名詞';
|
||
}
|
||
|
||
export function ignoreReading(tok: Token): boolean {
|
||
return tok.partOfSpeech === PartOfSpeech.symbol && tok.pos2 === '文字';
|
||
}
|
||
|
||
export function isCopula(tok: Token): boolean {
|
||
const raw = tok.inflectionType;
|
||
if (!raw) {
|
||
return false;
|
||
}
|
||
return ['特殊・ダ', '特殊・デス', '特殊|だ', '特殊|デス'].includes(raw);
|
||
}
|
||
|
||
export function isAuxVerb(tok: Token): boolean {
|
||
return tok.partOfSpeech === PartOfSpeech.bound_auxiliary && !isCopula(tok);
|
||
}
|
||
|
||
export function isContinuativeForm(tok: Token): boolean {
|
||
if (!tok.inflectionForm) {
|
||
return false;
|
||
}
|
||
const inflectionForm = tok.inflectionForm;
|
||
const isContinuative =
|
||
inflectionForm === '連用デ接続' ||
|
||
inflectionForm === '連用タ接続' ||
|
||
inflectionForm.startsWith('連用形');
|
||
|
||
if (!isContinuative) {
|
||
return false;
|
||
}
|
||
return tok.headword !== 'ない';
|
||
}
|
||
|
||
export function isVerbSuffix(tok: Token): boolean {
|
||
return tok.partOfSpeech === PartOfSpeech.verb && (tok.pos2 === '非自立' || tok.pos2 === '接尾');
|
||
}
|
||
|
||
export function isTatteParticle(tok: Token): boolean {
|
||
return (
|
||
tok.partOfSpeech === PartOfSpeech.particle &&
|
||
tok.pos2 === '接続助詞' &&
|
||
tok.headword === 'たって'
|
||
);
|
||
}
|
||
|
||
export function isBaParticle(tok: Token): boolean {
|
||
return tok.partOfSpeech === PartOfSpeech.particle && tok.pos2 === '接続助詞' && tok.word === 'ば';
|
||
}
|
||
|
||
export function isTeDeParticle(tok: Token): boolean {
|
||
return (
|
||
tok.partOfSpeech === PartOfSpeech.particle &&
|
||
tok.pos2 === '接続助詞' &&
|
||
['て', 'で', 'ちゃ'].includes(tok.word)
|
||
);
|
||
}
|
||
|
||
export function isTaDaParticle(tok: Token): boolean {
|
||
return isAuxVerb(tok) && ['た', 'だ'].includes(tok.word);
|
||
}
|
||
|
||
export function isVerb(tok: Token): boolean {
|
||
return [PartOfSpeech.verb, PartOfSpeech.bound_auxiliary].includes(tok.partOfSpeech);
|
||
}
|
||
|
||
export function isVerbNonIndependent(): boolean {
|
||
return true;
|
||
}
|
||
|
||
export function canReceiveAuxiliary(tok: Token): boolean {
|
||
return [PartOfSpeech.verb, PartOfSpeech.bound_auxiliary, PartOfSpeech.i_adjective].includes(
|
||
tok.partOfSpeech,
|
||
);
|
||
}
|
||
|
||
export function isNounSuffix(tok: Token): boolean {
|
||
return tok.partOfSpeech === PartOfSpeech.verb && tok.pos2 === '接尾';
|
||
}
|
||
|
||
export function isCounter(tok: Token): boolean {
|
||
return (
|
||
tok.partOfSpeech === PartOfSpeech.noun &&
|
||
tok.pos3 !== undefined &&
|
||
tok.pos3.startsWith('助数詞')
|
||
);
|
||
}
|
||
|
||
export function isNumeral(tok: Token): boolean {
|
||
return (
|
||
tok.partOfSpeech === PartOfSpeech.noun && tok.pos2 !== undefined && tok.pos2.startsWith('数')
|
||
);
|
||
}
|
||
|
||
export function shouldMerge(lastStandaloneToken: Token, token: Token): boolean {
|
||
if (isVerb(lastStandaloneToken)) {
|
||
if (isAuxVerb(token)) {
|
||
return true;
|
||
}
|
||
if (isContinuativeForm(lastStandaloneToken) && isVerbSuffix(token)) {
|
||
return true;
|
||
}
|
||
if (isVerbSuffix(token) && isVerbNonIndependent()) {
|
||
return true;
|
||
}
|
||
}
|
||
|
||
if (isNoun(lastStandaloneToken) && !isProperNoun(lastStandaloneToken) && isNounSuffix(token)) {
|
||
return true;
|
||
}
|
||
|
||
if (isCounter(token) && isNumeral(lastStandaloneToken)) {
|
||
return true;
|
||
}
|
||
|
||
if (isBaParticle(token) && canReceiveAuxiliary(lastStandaloneToken)) {
|
||
return true;
|
||
}
|
||
|
||
if (isTatteParticle(token) && canReceiveAuxiliary(lastStandaloneToken)) {
|
||
return true;
|
||
}
|
||
|
||
if (isTeDeParticle(token) && isContinuativeForm(lastStandaloneToken)) {
|
||
return true;
|
||
}
|
||
|
||
if (isTaDaParticle(token) && canReceiveAuxiliary(lastStandaloneToken)) {
|
||
return true;
|
||
}
|
||
|
||
if (isTeDeParticle(lastStandaloneToken) && isVerbSuffix(token)) {
|
||
return true;
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
export function mergeTokens(
|
||
tokens: Token[],
|
||
isKnownWord: (text: string) => boolean = () => false,
|
||
knownWordMatchMode: 'headword' | 'surface' = 'headword',
|
||
shouldLookupKnownWords = true,
|
||
sourceText?: string,
|
||
): MergedToken[] {
|
||
if (!tokens || tokens.length === 0) {
|
||
return [];
|
||
}
|
||
|
||
const result: MergedToken[] = [];
|
||
const normalizedSourceText = normalizeSourceTextForTokenOffsets(sourceText);
|
||
let charOffset = 0;
|
||
let sourceCursor = 0;
|
||
let lastStandaloneToken: Token | null = null;
|
||
const resolveKnownMatch = (text: string | undefined): boolean => {
|
||
if (!shouldLookupKnownWords || !text) {
|
||
return false;
|
||
}
|
||
return isKnownWord(text);
|
||
};
|
||
|
||
for (const token of tokens) {
|
||
const matchedStart =
|
||
typeof normalizedSourceText === 'string'
|
||
? normalizedSourceText.indexOf(token.word, sourceCursor)
|
||
: -1;
|
||
const start = matchedStart >= sourceCursor ? matchedStart : charOffset;
|
||
const end = start + token.word.length;
|
||
charOffset = end;
|
||
sourceCursor = end;
|
||
|
||
let shouldMergeToken = false;
|
||
|
||
if (result.length > 0 && lastStandaloneToken !== null) {
|
||
shouldMergeToken = shouldMerge(lastStandaloneToken, token);
|
||
}
|
||
|
||
const tokenReading = ignoreReading(token) ? '' : token.katakanaReading || token.word;
|
||
if (shouldMergeToken && result.length > 0) {
|
||
const prev = result.pop()!;
|
||
const mergedHeadword = prev.headword;
|
||
const headwordForKnownMatch = (() => {
|
||
if (knownWordMatchMode === 'surface') {
|
||
return prev.surface;
|
||
}
|
||
return mergedHeadword;
|
||
})();
|
||
result.push({
|
||
surface: prev.surface + token.word,
|
||
reading: prev.reading + tokenReading,
|
||
headword: prev.headword,
|
||
startPos: prev.startPos,
|
||
endPos: end,
|
||
partOfSpeech: prev.partOfSpeech,
|
||
pos1: prev.pos1 ?? token.pos1,
|
||
pos2: prev.pos2 ?? token.pos2,
|
||
pos3: prev.pos3 ?? token.pos3,
|
||
isMerged: true,
|
||
isKnown: resolveKnownMatch(headwordForKnownMatch),
|
||
isNPlusOneTarget: false,
|
||
});
|
||
} else {
|
||
const headwordForKnownMatch = (() => {
|
||
if (knownWordMatchMode === 'surface') {
|
||
return token.word;
|
||
}
|
||
return token.headword;
|
||
})();
|
||
result.push({
|
||
surface: token.word,
|
||
reading: tokenReading,
|
||
headword: token.headword,
|
||
startPos: start,
|
||
endPos: end,
|
||
partOfSpeech: token.partOfSpeech,
|
||
pos1: token.pos1,
|
||
pos2: token.pos2,
|
||
pos3: token.pos3,
|
||
isMerged: false,
|
||
isKnown: resolveKnownMatch(headwordForKnownMatch),
|
||
isNPlusOneTarget: false,
|
||
});
|
||
}
|
||
|
||
lastStandaloneToken = token;
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
const SENTENCE_BOUNDARY_SURFACES = new Set(['。', '?', '!', '?', '!', '…', '\u2026']);
|
||
const N_PLUS_ONE_IGNORED_POS1 = new Set(DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG.defaults);
|
||
const N_PLUS_ONE_IGNORED_POS2 = new Set(DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG.defaults);
|
||
|
||
function normalizePos1Tag(pos1: string | undefined): string {
|
||
return typeof pos1 === 'string' ? pos1.trim() : '';
|
||
}
|
||
|
||
function normalizePos2Tag(pos2: string | undefined): string {
|
||
return typeof pos2 === 'string' ? pos2.trim() : '';
|
||
}
|
||
|
||
function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet<string>): boolean {
|
||
if (!normalizedTag) {
|
||
return false;
|
||
}
|
||
const parts = normalizedTag
|
||
.split('|')
|
||
.map((part) => part.trim())
|
||
.filter((part) => part.length > 0);
|
||
if (parts.length === 0) {
|
||
return false;
|
||
}
|
||
return parts.every((part) => exclusions.has(part));
|
||
}
|
||
|
||
function isKanaChar(char: string): boolean {
|
||
const code = char.codePointAt(0);
|
||
if (code === undefined) {
|
||
return false;
|
||
}
|
||
|
||
return (
|
||
(code >= 0x3041 && code <= 0x3096) ||
|
||
(code >= 0x309b && code <= 0x309f) ||
|
||
code === 0x30fc ||
|
||
(code >= 0x30a0 && code <= 0x30fa) ||
|
||
(code >= 0x30fd && code <= 0x30ff)
|
||
);
|
||
}
|
||
|
||
function isKanaCandidateIgnorableChar(char: string): boolean {
|
||
return /^[\s.,!?;:()[\]{}"'`、。!?…‥・「」『』()[]{}〈〉《》【】―-]$/u.test(char);
|
||
}
|
||
|
||
function isKanaOnlyText(text: string): boolean {
|
||
const normalized = text.trim();
|
||
if (normalized.length === 0) {
|
||
return false;
|
||
}
|
||
|
||
let hasKana = false;
|
||
for (const char of normalized) {
|
||
if (isKanaChar(char)) {
|
||
hasKana = true;
|
||
continue;
|
||
}
|
||
if (!isKanaCandidateIgnorableChar(char)) {
|
||
return false;
|
||
}
|
||
}
|
||
|
||
return hasKana;
|
||
}
|
||
|
||
function normalizeSourceTextForTokenOffsets(sourceText: string | undefined): string | undefined {
|
||
return typeof sourceText === 'string' ? sourceText.replace(/\r?\n/g, ' ').trim() : undefined;
|
||
}
|
||
|
||
export function isNPlusOneCandidateToken(
|
||
token: MergedToken,
|
||
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
|
||
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
|
||
): boolean {
|
||
if (token.isKnown) {
|
||
return false;
|
||
}
|
||
if (isKanaOnlyText(token.surface)) {
|
||
return false;
|
||
}
|
||
return isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions);
|
||
}
|
||
|
||
function isNPlusOneWordCountToken(
|
||
token: MergedToken,
|
||
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
|
||
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
|
||
): boolean {
|
||
if (shouldExcludeTokenFromSubtitleAnnotations(token, { pos1Exclusions, pos2Exclusions })) {
|
||
return false;
|
||
}
|
||
|
||
const normalizedPos1 = normalizePos1Tag(token.pos1);
|
||
const hasPos1 = normalizedPos1.length > 0;
|
||
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
|
||
return false;
|
||
}
|
||
|
||
const normalizedPos2 = normalizePos2Tag(token.pos2);
|
||
const hasPos2 = normalizedPos2.length > 0;
|
||
if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) {
|
||
return false;
|
||
}
|
||
|
||
if (
|
||
!hasPos1 &&
|
||
!hasPos2 &&
|
||
(token.partOfSpeech === PartOfSpeech.particle ||
|
||
token.partOfSpeech === PartOfSpeech.bound_auxiliary ||
|
||
token.partOfSpeech === PartOfSpeech.symbol)
|
||
) {
|
||
return false;
|
||
}
|
||
|
||
if (token.partOfSpeech === PartOfSpeech.noun && token.pos2 === '固有名詞') {
|
||
return false;
|
||
}
|
||
|
||
if (token.pos3 && token.pos3.startsWith('助数詞')) {
|
||
return false;
|
||
}
|
||
|
||
if (token.surface.trim().length === 0) {
|
||
return false;
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
function isNPlusOneSentenceLengthToken(
|
||
token: MergedToken,
|
||
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
|
||
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
|
||
): boolean {
|
||
if (!isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions)) {
|
||
return false;
|
||
}
|
||
|
||
return token.isKnown || isNPlusOneCandidateToken(token, pos1Exclusions, pos2Exclusions);
|
||
}
|
||
|
||
function isSentenceBoundaryToken(token: MergedToken): boolean {
|
||
if (token.partOfSpeech !== PartOfSpeech.symbol) {
|
||
return false;
|
||
}
|
||
|
||
return SENTENCE_BOUNDARY_SURFACES.has(token.surface);
|
||
}
|
||
|
||
function hasSentenceBoundaryInSourceGap(
|
||
sourceText: string | undefined,
|
||
previousEnd: number | null,
|
||
nextStart: number,
|
||
): boolean {
|
||
if (typeof sourceText !== 'string' || previousEnd === null || nextStart <= previousEnd) {
|
||
return false;
|
||
}
|
||
|
||
const gap = sourceText.slice(previousEnd, nextStart);
|
||
return [...gap].some((char) => SENTENCE_BOUNDARY_SURFACES.has(char));
|
||
}
|
||
|
||
export function markNPlusOneTargets(
|
||
tokens: MergedToken[],
|
||
minSentenceWords = 3,
|
||
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
|
||
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
|
||
sourceText?: string,
|
||
): MergedToken[] {
|
||
if (tokens.length === 0) {
|
||
return [];
|
||
}
|
||
|
||
const normalizedSourceText = normalizeSourceTextForTokenOffsets(sourceText);
|
||
|
||
const markedTokens = tokens.map((token) => ({
|
||
...token,
|
||
isNPlusOneTarget: false,
|
||
}));
|
||
|
||
let sentenceStart = 0;
|
||
let previousTokenEnd: number | null = null;
|
||
const minimumSentenceWords = Number.isInteger(minSentenceWords)
|
||
? Math.max(1, minSentenceWords)
|
||
: 3;
|
||
|
||
const markSentence = (start: number, endExclusive: number): void => {
|
||
const sentenceCandidates: number[] = [];
|
||
let sentenceWordCount = 0;
|
||
for (let i = start; i < endExclusive; i++) {
|
||
const token = markedTokens[i];
|
||
if (!token) continue;
|
||
if (isNPlusOneSentenceLengthToken(token, pos1Exclusions, pos2Exclusions)) {
|
||
sentenceWordCount += 1;
|
||
}
|
||
|
||
if (isNPlusOneCandidateToken(token, pos1Exclusions, pos2Exclusions)) {
|
||
sentenceCandidates.push(i);
|
||
}
|
||
}
|
||
|
||
if (sentenceWordCount >= minimumSentenceWords && sentenceCandidates.length === 1) {
|
||
markedTokens[sentenceCandidates[0]!] = {
|
||
...markedTokens[sentenceCandidates[0]!]!,
|
||
isNPlusOneTarget: true,
|
||
};
|
||
}
|
||
};
|
||
|
||
for (let i = 0; i < markedTokens.length; i++) {
|
||
const token = markedTokens[i];
|
||
if (!token) continue;
|
||
if (hasSentenceBoundaryInSourceGap(normalizedSourceText, previousTokenEnd, token.startPos)) {
|
||
markSentence(sentenceStart, i);
|
||
sentenceStart = i;
|
||
}
|
||
if (isSentenceBoundaryToken(token)) {
|
||
markSentence(sentenceStart, i);
|
||
sentenceStart = i + 1;
|
||
}
|
||
previousTokenEnd = token.endPos;
|
||
}
|
||
|
||
if (sentenceStart < markedTokens.length) {
|
||
markSentence(sentenceStart, markedTokens.length);
|
||
}
|
||
|
||
return markedTokens;
|
||
}
|