/* * SubMiner - All-in-one sentence mining overlay * Copyright (C) 2024 sudacode * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ import { PartOfSpeech, Token, MergedToken } from './types'; import { DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG } from './token-pos1-exclusions'; import { DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG } from './token-pos2-exclusions'; export function isNoun(tok: Token): boolean { return tok.partOfSpeech === PartOfSpeech.noun; } export function isProperNoun(tok: Token): boolean { return tok.partOfSpeech === PartOfSpeech.noun && tok.pos2 === '固有名詞'; } export function ignoreReading(tok: Token): boolean { return tok.partOfSpeech === PartOfSpeech.symbol && tok.pos2 === '文字'; } export function isCopula(tok: Token): boolean { const raw = tok.inflectionType; if (!raw) { return false; } return ['特殊・ダ', '特殊・デス', '特殊|だ', '特殊|デス'].includes(raw); } export function isAuxVerb(tok: Token): boolean { return tok.partOfSpeech === PartOfSpeech.bound_auxiliary && !isCopula(tok); } export function isContinuativeForm(tok: Token): boolean { if (!tok.inflectionForm) { return false; } const inflectionForm = tok.inflectionForm; const isContinuative = inflectionForm === '連用デ接続' || inflectionForm === '連用タ接続' || inflectionForm.startsWith('連用形'); if (!isContinuative) { return false; } return tok.headword !== 'ない'; } export function isVerbSuffix(tok: Token): boolean { return tok.partOfSpeech === PartOfSpeech.verb && (tok.pos2 === '非自立' || tok.pos2 === '接尾'); } export function isTatteParticle(tok: Token): boolean { return ( tok.partOfSpeech === PartOfSpeech.particle && tok.pos2 === '接続助詞' && tok.headword === 'たって' ); } export function isBaParticle(tok: Token): boolean { return tok.partOfSpeech === PartOfSpeech.particle && tok.pos2 === '接続助詞' && tok.word === 'ば'; } export function isTeDeParticle(tok: Token): boolean { return ( tok.partOfSpeech === PartOfSpeech.particle && tok.pos2 === '接続助詞' && ['て', 'で', 'ちゃ'].includes(tok.word) ); } export function isTaDaParticle(tok: Token): boolean { return isAuxVerb(tok) && ['た', 'だ'].includes(tok.word); } export function isVerb(tok: Token): boolean { return [PartOfSpeech.verb, PartOfSpeech.bound_auxiliary].includes(tok.partOfSpeech); } export function isVerbNonIndependent(): boolean { return true; } export function canReceiveAuxiliary(tok: Token): boolean { return [PartOfSpeech.verb, PartOfSpeech.bound_auxiliary, PartOfSpeech.i_adjective].includes( tok.partOfSpeech, ); } export function isNounSuffix(tok: Token): boolean { return tok.partOfSpeech === PartOfSpeech.verb && tok.pos2 === '接尾'; } export function isCounter(tok: Token): boolean { return ( tok.partOfSpeech === PartOfSpeech.noun && tok.pos3 !== undefined && tok.pos3.startsWith('助数詞') ); } export function isNumeral(tok: Token): boolean { return ( tok.partOfSpeech === PartOfSpeech.noun && tok.pos2 !== undefined && tok.pos2.startsWith('数') ); } export function shouldMerge(lastStandaloneToken: Token, token: Token): boolean { if (isVerb(lastStandaloneToken)) { if (isAuxVerb(token)) { return true; } if (isContinuativeForm(lastStandaloneToken) && isVerbSuffix(token)) { return true; } if (isVerbSuffix(token) && isVerbNonIndependent()) { return true; } } if (isNoun(lastStandaloneToken) && !isProperNoun(lastStandaloneToken) && isNounSuffix(token)) { return true; } if (isCounter(token) && isNumeral(lastStandaloneToken)) { return true; } if (isBaParticle(token) && canReceiveAuxiliary(lastStandaloneToken)) { return true; } if (isTatteParticle(token) && canReceiveAuxiliary(lastStandaloneToken)) { return true; } if (isTeDeParticle(token) && isContinuativeForm(lastStandaloneToken)) { return true; } if (isTaDaParticle(token) && canReceiveAuxiliary(lastStandaloneToken)) { return true; } if (isTeDeParticle(lastStandaloneToken) && isVerbSuffix(token)) { return true; } return false; } export function mergeTokens( tokens: Token[], isKnownWord: (text: string) => boolean = () => false, knownWordMatchMode: 'headword' | 'surface' = 'headword', shouldLookupKnownWords = true, ): MergedToken[] { if (!tokens || tokens.length === 0) { return []; } const result: MergedToken[] = []; let charOffset = 0; let lastStandaloneToken: Token | null = null; const resolveKnownMatch = (text: string | undefined): boolean => { if (!shouldLookupKnownWords || !text) { return false; } return isKnownWord(text); }; for (const token of tokens) { const start = charOffset; const end = charOffset + token.word.length; charOffset = end; let shouldMergeToken = false; if (result.length > 0 && lastStandaloneToken !== null) { shouldMergeToken = shouldMerge(lastStandaloneToken, token); } const tokenReading = ignoreReading(token) ? '' : token.katakanaReading || token.word; if (shouldMergeToken && result.length > 0) { const prev = result.pop()!; const mergedHeadword = prev.headword; const headwordForKnownMatch = (() => { if (knownWordMatchMode === 'surface') { return prev.surface; } return mergedHeadword; })(); result.push({ surface: prev.surface + token.word, reading: prev.reading + tokenReading, headword: prev.headword, startPos: prev.startPos, endPos: end, partOfSpeech: prev.partOfSpeech, pos1: prev.pos1 ?? token.pos1, pos2: prev.pos2 ?? token.pos2, pos3: prev.pos3 ?? token.pos3, isMerged: true, isKnown: resolveKnownMatch(headwordForKnownMatch), isNPlusOneTarget: false, }); } else { const headwordForKnownMatch = (() => { if (knownWordMatchMode === 'surface') { return token.word; } return token.headword; })(); result.push({ surface: token.word, reading: tokenReading, headword: token.headword, startPos: start, endPos: end, partOfSpeech: token.partOfSpeech, pos1: token.pos1, pos2: token.pos2, pos3: token.pos3, isMerged: false, isKnown: resolveKnownMatch(headwordForKnownMatch), isNPlusOneTarget: false, }); } lastStandaloneToken = token; } return result; } const SENTENCE_BOUNDARY_SURFACES = new Set(['。', '?', '!', '?', '!', '…', '\u2026']); const N_PLUS_ONE_IGNORED_POS1 = new Set(DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG.defaults); const N_PLUS_ONE_IGNORED_POS2 = new Set(DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG.defaults); function normalizePos1Tag(pos1: string | undefined): string { return typeof pos1 === 'string' ? pos1.trim() : ''; } function normalizePos2Tag(pos2: string | undefined): string { return typeof pos2 === 'string' ? pos2.trim() : ''; } function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet): boolean { if (!normalizedTag) { return false; } const parts = normalizedTag .split('|') .map((part) => part.trim()) .filter((part) => part.length > 0); if (parts.length === 0) { return false; } return parts.every((part) => exclusions.has(part)); } export function isNPlusOneCandidateToken( token: MergedToken, pos1Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS1, pos2Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS2, ): boolean { if (token.isKnown) { return false; } return isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions); } function isNPlusOneWordCountToken( token: MergedToken, pos1Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS1, pos2Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS2, ): boolean { const normalizedPos1 = normalizePos1Tag(token.pos1); const hasPos1 = normalizedPos1.length > 0; if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) { return false; } const normalizedPos2 = normalizePos2Tag(token.pos2); const hasPos2 = normalizedPos2.length > 0; if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) { return false; } if ( !hasPos1 && !hasPos2 && (token.partOfSpeech === PartOfSpeech.particle || token.partOfSpeech === PartOfSpeech.bound_auxiliary || token.partOfSpeech === PartOfSpeech.symbol) ) { return false; } if (token.partOfSpeech === PartOfSpeech.noun && token.pos2 === '固有名詞') { return false; } if (token.pos3 && token.pos3.startsWith('助数詞')) { return false; } if (token.surface.trim().length === 0) { return false; } return true; } function isSentenceBoundaryToken(token: MergedToken): boolean { if (token.partOfSpeech !== PartOfSpeech.symbol) { return false; } return SENTENCE_BOUNDARY_SURFACES.has(token.surface); } export function markNPlusOneTargets( tokens: MergedToken[], minSentenceWords = 3, pos1Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS1, pos2Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS2, ): MergedToken[] { if (tokens.length === 0) { return []; } const markedTokens = tokens.map((token) => ({ ...token, isNPlusOneTarget: false, })); let sentenceStart = 0; const minimumSentenceWords = Number.isInteger(minSentenceWords) ? Math.max(1, minSentenceWords) : 3; const markSentence = (start: number, endExclusive: number): void => { const sentenceCandidates: number[] = []; let sentenceWordCount = 0; for (let i = start; i < endExclusive; i++) { const token = markedTokens[i]; if (!token) continue; if (isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions)) { sentenceWordCount += 1; } if (isNPlusOneCandidateToken(token, pos1Exclusions, pos2Exclusions)) { sentenceCandidates.push(i); } } if (sentenceWordCount >= minimumSentenceWords && sentenceCandidates.length === 1) { markedTokens[sentenceCandidates[0]!] = { ...markedTokens[sentenceCandidates[0]!]!, isNPlusOneTarget: true, }; } }; for (let i = 0; i < markedTokens.length; i++) { const token = markedTokens[i]; if (!token) continue; if (isSentenceBoundaryToken(token)) { markSentence(sentenceStart, i); sentenceStart = i + 1; } } if (sentenceStart < markedTokens.length) { markSentence(sentenceStart, markedTokens.length); } return markedTokens; }