/*
* SubMiner - All-in-one sentence mining overlay
* Copyright (C) 2024 sudacode
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
import { PartOfSpeech, Token, MergedToken } from './types';
import { DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG } from './token-pos1-exclusions';
import { DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG } from './token-pos2-exclusions';
export function isNoun(tok: Token): boolean {
return tok.partOfSpeech === PartOfSpeech.noun;
}
export function isProperNoun(tok: Token): boolean {
return tok.partOfSpeech === PartOfSpeech.noun && tok.pos2 === '固有名詞';
}
export function ignoreReading(tok: Token): boolean {
return tok.partOfSpeech === PartOfSpeech.symbol && tok.pos2 === '文字';
}
export function isCopula(tok: Token): boolean {
const raw = tok.inflectionType;
if (!raw) {
return false;
}
return ['特殊・ダ', '特殊・デス', '特殊|だ', '特殊|デス'].includes(raw);
}
export function isAuxVerb(tok: Token): boolean {
return tok.partOfSpeech === PartOfSpeech.bound_auxiliary && !isCopula(tok);
}
export function isContinuativeForm(tok: Token): boolean {
if (!tok.inflectionForm) {
return false;
}
const inflectionForm = tok.inflectionForm;
const isContinuative =
inflectionForm === '連用デ接続' ||
inflectionForm === '連用タ接続' ||
inflectionForm.startsWith('連用形');
if (!isContinuative) {
return false;
}
return tok.headword !== 'ない';
}
export function isVerbSuffix(tok: Token): boolean {
return tok.partOfSpeech === PartOfSpeech.verb && (tok.pos2 === '非自立' || tok.pos2 === '接尾');
}
export function isTatteParticle(tok: Token): boolean {
return (
tok.partOfSpeech === PartOfSpeech.particle &&
tok.pos2 === '接続助詞' &&
tok.headword === 'たって'
);
}
export function isBaParticle(tok: Token): boolean {
return tok.partOfSpeech === PartOfSpeech.particle && tok.pos2 === '接続助詞' && tok.word === 'ば';
}
export function isTeDeParticle(tok: Token): boolean {
return (
tok.partOfSpeech === PartOfSpeech.particle &&
tok.pos2 === '接続助詞' &&
['て', 'で', 'ちゃ'].includes(tok.word)
);
}
export function isTaDaParticle(tok: Token): boolean {
return isAuxVerb(tok) && ['た', 'だ'].includes(tok.word);
}
export function isVerb(tok: Token): boolean {
return [PartOfSpeech.verb, PartOfSpeech.bound_auxiliary].includes(tok.partOfSpeech);
}
export function isVerbNonIndependent(): boolean {
return true;
}
export function canReceiveAuxiliary(tok: Token): boolean {
return [PartOfSpeech.verb, PartOfSpeech.bound_auxiliary, PartOfSpeech.i_adjective].includes(
tok.partOfSpeech,
);
}
export function isNounSuffix(tok: Token): boolean {
return tok.partOfSpeech === PartOfSpeech.verb && tok.pos2 === '接尾';
}
export function isCounter(tok: Token): boolean {
return (
tok.partOfSpeech === PartOfSpeech.noun &&
tok.pos3 !== undefined &&
tok.pos3.startsWith('助数詞')
);
}
export function isNumeral(tok: Token): boolean {
return (
tok.partOfSpeech === PartOfSpeech.noun && tok.pos2 !== undefined && tok.pos2.startsWith('数')
);
}
export function shouldMerge(lastStandaloneToken: Token, token: Token): boolean {
if (isVerb(lastStandaloneToken)) {
if (isAuxVerb(token)) {
return true;
}
if (isContinuativeForm(lastStandaloneToken) && isVerbSuffix(token)) {
return true;
}
if (isVerbSuffix(token) && isVerbNonIndependent()) {
return true;
}
}
if (isNoun(lastStandaloneToken) && !isProperNoun(lastStandaloneToken) && isNounSuffix(token)) {
return true;
}
if (isCounter(token) && isNumeral(lastStandaloneToken)) {
return true;
}
if (isBaParticle(token) && canReceiveAuxiliary(lastStandaloneToken)) {
return true;
}
if (isTatteParticle(token) && canReceiveAuxiliary(lastStandaloneToken)) {
return true;
}
if (isTeDeParticle(token) && isContinuativeForm(lastStandaloneToken)) {
return true;
}
if (isTaDaParticle(token) && canReceiveAuxiliary(lastStandaloneToken)) {
return true;
}
if (isTeDeParticle(lastStandaloneToken) && isVerbSuffix(token)) {
return true;
}
return false;
}
export function mergeTokens(
tokens: Token[],
isKnownWord: (text: string) => boolean = () => false,
knownWordMatchMode: 'headword' | 'surface' = 'headword',
shouldLookupKnownWords = true,
): MergedToken[] {
if (!tokens || tokens.length === 0) {
return [];
}
const result: MergedToken[] = [];
let charOffset = 0;
let lastStandaloneToken: Token | null = null;
const resolveKnownMatch = (text: string | undefined): boolean => {
if (!shouldLookupKnownWords || !text) {
return false;
}
return isKnownWord(text);
};
for (const token of tokens) {
const start = charOffset;
const end = charOffset + token.word.length;
charOffset = end;
let shouldMergeToken = false;
if (result.length > 0 && lastStandaloneToken !== null) {
shouldMergeToken = shouldMerge(lastStandaloneToken, token);
}
const tokenReading = ignoreReading(token) ? '' : token.katakanaReading || token.word;
if (shouldMergeToken && result.length > 0) {
const prev = result.pop()!;
const mergedHeadword = prev.headword;
const headwordForKnownMatch = (() => {
if (knownWordMatchMode === 'surface') {
return prev.surface;
}
return mergedHeadword;
})();
result.push({
surface: prev.surface + token.word,
reading: prev.reading + tokenReading,
headword: prev.headword,
startPos: prev.startPos,
endPos: end,
partOfSpeech: prev.partOfSpeech,
pos1: prev.pos1 ?? token.pos1,
pos2: prev.pos2 ?? token.pos2,
pos3: prev.pos3 ?? token.pos3,
isMerged: true,
isKnown: resolveKnownMatch(headwordForKnownMatch),
isNPlusOneTarget: false,
});
} else {
const headwordForKnownMatch = (() => {
if (knownWordMatchMode === 'surface') {
return token.word;
}
return token.headword;
})();
result.push({
surface: token.word,
reading: tokenReading,
headword: token.headword,
startPos: start,
endPos: end,
partOfSpeech: token.partOfSpeech,
pos1: token.pos1,
pos2: token.pos2,
pos3: token.pos3,
isMerged: false,
isKnown: resolveKnownMatch(headwordForKnownMatch),
isNPlusOneTarget: false,
});
}
lastStandaloneToken = token;
}
return result;
}
const SENTENCE_BOUNDARY_SURFACES = new Set(['。', '?', '!', '?', '!', '…', '\u2026']);
const N_PLUS_ONE_IGNORED_POS1 = new Set(DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG.defaults);
const N_PLUS_ONE_IGNORED_POS2 = new Set(DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG.defaults);
function normalizePos1Tag(pos1: string | undefined): string {
return typeof pos1 === 'string' ? pos1.trim() : '';
}
function normalizePos2Tag(pos2: string | undefined): string {
return typeof pos2 === 'string' ? pos2.trim() : '';
}
function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet): boolean {
if (!normalizedTag) {
return false;
}
const parts = normalizedTag
.split('|')
.map((part) => part.trim())
.filter((part) => part.length > 0);
if (parts.length === 0) {
return false;
}
return parts.every((part) => exclusions.has(part));
}
export function isNPlusOneCandidateToken(
token: MergedToken,
pos1Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS1,
pos2Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS2,
): boolean {
if (token.isKnown) {
return false;
}
return isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions);
}
function isNPlusOneWordCountToken(
token: MergedToken,
pos1Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS1,
pos2Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS2,
): boolean {
const normalizedPos1 = normalizePos1Tag(token.pos1);
const hasPos1 = normalizedPos1.length > 0;
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
return false;
}
const normalizedPos2 = normalizePos2Tag(token.pos2);
const hasPos2 = normalizedPos2.length > 0;
if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) {
return false;
}
if (
!hasPos1 &&
!hasPos2 &&
(token.partOfSpeech === PartOfSpeech.particle ||
token.partOfSpeech === PartOfSpeech.bound_auxiliary ||
token.partOfSpeech === PartOfSpeech.symbol)
) {
return false;
}
if (token.partOfSpeech === PartOfSpeech.noun && token.pos2 === '固有名詞') {
return false;
}
if (token.pos3 && token.pos3.startsWith('助数詞')) {
return false;
}
if (token.surface.trim().length === 0) {
return false;
}
return true;
}
function isSentenceBoundaryToken(token: MergedToken): boolean {
if (token.partOfSpeech !== PartOfSpeech.symbol) {
return false;
}
return SENTENCE_BOUNDARY_SURFACES.has(token.surface);
}
export function markNPlusOneTargets(
tokens: MergedToken[],
minSentenceWords = 3,
pos1Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS1,
pos2Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS2,
): MergedToken[] {
if (tokens.length === 0) {
return [];
}
const markedTokens = tokens.map((token) => ({
...token,
isNPlusOneTarget: false,
}));
let sentenceStart = 0;
const minimumSentenceWords = Number.isInteger(minSentenceWords)
? Math.max(1, minSentenceWords)
: 3;
const markSentence = (start: number, endExclusive: number): void => {
const sentenceCandidates: number[] = [];
let sentenceWordCount = 0;
for (let i = start; i < endExclusive; i++) {
const token = markedTokens[i];
if (!token) continue;
if (isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions)) {
sentenceWordCount += 1;
}
if (isNPlusOneCandidateToken(token, pos1Exclusions, pos2Exclusions)) {
sentenceCandidates.push(i);
}
}
if (sentenceWordCount >= minimumSentenceWords && sentenceCandidates.length === 1) {
markedTokens[sentenceCandidates[0]!] = {
...markedTokens[sentenceCandidates[0]!]!,
isNPlusOneTarget: true,
};
}
};
for (let i = 0; i < markedTokens.length; i++) {
const token = markedTokens[i];
if (!token) continue;
if (isSentenceBoundaryToken(token)) {
markSentence(sentenceStart, i);
sentenceStart = i + 1;
}
}
if (sentenceStart < markedTokens.length) {
markSentence(sentenceStart, markedTokens.length);
}
return markedTokens;
}