Files
SubMiner/src/token-merger.ts

396 lines
11 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*
* SubMiner - All-in-one sentence mining overlay
* Copyright (C) 2024 sudacode
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
import { PartOfSpeech, Token, MergedToken } from './types';
import { DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG } from './token-pos1-exclusions';
import { DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG } from './token-pos2-exclusions';
export function isNoun(tok: Token): boolean {
return tok.partOfSpeech === PartOfSpeech.noun;
}
export function isProperNoun(tok: Token): boolean {
return tok.partOfSpeech === PartOfSpeech.noun && tok.pos2 === '固有名詞';
}
export function ignoreReading(tok: Token): boolean {
return tok.partOfSpeech === PartOfSpeech.symbol && tok.pos2 === '文字';
}
export function isCopula(tok: Token): boolean {
const raw = tok.inflectionType;
if (!raw) {
return false;
}
return ['特殊・ダ', '特殊・デス', '特殊|だ', '特殊|デス'].includes(raw);
}
export function isAuxVerb(tok: Token): boolean {
return tok.partOfSpeech === PartOfSpeech.bound_auxiliary && !isCopula(tok);
}
export function isContinuativeForm(tok: Token): boolean {
if (!tok.inflectionForm) {
return false;
}
const inflectionForm = tok.inflectionForm;
const isContinuative =
inflectionForm === '連用デ接続' ||
inflectionForm === '連用タ接続' ||
inflectionForm.startsWith('連用形');
if (!isContinuative) {
return false;
}
return tok.headword !== 'ない';
}
export function isVerbSuffix(tok: Token): boolean {
return tok.partOfSpeech === PartOfSpeech.verb && (tok.pos2 === '非自立' || tok.pos2 === '接尾');
}
export function isTatteParticle(tok: Token): boolean {
return (
tok.partOfSpeech === PartOfSpeech.particle &&
tok.pos2 === '接続助詞' &&
tok.headword === 'たって'
);
}
export function isBaParticle(tok: Token): boolean {
return tok.partOfSpeech === PartOfSpeech.particle && tok.pos2 === '接続助詞' && tok.word === 'ば';
}
export function isTeDeParticle(tok: Token): boolean {
return (
tok.partOfSpeech === PartOfSpeech.particle &&
tok.pos2 === '接続助詞' &&
['て', 'で', 'ちゃ'].includes(tok.word)
);
}
export function isTaDaParticle(tok: Token): boolean {
return isAuxVerb(tok) && ['た', 'だ'].includes(tok.word);
}
export function isVerb(tok: Token): boolean {
return [PartOfSpeech.verb, PartOfSpeech.bound_auxiliary].includes(tok.partOfSpeech);
}
export function isVerbNonIndependent(): boolean {
return true;
}
export function canReceiveAuxiliary(tok: Token): boolean {
return [PartOfSpeech.verb, PartOfSpeech.bound_auxiliary, PartOfSpeech.i_adjective].includes(
tok.partOfSpeech,
);
}
export function isNounSuffix(tok: Token): boolean {
return tok.partOfSpeech === PartOfSpeech.verb && tok.pos2 === '接尾';
}
export function isCounter(tok: Token): boolean {
return (
tok.partOfSpeech === PartOfSpeech.noun &&
tok.pos3 !== undefined &&
tok.pos3.startsWith('助数詞')
);
}
export function isNumeral(tok: Token): boolean {
return (
tok.partOfSpeech === PartOfSpeech.noun && tok.pos2 !== undefined && tok.pos2.startsWith('数')
);
}
export function shouldMerge(lastStandaloneToken: Token, token: Token): boolean {
if (isVerb(lastStandaloneToken)) {
if (isAuxVerb(token)) {
return true;
}
if (isContinuativeForm(lastStandaloneToken) && isVerbSuffix(token)) {
return true;
}
if (isVerbSuffix(token) && isVerbNonIndependent()) {
return true;
}
}
if (isNoun(lastStandaloneToken) && !isProperNoun(lastStandaloneToken) && isNounSuffix(token)) {
return true;
}
if (isCounter(token) && isNumeral(lastStandaloneToken)) {
return true;
}
if (isBaParticle(token) && canReceiveAuxiliary(lastStandaloneToken)) {
return true;
}
if (isTatteParticle(token) && canReceiveAuxiliary(lastStandaloneToken)) {
return true;
}
if (isTeDeParticle(token) && isContinuativeForm(lastStandaloneToken)) {
return true;
}
if (isTaDaParticle(token) && canReceiveAuxiliary(lastStandaloneToken)) {
return true;
}
if (isTeDeParticle(lastStandaloneToken) && isVerbSuffix(token)) {
return true;
}
return false;
}
export function mergeTokens(
tokens: Token[],
isKnownWord: (text: string) => boolean = () => false,
knownWordMatchMode: 'headword' | 'surface' = 'headword',
shouldLookupKnownWords = true,
): MergedToken[] {
if (!tokens || tokens.length === 0) {
return [];
}
const result: MergedToken[] = [];
let charOffset = 0;
let lastStandaloneToken: Token | null = null;
const resolveKnownMatch = (text: string | undefined): boolean => {
if (!shouldLookupKnownWords || !text) {
return false;
}
return isKnownWord(text);
};
for (const token of tokens) {
const start = charOffset;
const end = charOffset + token.word.length;
charOffset = end;
let shouldMergeToken = false;
if (result.length > 0 && lastStandaloneToken !== null) {
shouldMergeToken = shouldMerge(lastStandaloneToken, token);
}
const tokenReading = ignoreReading(token) ? '' : token.katakanaReading || token.word;
if (shouldMergeToken && result.length > 0) {
const prev = result.pop()!;
const mergedHeadword = prev.headword;
const headwordForKnownMatch = (() => {
if (knownWordMatchMode === 'surface') {
return prev.surface;
}
return mergedHeadword;
})();
result.push({
surface: prev.surface + token.word,
reading: prev.reading + tokenReading,
headword: prev.headword,
startPos: prev.startPos,
endPos: end,
partOfSpeech: prev.partOfSpeech,
pos1: prev.pos1 ?? token.pos1,
pos2: prev.pos2 ?? token.pos2,
pos3: prev.pos3 ?? token.pos3,
isMerged: true,
isKnown: resolveKnownMatch(headwordForKnownMatch),
isNPlusOneTarget: false,
});
} else {
const headwordForKnownMatch = (() => {
if (knownWordMatchMode === 'surface') {
return token.word;
}
return token.headword;
})();
result.push({
surface: token.word,
reading: tokenReading,
headword: token.headword,
startPos: start,
endPos: end,
partOfSpeech: token.partOfSpeech,
pos1: token.pos1,
pos2: token.pos2,
pos3: token.pos3,
isMerged: false,
isKnown: resolveKnownMatch(headwordForKnownMatch),
isNPlusOneTarget: false,
});
}
lastStandaloneToken = token;
}
return result;
}
const SENTENCE_BOUNDARY_SURFACES = new Set(['。', '', '', '?', '!', '…', '\u2026']);
const N_PLUS_ONE_IGNORED_POS1 = new Set(DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG.defaults);
const N_PLUS_ONE_IGNORED_POS2 = new Set(DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG.defaults);
function normalizePos1Tag(pos1: string | undefined): string {
return typeof pos1 === 'string' ? pos1.trim() : '';
}
function normalizePos2Tag(pos2: string | undefined): string {
return typeof pos2 === 'string' ? pos2.trim() : '';
}
function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet<string>): boolean {
if (!normalizedTag) {
return false;
}
const parts = normalizedTag
.split('|')
.map((part) => part.trim())
.filter((part) => part.length > 0);
if (parts.length === 0) {
return false;
}
return parts.every((part) => exclusions.has(part));
}
export function isNPlusOneCandidateToken(
token: MergedToken,
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
): boolean {
if (token.isKnown) {
return false;
}
return isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions);
}
function isNPlusOneWordCountToken(
token: MergedToken,
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
): boolean {
const normalizedPos1 = normalizePos1Tag(token.pos1);
const hasPos1 = normalizedPos1.length > 0;
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
return false;
}
const normalizedPos2 = normalizePos2Tag(token.pos2);
const hasPos2 = normalizedPos2.length > 0;
if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) {
return false;
}
if (
!hasPos1 &&
!hasPos2 &&
(token.partOfSpeech === PartOfSpeech.particle ||
token.partOfSpeech === PartOfSpeech.bound_auxiliary ||
token.partOfSpeech === PartOfSpeech.symbol)
) {
return false;
}
if (token.partOfSpeech === PartOfSpeech.noun && token.pos2 === '固有名詞') {
return false;
}
if (token.pos3 && token.pos3.startsWith('助数詞')) {
return false;
}
if (token.surface.trim().length === 0) {
return false;
}
return true;
}
function isSentenceBoundaryToken(token: MergedToken): boolean {
if (token.partOfSpeech !== PartOfSpeech.symbol) {
return false;
}
return SENTENCE_BOUNDARY_SURFACES.has(token.surface);
}
export function markNPlusOneTargets(
tokens: MergedToken[],
minSentenceWords = 3,
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
): MergedToken[] {
if (tokens.length === 0) {
return [];
}
const markedTokens = tokens.map((token) => ({
...token,
isNPlusOneTarget: false,
}));
let sentenceStart = 0;
const minimumSentenceWords = Number.isInteger(minSentenceWords)
? Math.max(1, minSentenceWords)
: 3;
const markSentence = (start: number, endExclusive: number): void => {
const sentenceCandidates: number[] = [];
let sentenceWordCount = 0;
for (let i = start; i < endExclusive; i++) {
const token = markedTokens[i];
if (!token) continue;
if (isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions)) {
sentenceWordCount += 1;
}
if (isNPlusOneCandidateToken(token, pos1Exclusions, pos2Exclusions)) {
sentenceCandidates.push(i);
}
}
if (sentenceWordCount >= minimumSentenceWords && sentenceCandidates.length === 1) {
markedTokens[sentenceCandidates[0]!] = {
...markedTokens[sentenceCandidates[0]!]!,
isNPlusOneTarget: true,
};
}
};
for (let i = 0; i < markedTokens.length; i++) {
const token = markedTokens[i];
if (!token) continue;
if (isSentenceBoundaryToken(token)) {
markSentence(sentenceStart, i);
sentenceStart = i + 1;
}
}
if (sentenceStart < markedTokens.length) {
markSentence(sentenceStart, markedTokens.length);
}
return markedTokens;
}