SubMiner/src/token-merger.ts

/*
 * SubMiner - All-in-one sentence mining overlay
 * Copyright (C) 2024 sudacode
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */

import { PartOfSpeech, Token, MergedToken } from './types';
import { DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG } from './token-pos1-exclusions';
import { DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG } from './token-pos2-exclusions';
import { shouldExcludeTokenFromSubtitleAnnotations } from './core/services/tokenizer/subtitle-annotation-filter';

export function isNoun(tok: Token): boolean {
  return tok.partOfSpeech === PartOfSpeech.noun;
}

export function isProperNoun(tok: Token): boolean {
  return tok.partOfSpeech === PartOfSpeech.noun && tok.pos2 === '固有名詞';
}

export function ignoreReading(tok: Token): boolean {
  return tok.partOfSpeech === PartOfSpeech.symbol && tok.pos2 === '文字';
}

export function isCopula(tok: Token): boolean {
  const raw = tok.inflectionType;
  if (!raw) {
    return false;
  }
  return ['特殊・ダ', '特殊・デス', '特殊|だ', '特殊|デス'].includes(raw);
}

export function isAuxVerb(tok: Token): boolean {
  return tok.partOfSpeech === PartOfSpeech.bound_auxiliary && !isCopula(tok);
}

export function isContinuativeForm(tok: Token): boolean {
  if (!tok.inflectionForm) {
    return false;
  }
  const inflectionForm = tok.inflectionForm;
  const isContinuative =
    inflectionForm === '連用デ接続' ||
    inflectionForm === '連用タ接続' ||
    inflectionForm.startsWith('連用形');

  if (!isContinuative) {
    return false;
  }
  return tok.headword !== 'ない';
}

export function isVerbSuffix(tok: Token): boolean {
  return tok.partOfSpeech === PartOfSpeech.verb && (tok.pos2 === '非自立' || tok.pos2 === '接尾');
}

export function isTatteParticle(tok: Token): boolean {
  return (
    tok.partOfSpeech === PartOfSpeech.particle &&
    tok.pos2 === '接続助詞' &&
    tok.headword === 'たって'
  );
}

export function isBaParticle(tok: Token): boolean {
  return tok.partOfSpeech === PartOfSpeech.particle && tok.pos2 === '接続助詞' && tok.word === 'ば';
}

export function isTeDeParticle(tok: Token): boolean {
  return (
    tok.partOfSpeech === PartOfSpeech.particle &&
    tok.pos2 === '接続助詞' &&
    ['て', 'で', 'ちゃ'].includes(tok.word)
  );
}

export function isTaDaParticle(tok: Token): boolean {
  return isAuxVerb(tok) && ['た', 'だ'].includes(tok.word);
}

export function isVerb(tok: Token): boolean {
  return [PartOfSpeech.verb, PartOfSpeech.bound_auxiliary].includes(tok.partOfSpeech);
}

export function isVerbNonIndependent(): boolean {
  return true;
}

export function canReceiveAuxiliary(tok: Token): boolean {
  return [PartOfSpeech.verb, PartOfSpeech.bound_auxiliary, PartOfSpeech.i_adjective].includes(
    tok.partOfSpeech,
  );
}

export function isNounSuffix(tok: Token): boolean {
  return tok.partOfSpeech === PartOfSpeech.verb && tok.pos2 === '接尾';
}

export function isCounter(tok: Token): boolean {
  return (
    tok.partOfSpeech === PartOfSpeech.noun &&
    tok.pos3 !== undefined &&
    tok.pos3.startsWith('助数詞')
  );
}

export function isNumeral(tok: Token): boolean {
  return (
    tok.partOfSpeech === PartOfSpeech.noun && tok.pos2 !== undefined && tok.pos2.startsWith('数')
  );
}

export function shouldMerge(lastStandaloneToken: Token, token: Token): boolean {
  if (isVerb(lastStandaloneToken)) {
    if (isAuxVerb(token)) {
      return true;
    }
    if (isContinuativeForm(lastStandaloneToken) && isVerbSuffix(token)) {
      return true;
    }
    if (isVerbSuffix(token) && isVerbNonIndependent()) {
      return true;
    }
  }

  if (isNoun(lastStandaloneToken) && !isProperNoun(lastStandaloneToken) && isNounSuffix(token)) {
    return true;
  }

  if (isCounter(token) && isNumeral(lastStandaloneToken)) {
    return true;
  }

  if (isBaParticle(token) && canReceiveAuxiliary(lastStandaloneToken)) {
    return true;
  }

  if (isTatteParticle(token) && canReceiveAuxiliary(lastStandaloneToken)) {
    return true;
  }

  if (isTeDeParticle(token) && isContinuativeForm(lastStandaloneToken)) {
    return true;
  }

  if (isTaDaParticle(token) && canReceiveAuxiliary(lastStandaloneToken)) {
    return true;
  }

  if (isTeDeParticle(lastStandaloneToken) && isVerbSuffix(token)) {
    return true;
  }

  return false;
}

export function mergeTokens(
  tokens: Token[],
  isKnownWord: (text: string) => boolean = () => false,
  knownWordMatchMode: 'headword' | 'surface' = 'headword',
  shouldLookupKnownWords = true,
  sourceText?: string,
): MergedToken[] {
  if (!tokens || tokens.length === 0) {
    return [];
  }

  const result: MergedToken[] = [];
  const normalizedSourceText = normalizeSourceTextForTokenOffsets(sourceText);
  let charOffset = 0;
  let sourceCursor = 0;
  let lastStandaloneToken: Token | null = null;
  const resolveKnownMatch = (text: string | undefined): boolean => {
    if (!shouldLookupKnownWords || !text) {
      return false;
    }
    return isKnownWord(text);
  };

  for (const token of tokens) {
    const matchedStart =
      typeof normalizedSourceText === 'string'
        ? normalizedSourceText.indexOf(token.word, sourceCursor)
        : -1;
    const start = matchedStart >= sourceCursor ? matchedStart : charOffset;
    const end = start + token.word.length;
    charOffset = end;
    sourceCursor = end;

    let shouldMergeToken = false;

    if (result.length > 0 && lastStandaloneToken !== null) {
      shouldMergeToken = shouldMerge(lastStandaloneToken, token);
    }

    const tokenReading = ignoreReading(token) ? '' : token.katakanaReading || token.word;
    if (shouldMergeToken && result.length > 0) {
      const prev = result.pop()!;
      const mergedHeadword = prev.headword;
      const headwordForKnownMatch = (() => {
        if (knownWordMatchMode === 'surface') {
          return prev.surface;
        }
        return mergedHeadword;
      })();
      result.push({
        surface: prev.surface + token.word,
        reading: prev.reading + tokenReading,
        headword: prev.headword,
        startPos: prev.startPos,
        endPos: end,
        partOfSpeech: prev.partOfSpeech,
        pos1: prev.pos1 ?? token.pos1,
        pos2: prev.pos2 ?? token.pos2,
        pos3: prev.pos3 ?? token.pos3,
        isMerged: true,
        isKnown: resolveKnownMatch(headwordForKnownMatch),
        isNPlusOneTarget: false,
      });
    } else {
      const headwordForKnownMatch = (() => {
        if (knownWordMatchMode === 'surface') {
          return token.word;
        }
        return token.headword;
      })();
      result.push({
        surface: token.word,
        reading: tokenReading,
        headword: token.headword,
        startPos: start,
        endPos: end,
        partOfSpeech: token.partOfSpeech,
        pos1: token.pos1,
        pos2: token.pos2,
        pos3: token.pos3,
        isMerged: false,
        isKnown: resolveKnownMatch(headwordForKnownMatch),
        isNPlusOneTarget: false,
      });
    }

    lastStandaloneToken = token;
  }

  return result;
}

const SENTENCE_BOUNDARY_SURFACES = new Set(['。', '？', '！', '?', '!', '…', '\u2026']);
const N_PLUS_ONE_IGNORED_POS1 = new Set(DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG.defaults);
const N_PLUS_ONE_IGNORED_POS2 = new Set(DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG.defaults);

function normalizePos1Tag(pos1: string | undefined): string {
  return typeof pos1 === 'string' ? pos1.trim() : '';
}

function normalizePos2Tag(pos2: string | undefined): string {
  return typeof pos2 === 'string' ? pos2.trim() : '';
}

function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet<string>): boolean {
  if (!normalizedTag) {
    return false;
  }
  const parts = normalizedTag
    .split('|')
    .map((part) => part.trim())
    .filter((part) => part.length > 0);
  if (parts.length === 0) {
    return false;
  }
  return parts.every((part) => exclusions.has(part));
}

function isKanaChar(char: string): boolean {
  const code = char.codePointAt(0);
  if (code === undefined) {
    return false;
  }

  return (
    (code >= 0x3041 && code <= 0x3096) ||
    (code >= 0x309b && code <= 0x309f) ||
    code === 0x30fc ||
    (code >= 0x30a0 && code <= 0x30fa) ||
    (code >= 0x30fd && code <= 0x30ff)
  );
}

function isKanaCandidateIgnorableChar(char: string): boolean {
  return /^[\s.,!?;:()[\]{}"'`、。！？…‥・「」『』（）［］｛｝〈〉《》【】―-]$/u.test(char);
}

function isKanaOnlyText(text: string): boolean {
  const normalized = text.trim();
  if (normalized.length === 0) {
    return false;
  }

  let hasKana = false;
  for (const char of normalized) {
    if (isKanaChar(char)) {
      hasKana = true;
      continue;
    }
    if (!isKanaCandidateIgnorableChar(char)) {
      return false;
    }
  }

  return hasKana;
}

function normalizeSourceTextForTokenOffsets(sourceText: string | undefined): string | undefined {
  return typeof sourceText === 'string' ? sourceText.replace(/\r?\n/g, ' ').trim() : undefined;
}

export function isNPlusOneCandidateToken(
  token: MergedToken,
  pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
  pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
): boolean {
  if (token.isKnown) {
    return false;
  }
  if (isKanaOnlyText(token.surface)) {
    return false;
  }
  return isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions);
}

function isNPlusOneWordCountToken(
  token: MergedToken,
  pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
  pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
): boolean {
  if (shouldExcludeTokenFromSubtitleAnnotations(token, { pos1Exclusions, pos2Exclusions })) {
    return false;
  }

  const normalizedPos1 = normalizePos1Tag(token.pos1);
  const hasPos1 = normalizedPos1.length > 0;
  if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
    return false;
  }

  const normalizedPos2 = normalizePos2Tag(token.pos2);
  const hasPos2 = normalizedPos2.length > 0;
  if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) {
    return false;
  }

  if (
    !hasPos1 &&
    !hasPos2 &&
    (token.partOfSpeech === PartOfSpeech.particle ||
      token.partOfSpeech === PartOfSpeech.bound_auxiliary ||
      token.partOfSpeech === PartOfSpeech.symbol)
  ) {
    return false;
  }

  if (token.partOfSpeech === PartOfSpeech.noun && token.pos2 === '固有名詞') {
    return false;
  }

  if (token.pos3 && token.pos3.startsWith('助数詞')) {
    return false;
  }

  if (token.surface.trim().length === 0) {
    return false;
  }

  return true;
}

function isNPlusOneSentenceLengthToken(
  token: MergedToken,
  pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
  pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
): boolean {
  if (!isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions)) {
    return false;
  }

  return token.isKnown || isNPlusOneCandidateToken(token, pos1Exclusions, pos2Exclusions);
}

function isSentenceBoundaryToken(token: MergedToken): boolean {
  if (token.partOfSpeech !== PartOfSpeech.symbol) {
    return false;
  }

  return SENTENCE_BOUNDARY_SURFACES.has(token.surface);
}

function hasSentenceBoundaryInSourceGap(
  sourceText: string | undefined,
  previousEnd: number | null,
  nextStart: number,
): boolean {
  if (typeof sourceText !== 'string' || previousEnd === null || nextStart <= previousEnd) {
    return false;
  }

  const gap = sourceText.slice(previousEnd, nextStart);
  return [...gap].some((char) => SENTENCE_BOUNDARY_SURFACES.has(char));
}

export function markNPlusOneTargets(
  tokens: MergedToken[],
  minSentenceWords = 3,
  pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
  pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
  sourceText?: string,
): MergedToken[] {
  if (tokens.length === 0) {
    return [];
  }

  const normalizedSourceText = normalizeSourceTextForTokenOffsets(sourceText);

  const markedTokens = tokens.map((token) => ({
    ...token,
    isNPlusOneTarget: false,
  }));

  let sentenceStart = 0;
  let previousTokenEnd: number | null = null;
  const minimumSentenceWords = Number.isInteger(minSentenceWords)
    ? Math.max(1, minSentenceWords)
    : 3;

  const markSentence = (start: number, endExclusive: number): void => {
    const sentenceCandidates: number[] = [];
    let sentenceWordCount = 0;
    for (let i = start; i < endExclusive; i++) {
      const token = markedTokens[i];
      if (!token) continue;
      if (isNPlusOneSentenceLengthToken(token, pos1Exclusions, pos2Exclusions)) {
        sentenceWordCount += 1;
      }

      if (isNPlusOneCandidateToken(token, pos1Exclusions, pos2Exclusions)) {
        sentenceCandidates.push(i);
      }
    }

    if (sentenceWordCount >= minimumSentenceWords && sentenceCandidates.length === 1) {
      markedTokens[sentenceCandidates[0]!] = {
        ...markedTokens[sentenceCandidates[0]!]!,
        isNPlusOneTarget: true,
      };
    }
  };

  for (let i = 0; i < markedTokens.length; i++) {
    const token = markedTokens[i];
    if (!token) continue;
    if (hasSentenceBoundaryInSourceGap(normalizedSourceText, previousTokenEnd, token.startPos)) {
      markSentence(sentenceStart, i);
      sentenceStart = i;
    }
    if (isSentenceBoundaryToken(token)) {
      markSentence(sentenceStart, i);
      sentenceStart = i + 1;
    }
    previousTokenEnd = token.endPos;
  }

  if (sentenceStart < markedTokens.length) {
    markSentence(sentenceStart, markedTokens.length);
  }

  return markedTokens;
}