mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-02-28 06:22:45 -08:00
234 lines
5.9 KiB
TypeScript
234 lines
5.9 KiB
TypeScript
/*
|
|
* SubMiner - All-in-one sentence mining overlay
|
|
* Copyright (C) 2024 sudacode
|
|
*
|
|
* This program is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
import { PartOfSpeech, Token, MergedToken } from "./types";
|
|
|
|
export function isNoun(tok: Token): boolean {
|
|
return tok.partOfSpeech === PartOfSpeech.noun;
|
|
}
|
|
|
|
export function isProperNoun(tok: Token): boolean {
|
|
return tok.partOfSpeech === PartOfSpeech.noun && tok.pos2 === "固有名詞";
|
|
}
|
|
|
|
export function ignoreReading(tok: Token): boolean {
|
|
return tok.partOfSpeech === PartOfSpeech.symbol && tok.pos2 === "文字";
|
|
}
|
|
|
|
export function isCopula(tok: Token): boolean {
|
|
const raw = tok.inflectionType;
|
|
if (!raw) {
|
|
return false;
|
|
}
|
|
return ["特殊・ダ", "特殊・デス", "特殊|だ", "特殊|デス"].includes(raw);
|
|
}
|
|
|
|
export function isAuxVerb(tok: Token): boolean {
|
|
return tok.partOfSpeech === PartOfSpeech.bound_auxiliary && !isCopula(tok);
|
|
}
|
|
|
|
export function isContinuativeForm(tok: Token): boolean {
|
|
if (!tok.inflectionForm) {
|
|
return false;
|
|
}
|
|
const inflectionForm = tok.inflectionForm;
|
|
const isContinuative =
|
|
inflectionForm === "連用デ接続" ||
|
|
inflectionForm === "連用タ接続" ||
|
|
inflectionForm.startsWith("連用形");
|
|
|
|
if (!isContinuative) {
|
|
return false;
|
|
}
|
|
return tok.headword !== "ない";
|
|
}
|
|
|
|
export function isVerbSuffix(tok: Token): boolean {
|
|
return (
|
|
tok.partOfSpeech === PartOfSpeech.verb &&
|
|
(tok.pos2 === "非自立" || tok.pos2 === "接尾")
|
|
);
|
|
}
|
|
|
|
export function isTatteParticle(tok: Token): boolean {
|
|
return (
|
|
tok.partOfSpeech === PartOfSpeech.particle &&
|
|
tok.pos2 === "接続助詞" &&
|
|
tok.headword === "たって"
|
|
);
|
|
}
|
|
|
|
export function isBaParticle(tok: Token): boolean {
|
|
return (
|
|
tok.partOfSpeech === PartOfSpeech.particle &&
|
|
tok.pos2 === "接続助詞" &&
|
|
tok.word === "ば"
|
|
);
|
|
}
|
|
|
|
export function isTeDeParticle(tok: Token): boolean {
|
|
return (
|
|
tok.partOfSpeech === PartOfSpeech.particle &&
|
|
tok.pos2 === "接続助詞" &&
|
|
["て", "で", "ちゃ"].includes(tok.word)
|
|
);
|
|
}
|
|
|
|
export function isTaDaParticle(tok: Token): boolean {
|
|
return isAuxVerb(tok) && ["た", "だ"].includes(tok.word);
|
|
}
|
|
|
|
export function isVerb(tok: Token): boolean {
|
|
return [PartOfSpeech.verb, PartOfSpeech.bound_auxiliary].includes(
|
|
tok.partOfSpeech,
|
|
);
|
|
}
|
|
|
|
export function isVerbNonIndependent(): boolean {
|
|
return true;
|
|
}
|
|
|
|
export function canReceiveAuxiliary(tok: Token): boolean {
|
|
return [
|
|
PartOfSpeech.verb,
|
|
PartOfSpeech.bound_auxiliary,
|
|
PartOfSpeech.i_adjective,
|
|
].includes(tok.partOfSpeech);
|
|
}
|
|
|
|
export function isNounSuffix(tok: Token): boolean {
|
|
return tok.partOfSpeech === PartOfSpeech.verb && tok.pos2 === "接尾";
|
|
}
|
|
|
|
export function isCounter(tok: Token): boolean {
|
|
return (
|
|
tok.partOfSpeech === PartOfSpeech.noun &&
|
|
tok.pos3 !== undefined &&
|
|
tok.pos3.startsWith("助数詞")
|
|
);
|
|
}
|
|
|
|
export function isNumeral(tok: Token): boolean {
|
|
return (
|
|
tok.partOfSpeech === PartOfSpeech.noun &&
|
|
tok.pos2 !== undefined &&
|
|
tok.pos2.startsWith("数")
|
|
);
|
|
}
|
|
|
|
export function shouldMerge(lastStandaloneToken: Token, token: Token): boolean {
|
|
if (isVerb(lastStandaloneToken)) {
|
|
if (isAuxVerb(token)) {
|
|
return true;
|
|
}
|
|
if (isContinuativeForm(lastStandaloneToken) && isVerbSuffix(token)) {
|
|
return true;
|
|
}
|
|
if (isVerbSuffix(token) && isVerbNonIndependent()) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if (
|
|
isNoun(lastStandaloneToken) &&
|
|
!isProperNoun(lastStandaloneToken) &&
|
|
isNounSuffix(token)
|
|
) {
|
|
return true;
|
|
}
|
|
|
|
if (isCounter(token) && isNumeral(lastStandaloneToken)) {
|
|
return true;
|
|
}
|
|
|
|
if (isBaParticle(token) && canReceiveAuxiliary(lastStandaloneToken)) {
|
|
return true;
|
|
}
|
|
|
|
if (isTatteParticle(token) && canReceiveAuxiliary(lastStandaloneToken)) {
|
|
return true;
|
|
}
|
|
|
|
if (isTeDeParticle(token) && isContinuativeForm(lastStandaloneToken)) {
|
|
return true;
|
|
}
|
|
|
|
if (isTaDaParticle(token) && canReceiveAuxiliary(lastStandaloneToken)) {
|
|
return true;
|
|
}
|
|
|
|
if (isTeDeParticle(lastStandaloneToken) && isVerbSuffix(token)) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
export function mergeTokens(tokens: Token[]): MergedToken[] {
|
|
if (!tokens || tokens.length === 0) {
|
|
return [];
|
|
}
|
|
|
|
const result: MergedToken[] = [];
|
|
let charOffset = 0;
|
|
let lastStandaloneToken: Token | null = null;
|
|
|
|
for (const token of tokens) {
|
|
const start = charOffset;
|
|
const end = charOffset + token.word.length;
|
|
charOffset = end;
|
|
|
|
let shouldMergeToken = false;
|
|
|
|
if (result.length > 0 && lastStandaloneToken !== null) {
|
|
shouldMergeToken = shouldMerge(lastStandaloneToken, token);
|
|
}
|
|
|
|
const tokenReading = ignoreReading(token)
|
|
? ""
|
|
: token.katakanaReading || token.word;
|
|
|
|
if (shouldMergeToken && result.length > 0) {
|
|
const prev = result.pop()!;
|
|
result.push({
|
|
surface: prev.surface + token.word,
|
|
reading: prev.reading + tokenReading,
|
|
headword: prev.headword,
|
|
startPos: prev.startPos,
|
|
endPos: end,
|
|
partOfSpeech: prev.partOfSpeech,
|
|
isMerged: true,
|
|
});
|
|
} else {
|
|
result.push({
|
|
surface: token.word,
|
|
reading: tokenReading,
|
|
headword: token.headword,
|
|
startPos: start,
|
|
endPos: end,
|
|
partOfSpeech: token.partOfSpeech,
|
|
isMerged: false,
|
|
});
|
|
}
|
|
|
|
lastStandaloneToken = token;
|
|
}
|
|
|
|
return result;
|
|
}
|