mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-20 12:11:28 -07:00
fix(subtitle): unify annotation token filtering
This commit is contained in:
@@ -3628,6 +3628,119 @@ test('tokenizeSubtitle excludes merged function/content token from frequency hig
|
|||||||
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, true);
|
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, true);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('tokenizeSubtitle clears all annotations for kana-only demonstrative helper merges', async () => {
|
||||||
|
const result = await tokenizeSubtitle(
|
||||||
|
'これで実力どおりか',
|
||||||
|
makeDepsFromYomitanTokens(
|
||||||
|
[
|
||||||
|
{ surface: 'これで', reading: 'これで', headword: 'これ' },
|
||||||
|
{ surface: '実力どおり', reading: 'じつりょくどおり', headword: '実力どおり' },
|
||||||
|
{ surface: 'か', reading: 'か', headword: 'か' },
|
||||||
|
],
|
||||||
|
{
|
||||||
|
getFrequencyDictionaryEnabled: () => true,
|
||||||
|
getFrequencyRank: (text) =>
|
||||||
|
text === 'これ' ? 9 : text === '実力どおり' ? 2500 : text === 'か' ? 800 : null,
|
||||||
|
getJlptLevel: (text) =>
|
||||||
|
text === 'これ' ? 'N5' : text === '実力どおり' ? 'N1' : text === 'か' ? 'N5' : null,
|
||||||
|
isKnownWord: (text) => text === 'これ',
|
||||||
|
getMinSentenceWordsForNPlusOne: () => 1,
|
||||||
|
tokenizeWithMecab: async () => [
|
||||||
|
{
|
||||||
|
headword: 'これ',
|
||||||
|
surface: 'これ',
|
||||||
|
reading: 'コレ',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 2,
|
||||||
|
partOfSpeech: PartOfSpeech.noun,
|
||||||
|
pos1: '名詞',
|
||||||
|
pos2: '代名詞',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: 'で',
|
||||||
|
surface: 'で',
|
||||||
|
reading: 'デ',
|
||||||
|
startPos: 2,
|
||||||
|
endPos: 3,
|
||||||
|
partOfSpeech: PartOfSpeech.particle,
|
||||||
|
pos1: '助詞',
|
||||||
|
pos2: '格助詞',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: '実力どおり',
|
||||||
|
surface: '実力どおり',
|
||||||
|
reading: 'ジツリョクドオリ',
|
||||||
|
startPos: 3,
|
||||||
|
endPos: 8,
|
||||||
|
partOfSpeech: PartOfSpeech.noun,
|
||||||
|
pos1: '名詞',
|
||||||
|
pos2: '一般',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: 'か',
|
||||||
|
surface: 'か',
|
||||||
|
reading: 'カ',
|
||||||
|
startPos: 8,
|
||||||
|
endPos: 9,
|
||||||
|
partOfSpeech: PartOfSpeech.particle,
|
||||||
|
pos1: '助詞',
|
||||||
|
pos2: '終助詞',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.deepEqual(
|
||||||
|
result.tokens?.map((token) => ({
|
||||||
|
surface: token.surface,
|
||||||
|
headword: token.headword,
|
||||||
|
isKnown: token.isKnown,
|
||||||
|
isNPlusOneTarget: token.isNPlusOneTarget,
|
||||||
|
frequencyRank: token.frequencyRank,
|
||||||
|
jlptLevel: token.jlptLevel,
|
||||||
|
})),
|
||||||
|
[
|
||||||
|
{
|
||||||
|
surface: 'これで',
|
||||||
|
headword: 'これ',
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
frequencyRank: undefined,
|
||||||
|
jlptLevel: undefined,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
surface: '実力どおり',
|
||||||
|
headword: '実力どおり',
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: true,
|
||||||
|
frequencyRank: 2500,
|
||||||
|
jlptLevel: 'N1',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
surface: 'か',
|
||||||
|
headword: 'か',
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
frequencyRank: undefined,
|
||||||
|
jlptLevel: undefined,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle keeps frequency for content-led merged token with trailing colloquial suffixes', async () => {
|
test('tokenizeSubtitle keeps frequency for content-led merged token with trailing colloquial suffixes', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'張り切ってんじゃ',
|
'張り切ってんじゃ',
|
||||||
|
|||||||
@@ -316,6 +316,19 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes merged lexical tokens w
|
|||||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('shouldExcludeTokenFromSubtitleAnnotations excludes kana-only demonstrative helper merges', () => {
|
||||||
|
const token = makeToken({
|
||||||
|
surface: 'これで',
|
||||||
|
headword: 'これ',
|
||||||
|
reading: 'コレデ',
|
||||||
|
partOfSpeech: PartOfSpeech.noun,
|
||||||
|
pos1: '名詞|助詞',
|
||||||
|
pos2: '代名詞|格助詞',
|
||||||
|
});
|
||||||
|
|
||||||
|
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||||
|
});
|
||||||
|
|
||||||
test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => {
|
test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => {
|
||||||
const token = makeToken({
|
const token = makeToken({
|
||||||
surface: 'は',
|
surface: 'は',
|
||||||
@@ -481,8 +494,8 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
|
|||||||
);
|
);
|
||||||
|
|
||||||
assert.equal(result[0]?.isKnown, false);
|
assert.equal(result[0]?.isKnown, false);
|
||||||
assert.equal(result[1]?.isKnown, true);
|
assert.equal(result[1]?.isKnown, false);
|
||||||
assert.equal(result[2]?.isKnown, true);
|
assert.equal(result[2]?.isKnown, false);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -568,7 +581,7 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+
|
|||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens keeps frequency for kanji noun tokens even when mecab marks them non-independent', () => {
|
test('annotateTokens clears all annotations for non-independent kanji noun tokens under unified gate', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: '者',
|
surface: '者',
|
||||||
@@ -588,7 +601,10 @@ test('annotateTokens keeps frequency for kanji noun tokens even when mecab marks
|
|||||||
minSentenceWordsForNPlusOne: 1,
|
minSentenceWordsForNPlusOne: 1,
|
||||||
});
|
});
|
||||||
|
|
||||||
assert.equal(result[0]?.frequencyRank, 475);
|
assert.equal(result[0]?.isKnown, false);
|
||||||
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens excludes likely kana SFX tokens from frequency when POS tags are missing', () => {
|
test('annotateTokens excludes likely kana SFX tokens from frequency when POS tags are missing', () => {
|
||||||
@@ -742,3 +758,33 @@ test('annotateTokens excludes composite tokens when all component pos tags are e
|
|||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('annotateTokens applies one shared exclusion gate across known N+1 frequency and JLPT', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: 'これで',
|
||||||
|
headword: 'これ',
|
||||||
|
reading: 'コレデ',
|
||||||
|
partOfSpeech: PartOfSpeech.noun,
|
||||||
|
pos1: '名詞|助詞',
|
||||||
|
pos2: '代名詞|格助詞',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 3,
|
||||||
|
frequencyRank: 9,
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = annotateTokens(
|
||||||
|
tokens,
|
||||||
|
makeDeps({
|
||||||
|
isKnownWord: (text) => text === 'これ',
|
||||||
|
getJlptLevel: (text) => (text === 'これ' ? 'N5' : null),
|
||||||
|
}),
|
||||||
|
{ minSentenceWordsForNPlusOne: 1 },
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.equal(result[0]?.isKnown, false);
|
||||||
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
|
assert.equal(result[0]?.jlptLevel, undefined);
|
||||||
|
});
|
||||||
|
|||||||
@@ -9,6 +9,10 @@ import {
|
|||||||
} from '../../../token-pos2-exclusions';
|
} from '../../../token-pos2-exclusions';
|
||||||
import { JlptLevel, MergedToken, NPlusOneMatchMode, PartOfSpeech } from '../../../types';
|
import { JlptLevel, MergedToken, NPlusOneMatchMode, PartOfSpeech } from '../../../types';
|
||||||
import { shouldIgnoreJlptByTerm, shouldIgnoreJlptForMecabPos1 } from '../jlpt-token-filter';
|
import { shouldIgnoreJlptByTerm, shouldIgnoreJlptForMecabPos1 } from '../jlpt-token-filter';
|
||||||
|
import {
|
||||||
|
shouldExcludeTokenFromSubtitleAnnotations as sharedShouldExcludeTokenFromSubtitleAnnotations,
|
||||||
|
stripSubtitleAnnotationMetadata as sharedStripSubtitleAnnotationMetadata,
|
||||||
|
} from './subtitle-annotation-filter';
|
||||||
|
|
||||||
const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
|
const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
|
||||||
const KATAKANA_CODEPOINT_START = 0x30a1;
|
const KATAKANA_CODEPOINT_START = 0x30a1;
|
||||||
@@ -633,34 +637,11 @@ function isExcludedFromSubtitleAnnotationsByTerm(token: MergedToken): boolean {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export function shouldExcludeTokenFromSubtitleAnnotations(token: MergedToken): boolean {
|
export function shouldExcludeTokenFromSubtitleAnnotations(token: MergedToken): boolean {
|
||||||
if (isExcludedFromSubtitleAnnotationsByPos1(normalizePos1Tag(token.pos1))) {
|
return sharedShouldExcludeTokenFromSubtitleAnnotations(token);
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isAuxiliaryStemGrammarTailToken(token)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isExcludedTrailingParticleMergedToken(token)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return isExcludedFromSubtitleAnnotationsByTerm(token);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export function stripSubtitleAnnotationMetadata(token: MergedToken): MergedToken {
|
export function stripSubtitleAnnotationMetadata(token: MergedToken): MergedToken {
|
||||||
if (!shouldExcludeTokenFromSubtitleAnnotations(token)) {
|
return sharedStripSubtitleAnnotationMetadata(token);
|
||||||
return token;
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
...token,
|
|
||||||
isKnown: false,
|
|
||||||
isNPlusOneTarget: false,
|
|
||||||
isNameMatch: false,
|
|
||||||
jlptLevel: undefined,
|
|
||||||
frequencyRank: undefined,
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function computeTokenKnownStatus(
|
function computeTokenKnownStatus(
|
||||||
@@ -737,6 +718,18 @@ export function annotateTokens(
|
|||||||
|
|
||||||
// Single pass: compute known word status, frequency filtering, and JLPT level together
|
// Single pass: compute known word status, frequency filtering, and JLPT level together
|
||||||
const annotated = tokens.map((token) => {
|
const annotated = tokens.map((token) => {
|
||||||
|
if (
|
||||||
|
sharedShouldExcludeTokenFromSubtitleAnnotations(token, {
|
||||||
|
pos1Exclusions,
|
||||||
|
pos2Exclusions,
|
||||||
|
})
|
||||||
|
) {
|
||||||
|
return sharedStripSubtitleAnnotationMetadata(token, {
|
||||||
|
pos1Exclusions,
|
||||||
|
pos2Exclusions,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
const prioritizedNameMatch = nameMatchEnabled && token.isNameMatch === true;
|
const prioritizedNameMatch = nameMatchEnabled && token.isNameMatch === true;
|
||||||
const isKnown = nPlusOneEnabled
|
const isKnown = nPlusOneEnabled
|
||||||
? computeTokenKnownStatus(token, deps.isKnownWord, deps.knownWordMatchMode)
|
? computeTokenKnownStatus(token, deps.isKnownWord, deps.knownWordMatchMode)
|
||||||
|
|||||||
341
src/core/services/tokenizer/subtitle-annotation-filter.ts
Normal file
341
src/core/services/tokenizer/subtitle-annotation-filter.ts
Normal file
@@ -0,0 +1,341 @@
|
|||||||
|
import {
|
||||||
|
DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
|
||||||
|
resolveAnnotationPos1ExclusionSet,
|
||||||
|
} from '../../../token-pos1-exclusions';
|
||||||
|
import {
|
||||||
|
DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
|
||||||
|
resolveAnnotationPos2ExclusionSet,
|
||||||
|
} from '../../../token-pos2-exclusions';
|
||||||
|
import { MergedToken, PartOfSpeech } from '../../../types';
|
||||||
|
import { shouldIgnoreJlptByTerm } from '../jlpt-token-filter';
|
||||||
|
|
||||||
|
const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
|
||||||
|
const KATAKANA_CODEPOINT_START = 0x30a1;
|
||||||
|
const KATAKANA_CODEPOINT_END = 0x30f6;
|
||||||
|
|
||||||
|
const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
||||||
|
'ああ',
|
||||||
|
'ええ',
|
||||||
|
'うう',
|
||||||
|
'おお',
|
||||||
|
'はあ',
|
||||||
|
'はは',
|
||||||
|
'へえ',
|
||||||
|
'ふう',
|
||||||
|
'ほう',
|
||||||
|
]);
|
||||||
|
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
|
||||||
|
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
|
||||||
|
'だ',
|
||||||
|
'です',
|
||||||
|
'でした',
|
||||||
|
'だった',
|
||||||
|
'では',
|
||||||
|
'じゃ',
|
||||||
|
'でしょう',
|
||||||
|
'だろう',
|
||||||
|
] as const;
|
||||||
|
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES = [
|
||||||
|
'',
|
||||||
|
'か',
|
||||||
|
'ね',
|
||||||
|
'よ',
|
||||||
|
'な',
|
||||||
|
'よね',
|
||||||
|
'かな',
|
||||||
|
'かね',
|
||||||
|
] as const;
|
||||||
|
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set(
|
||||||
|
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.flatMap((prefix) =>
|
||||||
|
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES.flatMap((core) =>
|
||||||
|
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES.map(
|
||||||
|
(particle) => `${prefix}${core}${particle}`,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
);
|
||||||
|
const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
|
||||||
|
'って',
|
||||||
|
'ってよ',
|
||||||
|
'ってね',
|
||||||
|
'ってな',
|
||||||
|
'ってさ',
|
||||||
|
'ってか',
|
||||||
|
'ってば',
|
||||||
|
]);
|
||||||
|
const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
|
||||||
|
|
||||||
|
export interface SubtitleAnnotationFilterOptions {
|
||||||
|
pos1Exclusions?: ReadonlySet<string>;
|
||||||
|
pos2Exclusions?: ReadonlySet<string>;
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizePosTag(pos: string | undefined): string {
|
||||||
|
return typeof pos === 'string' ? pos.trim() : '';
|
||||||
|
}
|
||||||
|
|
||||||
|
function splitNormalizedTagParts(normalizedTag: string): string[] {
|
||||||
|
if (!normalizedTag) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
return normalizedTag
|
||||||
|
.split('|')
|
||||||
|
.map((part) => part.trim())
|
||||||
|
.filter((part) => part.length > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet<string>): boolean {
|
||||||
|
const parts = splitNormalizedTagParts(normalizedTag);
|
||||||
|
if (parts.length === 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return parts.every((part) => exclusions.has(part));
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolvePos1Exclusions(
|
||||||
|
options: SubtitleAnnotationFilterOptions = {},
|
||||||
|
): ReadonlySet<string> {
|
||||||
|
if (options.pos1Exclusions) {
|
||||||
|
return options.pos1Exclusions;
|
||||||
|
}
|
||||||
|
|
||||||
|
return resolveAnnotationPos1ExclusionSet(DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG);
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolvePos2Exclusions(
|
||||||
|
options: SubtitleAnnotationFilterOptions = {},
|
||||||
|
): ReadonlySet<string> {
|
||||||
|
if (options.pos2Exclusions) {
|
||||||
|
return options.pos2Exclusions;
|
||||||
|
}
|
||||||
|
|
||||||
|
return resolveAnnotationPos2ExclusionSet(DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG);
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeKana(text: string): string {
|
||||||
|
const raw = text.trim();
|
||||||
|
if (!raw) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
let normalized = '';
|
||||||
|
for (const char of raw) {
|
||||||
|
const code = char.codePointAt(0);
|
||||||
|
if (code === undefined) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (code >= KATAKANA_CODEPOINT_START && code <= KATAKANA_CODEPOINT_END) {
|
||||||
|
normalized += String.fromCodePoint(code - KATAKANA_TO_HIRAGANA_OFFSET);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
normalized += char;
|
||||||
|
}
|
||||||
|
|
||||||
|
return normalized;
|
||||||
|
}
|
||||||
|
|
||||||
|
function isKanaChar(char: string): boolean {
|
||||||
|
const code = char.codePointAt(0);
|
||||||
|
if (code === undefined) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (
|
||||||
|
(code >= 0x3041 && code <= 0x3096) ||
|
||||||
|
(code >= 0x309b && code <= 0x309f) ||
|
||||||
|
code === 0x30fc ||
|
||||||
|
(code >= 0x30a0 && code <= 0x30fa) ||
|
||||||
|
(code >= 0x30fd && code <= 0x30ff)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function isTrailingSmallTsuKanaSfx(text: string): boolean {
|
||||||
|
const normalized = normalizeKana(text);
|
||||||
|
if (!normalized) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const chars = [...normalized];
|
||||||
|
if (chars.length < 2 || chars.length > 4) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!chars.every(isKanaChar)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return chars[chars.length - 1] === 'っ';
|
||||||
|
}
|
||||||
|
|
||||||
|
function isReduplicatedKanaSfx(text: string): boolean {
|
||||||
|
const normalized = normalizeKana(text);
|
||||||
|
if (!normalized) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const chars = [...normalized];
|
||||||
|
if (chars.length < 4 || chars.length % 2 !== 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!chars.every(isKanaChar)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const half = chars.length / 2;
|
||||||
|
return chars.slice(0, half).join('') === chars.slice(half).join('');
|
||||||
|
}
|
||||||
|
|
||||||
|
function isReduplicatedKanaSfxWithOptionalTrailingTo(text: string): boolean {
|
||||||
|
const normalized = normalizeKana(text);
|
||||||
|
if (!normalized) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isReduplicatedKanaSfx(normalized)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (normalized.length <= 1 || !normalized.endsWith('と')) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return isReduplicatedKanaSfx(normalized.slice(0, -1));
|
||||||
|
}
|
||||||
|
|
||||||
|
function isExcludedTrailingParticleMergedToken(token: MergedToken): boolean {
|
||||||
|
const normalizedSurface = normalizeKana(token.surface);
|
||||||
|
const normalizedHeadword = normalizeKana(token.headword);
|
||||||
|
if (!normalizedSurface || !normalizedHeadword || !normalizedSurface.startsWith(normalizedHeadword)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const suffix = normalizedSurface.slice(normalizedHeadword.length);
|
||||||
|
if (!SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES.has(suffix)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
|
||||||
|
if (pos1Parts.length < 2) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const [leadingPos1, ...trailingPos1] = pos1Parts;
|
||||||
|
if (!leadingPos1 || resolvePos1Exclusions().has(leadingPos1)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return trailingPos1.length > 0 && trailingPos1.every((part) => part === '助詞');
|
||||||
|
}
|
||||||
|
|
||||||
|
function isAuxiliaryStemGrammarTailToken(token: MergedToken): boolean {
|
||||||
|
const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
|
||||||
|
if (pos1Parts.length === 0 || !pos1Parts.every((part) => AUXILIARY_STEM_GRAMMAR_TAIL_POS1.has(part))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const pos3Parts = splitNormalizedTagParts(normalizePosTag(token.pos3));
|
||||||
|
return pos3Parts.includes('助動詞語幹');
|
||||||
|
}
|
||||||
|
|
||||||
|
function isExcludedByTerm(token: MergedToken): boolean {
|
||||||
|
const candidates = [token.surface, token.reading, token.headword].filter(
|
||||||
|
(candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const candidate of candidates) {
|
||||||
|
const trimmed = candidate.trim();
|
||||||
|
if (!trimmed) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const normalized = normalizeKana(trimmed);
|
||||||
|
if (!normalized) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(trimmed) ||
|
||||||
|
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalized) ||
|
||||||
|
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(trimmed) ||
|
||||||
|
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(normalized) ||
|
||||||
|
shouldIgnoreJlptByTerm(trimmed) ||
|
||||||
|
shouldIgnoreJlptByTerm(normalized)
|
||||||
|
) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
isTrailingSmallTsuKanaSfx(trimmed) ||
|
||||||
|
isTrailingSmallTsuKanaSfx(normalized) ||
|
||||||
|
isReduplicatedKanaSfxWithOptionalTrailingTo(trimmed) ||
|
||||||
|
isReduplicatedKanaSfxWithOptionalTrailingTo(normalized)
|
||||||
|
) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function shouldExcludeTokenFromSubtitleAnnotations(
|
||||||
|
token: MergedToken,
|
||||||
|
options: SubtitleAnnotationFilterOptions = {},
|
||||||
|
): boolean {
|
||||||
|
const pos1Exclusions = resolvePos1Exclusions(options);
|
||||||
|
const pos2Exclusions = resolvePos2Exclusions(options);
|
||||||
|
const normalizedPos1 = normalizePosTag(token.pos1);
|
||||||
|
const normalizedPos2 = normalizePosTag(token.pos2);
|
||||||
|
const hasPos1 = normalizedPos1.length > 0;
|
||||||
|
const hasPos2 = normalizedPos2.length > 0;
|
||||||
|
|
||||||
|
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
!hasPos1 &&
|
||||||
|
!hasPos2 &&
|
||||||
|
(token.partOfSpeech === PartOfSpeech.particle ||
|
||||||
|
token.partOfSpeech === PartOfSpeech.bound_auxiliary ||
|
||||||
|
token.partOfSpeech === PartOfSpeech.symbol)
|
||||||
|
) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isAuxiliaryStemGrammarTailToken(token)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isExcludedTrailingParticleMergedToken(token)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return isExcludedByTerm(token);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function stripSubtitleAnnotationMetadata(
|
||||||
|
token: MergedToken,
|
||||||
|
options: SubtitleAnnotationFilterOptions = {},
|
||||||
|
): MergedToken {
|
||||||
|
if (!shouldExcludeTokenFromSubtitleAnnotations(token, options)) {
|
||||||
|
return token;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
...token,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
isNameMatch: false,
|
||||||
|
jlptLevel: undefined,
|
||||||
|
frequencyRank: undefined,
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -19,6 +19,7 @@
|
|||||||
import { PartOfSpeech, Token, MergedToken } from './types';
|
import { PartOfSpeech, Token, MergedToken } from './types';
|
||||||
import { DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG } from './token-pos1-exclusions';
|
import { DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG } from './token-pos1-exclusions';
|
||||||
import { DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG } from './token-pos2-exclusions';
|
import { DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG } from './token-pos2-exclusions';
|
||||||
|
import { shouldExcludeTokenFromSubtitleAnnotations } from './core/services/tokenizer/subtitle-annotation-filter';
|
||||||
|
|
||||||
export function isNoun(tok: Token): boolean {
|
export function isNoun(tok: Token): boolean {
|
||||||
return tok.partOfSpeech === PartOfSpeech.noun;
|
return tok.partOfSpeech === PartOfSpeech.noun;
|
||||||
@@ -297,6 +298,10 @@ function isNPlusOneWordCountToken(
|
|||||||
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
|
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
|
||||||
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
|
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
|
||||||
): boolean {
|
): boolean {
|
||||||
|
if (shouldExcludeTokenFromSubtitleAnnotations(token, { pos1Exclusions, pos2Exclusions })) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
const normalizedPos1 = normalizePos1Tag(token.pos1);
|
const normalizedPos1 = normalizePos1Tag(token.pos1);
|
||||||
const hasPos1 = normalizedPos1.length > 0;
|
const hasPos1 = normalizedPos1.length > 0;
|
||||||
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
|
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
|
||||||
|
|||||||
Reference in New Issue
Block a user