mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-20 03:16:46 -07:00
fix(subtitle): unify annotation token filtering
This commit is contained in:
@@ -3628,6 +3628,119 @@ test('tokenizeSubtitle excludes merged function/content token from frequency hig
|
||||
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, true);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle clears all annotations for kana-only demonstrative helper merges', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'これで実力どおりか',
|
||||
makeDepsFromYomitanTokens(
|
||||
[
|
||||
{ surface: 'これで', reading: 'これで', headword: 'これ' },
|
||||
{ surface: '実力どおり', reading: 'じつりょくどおり', headword: '実力どおり' },
|
||||
{ surface: 'か', reading: 'か', headword: 'か' },
|
||||
],
|
||||
{
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) =>
|
||||
text === 'これ' ? 9 : text === '実力どおり' ? 2500 : text === 'か' ? 800 : null,
|
||||
getJlptLevel: (text) =>
|
||||
text === 'これ' ? 'N5' : text === '実力どおり' ? 'N1' : text === 'か' ? 'N5' : null,
|
||||
isKnownWord: (text) => text === 'これ',
|
||||
getMinSentenceWordsForNPlusOne: () => 1,
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
headword: 'これ',
|
||||
surface: 'これ',
|
||||
reading: 'コレ',
|
||||
startPos: 0,
|
||||
endPos: 2,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '代名詞',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'で',
|
||||
surface: 'で',
|
||||
reading: 'デ',
|
||||
startPos: 2,
|
||||
endPos: 3,
|
||||
partOfSpeech: PartOfSpeech.particle,
|
||||
pos1: '助詞',
|
||||
pos2: '格助詞',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: '実力どおり',
|
||||
surface: '実力どおり',
|
||||
reading: 'ジツリョクドオリ',
|
||||
startPos: 3,
|
||||
endPos: 8,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '一般',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'か',
|
||||
surface: 'か',
|
||||
reading: 'カ',
|
||||
startPos: 8,
|
||||
endPos: 9,
|
||||
partOfSpeech: PartOfSpeech.particle,
|
||||
pos1: '助詞',
|
||||
pos2: '終助詞',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
},
|
||||
),
|
||||
);
|
||||
|
||||
assert.deepEqual(
|
||||
result.tokens?.map((token) => ({
|
||||
surface: token.surface,
|
||||
headword: token.headword,
|
||||
isKnown: token.isKnown,
|
||||
isNPlusOneTarget: token.isNPlusOneTarget,
|
||||
frequencyRank: token.frequencyRank,
|
||||
jlptLevel: token.jlptLevel,
|
||||
})),
|
||||
[
|
||||
{
|
||||
surface: 'これで',
|
||||
headword: 'これ',
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
frequencyRank: undefined,
|
||||
jlptLevel: undefined,
|
||||
},
|
||||
{
|
||||
surface: '実力どおり',
|
||||
headword: '実力どおり',
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: true,
|
||||
frequencyRank: 2500,
|
||||
jlptLevel: 'N1',
|
||||
},
|
||||
{
|
||||
surface: 'か',
|
||||
headword: 'か',
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
frequencyRank: undefined,
|
||||
jlptLevel: undefined,
|
||||
},
|
||||
],
|
||||
);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle keeps frequency for content-led merged token with trailing colloquial suffixes', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'張り切ってんじゃ',
|
||||
|
||||
@@ -316,6 +316,19 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes merged lexical tokens w
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes kana-only demonstrative helper merges', () => {
|
||||
const token = makeToken({
|
||||
surface: 'これで',
|
||||
headword: 'これ',
|
||||
reading: 'コレデ',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞|助詞',
|
||||
pos2: '代名詞|格助詞',
|
||||
});
|
||||
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||
});
|
||||
|
||||
test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => {
|
||||
const token = makeToken({
|
||||
surface: 'は',
|
||||
@@ -481,8 +494,8 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.isKnown, false);
|
||||
assert.equal(result[1]?.isKnown, true);
|
||||
assert.equal(result[2]?.isKnown, true);
|
||||
assert.equal(result[1]?.isKnown, false);
|
||||
assert.equal(result[2]?.isKnown, false);
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
@@ -568,7 +581,7 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('annotateTokens keeps frequency for kanji noun tokens even when mecab marks them non-independent', () => {
|
||||
test('annotateTokens clears all annotations for non-independent kanji noun tokens under unified gate', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: '者',
|
||||
@@ -588,7 +601,10 @@ test('annotateTokens keeps frequency for kanji noun tokens even when mecab marks
|
||||
minSentenceWordsForNPlusOne: 1,
|
||||
});
|
||||
|
||||
assert.equal(result[0]?.frequencyRank, 475);
|
||||
assert.equal(result[0]?.isKnown, false);
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
assert.equal(result[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens excludes likely kana SFX tokens from frequency when POS tags are missing', () => {
|
||||
@@ -742,3 +758,33 @@ test('annotateTokens excludes composite tokens when all component pos tags are e
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('annotateTokens applies one shared exclusion gate across known N+1 frequency and JLPT', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'これで',
|
||||
headword: 'これ',
|
||||
reading: 'コレデ',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞|助詞',
|
||||
pos2: '代名詞|格助詞',
|
||||
startPos: 0,
|
||||
endPos: 3,
|
||||
frequencyRank: 9,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === 'これ',
|
||||
getJlptLevel: (text) => (text === 'これ' ? 'N5' : null),
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 1 },
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.isKnown, false);
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
assert.equal(result[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
@@ -9,6 +9,10 @@ import {
|
||||
} from '../../../token-pos2-exclusions';
|
||||
import { JlptLevel, MergedToken, NPlusOneMatchMode, PartOfSpeech } from '../../../types';
|
||||
import { shouldIgnoreJlptByTerm, shouldIgnoreJlptForMecabPos1 } from '../jlpt-token-filter';
|
||||
import {
|
||||
shouldExcludeTokenFromSubtitleAnnotations as sharedShouldExcludeTokenFromSubtitleAnnotations,
|
||||
stripSubtitleAnnotationMetadata as sharedStripSubtitleAnnotationMetadata,
|
||||
} from './subtitle-annotation-filter';
|
||||
|
||||
const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
|
||||
const KATAKANA_CODEPOINT_START = 0x30a1;
|
||||
@@ -633,34 +637,11 @@ function isExcludedFromSubtitleAnnotationsByTerm(token: MergedToken): boolean {
|
||||
}
|
||||
|
||||
export function shouldExcludeTokenFromSubtitleAnnotations(token: MergedToken): boolean {
|
||||
if (isExcludedFromSubtitleAnnotationsByPos1(normalizePos1Tag(token.pos1))) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isAuxiliaryStemGrammarTailToken(token)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isExcludedTrailingParticleMergedToken(token)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return isExcludedFromSubtitleAnnotationsByTerm(token);
|
||||
return sharedShouldExcludeTokenFromSubtitleAnnotations(token);
|
||||
}
|
||||
|
||||
export function stripSubtitleAnnotationMetadata(token: MergedToken): MergedToken {
|
||||
if (!shouldExcludeTokenFromSubtitleAnnotations(token)) {
|
||||
return token;
|
||||
}
|
||||
|
||||
return {
|
||||
...token,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
isNameMatch: false,
|
||||
jlptLevel: undefined,
|
||||
frequencyRank: undefined,
|
||||
};
|
||||
return sharedStripSubtitleAnnotationMetadata(token);
|
||||
}
|
||||
|
||||
function computeTokenKnownStatus(
|
||||
@@ -737,6 +718,18 @@ export function annotateTokens(
|
||||
|
||||
// Single pass: compute known word status, frequency filtering, and JLPT level together
|
||||
const annotated = tokens.map((token) => {
|
||||
if (
|
||||
sharedShouldExcludeTokenFromSubtitleAnnotations(token, {
|
||||
pos1Exclusions,
|
||||
pos2Exclusions,
|
||||
})
|
||||
) {
|
||||
return sharedStripSubtitleAnnotationMetadata(token, {
|
||||
pos1Exclusions,
|
||||
pos2Exclusions,
|
||||
});
|
||||
}
|
||||
|
||||
const prioritizedNameMatch = nameMatchEnabled && token.isNameMatch === true;
|
||||
const isKnown = nPlusOneEnabled
|
||||
? computeTokenKnownStatus(token, deps.isKnownWord, deps.knownWordMatchMode)
|
||||
|
||||
341
src/core/services/tokenizer/subtitle-annotation-filter.ts
Normal file
341
src/core/services/tokenizer/subtitle-annotation-filter.ts
Normal file
@@ -0,0 +1,341 @@
|
||||
import {
|
||||
DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
|
||||
resolveAnnotationPos1ExclusionSet,
|
||||
} from '../../../token-pos1-exclusions';
|
||||
import {
|
||||
DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
|
||||
resolveAnnotationPos2ExclusionSet,
|
||||
} from '../../../token-pos2-exclusions';
|
||||
import { MergedToken, PartOfSpeech } from '../../../types';
|
||||
import { shouldIgnoreJlptByTerm } from '../jlpt-token-filter';
|
||||
|
||||
const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
|
||||
const KATAKANA_CODEPOINT_START = 0x30a1;
|
||||
const KATAKANA_CODEPOINT_END = 0x30f6;
|
||||
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
||||
'ああ',
|
||||
'ええ',
|
||||
'うう',
|
||||
'おお',
|
||||
'はあ',
|
||||
'はは',
|
||||
'へえ',
|
||||
'ふう',
|
||||
'ほう',
|
||||
]);
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
|
||||
'だ',
|
||||
'です',
|
||||
'でした',
|
||||
'だった',
|
||||
'では',
|
||||
'じゃ',
|
||||
'でしょう',
|
||||
'だろう',
|
||||
] as const;
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES = [
|
||||
'',
|
||||
'か',
|
||||
'ね',
|
||||
'よ',
|
||||
'な',
|
||||
'よね',
|
||||
'かな',
|
||||
'かね',
|
||||
] as const;
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set(
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.flatMap((prefix) =>
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES.flatMap((core) =>
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES.map(
|
||||
(particle) => `${prefix}${core}${particle}`,
|
||||
),
|
||||
),
|
||||
),
|
||||
);
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
|
||||
'って',
|
||||
'ってよ',
|
||||
'ってね',
|
||||
'ってな',
|
||||
'ってさ',
|
||||
'ってか',
|
||||
'ってば',
|
||||
]);
|
||||
const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
|
||||
|
||||
export interface SubtitleAnnotationFilterOptions {
|
||||
pos1Exclusions?: ReadonlySet<string>;
|
||||
pos2Exclusions?: ReadonlySet<string>;
|
||||
}
|
||||
|
||||
function normalizePosTag(pos: string | undefined): string {
|
||||
return typeof pos === 'string' ? pos.trim() : '';
|
||||
}
|
||||
|
||||
function splitNormalizedTagParts(normalizedTag: string): string[] {
|
||||
if (!normalizedTag) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return normalizedTag
|
||||
.split('|')
|
||||
.map((part) => part.trim())
|
||||
.filter((part) => part.length > 0);
|
||||
}
|
||||
|
||||
function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet<string>): boolean {
|
||||
const parts = splitNormalizedTagParts(normalizedTag);
|
||||
if (parts.length === 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return parts.every((part) => exclusions.has(part));
|
||||
}
|
||||
|
||||
function resolvePos1Exclusions(
|
||||
options: SubtitleAnnotationFilterOptions = {},
|
||||
): ReadonlySet<string> {
|
||||
if (options.pos1Exclusions) {
|
||||
return options.pos1Exclusions;
|
||||
}
|
||||
|
||||
return resolveAnnotationPos1ExclusionSet(DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG);
|
||||
}
|
||||
|
||||
function resolvePos2Exclusions(
|
||||
options: SubtitleAnnotationFilterOptions = {},
|
||||
): ReadonlySet<string> {
|
||||
if (options.pos2Exclusions) {
|
||||
return options.pos2Exclusions;
|
||||
}
|
||||
|
||||
return resolveAnnotationPos2ExclusionSet(DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG);
|
||||
}
|
||||
|
||||
function normalizeKana(text: string): string {
|
||||
const raw = text.trim();
|
||||
if (!raw) {
|
||||
return '';
|
||||
}
|
||||
|
||||
let normalized = '';
|
||||
for (const char of raw) {
|
||||
const code = char.codePointAt(0);
|
||||
if (code === undefined) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (code >= KATAKANA_CODEPOINT_START && code <= KATAKANA_CODEPOINT_END) {
|
||||
normalized += String.fromCodePoint(code - KATAKANA_TO_HIRAGANA_OFFSET);
|
||||
continue;
|
||||
}
|
||||
|
||||
normalized += char;
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function isKanaChar(char: string): boolean {
|
||||
const code = char.codePointAt(0);
|
||||
if (code === undefined) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return (
|
||||
(code >= 0x3041 && code <= 0x3096) ||
|
||||
(code >= 0x309b && code <= 0x309f) ||
|
||||
code === 0x30fc ||
|
||||
(code >= 0x30a0 && code <= 0x30fa) ||
|
||||
(code >= 0x30fd && code <= 0x30ff)
|
||||
);
|
||||
}
|
||||
|
||||
function isTrailingSmallTsuKanaSfx(text: string): boolean {
|
||||
const normalized = normalizeKana(text);
|
||||
if (!normalized) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const chars = [...normalized];
|
||||
if (chars.length < 2 || chars.length > 4) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!chars.every(isKanaChar)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return chars[chars.length - 1] === 'っ';
|
||||
}
|
||||
|
||||
function isReduplicatedKanaSfx(text: string): boolean {
|
||||
const normalized = normalizeKana(text);
|
||||
if (!normalized) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const chars = [...normalized];
|
||||
if (chars.length < 4 || chars.length % 2 !== 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!chars.every(isKanaChar)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const half = chars.length / 2;
|
||||
return chars.slice(0, half).join('') === chars.slice(half).join('');
|
||||
}
|
||||
|
||||
function isReduplicatedKanaSfxWithOptionalTrailingTo(text: string): boolean {
|
||||
const normalized = normalizeKana(text);
|
||||
if (!normalized) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (isReduplicatedKanaSfx(normalized)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (normalized.length <= 1 || !normalized.endsWith('と')) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return isReduplicatedKanaSfx(normalized.slice(0, -1));
|
||||
}
|
||||
|
||||
function isExcludedTrailingParticleMergedToken(token: MergedToken): boolean {
|
||||
const normalizedSurface = normalizeKana(token.surface);
|
||||
const normalizedHeadword = normalizeKana(token.headword);
|
||||
if (!normalizedSurface || !normalizedHeadword || !normalizedSurface.startsWith(normalizedHeadword)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const suffix = normalizedSurface.slice(normalizedHeadword.length);
|
||||
if (!SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES.has(suffix)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
|
||||
if (pos1Parts.length < 2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const [leadingPos1, ...trailingPos1] = pos1Parts;
|
||||
if (!leadingPos1 || resolvePos1Exclusions().has(leadingPos1)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return trailingPos1.length > 0 && trailingPos1.every((part) => part === '助詞');
|
||||
}
|
||||
|
||||
function isAuxiliaryStemGrammarTailToken(token: MergedToken): boolean {
|
||||
const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
|
||||
if (pos1Parts.length === 0 || !pos1Parts.every((part) => AUXILIARY_STEM_GRAMMAR_TAIL_POS1.has(part))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const pos3Parts = splitNormalizedTagParts(normalizePosTag(token.pos3));
|
||||
return pos3Parts.includes('助動詞語幹');
|
||||
}
|
||||
|
||||
function isExcludedByTerm(token: MergedToken): boolean {
|
||||
const candidates = [token.surface, token.reading, token.headword].filter(
|
||||
(candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
|
||||
);
|
||||
|
||||
for (const candidate of candidates) {
|
||||
const trimmed = candidate.trim();
|
||||
if (!trimmed) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const normalized = normalizeKana(trimmed);
|
||||
if (!normalized) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(trimmed) ||
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalized) ||
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(trimmed) ||
|
||||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(normalized) ||
|
||||
shouldIgnoreJlptByTerm(trimmed) ||
|
||||
shouldIgnoreJlptByTerm(normalized)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (
|
||||
isTrailingSmallTsuKanaSfx(trimmed) ||
|
||||
isTrailingSmallTsuKanaSfx(normalized) ||
|
||||
isReduplicatedKanaSfxWithOptionalTrailingTo(trimmed) ||
|
||||
isReduplicatedKanaSfxWithOptionalTrailingTo(normalized)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
export function shouldExcludeTokenFromSubtitleAnnotations(
|
||||
token: MergedToken,
|
||||
options: SubtitleAnnotationFilterOptions = {},
|
||||
): boolean {
|
||||
const pos1Exclusions = resolvePos1Exclusions(options);
|
||||
const pos2Exclusions = resolvePos2Exclusions(options);
|
||||
const normalizedPos1 = normalizePosTag(token.pos1);
|
||||
const normalizedPos2 = normalizePosTag(token.pos2);
|
||||
const hasPos1 = normalizedPos1.length > 0;
|
||||
const hasPos2 = normalizedPos2.length > 0;
|
||||
|
||||
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (
|
||||
!hasPos1 &&
|
||||
!hasPos2 &&
|
||||
(token.partOfSpeech === PartOfSpeech.particle ||
|
||||
token.partOfSpeech === PartOfSpeech.bound_auxiliary ||
|
||||
token.partOfSpeech === PartOfSpeech.symbol)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isAuxiliaryStemGrammarTailToken(token)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isExcludedTrailingParticleMergedToken(token)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return isExcludedByTerm(token);
|
||||
}
|
||||
|
||||
export function stripSubtitleAnnotationMetadata(
|
||||
token: MergedToken,
|
||||
options: SubtitleAnnotationFilterOptions = {},
|
||||
): MergedToken {
|
||||
if (!shouldExcludeTokenFromSubtitleAnnotations(token, options)) {
|
||||
return token;
|
||||
}
|
||||
|
||||
return {
|
||||
...token,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
isNameMatch: false,
|
||||
jlptLevel: undefined,
|
||||
frequencyRank: undefined,
|
||||
};
|
||||
}
|
||||
@@ -19,6 +19,7 @@
|
||||
import { PartOfSpeech, Token, MergedToken } from './types';
|
||||
import { DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG } from './token-pos1-exclusions';
|
||||
import { DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG } from './token-pos2-exclusions';
|
||||
import { shouldExcludeTokenFromSubtitleAnnotations } from './core/services/tokenizer/subtitle-annotation-filter';
|
||||
|
||||
export function isNoun(tok: Token): boolean {
|
||||
return tok.partOfSpeech === PartOfSpeech.noun;
|
||||
@@ -297,6 +298,10 @@ function isNPlusOneWordCountToken(
|
||||
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
|
||||
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
|
||||
): boolean {
|
||||
if (shouldExcludeTokenFromSubtitleAnnotations(token, { pos1Exclusions, pos2Exclusions })) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const normalizedPos1 = normalizePos1Tag(token.pos1);
|
||||
const hasPos1 = normalizedPos1.length > 0;
|
||||
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
|
||||
|
||||
Reference in New Issue
Block a user