fix(subtitle): unify annotation token filtering

This commit is contained in:
2026-03-19 23:48:38 -07:00
parent 4a01cebca6
commit 42028d0a4d
5 changed files with 527 additions and 29 deletions

View File

@@ -3628,6 +3628,119 @@ test('tokenizeSubtitle excludes merged function/content token from frequency hig
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, true);
});
test('tokenizeSubtitle clears all annotations for kana-only demonstrative helper merges', async () => {
const result = await tokenizeSubtitle(
'これで実力どおりか',
makeDepsFromYomitanTokens(
[
{ surface: 'これで', reading: 'これで', headword: 'これ' },
{ surface: '実力どおり', reading: 'じつりょくどおり', headword: '実力どおり' },
{ surface: 'か', reading: 'か', headword: 'か' },
],
{
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) =>
text === 'これ' ? 9 : text === '実力どおり' ? 2500 : text === 'か' ? 800 : null,
getJlptLevel: (text) =>
text === 'これ' ? 'N5' : text === '実力どおり' ? 'N1' : text === 'か' ? 'N5' : null,
isKnownWord: (text) => text === 'これ',
getMinSentenceWordsForNPlusOne: () => 1,
tokenizeWithMecab: async () => [
{
headword: 'これ',
surface: 'これ',
reading: 'コレ',
startPos: 0,
endPos: 2,
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '代名詞',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'で',
surface: 'で',
reading: 'デ',
startPos: 2,
endPos: 3,
partOfSpeech: PartOfSpeech.particle,
pos1: '助詞',
pos2: '格助詞',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: '実力どおり',
surface: '実力どおり',
reading: 'ジツリョクドオリ',
startPos: 3,
endPos: 8,
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '一般',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: 'か',
surface: 'か',
reading: 'カ',
startPos: 8,
endPos: 9,
partOfSpeech: PartOfSpeech.particle,
pos1: '助詞',
pos2: '終助詞',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
],
},
),
);
assert.deepEqual(
result.tokens?.map((token) => ({
surface: token.surface,
headword: token.headword,
isKnown: token.isKnown,
isNPlusOneTarget: token.isNPlusOneTarget,
frequencyRank: token.frequencyRank,
jlptLevel: token.jlptLevel,
})),
[
{
surface: 'これで',
headword: 'これ',
isKnown: false,
isNPlusOneTarget: false,
frequencyRank: undefined,
jlptLevel: undefined,
},
{
surface: '実力どおり',
headword: '実力どおり',
isKnown: false,
isNPlusOneTarget: true,
frequencyRank: 2500,
jlptLevel: 'N1',
},
{
surface: 'か',
headword: 'か',
isKnown: false,
isNPlusOneTarget: false,
frequencyRank: undefined,
jlptLevel: undefined,
},
],
);
});
test('tokenizeSubtitle keeps frequency for content-led merged token with trailing colloquial suffixes', async () => {
const result = await tokenizeSubtitle(
'張り切ってんじゃ',

View File

@@ -316,6 +316,19 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes merged lexical tokens w
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes kana-only demonstrative helper merges', () => {
const token = makeToken({
surface: 'これで',
headword: 'これ',
reading: 'コレデ',
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞|助詞',
pos2: '代名詞|格助詞',
});
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
});
test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => {
const token = makeToken({
surface: 'は',
@@ -481,8 +494,8 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
);
assert.equal(result[0]?.isKnown, false);
assert.equal(result[1]?.isKnown, true);
assert.equal(result[2]?.isKnown, true);
assert.equal(result[1]?.isKnown, false);
assert.equal(result[2]?.isKnown, false);
assert.equal(result[0]?.isNPlusOneTarget, false);
});
@@ -568,7 +581,7 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+
assert.equal(result[0]?.isNPlusOneTarget, false);
});
test('annotateTokens keeps frequency for kanji noun tokens even when mecab marks them non-independent', () => {
test('annotateTokens clears all annotations for non-independent kanji noun tokens under unified gate', () => {
const tokens = [
makeToken({
surface: '者',
@@ -588,7 +601,10 @@ test('annotateTokens keeps frequency for kanji noun tokens even when mecab marks
minSentenceWordsForNPlusOne: 1,
});
assert.equal(result[0]?.frequencyRank, 475);
assert.equal(result[0]?.isKnown, false);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.jlptLevel, undefined);
});
test('annotateTokens excludes likely kana SFX tokens from frequency when POS tags are missing', () => {
@@ -742,3 +758,33 @@ test('annotateTokens excludes composite tokens when all component pos tags are e
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.isNPlusOneTarget, false);
});
test('annotateTokens applies one shared exclusion gate across known N+1 frequency and JLPT', () => {
const tokens = [
makeToken({
surface: 'これで',
headword: 'これ',
reading: 'コレデ',
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞|助詞',
pos2: '代名詞|格助詞',
startPos: 0,
endPos: 3,
frequencyRank: 9,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === 'これ',
getJlptLevel: (text) => (text === 'これ' ? 'N5' : null),
}),
{ minSentenceWordsForNPlusOne: 1 },
);
assert.equal(result[0]?.isKnown, false);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.jlptLevel, undefined);
});

View File

@@ -9,6 +9,10 @@ import {
} from '../../../token-pos2-exclusions';
import { JlptLevel, MergedToken, NPlusOneMatchMode, PartOfSpeech } from '../../../types';
import { shouldIgnoreJlptByTerm, shouldIgnoreJlptForMecabPos1 } from '../jlpt-token-filter';
import {
shouldExcludeTokenFromSubtitleAnnotations as sharedShouldExcludeTokenFromSubtitleAnnotations,
stripSubtitleAnnotationMetadata as sharedStripSubtitleAnnotationMetadata,
} from './subtitle-annotation-filter';
const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
const KATAKANA_CODEPOINT_START = 0x30a1;
@@ -633,34 +637,11 @@ function isExcludedFromSubtitleAnnotationsByTerm(token: MergedToken): boolean {
}
export function shouldExcludeTokenFromSubtitleAnnotations(token: MergedToken): boolean {
if (isExcludedFromSubtitleAnnotationsByPos1(normalizePos1Tag(token.pos1))) {
return true;
}
if (isAuxiliaryStemGrammarTailToken(token)) {
return true;
}
if (isExcludedTrailingParticleMergedToken(token)) {
return true;
}
return isExcludedFromSubtitleAnnotationsByTerm(token);
return sharedShouldExcludeTokenFromSubtitleAnnotations(token);
}
export function stripSubtitleAnnotationMetadata(token: MergedToken): MergedToken {
if (!shouldExcludeTokenFromSubtitleAnnotations(token)) {
return token;
}
return {
...token,
isKnown: false,
isNPlusOneTarget: false,
isNameMatch: false,
jlptLevel: undefined,
frequencyRank: undefined,
};
return sharedStripSubtitleAnnotationMetadata(token);
}
function computeTokenKnownStatus(
@@ -737,6 +718,18 @@ export function annotateTokens(
// Single pass: compute known word status, frequency filtering, and JLPT level together
const annotated = tokens.map((token) => {
if (
sharedShouldExcludeTokenFromSubtitleAnnotations(token, {
pos1Exclusions,
pos2Exclusions,
})
) {
return sharedStripSubtitleAnnotationMetadata(token, {
pos1Exclusions,
pos2Exclusions,
});
}
const prioritizedNameMatch = nameMatchEnabled && token.isNameMatch === true;
const isKnown = nPlusOneEnabled
? computeTokenKnownStatus(token, deps.isKnownWord, deps.knownWordMatchMode)

View File

@@ -0,0 +1,341 @@
import {
DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
resolveAnnotationPos1ExclusionSet,
} from '../../../token-pos1-exclusions';
import {
DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
resolveAnnotationPos2ExclusionSet,
} from '../../../token-pos2-exclusions';
import { MergedToken, PartOfSpeech } from '../../../types';
import { shouldIgnoreJlptByTerm } from '../jlpt-token-filter';
const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
const KATAKANA_CODEPOINT_START = 0x30a1;
const KATAKANA_CODEPOINT_END = 0x30f6;
const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
'ああ',
'ええ',
'うう',
'おお',
'はあ',
'はは',
'へえ',
'ふう',
'ほう',
]);
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
'だ',
'です',
'でした',
'だった',
'では',
'じゃ',
'でしょう',
'だろう',
] as const;
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES = [
'',
'か',
'ね',
'よ',
'な',
'よね',
'かな',
'かね',
] as const;
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set(
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.flatMap((prefix) =>
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES.flatMap((core) =>
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES.map(
(particle) => `${prefix}${core}${particle}`,
),
),
),
);
const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
'って',
'ってよ',
'ってね',
'ってな',
'ってさ',
'ってか',
'ってば',
]);
const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
export interface SubtitleAnnotationFilterOptions {
pos1Exclusions?: ReadonlySet<string>;
pos2Exclusions?: ReadonlySet<string>;
}
function normalizePosTag(pos: string | undefined): string {
return typeof pos === 'string' ? pos.trim() : '';
}
function splitNormalizedTagParts(normalizedTag: string): string[] {
if (!normalizedTag) {
return [];
}
return normalizedTag
.split('|')
.map((part) => part.trim())
.filter((part) => part.length > 0);
}
function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet<string>): boolean {
const parts = splitNormalizedTagParts(normalizedTag);
if (parts.length === 0) {
return false;
}
return parts.every((part) => exclusions.has(part));
}
function resolvePos1Exclusions(
options: SubtitleAnnotationFilterOptions = {},
): ReadonlySet<string> {
if (options.pos1Exclusions) {
return options.pos1Exclusions;
}
return resolveAnnotationPos1ExclusionSet(DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG);
}
function resolvePos2Exclusions(
options: SubtitleAnnotationFilterOptions = {},
): ReadonlySet<string> {
if (options.pos2Exclusions) {
return options.pos2Exclusions;
}
return resolveAnnotationPos2ExclusionSet(DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG);
}
function normalizeKana(text: string): string {
const raw = text.trim();
if (!raw) {
return '';
}
let normalized = '';
for (const char of raw) {
const code = char.codePointAt(0);
if (code === undefined) {
continue;
}
if (code >= KATAKANA_CODEPOINT_START && code <= KATAKANA_CODEPOINT_END) {
normalized += String.fromCodePoint(code - KATAKANA_TO_HIRAGANA_OFFSET);
continue;
}
normalized += char;
}
return normalized;
}
function isKanaChar(char: string): boolean {
const code = char.codePointAt(0);
if (code === undefined) {
return false;
}
return (
(code >= 0x3041 && code <= 0x3096) ||
(code >= 0x309b && code <= 0x309f) ||
code === 0x30fc ||
(code >= 0x30a0 && code <= 0x30fa) ||
(code >= 0x30fd && code <= 0x30ff)
);
}
function isTrailingSmallTsuKanaSfx(text: string): boolean {
const normalized = normalizeKana(text);
if (!normalized) {
return false;
}
const chars = [...normalized];
if (chars.length < 2 || chars.length > 4) {
return false;
}
if (!chars.every(isKanaChar)) {
return false;
}
return chars[chars.length - 1] === 'っ';
}
function isReduplicatedKanaSfx(text: string): boolean {
const normalized = normalizeKana(text);
if (!normalized) {
return false;
}
const chars = [...normalized];
if (chars.length < 4 || chars.length % 2 !== 0) {
return false;
}
if (!chars.every(isKanaChar)) {
return false;
}
const half = chars.length / 2;
return chars.slice(0, half).join('') === chars.slice(half).join('');
}
function isReduplicatedKanaSfxWithOptionalTrailingTo(text: string): boolean {
const normalized = normalizeKana(text);
if (!normalized) {
return false;
}
if (isReduplicatedKanaSfx(normalized)) {
return true;
}
if (normalized.length <= 1 || !normalized.endsWith('と')) {
return false;
}
return isReduplicatedKanaSfx(normalized.slice(0, -1));
}
function isExcludedTrailingParticleMergedToken(token: MergedToken): boolean {
const normalizedSurface = normalizeKana(token.surface);
const normalizedHeadword = normalizeKana(token.headword);
if (!normalizedSurface || !normalizedHeadword || !normalizedSurface.startsWith(normalizedHeadword)) {
return false;
}
const suffix = normalizedSurface.slice(normalizedHeadword.length);
if (!SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES.has(suffix)) {
return false;
}
const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
if (pos1Parts.length < 2) {
return false;
}
const [leadingPos1, ...trailingPos1] = pos1Parts;
if (!leadingPos1 || resolvePos1Exclusions().has(leadingPos1)) {
return false;
}
return trailingPos1.length > 0 && trailingPos1.every((part) => part === '助詞');
}
function isAuxiliaryStemGrammarTailToken(token: MergedToken): boolean {
const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
if (pos1Parts.length === 0 || !pos1Parts.every((part) => AUXILIARY_STEM_GRAMMAR_TAIL_POS1.has(part))) {
return false;
}
const pos3Parts = splitNormalizedTagParts(normalizePosTag(token.pos3));
return pos3Parts.includes('助動詞語幹');
}
function isExcludedByTerm(token: MergedToken): boolean {
const candidates = [token.surface, token.reading, token.headword].filter(
(candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
);
for (const candidate of candidates) {
const trimmed = candidate.trim();
if (!trimmed) {
continue;
}
const normalized = normalizeKana(trimmed);
if (!normalized) {
continue;
}
if (
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(trimmed) ||
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalized) ||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(trimmed) ||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(normalized) ||
shouldIgnoreJlptByTerm(trimmed) ||
shouldIgnoreJlptByTerm(normalized)
) {
return true;
}
if (
isTrailingSmallTsuKanaSfx(trimmed) ||
isTrailingSmallTsuKanaSfx(normalized) ||
isReduplicatedKanaSfxWithOptionalTrailingTo(trimmed) ||
isReduplicatedKanaSfxWithOptionalTrailingTo(normalized)
) {
return true;
}
}
return false;
}
export function shouldExcludeTokenFromSubtitleAnnotations(
token: MergedToken,
options: SubtitleAnnotationFilterOptions = {},
): boolean {
const pos1Exclusions = resolvePos1Exclusions(options);
const pos2Exclusions = resolvePos2Exclusions(options);
const normalizedPos1 = normalizePosTag(token.pos1);
const normalizedPos2 = normalizePosTag(token.pos2);
const hasPos1 = normalizedPos1.length > 0;
const hasPos2 = normalizedPos2.length > 0;
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
return true;
}
if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) {
return true;
}
if (
!hasPos1 &&
!hasPos2 &&
(token.partOfSpeech === PartOfSpeech.particle ||
token.partOfSpeech === PartOfSpeech.bound_auxiliary ||
token.partOfSpeech === PartOfSpeech.symbol)
) {
return true;
}
if (isAuxiliaryStemGrammarTailToken(token)) {
return true;
}
if (isExcludedTrailingParticleMergedToken(token)) {
return true;
}
return isExcludedByTerm(token);
}
export function stripSubtitleAnnotationMetadata(
token: MergedToken,
options: SubtitleAnnotationFilterOptions = {},
): MergedToken {
if (!shouldExcludeTokenFromSubtitleAnnotations(token, options)) {
return token;
}
return {
...token,
isKnown: false,
isNPlusOneTarget: false,
isNameMatch: false,
jlptLevel: undefined,
frequencyRank: undefined,
};
}

View File

@@ -19,6 +19,7 @@
import { PartOfSpeech, Token, MergedToken } from './types';
import { DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG } from './token-pos1-exclusions';
import { DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG } from './token-pos2-exclusions';
import { shouldExcludeTokenFromSubtitleAnnotations } from './core/services/tokenizer/subtitle-annotation-filter';
export function isNoun(tok: Token): boolean {
return tok.partOfSpeech === PartOfSpeech.noun;
@@ -297,6 +298,10 @@ function isNPlusOneWordCountToken(
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
): boolean {
if (shouldExcludeTokenFromSubtitleAnnotations(token, { pos1Exclusions, pos2Exclusions })) {
return false;
}
const normalizedPos1 = normalizePos1Tag(token.pos1);
const hasPos1 = normalizedPos1.length > 0;
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {