feat(tokenizer): exclude interjections and sound effects from subtitle annotations

- Filter out 感動詞 (interjection) POS1 tokens from annotation payloads
- Exclude common interjection terms (ああ, ええ, はあ, etc.)
- Exclude reduplicated kana SFX with optional trailing と
- shouldExcludeTokenFromSubtitleAnnotations checks both POS1 and term patterns
- filterSubtitleAnnotationTokens applied after annotation stage
This commit is contained in:
2026-03-16 01:45:58 -07:00
parent 5767667d51
commit a317019bb9
4 changed files with 253 additions and 4 deletions

View File

@@ -14,6 +14,17 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
const KATAKANA_CODEPOINT_START = 0x30a1;
const KATAKANA_CODEPOINT_END = 0x30f6;
const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
'ああ',
'ええ',
'うう',
'おお',
'はあ',
'はは',
'へえ',
'ふう',
'ほう',
]);
const jlptLevelLookupCaches = new WeakMap<
(text: string) => JlptLevel | null,
@@ -48,6 +59,8 @@ function normalizePos1Tag(pos1: string | undefined): string {
return typeof pos1 === 'string' ? pos1.trim() : '';
}
const SUBTITLE_ANNOTATION_EXCLUDED_POS1 = new Set(['感動詞']);
function splitNormalizedTagParts(normalizedTag: string): string[] {
if (!normalizedTag) {
return [];
@@ -69,6 +82,11 @@ function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet<strin
return parts.some((part) => exclusions.has(part));
}
function isExcludedFromSubtitleAnnotationsByPos1(normalizedPos1: string): boolean {
const parts = splitNormalizedTagParts(normalizedPos1);
return parts.some((part) => SUBTITLE_ANNOTATION_EXCLUDED_POS1.has(part));
}
function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
if (options.pos1Exclusions) {
return options.pos1Exclusions;
@@ -383,6 +401,23 @@ function isReduplicatedKanaSfx(text: string): boolean {
return chars.slice(0, half).join('') === chars.slice(half).join('');
}
function isReduplicatedKanaSfxWithOptionalTrailingTo(text: string): boolean {
const normalized = normalizeJlptTextForExclusion(text);
if (!normalized) {
return false;
}
if (isReduplicatedKanaSfx(normalized)) {
return true;
}
if (normalized.length <= 1 || !normalized.endsWith('と')) {
return false;
}
return isReduplicatedKanaSfx(normalized.slice(0, -1));
}
function hasAdjacentKanaRepeat(text: string): boolean {
const normalized = normalizeJlptTextForExclusion(text);
if (!normalized) {
@@ -485,6 +520,55 @@ function isJlptEligibleToken(token: MergedToken): boolean {
return true;
}
function isExcludedFromSubtitleAnnotationsByTerm(token: MergedToken): boolean {
const candidates = [
resolveJlptLookupText(token),
token.surface,
token.headword,
token.reading,
].filter(
(candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
);
for (const candidate of candidates) {
const trimmedCandidate = candidate.trim();
if (!trimmedCandidate) {
continue;
}
const normalizedCandidate = normalizeJlptTextForExclusion(trimmedCandidate);
if (!normalizedCandidate) {
continue;
}
if (
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(trimmedCandidate) ||
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalizedCandidate)
) {
return true;
}
if (
isTrailingSmallTsuKanaSfx(trimmedCandidate) ||
isTrailingSmallTsuKanaSfx(normalizedCandidate) ||
isReduplicatedKanaSfxWithOptionalTrailingTo(trimmedCandidate) ||
isReduplicatedKanaSfxWithOptionalTrailingTo(normalizedCandidate)
) {
return true;
}
}
return false;
}
export function shouldExcludeTokenFromSubtitleAnnotations(token: MergedToken): boolean {
if (isExcludedFromSubtitleAnnotationsByPos1(normalizePos1Tag(token.pos1))) {
return true;
}
return isExcludedFromSubtitleAnnotationsByTerm(token);
}
function computeTokenKnownStatus(
token: MergedToken,
isKnownWord: (text: string) => boolean,

View File

@@ -212,3 +212,57 @@ test('merges trailing katakana continuation without headword into previous token
],
);
});
// Regression: merged content+function token candidate must not beat a multi-token split
// candidate that preserves the content token as a standalone frequency-eligible unit.
// Background: Yomitan scanning can produce a single-token candidate where a content word
// is merged with trailing function particles (e.g. かかってこいよ → headword かかってくる).
// When a competing multi-token candidate splits content and function separately, the
// multi-token candidate should win so the content token remains frequency-highlightable.
test('multi-token candidate beats single merged content+function token candidate (frequency regression)', () => {
// Candidate A: single merged token — content verb fused with trailing sentence-final particle
// This is the "bad" candidate: downstream annotation would exclude frequency for the whole
// token because the merged pos1 would contain a function-word component.
const mergedCandidate = makeParseItem('scanning-parser', [
[{ text: 'かかってこいよ', reading: 'かかってこいよ', headword: 'かかってくる' }],
]);
// Candidate B: two tokens — content verb surface + particle separately.
// The content token is frequency-eligible on its own.
const splitCandidate = makeParseItem('scanning-parser', [
[{ text: 'かかってこい', reading: 'かかってこい', headword: 'かかってくる' }],
[{ text: 'よ', reading: 'よ', headword: 'よ' }],
]);
// When merged candidate comes first in the array, multi-token split still wins.
const tokens = selectYomitanParseTokens(
[mergedCandidate, splitCandidate],
() => false,
'headword',
);
assert.equal(tokens?.length, 2);
assert.equal(tokens?.[0]?.surface, 'かかってこい');
assert.equal(tokens?.[0]?.headword, 'かかってくる');
assert.equal(tokens?.[1]?.surface, 'よ');
});
test('multi-token candidate beats single merged content+function token regardless of input order', () => {
const mergedCandidate = makeParseItem('scanning-parser', [
[{ text: 'かかってこいよ', reading: 'かかってこいよ', headword: 'かかってくる' }],
]);
const splitCandidate = makeParseItem('scanning-parser', [
[{ text: 'かかってこい', reading: 'かかってこい', headword: 'かかってくる' }],
[{ text: 'よ', reading: 'よ', headword: 'よ' }],
]);
// Split candidate comes first — should still win over merged.
const tokens = selectYomitanParseTokens(
[splitCandidate, mergedCandidate],
() => false,
'headword',
);
assert.equal(tokens?.length, 2);
assert.equal(tokens?.[0]?.surface, 'かかってこい');
assert.equal(tokens?.[1]?.surface, 'よ');
});