mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-20 12:11:28 -07:00
feat(tokenizer): exclude interjections and sound effects from subtitle annotations
- Filter out 感動詞 (interjection) POS1 tokens from annotation payloads - Exclude common interjection terms (ああ, ええ, はあ, etc.) - Exclude reduplicated kana SFX with optional trailing と - shouldExcludeTokenFromSubtitleAnnotations checks both POS1 and term patterns - filterSubtitleAnnotationTokens applied after annotation stage
This commit is contained in:
@@ -1460,7 +1460,7 @@ test('tokenizeSubtitle skips JLPT level for excluded demonstratives', async () =
|
|||||||
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
|
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle skips JLPT level for repeated kana SFX', async () => {
|
test('tokenizeSubtitle excludes repeated kana interjections from annotation payloads entirely', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'ああ',
|
'ああ',
|
||||||
makeDeps({
|
makeDeps({
|
||||||
@@ -1491,8 +1491,7 @@ test('tokenizeSubtitle skips JLPT level for repeated kana SFX', async () => {
|
|||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
|
|
||||||
assert.equal(result.tokens?.length, 1);
|
assert.deepEqual(result, { text: 'ああ', tokens: null });
|
||||||
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
|
|
||||||
});
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle assigns JLPT level to Yomitan tokens', async () => {
|
test('tokenizeSubtitle assigns JLPT level to Yomitan tokens', async () => {
|
||||||
@@ -3057,6 +3056,102 @@ test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and freque
|
|||||||
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
|
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('tokenizeSubtitle excludes mecab-tagged interjections from annotation payloads entirely', async () => {
|
||||||
|
const result = await tokenizeSubtitle(
|
||||||
|
'ぐはっ',
|
||||||
|
makeDepsFromYomitanTokens([{ surface: 'ぐはっ', reading: 'ぐはっ', headword: 'ぐはっ' }], {
|
||||||
|
getFrequencyDictionaryEnabled: () => true,
|
||||||
|
getFrequencyRank: () => 17,
|
||||||
|
getJlptLevel: () => 'N5',
|
||||||
|
tokenizeWithMecab: async () => [
|
||||||
|
{
|
||||||
|
headword: 'ぐはっ',
|
||||||
|
surface: 'ぐはっ',
|
||||||
|
reading: 'グハッ',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 3,
|
||||||
|
partOfSpeech: PartOfSpeech.other,
|
||||||
|
pos1: '感動詞',
|
||||||
|
isMerged: true,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.deepEqual(result, { text: 'ぐはっ', tokens: null });
|
||||||
|
});
|
||||||
|
|
||||||
|
test('tokenizeSubtitle keeps visible text while excluding interjections from mixed annotation payloads', async () => {
|
||||||
|
const result = await tokenizeSubtitle(
|
||||||
|
'ぐはっ 猫',
|
||||||
|
makeDeps({
|
||||||
|
getFrequencyDictionaryEnabled: () => true,
|
||||||
|
getFrequencyRank: (text) => (text === '猫' ? 11 : 17),
|
||||||
|
getJlptLevel: (text) => (text === '猫' ? 'N5' : null),
|
||||||
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
||||||
|
getYomitanParserWindow: () =>
|
||||||
|
({
|
||||||
|
isDestroyed: () => false,
|
||||||
|
webContents: {
|
||||||
|
executeJavaScript: async (script: string) => {
|
||||||
|
if (script.includes('getTermFrequencies')) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
source: 'scanning-parser',
|
||||||
|
index: 0,
|
||||||
|
content: [
|
||||||
|
[{ text: 'ぐはっ', reading: 'ぐはっ', headwords: [[{ term: 'ぐはっ' }]] }],
|
||||||
|
[{ text: '猫', reading: 'ねこ', headwords: [[{ term: '猫' }]] }],
|
||||||
|
],
|
||||||
|
},
|
||||||
|
];
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}) as unknown as Electron.BrowserWindow,
|
||||||
|
tokenizeWithMecab: async () => [
|
||||||
|
{
|
||||||
|
headword: 'ぐはっ',
|
||||||
|
surface: 'ぐはっ',
|
||||||
|
reading: 'グハッ',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 3,
|
||||||
|
partOfSpeech: PartOfSpeech.other,
|
||||||
|
pos1: '感動詞',
|
||||||
|
isMerged: true,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: '猫',
|
||||||
|
surface: '猫',
|
||||||
|
reading: 'ネコ',
|
||||||
|
startPos: 4,
|
||||||
|
endPos: 5,
|
||||||
|
partOfSpeech: PartOfSpeech.noun,
|
||||||
|
pos1: '名詞',
|
||||||
|
isMerged: true,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.equal(result.text, 'ぐはっ 猫');
|
||||||
|
assert.deepEqual(
|
||||||
|
result.tokens?.map((token) => ({
|
||||||
|
surface: token.surface,
|
||||||
|
headword: token.headword,
|
||||||
|
})),
|
||||||
|
[{ surface: '猫', headword: '猫' }],
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle excludes single-kana merged tokens from frequency highlighting', async () => {
|
test('tokenizeSubtitle excludes single-kana merged tokens from frequency highlighting', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'た',
|
'た',
|
||||||
|
|||||||
@@ -178,6 +178,19 @@ async function applyAnnotationStage(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function filterSubtitleAnnotationTokens(tokens: MergedToken[]): Promise<MergedToken[]> {
|
||||||
|
if (tokens.length === 0) {
|
||||||
|
return tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!annotationStageModulePromise) {
|
||||||
|
annotationStageModulePromise = import('./tokenizer/annotation-stage');
|
||||||
|
}
|
||||||
|
|
||||||
|
const annotationStage = await annotationStageModulePromise;
|
||||||
|
return tokens.filter((token) => !annotationStage.shouldExcludeTokenFromSubtitleAnnotations(token));
|
||||||
|
}
|
||||||
|
|
||||||
export function createTokenizerDepsRuntime(
|
export function createTokenizerDepsRuntime(
|
||||||
options: TokenizerDepsRuntimeOptions,
|
options: TokenizerDepsRuntimeOptions,
|
||||||
): TokenizerServiceDeps {
|
): TokenizerServiceDeps {
|
||||||
@@ -698,9 +711,12 @@ export async function tokenizeSubtitle(
|
|||||||
|
|
||||||
const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps, annotationOptions);
|
const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps, annotationOptions);
|
||||||
if (yomitanTokens && yomitanTokens.length > 0) {
|
if (yomitanTokens && yomitanTokens.length > 0) {
|
||||||
|
const filteredTokens = await filterSubtitleAnnotationTokens(
|
||||||
|
await applyAnnotationStage(yomitanTokens, deps, annotationOptions),
|
||||||
|
);
|
||||||
return {
|
return {
|
||||||
text: displayText,
|
text: displayText,
|
||||||
tokens: await applyAnnotationStage(yomitanTokens, deps, annotationOptions),
|
tokens: filteredTokens.length > 0 ? filteredTokens : null,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -14,6 +14,17 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
|
|||||||
const KATAKANA_CODEPOINT_START = 0x30a1;
|
const KATAKANA_CODEPOINT_START = 0x30a1;
|
||||||
const KATAKANA_CODEPOINT_END = 0x30f6;
|
const KATAKANA_CODEPOINT_END = 0x30f6;
|
||||||
const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
|
const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
|
||||||
|
const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
||||||
|
'ああ',
|
||||||
|
'ええ',
|
||||||
|
'うう',
|
||||||
|
'おお',
|
||||||
|
'はあ',
|
||||||
|
'はは',
|
||||||
|
'へえ',
|
||||||
|
'ふう',
|
||||||
|
'ほう',
|
||||||
|
]);
|
||||||
|
|
||||||
const jlptLevelLookupCaches = new WeakMap<
|
const jlptLevelLookupCaches = new WeakMap<
|
||||||
(text: string) => JlptLevel | null,
|
(text: string) => JlptLevel | null,
|
||||||
@@ -48,6 +59,8 @@ function normalizePos1Tag(pos1: string | undefined): string {
|
|||||||
return typeof pos1 === 'string' ? pos1.trim() : '';
|
return typeof pos1 === 'string' ? pos1.trim() : '';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const SUBTITLE_ANNOTATION_EXCLUDED_POS1 = new Set(['感動詞']);
|
||||||
|
|
||||||
function splitNormalizedTagParts(normalizedTag: string): string[] {
|
function splitNormalizedTagParts(normalizedTag: string): string[] {
|
||||||
if (!normalizedTag) {
|
if (!normalizedTag) {
|
||||||
return [];
|
return [];
|
||||||
@@ -69,6 +82,11 @@ function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet<strin
|
|||||||
return parts.some((part) => exclusions.has(part));
|
return parts.some((part) => exclusions.has(part));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function isExcludedFromSubtitleAnnotationsByPos1(normalizedPos1: string): boolean {
|
||||||
|
const parts = splitNormalizedTagParts(normalizedPos1);
|
||||||
|
return parts.some((part) => SUBTITLE_ANNOTATION_EXCLUDED_POS1.has(part));
|
||||||
|
}
|
||||||
|
|
||||||
function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
|
function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
|
||||||
if (options.pos1Exclusions) {
|
if (options.pos1Exclusions) {
|
||||||
return options.pos1Exclusions;
|
return options.pos1Exclusions;
|
||||||
@@ -383,6 +401,23 @@ function isReduplicatedKanaSfx(text: string): boolean {
|
|||||||
return chars.slice(0, half).join('') === chars.slice(half).join('');
|
return chars.slice(0, half).join('') === chars.slice(half).join('');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function isReduplicatedKanaSfxWithOptionalTrailingTo(text: string): boolean {
|
||||||
|
const normalized = normalizeJlptTextForExclusion(text);
|
||||||
|
if (!normalized) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isReduplicatedKanaSfx(normalized)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (normalized.length <= 1 || !normalized.endsWith('と')) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return isReduplicatedKanaSfx(normalized.slice(0, -1));
|
||||||
|
}
|
||||||
|
|
||||||
function hasAdjacentKanaRepeat(text: string): boolean {
|
function hasAdjacentKanaRepeat(text: string): boolean {
|
||||||
const normalized = normalizeJlptTextForExclusion(text);
|
const normalized = normalizeJlptTextForExclusion(text);
|
||||||
if (!normalized) {
|
if (!normalized) {
|
||||||
@@ -485,6 +520,55 @@ function isJlptEligibleToken(token: MergedToken): boolean {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function isExcludedFromSubtitleAnnotationsByTerm(token: MergedToken): boolean {
|
||||||
|
const candidates = [
|
||||||
|
resolveJlptLookupText(token),
|
||||||
|
token.surface,
|
||||||
|
token.headword,
|
||||||
|
token.reading,
|
||||||
|
].filter(
|
||||||
|
(candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const candidate of candidates) {
|
||||||
|
const trimmedCandidate = candidate.trim();
|
||||||
|
if (!trimmedCandidate) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const normalizedCandidate = normalizeJlptTextForExclusion(trimmedCandidate);
|
||||||
|
if (!normalizedCandidate) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(trimmedCandidate) ||
|
||||||
|
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalizedCandidate)
|
||||||
|
) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
isTrailingSmallTsuKanaSfx(trimmedCandidate) ||
|
||||||
|
isTrailingSmallTsuKanaSfx(normalizedCandidate) ||
|
||||||
|
isReduplicatedKanaSfxWithOptionalTrailingTo(trimmedCandidate) ||
|
||||||
|
isReduplicatedKanaSfxWithOptionalTrailingTo(normalizedCandidate)
|
||||||
|
) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function shouldExcludeTokenFromSubtitleAnnotations(token: MergedToken): boolean {
|
||||||
|
if (isExcludedFromSubtitleAnnotationsByPos1(normalizePos1Tag(token.pos1))) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return isExcludedFromSubtitleAnnotationsByTerm(token);
|
||||||
|
}
|
||||||
|
|
||||||
function computeTokenKnownStatus(
|
function computeTokenKnownStatus(
|
||||||
token: MergedToken,
|
token: MergedToken,
|
||||||
isKnownWord: (text: string) => boolean,
|
isKnownWord: (text: string) => boolean,
|
||||||
|
|||||||
@@ -212,3 +212,57 @@ test('merges trailing katakana continuation without headword into previous token
|
|||||||
],
|
],
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Regression: merged content+function token candidate must not beat a multi-token split
|
||||||
|
// candidate that preserves the content token as a standalone frequency-eligible unit.
|
||||||
|
// Background: Yomitan scanning can produce a single-token candidate where a content word
|
||||||
|
// is merged with trailing function particles (e.g. かかってこいよ → headword かかってくる).
|
||||||
|
// When a competing multi-token candidate splits content and function separately, the
|
||||||
|
// multi-token candidate should win so the content token remains frequency-highlightable.
|
||||||
|
test('multi-token candidate beats single merged content+function token candidate (frequency regression)', () => {
|
||||||
|
// Candidate A: single merged token — content verb fused with trailing sentence-final particle
|
||||||
|
// This is the "bad" candidate: downstream annotation would exclude frequency for the whole
|
||||||
|
// token because the merged pos1 would contain a function-word component.
|
||||||
|
const mergedCandidate = makeParseItem('scanning-parser', [
|
||||||
|
[{ text: 'かかってこいよ', reading: 'かかってこいよ', headword: 'かかってくる' }],
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Candidate B: two tokens — content verb surface + particle separately.
|
||||||
|
// The content token is frequency-eligible on its own.
|
||||||
|
const splitCandidate = makeParseItem('scanning-parser', [
|
||||||
|
[{ text: 'かかってこい', reading: 'かかってこい', headword: 'かかってくる' }],
|
||||||
|
[{ text: 'よ', reading: 'よ', headword: 'よ' }],
|
||||||
|
]);
|
||||||
|
|
||||||
|
// When merged candidate comes first in the array, multi-token split still wins.
|
||||||
|
const tokens = selectYomitanParseTokens(
|
||||||
|
[mergedCandidate, splitCandidate],
|
||||||
|
() => false,
|
||||||
|
'headword',
|
||||||
|
);
|
||||||
|
assert.equal(tokens?.length, 2);
|
||||||
|
assert.equal(tokens?.[0]?.surface, 'かかってこい');
|
||||||
|
assert.equal(tokens?.[0]?.headword, 'かかってくる');
|
||||||
|
assert.equal(tokens?.[1]?.surface, 'よ');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('multi-token candidate beats single merged content+function token regardless of input order', () => {
|
||||||
|
const mergedCandidate = makeParseItem('scanning-parser', [
|
||||||
|
[{ text: 'かかってこいよ', reading: 'かかってこいよ', headword: 'かかってくる' }],
|
||||||
|
]);
|
||||||
|
|
||||||
|
const splitCandidate = makeParseItem('scanning-parser', [
|
||||||
|
[{ text: 'かかってこい', reading: 'かかってこい', headword: 'かかってくる' }],
|
||||||
|
[{ text: 'よ', reading: 'よ', headword: 'よ' }],
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Split candidate comes first — should still win over merged.
|
||||||
|
const tokens = selectYomitanParseTokens(
|
||||||
|
[splitCandidate, mergedCandidate],
|
||||||
|
() => false,
|
||||||
|
'headword',
|
||||||
|
);
|
||||||
|
assert.equal(tokens?.length, 2);
|
||||||
|
assert.equal(tokens?.[0]?.surface, 'かかってこい');
|
||||||
|
assert.equal(tokens?.[1]?.surface, 'よ');
|
||||||
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user