fix(subtitle): tighten frequency token filtering

This commit is contained in:
2026-03-07 01:28:37 -08:00
parent 3dff6c2515
commit 1d76e05cd3
7 changed files with 343 additions and 17 deletions

View File

@@ -1861,9 +1861,9 @@ test('tokenizeSubtitle keeps parsing explicit by scanning-parser source only', a
assert.equal(result.tokens?.[4]?.frequencyRank, 1500);
});
test('tokenizeSubtitle still assigns frequency to non-known Yomitan tokens', async () => {
test('tokenizeSubtitle still assigns frequency to non-known multi-character Yomitan tokens', async () => {
const result = await tokenizeSubtitle(
'小園',
'小園友達',
makeDeps({
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
getYomitanParserWindow: () =>
@@ -1884,9 +1884,9 @@ test('tokenizeSubtitle still assigns frequency to non-known Yomitan tokens', asy
],
[
{
text: '',
reading: '',
headwords: [[{ term: '' }]],
text: '友達',
reading: 'ともだち',
headwords: [[{ term: '友達' }]],
},
],
],
@@ -1895,7 +1895,7 @@ test('tokenizeSubtitle still assigns frequency to non-known Yomitan tokens', asy
},
}) as unknown as Electron.BrowserWindow,
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) => (text === '小園' ? 75 : text === '' ? 3000 : null),
getFrequencyRank: (text) => (text === '小園' ? 75 : text === '友達' ? 3000 : null),
isKnownWord: (text) => text === '小園',
}),
);
@@ -2635,6 +2635,21 @@ test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and freque
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
});
test('tokenizeSubtitle excludes single-kana merged tokens from frequency highlighting', async () => {
const result = await tokenizeSubtitle(
'た',
makeDepsFromYomitanTokens([{ surface: 'た', reading: 'た', headword: 'た' }], {
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) => (text === 'た' ? 17 : null),
getMinSentenceWordsForNPlusOne: () => 1,
tokenizeWithMecab: async () => null,
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
});
test('tokenizeSubtitle excludes merged function/content token from frequency highlighting but keeps N+1', async () => {
const result = await tokenizeSubtitle(
'になれば',

View File

@@ -252,12 +252,12 @@ test('annotateTokens applies configured pos1 exclusions to both frequency and N+
test('annotateTokens allows previously default-excluded pos1 when removed from effective set', () => {
const tokens = [
makeToken({
surface: '',
headword: '',
surface: 'まで',
headword: 'まで',
partOfSpeech: PartOfSpeech.other,
pos1: '助詞',
startPos: 0,
endPos: 1,
endPos: 2,
frequencyRank: 8,
}),
];
@@ -314,6 +314,52 @@ test('annotateTokens excludes likely kana SFX tokens from frequency when POS tag
assert.equal(result[0]?.frequencyRank, undefined);
});
test('annotateTokens excludes single hiragana and katakana tokens from frequency when POS tags are missing', () => {
const tokens = [
makeToken({
surface: 'た',
reading: 'た',
headword: 'た',
pos1: '',
pos2: '',
partOfSpeech: PartOfSpeech.other,
frequencyRank: 21,
startPos: 0,
endPos: 1,
}),
makeToken({
surface: 'ア',
reading: 'ア',
headword: 'ア',
pos1: '',
pos2: '',
partOfSpeech: PartOfSpeech.other,
frequencyRank: 22,
startPos: 1,
endPos: 2,
}),
makeToken({
surface: '山',
reading: 'やま',
headword: '山',
pos1: '',
pos2: '',
partOfSpeech: PartOfSpeech.other,
frequencyRank: 23,
startPos: 2,
endPos: 3,
}),
];
const result = annotateTokens(tokens, makeDeps(), {
minSentenceWordsForNPlusOne: 1,
});
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[1]?.frequencyRank, undefined);
assert.equal(result[2]?.frequencyRank, 23);
});
test('annotateTokens keeps frequency when mecab tags classify token as content-bearing', () => {
const tokens = [
makeToken({

View File

@@ -103,6 +103,10 @@ function isFrequencyExcludedByPos(
pos1Exclusions: ReadonlySet<string>,
pos2Exclusions: ReadonlySet<string>,
): boolean {
if (isSingleKanaFrequencyNoiseToken(token.surface)) {
return true;
}
const normalizedPos1 = normalizePos1Tag(token.pos1);
const hasPos1 = normalizedPos1.length > 0;
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
@@ -363,6 +367,20 @@ function isLikelyFrequencyNoiseToken(token: MergedToken): boolean {
return false;
}
function isSingleKanaFrequencyNoiseToken(text: string | undefined): boolean {
if (typeof text !== 'string') {
return false;
}
const normalized = text.trim();
if (!normalized) {
return false;
}
const chars = [...normalized];
return chars.length === 1 && isKanaChar(chars[0]!);
}
function isJlptEligibleToken(token: MergedToken): boolean {
if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) {
return false;

View File

@@ -643,6 +643,175 @@ test('requestYomitanScanTokens marks grouped entries when SubMiner dictionary al
assert.equal((result as Array<{ isNameMatch?: boolean }>)[0]?.isNameMatch, true);
});
test('requestYomitanScanTokens skips fallback fragments without exact primary source matches', async () => {
const deps = createDeps(async (script) => {
if (script.includes('optionsGetFull')) {
return {
profileCurrent: 0,
profiles: [
{
options: {
scanning: { length: 40 },
},
},
],
};
}
return await runInjectedYomitanScript(script, (action, params) => {
if (action !== 'termsFind') {
throw new Error(`unexpected action: ${action}`);
}
const text = (params as { text?: string } | undefined)?.text ?? '';
if (text.startsWith('だが ')) {
return {
originalTextLength: 2,
dictionaryEntries: [
{
headwords: [
{
term: 'だが',
reading: 'だが',
sources: [{ originalText: 'だが', isPrimary: true, matchType: 'exact' }],
},
],
},
],
};
}
if (text.startsWith('それでも')) {
return {
originalTextLength: 4,
dictionaryEntries: [
{
headwords: [
{
term: 'それでも',
reading: 'それでも',
sources: [{ originalText: 'それでも', isPrimary: true, matchType: 'exact' }],
},
],
},
],
};
}
if (text.startsWith('届かぬ')) {
return {
originalTextLength: 3,
dictionaryEntries: [
{
headwords: [
{
term: '届く',
reading: 'とどく',
sources: [{ originalText: '届かぬ', isPrimary: true, matchType: 'exact' }],
},
],
},
],
};
}
if (text.startsWith('高み')) {
return {
originalTextLength: 2,
dictionaryEntries: [
{
headwords: [
{
term: '高み',
reading: 'たかみ',
sources: [{ originalText: '高み', isPrimary: true, matchType: 'exact' }],
},
],
},
],
};
}
if (text.startsWith('があった')) {
return {
originalTextLength: 2,
dictionaryEntries: [
{
headwords: [
{
term: 'があ',
reading: '',
sources: [{ originalText: 'が', isPrimary: true, matchType: 'exact' }],
},
],
},
],
};
}
if (text.startsWith('あった')) {
return {
originalTextLength: 3,
dictionaryEntries: [
{
headwords: [
{
term: 'ある',
reading: 'ある',
sources: [{ originalText: 'あった', isPrimary: true, matchType: 'exact' }],
},
],
},
],
};
}
return { originalTextLength: 0, dictionaryEntries: [] };
});
});
const result = await requestYomitanScanTokens(
'だが それでも届かぬ高みがあった',
deps,
{ error: () => undefined },
);
assert.deepEqual(
result?.map((token) => ({
surface: token.surface,
headword: token.headword,
startPos: token.startPos,
endPos: token.endPos,
})),
[
{
surface: 'だが',
headword: 'だが',
startPos: 0,
endPos: 2,
},
{
surface: 'それでも',
headword: 'それでも',
startPos: 3,
endPos: 7,
},
{
surface: '届かぬ',
headword: '届く',
startPos: 7,
endPos: 10,
},
{
surface: '高み',
headword: '高み',
startPos: 10,
endPos: 12,
},
{
surface: 'あった',
headword: 'ある',
startPos: 13,
endPos: 16,
},
],
);
});
test('getYomitanDictionaryInfo requests dictionary info via backend action', async () => {
let scriptValue = '';
const deps = createDeps(async (script) => {

View File

@@ -843,14 +843,7 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
};
}
}
const fallback = dictionaryEntries?.[0]?.headwords?.[0];
return fallback
? {
term: fallback.term,
reading: fallback.reading,
isNameMatch: matchedNameDictionary || isNameDictionaryEntry(dictionaryEntries?.[0])
}
: null;
return null;
}
`;