fix(subtitle): tighten frequency token filtering

This commit is contained in:
2026-03-07 01:28:37 -08:00
parent 3dff6c2515
commit 1d76e05cd3
7 changed files with 343 additions and 17 deletions

View File

@@ -0,0 +1,42 @@
---
id: TASK-107
title: 'Fix Yomitan scan-token fallback fragmentation on exact-source misses'
status: Done
assignee: []
created_date: '2026-03-07 01:10'
updated_date: '2026-03-07 01:12'
labels: []
dependencies: []
priority: high
ordinal: 9007
---
## Description
<!-- SECTION:DESCRIPTION:BEGIN -->
Left-to-right Yomitan scanning can emit bogus fallback tokens when `termsFind` returns entries but none of their headwords carries an exact primary source for the consumed substring. Repro: `だが それでも届かぬ高みがあった` currently yields trailing fragments like `があ` / `た`, which blocks the real `あった` token from receiving frequency highlighting.
<!-- SECTION:DESCRIPTION:END -->
## Acceptance Criteria
<!-- AC:BEGIN -->
- [x] #1 Scanner skips `termsFind` fallback entries that are not backed by an exact primary source for the consumed substring.
- [x] #2 Repro line no longer yields bogus trailing fragments such as `があ`.
- [x] #3 Regression coverage added for the scan-token path.
<!-- AC:END -->
## Final Summary
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
Removed the scan-token helper fallback that previously emitted a token from the first returned headword even when Yomitan did not report an exact primary source for the consumed substring. Added a focused regression test covering `だが それでも届かぬ高みがあった`, ensuring bogus `があ` fragmentation is skipped so the later `あった` exact match can still be tokenized and highlighted.
Verification:
- `bun test src/core/services/tokenizer/yomitan-parser-runtime.test.ts src/core/services/tokenizer.test.ts --timeout 20000`
<!-- SECTION:FINAL_SUMMARY:END -->

View File

@@ -0,0 +1,43 @@
---
id: TASK-108
title: 'Exclude single kana tokens from frequency highlighting'
status: Done
assignee: []
created_date: '2026-03-07 01:18'
updated_date: '2026-03-07 01:22'
labels: []
dependencies: []
priority: medium
ordinal: 9008
---
## Description
<!-- SECTION:DESCRIPTION:BEGIN -->
Suppress frequency highlighting for single-character hiragana or katakana tokens. Scope is frequency-only: known/N+1/JLPT behavior stays unchanged.
<!-- SECTION:DESCRIPTION:END -->
## Acceptance Criteria
<!-- AC:BEGIN -->
- [x] #1 Single-character hiragana tokens do not retain `frequencyRank`.
- [x] #2 Single-character katakana tokens do not retain `frequencyRank`.
- [x] #3 Regression coverage exists at annotation-stage and tokenizer levels.
<!-- AC:END -->
## Final Summary
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
Added a frequency-only suppression rule for single-character kana tokens based on token `surface`, so bogus merged fragments like `た` and standalone one-character kana no longer keep `frequencyRank`. Regression coverage now exists both in the annotation stage and in the tokenizer path, while multi-character tokens and N+1/JLPT behavior remain unchanged.
Verification:
- `bun test src/core/services/tokenizer/annotation-stage.test.ts --timeout 20000`
- `bun test src/core/services/tokenizer.test.ts --timeout 20000`
<!-- SECTION:FINAL_SUMMARY:END -->

View File

@@ -1861,9 +1861,9 @@ test('tokenizeSubtitle keeps parsing explicit by scanning-parser source only', a
assert.equal(result.tokens?.[4]?.frequencyRank, 1500);
});
test('tokenizeSubtitle still assigns frequency to non-known Yomitan tokens', async () => {
test('tokenizeSubtitle still assigns frequency to non-known multi-character Yomitan tokens', async () => {
const result = await tokenizeSubtitle(
'小園',
'小園友達',
makeDeps({
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
getYomitanParserWindow: () =>
@@ -1884,9 +1884,9 @@ test('tokenizeSubtitle still assigns frequency to non-known Yomitan tokens', asy
],
[
{
text: '',
reading: '',
headwords: [[{ term: '' }]],
text: '友達',
reading: 'ともだち',
headwords: [[{ term: '友達' }]],
},
],
],
@@ -1895,7 +1895,7 @@ test('tokenizeSubtitle still assigns frequency to non-known Yomitan tokens', asy
},
}) as unknown as Electron.BrowserWindow,
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) => (text === '小園' ? 75 : text === '' ? 3000 : null),
getFrequencyRank: (text) => (text === '小園' ? 75 : text === '友達' ? 3000 : null),
isKnownWord: (text) => text === '小園',
}),
);
@@ -2635,6 +2635,21 @@ test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and freque
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
});
test('tokenizeSubtitle excludes single-kana merged tokens from frequency highlighting', async () => {
const result = await tokenizeSubtitle(
'た',
makeDepsFromYomitanTokens([{ surface: 'た', reading: 'た', headword: 'た' }], {
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) => (text === 'た' ? 17 : null),
getMinSentenceWordsForNPlusOne: () => 1,
tokenizeWithMecab: async () => null,
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
});
test('tokenizeSubtitle excludes merged function/content token from frequency highlighting but keeps N+1', async () => {
const result = await tokenizeSubtitle(
'になれば',

View File

@@ -252,12 +252,12 @@ test('annotateTokens applies configured pos1 exclusions to both frequency and N+
test('annotateTokens allows previously default-excluded pos1 when removed from effective set', () => {
const tokens = [
makeToken({
surface: '',
headword: '',
surface: 'まで',
headword: 'まで',
partOfSpeech: PartOfSpeech.other,
pos1: '助詞',
startPos: 0,
endPos: 1,
endPos: 2,
frequencyRank: 8,
}),
];
@@ -314,6 +314,52 @@ test('annotateTokens excludes likely kana SFX tokens from frequency when POS tag
assert.equal(result[0]?.frequencyRank, undefined);
});
test('annotateTokens excludes single hiragana and katakana tokens from frequency when POS tags are missing', () => {
const tokens = [
makeToken({
surface: 'た',
reading: 'た',
headword: 'た',
pos1: '',
pos2: '',
partOfSpeech: PartOfSpeech.other,
frequencyRank: 21,
startPos: 0,
endPos: 1,
}),
makeToken({
surface: 'ア',
reading: 'ア',
headword: 'ア',
pos1: '',
pos2: '',
partOfSpeech: PartOfSpeech.other,
frequencyRank: 22,
startPos: 1,
endPos: 2,
}),
makeToken({
surface: '山',
reading: 'やま',
headword: '山',
pos1: '',
pos2: '',
partOfSpeech: PartOfSpeech.other,
frequencyRank: 23,
startPos: 2,
endPos: 3,
}),
];
const result = annotateTokens(tokens, makeDeps(), {
minSentenceWordsForNPlusOne: 1,
});
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[1]?.frequencyRank, undefined);
assert.equal(result[2]?.frequencyRank, 23);
});
test('annotateTokens keeps frequency when mecab tags classify token as content-bearing', () => {
const tokens = [
makeToken({

View File

@@ -103,6 +103,10 @@ function isFrequencyExcludedByPos(
pos1Exclusions: ReadonlySet<string>,
pos2Exclusions: ReadonlySet<string>,
): boolean {
if (isSingleKanaFrequencyNoiseToken(token.surface)) {
return true;
}
const normalizedPos1 = normalizePos1Tag(token.pos1);
const hasPos1 = normalizedPos1.length > 0;
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
@@ -363,6 +367,20 @@ function isLikelyFrequencyNoiseToken(token: MergedToken): boolean {
return false;
}
function isSingleKanaFrequencyNoiseToken(text: string | undefined): boolean {
if (typeof text !== 'string') {
return false;
}
const normalized = text.trim();
if (!normalized) {
return false;
}
const chars = [...normalized];
return chars.length === 1 && isKanaChar(chars[0]!);
}
function isJlptEligibleToken(token: MergedToken): boolean {
if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) {
return false;

View File

@@ -643,6 +643,175 @@ test('requestYomitanScanTokens marks grouped entries when SubMiner dictionary al
assert.equal((result as Array<{ isNameMatch?: boolean }>)[0]?.isNameMatch, true);
});
test('requestYomitanScanTokens skips fallback fragments without exact primary source matches', async () => {
const deps = createDeps(async (script) => {
if (script.includes('optionsGetFull')) {
return {
profileCurrent: 0,
profiles: [
{
options: {
scanning: { length: 40 },
},
},
],
};
}
return await runInjectedYomitanScript(script, (action, params) => {
if (action !== 'termsFind') {
throw new Error(`unexpected action: ${action}`);
}
const text = (params as { text?: string } | undefined)?.text ?? '';
if (text.startsWith('だが ')) {
return {
originalTextLength: 2,
dictionaryEntries: [
{
headwords: [
{
term: 'だが',
reading: 'だが',
sources: [{ originalText: 'だが', isPrimary: true, matchType: 'exact' }],
},
],
},
],
};
}
if (text.startsWith('それでも')) {
return {
originalTextLength: 4,
dictionaryEntries: [
{
headwords: [
{
term: 'それでも',
reading: 'それでも',
sources: [{ originalText: 'それでも', isPrimary: true, matchType: 'exact' }],
},
],
},
],
};
}
if (text.startsWith('届かぬ')) {
return {
originalTextLength: 3,
dictionaryEntries: [
{
headwords: [
{
term: '届く',
reading: 'とどく',
sources: [{ originalText: '届かぬ', isPrimary: true, matchType: 'exact' }],
},
],
},
],
};
}
if (text.startsWith('高み')) {
return {
originalTextLength: 2,
dictionaryEntries: [
{
headwords: [
{
term: '高み',
reading: 'たかみ',
sources: [{ originalText: '高み', isPrimary: true, matchType: 'exact' }],
},
],
},
],
};
}
if (text.startsWith('があった')) {
return {
originalTextLength: 2,
dictionaryEntries: [
{
headwords: [
{
term: 'があ',
reading: '',
sources: [{ originalText: 'が', isPrimary: true, matchType: 'exact' }],
},
],
},
],
};
}
if (text.startsWith('あった')) {
return {
originalTextLength: 3,
dictionaryEntries: [
{
headwords: [
{
term: 'ある',
reading: 'ある',
sources: [{ originalText: 'あった', isPrimary: true, matchType: 'exact' }],
},
],
},
],
};
}
return { originalTextLength: 0, dictionaryEntries: [] };
});
});
const result = await requestYomitanScanTokens(
'だが それでも届かぬ高みがあった',
deps,
{ error: () => undefined },
);
assert.deepEqual(
result?.map((token) => ({
surface: token.surface,
headword: token.headword,
startPos: token.startPos,
endPos: token.endPos,
})),
[
{
surface: 'だが',
headword: 'だが',
startPos: 0,
endPos: 2,
},
{
surface: 'それでも',
headword: 'それでも',
startPos: 3,
endPos: 7,
},
{
surface: '届かぬ',
headword: '届く',
startPos: 7,
endPos: 10,
},
{
surface: '高み',
headword: '高み',
startPos: 10,
endPos: 12,
},
{
surface: 'あった',
headword: 'ある',
startPos: 13,
endPos: 16,
},
],
);
});
test('getYomitanDictionaryInfo requests dictionary info via backend action', async () => {
let scriptValue = '';
const deps = createDeps(async (script) => {

View File

@@ -843,14 +843,7 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
};
}
}
const fallback = dictionaryEntries?.[0]?.headwords?.[0];
return fallback
? {
term: fallback.term,
reading: fallback.reading,
isNameMatch: matchedNameDictionary || isNameDictionaryEntry(dictionaryEntries?.[0])
}
: null;
return null;
}
`;