mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-07 03:22:17 -08:00
fix(subtitle): tighten frequency token filtering
This commit is contained in:
@@ -0,0 +1,42 @@
|
||||
---
|
||||
id: TASK-107
|
||||
title: 'Fix Yomitan scan-token fallback fragmentation on exact-source misses'
|
||||
status: Done
|
||||
assignee: []
|
||||
created_date: '2026-03-07 01:10'
|
||||
updated_date: '2026-03-07 01:12'
|
||||
labels: []
|
||||
dependencies: []
|
||||
priority: high
|
||||
ordinal: 9007
|
||||
---
|
||||
|
||||
## Description
|
||||
|
||||
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||
|
||||
Left-to-right Yomitan scanning can emit bogus fallback tokens when `termsFind` returns entries but none of their headwords carries an exact primary source for the consumed substring. Repro: `だが それでも届かぬ高みがあった` currently yields trailing fragments like `があ` / `た`, which blocks the real `あった` token from receiving frequency highlighting.
|
||||
|
||||
<!-- SECTION:DESCRIPTION:END -->
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
<!-- AC:BEGIN -->
|
||||
|
||||
- [x] #1 Scanner skips `termsFind` fallback entries that are not backed by an exact primary source for the consumed substring.
|
||||
- [x] #2 Repro line no longer yields bogus trailing fragments such as `があ`.
|
||||
- [x] #3 Regression coverage added for the scan-token path.
|
||||
|
||||
<!-- AC:END -->
|
||||
|
||||
## Final Summary
|
||||
|
||||
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||
|
||||
Removed the scan-token helper fallback that previously emitted a token from the first returned headword even when Yomitan did not report an exact primary source for the consumed substring. Added a focused regression test covering `だが それでも届かぬ高みがあった`, ensuring bogus `があ` fragmentation is skipped so the later `あった` exact match can still be tokenized and highlighted.
|
||||
|
||||
Verification:
|
||||
|
||||
- `bun test src/core/services/tokenizer/yomitan-parser-runtime.test.ts src/core/services/tokenizer.test.ts --timeout 20000`
|
||||
|
||||
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||
@@ -0,0 +1,43 @@
|
||||
---
|
||||
id: TASK-108
|
||||
title: 'Exclude single kana tokens from frequency highlighting'
|
||||
status: Done
|
||||
assignee: []
|
||||
created_date: '2026-03-07 01:18'
|
||||
updated_date: '2026-03-07 01:22'
|
||||
labels: []
|
||||
dependencies: []
|
||||
priority: medium
|
||||
ordinal: 9008
|
||||
---
|
||||
|
||||
## Description
|
||||
|
||||
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||
|
||||
Suppress frequency highlighting for single-character hiragana or katakana tokens. Scope is frequency-only: known/N+1/JLPT behavior stays unchanged.
|
||||
|
||||
<!-- SECTION:DESCRIPTION:END -->
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
<!-- AC:BEGIN -->
|
||||
|
||||
- [x] #1 Single-character hiragana tokens do not retain `frequencyRank`.
|
||||
- [x] #2 Single-character katakana tokens do not retain `frequencyRank`.
|
||||
- [x] #3 Regression coverage exists at annotation-stage and tokenizer levels.
|
||||
|
||||
<!-- AC:END -->
|
||||
|
||||
## Final Summary
|
||||
|
||||
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||
|
||||
Added a frequency-only suppression rule for single-character kana tokens based on token `surface`, so bogus merged fragments like `た` and standalone one-character kana no longer keep `frequencyRank`. Regression coverage now exists both in the annotation stage and in the tokenizer path, while multi-character tokens and N+1/JLPT behavior remain unchanged.
|
||||
|
||||
Verification:
|
||||
|
||||
- `bun test src/core/services/tokenizer/annotation-stage.test.ts --timeout 20000`
|
||||
- `bun test src/core/services/tokenizer.test.ts --timeout 20000`
|
||||
|
||||
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||
@@ -1861,9 +1861,9 @@ test('tokenizeSubtitle keeps parsing explicit by scanning-parser source only', a
|
||||
assert.equal(result.tokens?.[4]?.frequencyRank, 1500);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle still assigns frequency to non-known Yomitan tokens', async () => {
|
||||
test('tokenizeSubtitle still assigns frequency to non-known multi-character Yomitan tokens', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'小園に',
|
||||
'小園友達',
|
||||
makeDeps({
|
||||
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
||||
getYomitanParserWindow: () =>
|
||||
@@ -1884,9 +1884,9 @@ test('tokenizeSubtitle still assigns frequency to non-known Yomitan tokens', asy
|
||||
],
|
||||
[
|
||||
{
|
||||
text: 'に',
|
||||
reading: 'に',
|
||||
headwords: [[{ term: 'に' }]],
|
||||
text: '友達',
|
||||
reading: 'ともだち',
|
||||
headwords: [[{ term: '友達' }]],
|
||||
},
|
||||
],
|
||||
],
|
||||
@@ -1895,7 +1895,7 @@ test('tokenizeSubtitle still assigns frequency to non-known Yomitan tokens', asy
|
||||
},
|
||||
}) as unknown as Electron.BrowserWindow,
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) => (text === '小園' ? 75 : text === 'に' ? 3000 : null),
|
||||
getFrequencyRank: (text) => (text === '小園' ? 75 : text === '友達' ? 3000 : null),
|
||||
isKnownWord: (text) => text === '小園',
|
||||
}),
|
||||
);
|
||||
@@ -2635,6 +2635,21 @@ test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and freque
|
||||
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle excludes single-kana merged tokens from frequency highlighting', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'た',
|
||||
makeDepsFromYomitanTokens([{ surface: 'た', reading: 'た', headword: 'た' }], {
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) => (text === 'た' ? 17 : null),
|
||||
getMinSentenceWordsForNPlusOne: () => 1,
|
||||
tokenizeWithMecab: async () => null,
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle excludes merged function/content token from frequency highlighting but keeps N+1', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'になれば',
|
||||
|
||||
@@ -252,12 +252,12 @@ test('annotateTokens applies configured pos1 exclusions to both frequency and N+
|
||||
test('annotateTokens allows previously default-excluded pos1 when removed from effective set', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'は',
|
||||
headword: 'は',
|
||||
surface: 'まで',
|
||||
headword: 'まで',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '助詞',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
endPos: 2,
|
||||
frequencyRank: 8,
|
||||
}),
|
||||
];
|
||||
@@ -314,6 +314,52 @@ test('annotateTokens excludes likely kana SFX tokens from frequency when POS tag
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens excludes single hiragana and katakana tokens from frequency when POS tags are missing', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'た',
|
||||
reading: 'た',
|
||||
headword: 'た',
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
frequencyRank: 21,
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'ア',
|
||||
reading: 'ア',
|
||||
headword: 'ア',
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
frequencyRank: 22,
|
||||
startPos: 1,
|
||||
endPos: 2,
|
||||
}),
|
||||
makeToken({
|
||||
surface: '山',
|
||||
reading: 'やま',
|
||||
headword: '山',
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
frequencyRank: 23,
|
||||
startPos: 2,
|
||||
endPos: 3,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(tokens, makeDeps(), {
|
||||
minSentenceWordsForNPlusOne: 1,
|
||||
});
|
||||
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
assert.equal(result[1]?.frequencyRank, undefined);
|
||||
assert.equal(result[2]?.frequencyRank, 23);
|
||||
});
|
||||
|
||||
test('annotateTokens keeps frequency when mecab tags classify token as content-bearing', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
|
||||
@@ -103,6 +103,10 @@ function isFrequencyExcludedByPos(
|
||||
pos1Exclusions: ReadonlySet<string>,
|
||||
pos2Exclusions: ReadonlySet<string>,
|
||||
): boolean {
|
||||
if (isSingleKanaFrequencyNoiseToken(token.surface)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const normalizedPos1 = normalizePos1Tag(token.pos1);
|
||||
const hasPos1 = normalizedPos1.length > 0;
|
||||
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
|
||||
@@ -363,6 +367,20 @@ function isLikelyFrequencyNoiseToken(token: MergedToken): boolean {
|
||||
return false;
|
||||
}
|
||||
|
||||
function isSingleKanaFrequencyNoiseToken(text: string | undefined): boolean {
|
||||
if (typeof text !== 'string') {
|
||||
return false;
|
||||
}
|
||||
|
||||
const normalized = text.trim();
|
||||
if (!normalized) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const chars = [...normalized];
|
||||
return chars.length === 1 && isKanaChar(chars[0]!);
|
||||
}
|
||||
|
||||
function isJlptEligibleToken(token: MergedToken): boolean {
|
||||
if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) {
|
||||
return false;
|
||||
|
||||
@@ -643,6 +643,175 @@ test('requestYomitanScanTokens marks grouped entries when SubMiner dictionary al
|
||||
assert.equal((result as Array<{ isNameMatch?: boolean }>)[0]?.isNameMatch, true);
|
||||
});
|
||||
|
||||
test('requestYomitanScanTokens skips fallback fragments without exact primary source matches', async () => {
|
||||
const deps = createDeps(async (script) => {
|
||||
if (script.includes('optionsGetFull')) {
|
||||
return {
|
||||
profileCurrent: 0,
|
||||
profiles: [
|
||||
{
|
||||
options: {
|
||||
scanning: { length: 40 },
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
return await runInjectedYomitanScript(script, (action, params) => {
|
||||
if (action !== 'termsFind') {
|
||||
throw new Error(`unexpected action: ${action}`);
|
||||
}
|
||||
|
||||
const text = (params as { text?: string } | undefined)?.text ?? '';
|
||||
if (text.startsWith('だが ')) {
|
||||
return {
|
||||
originalTextLength: 2,
|
||||
dictionaryEntries: [
|
||||
{
|
||||
headwords: [
|
||||
{
|
||||
term: 'だが',
|
||||
reading: 'だが',
|
||||
sources: [{ originalText: 'だが', isPrimary: true, matchType: 'exact' }],
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
if (text.startsWith('それでも')) {
|
||||
return {
|
||||
originalTextLength: 4,
|
||||
dictionaryEntries: [
|
||||
{
|
||||
headwords: [
|
||||
{
|
||||
term: 'それでも',
|
||||
reading: 'それでも',
|
||||
sources: [{ originalText: 'それでも', isPrimary: true, matchType: 'exact' }],
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
if (text.startsWith('届かぬ')) {
|
||||
return {
|
||||
originalTextLength: 3,
|
||||
dictionaryEntries: [
|
||||
{
|
||||
headwords: [
|
||||
{
|
||||
term: '届く',
|
||||
reading: 'とどく',
|
||||
sources: [{ originalText: '届かぬ', isPrimary: true, matchType: 'exact' }],
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
if (text.startsWith('高み')) {
|
||||
return {
|
||||
originalTextLength: 2,
|
||||
dictionaryEntries: [
|
||||
{
|
||||
headwords: [
|
||||
{
|
||||
term: '高み',
|
||||
reading: 'たかみ',
|
||||
sources: [{ originalText: '高み', isPrimary: true, matchType: 'exact' }],
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
if (text.startsWith('があった')) {
|
||||
return {
|
||||
originalTextLength: 2,
|
||||
dictionaryEntries: [
|
||||
{
|
||||
headwords: [
|
||||
{
|
||||
term: 'があ',
|
||||
reading: '',
|
||||
sources: [{ originalText: 'が', isPrimary: true, matchType: 'exact' }],
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
if (text.startsWith('あった')) {
|
||||
return {
|
||||
originalTextLength: 3,
|
||||
dictionaryEntries: [
|
||||
{
|
||||
headwords: [
|
||||
{
|
||||
term: 'ある',
|
||||
reading: 'ある',
|
||||
sources: [{ originalText: 'あった', isPrimary: true, matchType: 'exact' }],
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
return { originalTextLength: 0, dictionaryEntries: [] };
|
||||
});
|
||||
});
|
||||
|
||||
const result = await requestYomitanScanTokens(
|
||||
'だが それでも届かぬ高みがあった',
|
||||
deps,
|
||||
{ error: () => undefined },
|
||||
);
|
||||
|
||||
assert.deepEqual(
|
||||
result?.map((token) => ({
|
||||
surface: token.surface,
|
||||
headword: token.headword,
|
||||
startPos: token.startPos,
|
||||
endPos: token.endPos,
|
||||
})),
|
||||
[
|
||||
{
|
||||
surface: 'だが',
|
||||
headword: 'だが',
|
||||
startPos: 0,
|
||||
endPos: 2,
|
||||
},
|
||||
{
|
||||
surface: 'それでも',
|
||||
headword: 'それでも',
|
||||
startPos: 3,
|
||||
endPos: 7,
|
||||
},
|
||||
{
|
||||
surface: '届かぬ',
|
||||
headword: '届く',
|
||||
startPos: 7,
|
||||
endPos: 10,
|
||||
},
|
||||
{
|
||||
surface: '高み',
|
||||
headword: '高み',
|
||||
startPos: 10,
|
||||
endPos: 12,
|
||||
},
|
||||
{
|
||||
surface: 'あった',
|
||||
headword: 'ある',
|
||||
startPos: 13,
|
||||
endPos: 16,
|
||||
},
|
||||
],
|
||||
);
|
||||
});
|
||||
|
||||
test('getYomitanDictionaryInfo requests dictionary info via backend action', async () => {
|
||||
let scriptValue = '';
|
||||
const deps = createDeps(async (script) => {
|
||||
|
||||
@@ -843,14 +843,7 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
|
||||
};
|
||||
}
|
||||
}
|
||||
const fallback = dictionaryEntries?.[0]?.headwords?.[0];
|
||||
return fallback
|
||||
? {
|
||||
term: fallback.term,
|
||||
reading: fallback.reading,
|
||||
isNameMatch: matchedNameDictionary || isNameDictionaryEntry(dictionaryEntries?.[0])
|
||||
}
|
||||
: null;
|
||||
return null;
|
||||
}
|
||||
`;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user