mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-07 03:22:17 -08:00
fix(subtitle): tighten frequency token filtering
This commit is contained in:
@@ -0,0 +1,42 @@
|
|||||||
|
---
|
||||||
|
id: TASK-107
|
||||||
|
title: 'Fix Yomitan scan-token fallback fragmentation on exact-source misses'
|
||||||
|
status: Done
|
||||||
|
assignee: []
|
||||||
|
created_date: '2026-03-07 01:10'
|
||||||
|
updated_date: '2026-03-07 01:12'
|
||||||
|
labels: []
|
||||||
|
dependencies: []
|
||||||
|
priority: high
|
||||||
|
ordinal: 9007
|
||||||
|
---
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||||
|
|
||||||
|
Left-to-right Yomitan scanning can emit bogus fallback tokens when `termsFind` returns entries but none of their headwords carries an exact primary source for the consumed substring. Repro: `だが それでも届かぬ高みがあった` currently yields trailing fragments like `があ` / `た`, which blocks the real `あった` token from receiving frequency highlighting.
|
||||||
|
|
||||||
|
<!-- SECTION:DESCRIPTION:END -->
|
||||||
|
|
||||||
|
## Acceptance Criteria
|
||||||
|
|
||||||
|
<!-- AC:BEGIN -->
|
||||||
|
|
||||||
|
- [x] #1 Scanner skips `termsFind` fallback entries that are not backed by an exact primary source for the consumed substring.
|
||||||
|
- [x] #2 Repro line no longer yields bogus trailing fragments such as `があ`.
|
||||||
|
- [x] #3 Regression coverage added for the scan-token path.
|
||||||
|
|
||||||
|
<!-- AC:END -->
|
||||||
|
|
||||||
|
## Final Summary
|
||||||
|
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||||
|
|
||||||
|
Removed the scan-token helper fallback that previously emitted a token from the first returned headword even when Yomitan did not report an exact primary source for the consumed substring. Added a focused regression test covering `だが それでも届かぬ高みがあった`, ensuring bogus `があ` fragmentation is skipped so the later `あった` exact match can still be tokenized and highlighted.
|
||||||
|
|
||||||
|
Verification:
|
||||||
|
|
||||||
|
- `bun test src/core/services/tokenizer/yomitan-parser-runtime.test.ts src/core/services/tokenizer.test.ts --timeout 20000`
|
||||||
|
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||||
@@ -0,0 +1,43 @@
|
|||||||
|
---
|
||||||
|
id: TASK-108
|
||||||
|
title: 'Exclude single kana tokens from frequency highlighting'
|
||||||
|
status: Done
|
||||||
|
assignee: []
|
||||||
|
created_date: '2026-03-07 01:18'
|
||||||
|
updated_date: '2026-03-07 01:22'
|
||||||
|
labels: []
|
||||||
|
dependencies: []
|
||||||
|
priority: medium
|
||||||
|
ordinal: 9008
|
||||||
|
---
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||||
|
|
||||||
|
Suppress frequency highlighting for single-character hiragana or katakana tokens. Scope is frequency-only: known/N+1/JLPT behavior stays unchanged.
|
||||||
|
|
||||||
|
<!-- SECTION:DESCRIPTION:END -->
|
||||||
|
|
||||||
|
## Acceptance Criteria
|
||||||
|
|
||||||
|
<!-- AC:BEGIN -->
|
||||||
|
|
||||||
|
- [x] #1 Single-character hiragana tokens do not retain `frequencyRank`.
|
||||||
|
- [x] #2 Single-character katakana tokens do not retain `frequencyRank`.
|
||||||
|
- [x] #3 Regression coverage exists at annotation-stage and tokenizer levels.
|
||||||
|
|
||||||
|
<!-- AC:END -->
|
||||||
|
|
||||||
|
## Final Summary
|
||||||
|
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||||
|
|
||||||
|
Added a frequency-only suppression rule for single-character kana tokens based on token `surface`, so bogus merged fragments like `た` and standalone one-character kana no longer keep `frequencyRank`. Regression coverage now exists both in the annotation stage and in the tokenizer path, while multi-character tokens and N+1/JLPT behavior remain unchanged.
|
||||||
|
|
||||||
|
Verification:
|
||||||
|
|
||||||
|
- `bun test src/core/services/tokenizer/annotation-stage.test.ts --timeout 20000`
|
||||||
|
- `bun test src/core/services/tokenizer.test.ts --timeout 20000`
|
||||||
|
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||||
@@ -1861,9 +1861,9 @@ test('tokenizeSubtitle keeps parsing explicit by scanning-parser source only', a
|
|||||||
assert.equal(result.tokens?.[4]?.frequencyRank, 1500);
|
assert.equal(result.tokens?.[4]?.frequencyRank, 1500);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle still assigns frequency to non-known Yomitan tokens', async () => {
|
test('tokenizeSubtitle still assigns frequency to non-known multi-character Yomitan tokens', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'小園に',
|
'小園友達',
|
||||||
makeDeps({
|
makeDeps({
|
||||||
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
||||||
getYomitanParserWindow: () =>
|
getYomitanParserWindow: () =>
|
||||||
@@ -1884,9 +1884,9 @@ test('tokenizeSubtitle still assigns frequency to non-known Yomitan tokens', asy
|
|||||||
],
|
],
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
text: 'に',
|
text: '友達',
|
||||||
reading: 'に',
|
reading: 'ともだち',
|
||||||
headwords: [[{ term: 'に' }]],
|
headwords: [[{ term: '友達' }]],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
],
|
],
|
||||||
@@ -1895,7 +1895,7 @@ test('tokenizeSubtitle still assigns frequency to non-known Yomitan tokens', asy
|
|||||||
},
|
},
|
||||||
}) as unknown as Electron.BrowserWindow,
|
}) as unknown as Electron.BrowserWindow,
|
||||||
getFrequencyDictionaryEnabled: () => true,
|
getFrequencyDictionaryEnabled: () => true,
|
||||||
getFrequencyRank: (text) => (text === '小園' ? 75 : text === 'に' ? 3000 : null),
|
getFrequencyRank: (text) => (text === '小園' ? 75 : text === '友達' ? 3000 : null),
|
||||||
isKnownWord: (text) => text === '小園',
|
isKnownWord: (text) => text === '小園',
|
||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
@@ -2635,6 +2635,21 @@ test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and freque
|
|||||||
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
|
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('tokenizeSubtitle excludes single-kana merged tokens from frequency highlighting', async () => {
|
||||||
|
const result = await tokenizeSubtitle(
|
||||||
|
'た',
|
||||||
|
makeDepsFromYomitanTokens([{ surface: 'た', reading: 'た', headword: 'た' }], {
|
||||||
|
getFrequencyDictionaryEnabled: () => true,
|
||||||
|
getFrequencyRank: (text) => (text === 'た' ? 17 : null),
|
||||||
|
getMinSentenceWordsForNPlusOne: () => 1,
|
||||||
|
tokenizeWithMecab: async () => null,
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.equal(result.tokens?.length, 1);
|
||||||
|
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
||||||
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle excludes merged function/content token from frequency highlighting but keeps N+1', async () => {
|
test('tokenizeSubtitle excludes merged function/content token from frequency highlighting but keeps N+1', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'になれば',
|
'になれば',
|
||||||
|
|||||||
@@ -252,12 +252,12 @@ test('annotateTokens applies configured pos1 exclusions to both frequency and N+
|
|||||||
test('annotateTokens allows previously default-excluded pos1 when removed from effective set', () => {
|
test('annotateTokens allows previously default-excluded pos1 when removed from effective set', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'は',
|
surface: 'まで',
|
||||||
headword: 'は',
|
headword: 'まで',
|
||||||
partOfSpeech: PartOfSpeech.other,
|
partOfSpeech: PartOfSpeech.other,
|
||||||
pos1: '助詞',
|
pos1: '助詞',
|
||||||
startPos: 0,
|
startPos: 0,
|
||||||
endPos: 1,
|
endPos: 2,
|
||||||
frequencyRank: 8,
|
frequencyRank: 8,
|
||||||
}),
|
}),
|
||||||
];
|
];
|
||||||
@@ -314,6 +314,52 @@ test('annotateTokens excludes likely kana SFX tokens from frequency when POS tag
|
|||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('annotateTokens excludes single hiragana and katakana tokens from frequency when POS tags are missing', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: 'た',
|
||||||
|
reading: 'た',
|
||||||
|
headword: 'た',
|
||||||
|
pos1: '',
|
||||||
|
pos2: '',
|
||||||
|
partOfSpeech: PartOfSpeech.other,
|
||||||
|
frequencyRank: 21,
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 1,
|
||||||
|
}),
|
||||||
|
makeToken({
|
||||||
|
surface: 'ア',
|
||||||
|
reading: 'ア',
|
||||||
|
headword: 'ア',
|
||||||
|
pos1: '',
|
||||||
|
pos2: '',
|
||||||
|
partOfSpeech: PartOfSpeech.other,
|
||||||
|
frequencyRank: 22,
|
||||||
|
startPos: 1,
|
||||||
|
endPos: 2,
|
||||||
|
}),
|
||||||
|
makeToken({
|
||||||
|
surface: '山',
|
||||||
|
reading: 'やま',
|
||||||
|
headword: '山',
|
||||||
|
pos1: '',
|
||||||
|
pos2: '',
|
||||||
|
partOfSpeech: PartOfSpeech.other,
|
||||||
|
frequencyRank: 23,
|
||||||
|
startPos: 2,
|
||||||
|
endPos: 3,
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = annotateTokens(tokens, makeDeps(), {
|
||||||
|
minSentenceWordsForNPlusOne: 1,
|
||||||
|
});
|
||||||
|
|
||||||
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
|
assert.equal(result[1]?.frequencyRank, undefined);
|
||||||
|
assert.equal(result[2]?.frequencyRank, 23);
|
||||||
|
});
|
||||||
|
|
||||||
test('annotateTokens keeps frequency when mecab tags classify token as content-bearing', () => {
|
test('annotateTokens keeps frequency when mecab tags classify token as content-bearing', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
|
|||||||
@@ -103,6 +103,10 @@ function isFrequencyExcludedByPos(
|
|||||||
pos1Exclusions: ReadonlySet<string>,
|
pos1Exclusions: ReadonlySet<string>,
|
||||||
pos2Exclusions: ReadonlySet<string>,
|
pos2Exclusions: ReadonlySet<string>,
|
||||||
): boolean {
|
): boolean {
|
||||||
|
if (isSingleKanaFrequencyNoiseToken(token.surface)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
const normalizedPos1 = normalizePos1Tag(token.pos1);
|
const normalizedPos1 = normalizePos1Tag(token.pos1);
|
||||||
const hasPos1 = normalizedPos1.length > 0;
|
const hasPos1 = normalizedPos1.length > 0;
|
||||||
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
|
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
|
||||||
@@ -363,6 +367,20 @@ function isLikelyFrequencyNoiseToken(token: MergedToken): boolean {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function isSingleKanaFrequencyNoiseToken(text: string | undefined): boolean {
|
||||||
|
if (typeof text !== 'string') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const normalized = text.trim();
|
||||||
|
if (!normalized) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const chars = [...normalized];
|
||||||
|
return chars.length === 1 && isKanaChar(chars[0]!);
|
||||||
|
}
|
||||||
|
|
||||||
function isJlptEligibleToken(token: MergedToken): boolean {
|
function isJlptEligibleToken(token: MergedToken): boolean {
|
||||||
if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) {
|
if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) {
|
||||||
return false;
|
return false;
|
||||||
|
|||||||
@@ -643,6 +643,175 @@ test('requestYomitanScanTokens marks grouped entries when SubMiner dictionary al
|
|||||||
assert.equal((result as Array<{ isNameMatch?: boolean }>)[0]?.isNameMatch, true);
|
assert.equal((result as Array<{ isNameMatch?: boolean }>)[0]?.isNameMatch, true);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('requestYomitanScanTokens skips fallback fragments without exact primary source matches', async () => {
|
||||||
|
const deps = createDeps(async (script) => {
|
||||||
|
if (script.includes('optionsGetFull')) {
|
||||||
|
return {
|
||||||
|
profileCurrent: 0,
|
||||||
|
profiles: [
|
||||||
|
{
|
||||||
|
options: {
|
||||||
|
scanning: { length: 40 },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return await runInjectedYomitanScript(script, (action, params) => {
|
||||||
|
if (action !== 'termsFind') {
|
||||||
|
throw new Error(`unexpected action: ${action}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const text = (params as { text?: string } | undefined)?.text ?? '';
|
||||||
|
if (text.startsWith('だが ')) {
|
||||||
|
return {
|
||||||
|
originalTextLength: 2,
|
||||||
|
dictionaryEntries: [
|
||||||
|
{
|
||||||
|
headwords: [
|
||||||
|
{
|
||||||
|
term: 'だが',
|
||||||
|
reading: 'だが',
|
||||||
|
sources: [{ originalText: 'だが', isPrimary: true, matchType: 'exact' }],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if (text.startsWith('それでも')) {
|
||||||
|
return {
|
||||||
|
originalTextLength: 4,
|
||||||
|
dictionaryEntries: [
|
||||||
|
{
|
||||||
|
headwords: [
|
||||||
|
{
|
||||||
|
term: 'それでも',
|
||||||
|
reading: 'それでも',
|
||||||
|
sources: [{ originalText: 'それでも', isPrimary: true, matchType: 'exact' }],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if (text.startsWith('届かぬ')) {
|
||||||
|
return {
|
||||||
|
originalTextLength: 3,
|
||||||
|
dictionaryEntries: [
|
||||||
|
{
|
||||||
|
headwords: [
|
||||||
|
{
|
||||||
|
term: '届く',
|
||||||
|
reading: 'とどく',
|
||||||
|
sources: [{ originalText: '届かぬ', isPrimary: true, matchType: 'exact' }],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if (text.startsWith('高み')) {
|
||||||
|
return {
|
||||||
|
originalTextLength: 2,
|
||||||
|
dictionaryEntries: [
|
||||||
|
{
|
||||||
|
headwords: [
|
||||||
|
{
|
||||||
|
term: '高み',
|
||||||
|
reading: 'たかみ',
|
||||||
|
sources: [{ originalText: '高み', isPrimary: true, matchType: 'exact' }],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if (text.startsWith('があった')) {
|
||||||
|
return {
|
||||||
|
originalTextLength: 2,
|
||||||
|
dictionaryEntries: [
|
||||||
|
{
|
||||||
|
headwords: [
|
||||||
|
{
|
||||||
|
term: 'があ',
|
||||||
|
reading: '',
|
||||||
|
sources: [{ originalText: 'が', isPrimary: true, matchType: 'exact' }],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if (text.startsWith('あった')) {
|
||||||
|
return {
|
||||||
|
originalTextLength: 3,
|
||||||
|
dictionaryEntries: [
|
||||||
|
{
|
||||||
|
headwords: [
|
||||||
|
{
|
||||||
|
term: 'ある',
|
||||||
|
reading: 'ある',
|
||||||
|
sources: [{ originalText: 'あった', isPrimary: true, matchType: 'exact' }],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return { originalTextLength: 0, dictionaryEntries: [] };
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await requestYomitanScanTokens(
|
||||||
|
'だが それでも届かぬ高みがあった',
|
||||||
|
deps,
|
||||||
|
{ error: () => undefined },
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.deepEqual(
|
||||||
|
result?.map((token) => ({
|
||||||
|
surface: token.surface,
|
||||||
|
headword: token.headword,
|
||||||
|
startPos: token.startPos,
|
||||||
|
endPos: token.endPos,
|
||||||
|
})),
|
||||||
|
[
|
||||||
|
{
|
||||||
|
surface: 'だが',
|
||||||
|
headword: 'だが',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
surface: 'それでも',
|
||||||
|
headword: 'それでも',
|
||||||
|
startPos: 3,
|
||||||
|
endPos: 7,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
surface: '届かぬ',
|
||||||
|
headword: '届く',
|
||||||
|
startPos: 7,
|
||||||
|
endPos: 10,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
surface: '高み',
|
||||||
|
headword: '高み',
|
||||||
|
startPos: 10,
|
||||||
|
endPos: 12,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
surface: 'あった',
|
||||||
|
headword: 'ある',
|
||||||
|
startPos: 13,
|
||||||
|
endPos: 16,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
test('getYomitanDictionaryInfo requests dictionary info via backend action', async () => {
|
test('getYomitanDictionaryInfo requests dictionary info via backend action', async () => {
|
||||||
let scriptValue = '';
|
let scriptValue = '';
|
||||||
const deps = createDeps(async (script) => {
|
const deps = createDeps(async (script) => {
|
||||||
|
|||||||
@@ -843,14 +843,7 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const fallback = dictionaryEntries?.[0]?.headwords?.[0];
|
return null;
|
||||||
return fallback
|
|
||||||
? {
|
|
||||||
term: fallback.term,
|
|
||||||
reading: fallback.reading,
|
|
||||||
isNameMatch: matchedNameDictionary || isNameDictionaryEntry(dictionaryEntries?.[0])
|
|
||||||
}
|
|
||||||
: null;
|
|
||||||
}
|
}
|
||||||
`;
|
`;
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user