mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-05-04 00:41:33 -07:00
fix: preserve ordinal frequency annotations
This commit is contained in:
+60
@@ -0,0 +1,60 @@
|
|||||||
|
---
|
||||||
|
id: TASK-332
|
||||||
|
title: Fix subtitle frequency annotation missing ranks shown in Yomitan popup
|
||||||
|
status: Done
|
||||||
|
assignee:
|
||||||
|
- Codex
|
||||||
|
created_date: '2026-05-04 03:29'
|
||||||
|
updated_date: '2026-05-04 03:41'
|
||||||
|
labels:
|
||||||
|
- bug
|
||||||
|
- tokenizer
|
||||||
|
dependencies: []
|
||||||
|
priority: medium
|
||||||
|
---
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||||
|
Subtitle frequency highlighting can miss a token even when the Yomitan popup shows a rank within the configured threshold. Reproduced with `第二走者とアンカーは\n中継地点に速やかに移動!`: Yomitan popup shows `第二` JPDB rank 1820, but SubMiner tokenizer output has no `frequencyRank` for `第二`, so renderer cannot annotate it.
|
||||||
|
<!-- SECTION:DESCRIPTION:END -->
|
||||||
|
|
||||||
|
## Acceptance Criteria
|
||||||
|
<!-- AC:BEGIN -->
|
||||||
|
- [x] #1 `第二` in `第二走者とアンカーは\n中継地点に速やかに移動!` receives the Yomitan rank shown by the popup when frequency highlighting is enabled.
|
||||||
|
- [x] #2 Regression test covers the Yomitan scan/frequency ingestion path for exact popup-derived ranks.
|
||||||
|
- [x] #3 Existing tokenizer frequency tests continue to pass.
|
||||||
|
<!-- AC:END -->
|
||||||
|
|
||||||
|
## Implementation Plan
|
||||||
|
|
||||||
|
<!-- SECTION:PLAN:BEGIN -->
|
||||||
|
1. Reproduce and inspect the missing `第二` rank path with tokenizer probes and focused tests.
|
||||||
|
2. Preserve exact Yomitan scan frequency ranks when the matching frequency entry omits reading metadata but has the same exact term.
|
||||||
|
3. Allow ranked ordinal prefix-noun tokens (`第` + numeric noun, e.g. `第二`) through annotation POS filtering while keeping standalone prefixes excluded.
|
||||||
|
4. Verify with focused tokenizer/runtime/annotation tests, typecheck, changelog lint, and a live-style Yomitan profile probe.
|
||||||
|
<!-- SECTION:PLAN:END -->
|
||||||
|
|
||||||
|
## Implementation Notes
|
||||||
|
|
||||||
|
<!-- SECTION:NOTES:BEGIN -->
|
||||||
|
Root-cause probe against temp copy of Yomitan profile: tokenizer returns no frequencyRank for `第二`; renderer config `topX` is 10000, so render threshold is not the blocker.
|
||||||
|
|
||||||
|
User approved implementation plan on 2026-05-04.
|
||||||
|
|
||||||
|
Verification: `bun test src/core/services/tokenizer.test.ts src/core/services/tokenizer/yomitan-parser-runtime.test.ts src/core/services/tokenizer/annotation-stage.test.ts` passed (192 tests).
|
||||||
|
|
||||||
|
Verification: `bun run typecheck` passed.
|
||||||
|
|
||||||
|
Verification: `bun run changelog:lint` passed.
|
||||||
|
|
||||||
|
Verification: `bun run get-frequency:electron -- --yomitan-user-data /tmp/subminer-yomitan-probe-909423 "第二走者とアンカーは\\n中継地点に速やかに移動!"` produced `第二` with `frequencyRank: 1820`.
|
||||||
|
|
||||||
|
Finalization check: implementation plan updated to reflect the discovered POS-filter root cause and completed solution.
|
||||||
|
<!-- SECTION:NOTES:END -->
|
||||||
|
|
||||||
|
## Final Summary
|
||||||
|
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||||
|
Fixed subtitle frequency annotation for `第二` by allowing ranked ordinal prefix-noun compounds through annotation POS filtering. Also made scan rank matching tolerate exact frequency entries where one side omits reading metadata. Verified with tokenizer/runtime/annotation tests, typecheck, changelog lint, and a live-style Yomitan profile probe showing `第二` now receives frequencyRank 1820.
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||||
@@ -0,0 +1,4 @@
|
|||||||
|
type: fixed
|
||||||
|
area: overlay
|
||||||
|
|
||||||
|
- Overlay: Fixed frequency highlighting for ordinal prefix-noun tokens like `第二` so popup ranks such as JPDB 1820 are preserved in subtitle annotations.
|
||||||
@@ -4077,6 +4077,69 @@ test('tokenizeSubtitle keeps frequency for content-led merged token with trailin
|
|||||||
assert.equal(result.tokens?.[0]?.frequencyRank, 5468);
|
assert.equal(result.tokens?.[0]?.frequencyRank, 5468);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('tokenizeSubtitle keeps frequency for ordinal prefix-noun tokens', async () => {
|
||||||
|
const result = await tokenizeSubtitle(
|
||||||
|
'第二走者',
|
||||||
|
makeDepsFromYomitanTokens(
|
||||||
|
[
|
||||||
|
{ surface: '第二', reading: 'だいに', headword: '第二' },
|
||||||
|
{ surface: '走者', reading: 'そうしゃ', headword: '走者' },
|
||||||
|
],
|
||||||
|
{
|
||||||
|
getFrequencyDictionaryEnabled: () => true,
|
||||||
|
getFrequencyRank: (text) => (text === '第二' ? 1820 : text === '走者' ? 41555 : null),
|
||||||
|
tokenizeWithMecab: async () => [
|
||||||
|
{
|
||||||
|
headword: '第',
|
||||||
|
surface: '第',
|
||||||
|
reading: 'ダイ',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 1,
|
||||||
|
partOfSpeech: PartOfSpeech.other,
|
||||||
|
pos1: '接頭詞',
|
||||||
|
pos2: '数接続',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: '二',
|
||||||
|
surface: '二',
|
||||||
|
reading: 'ニ',
|
||||||
|
startPos: 1,
|
||||||
|
endPos: 2,
|
||||||
|
partOfSpeech: PartOfSpeech.noun,
|
||||||
|
pos1: '名詞',
|
||||||
|
pos2: '数',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: '走者',
|
||||||
|
surface: '走者',
|
||||||
|
reading: 'ソウシャ',
|
||||||
|
startPos: 2,
|
||||||
|
endPos: 4,
|
||||||
|
partOfSpeech: PartOfSpeech.noun,
|
||||||
|
pos1: '名詞',
|
||||||
|
pos2: '一般',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
getMinSentenceWordsForNPlusOne: () => 1,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.equal(result.tokens?.[0]?.surface, '第二');
|
||||||
|
assert.equal(result.tokens?.[0]?.pos1, '接頭詞|名詞');
|
||||||
|
assert.equal(result.tokens?.[0]?.pos2, '数接続|数');
|
||||||
|
assert.equal(result.tokens?.[0]?.frequencyRank, 1820);
|
||||||
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle clears all annotations for explanatory contrast endings', async () => {
|
test('tokenizeSubtitle clears all annotations for explanatory contrast endings', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'最近辛いものが続いとるんですけど',
|
'最近辛いものが続いとるんですけど',
|
||||||
|
|||||||
@@ -149,6 +149,24 @@ function shouldAllowContentLedMergedTokenFrequency(
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function shouldAllowOrdinalPrefixNounFrequency(token: MergedToken): boolean {
|
||||||
|
const normalizedSurface = token.surface.trim();
|
||||||
|
const normalizedHeadword = token.headword.trim();
|
||||||
|
if (!normalizedSurface.startsWith('第') && !normalizedHeadword.startsWith('第')) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1));
|
||||||
|
const pos2Parts = splitNormalizedTagParts(normalizePos2Tag(token.pos2));
|
||||||
|
return (
|
||||||
|
pos1Parts.length >= 2 &&
|
||||||
|
pos1Parts[0] === '接頭詞' &&
|
||||||
|
pos1Parts.slice(1).some((part) => part === '名詞') &&
|
||||||
|
pos2Parts[0] === '数接続' &&
|
||||||
|
pos2Parts.slice(1).some((part) => part === '数')
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
function isFrequencyExcludedByPos(
|
function isFrequencyExcludedByPos(
|
||||||
token: MergedToken,
|
token: MergedToken,
|
||||||
pos1Exclusions: ReadonlySet<string>,
|
pos1Exclusions: ReadonlySet<string>,
|
||||||
@@ -168,12 +186,21 @@ function isFrequencyExcludedByPos(
|
|||||||
pos1Exclusions,
|
pos1Exclusions,
|
||||||
pos2Exclusions,
|
pos2Exclusions,
|
||||||
);
|
);
|
||||||
|
const allowOrdinalPrefixNounToken = shouldAllowOrdinalPrefixNounFrequency(token);
|
||||||
|
|
||||||
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions) && !allowContentLedMergedToken) {
|
if (
|
||||||
|
isExcludedByTagSet(normalizedPos1, pos1Exclusions) &&
|
||||||
|
!allowContentLedMergedToken &&
|
||||||
|
!allowOrdinalPrefixNounToken
|
||||||
|
) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isExcludedByTagSet(normalizedPos2, pos2Exclusions) && !allowContentLedMergedToken) {
|
if (
|
||||||
|
isExcludedByTagSet(normalizedPos2, pos2Exclusions) &&
|
||||||
|
!allowContentLedMergedToken &&
|
||||||
|
!allowOrdinalPrefixNounToken
|
||||||
|
) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -891,6 +891,105 @@ test('requestYomitanScanTokens can use frequency from later exact secondary-matc
|
|||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('requestYomitanScanTokens uses exact frequency entry when selected reading differs', async () => {
|
||||||
|
let scannerScript = '';
|
||||||
|
const deps = createDeps(async (script) => {
|
||||||
|
if (script.includes('termsFind')) {
|
||||||
|
scannerScript = script;
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
if (script.includes('optionsGetFull')) {
|
||||||
|
return {
|
||||||
|
profileCurrent: 0,
|
||||||
|
profileIndex: 0,
|
||||||
|
scanLength: 40,
|
||||||
|
dictionaries: ['JPDBv2㋕', 'Jiten', 'CC100'],
|
||||||
|
dictionaryPriorityByName: {
|
||||||
|
'JPDBv2㋕': 0,
|
||||||
|
Jiten: 1,
|
||||||
|
CC100: 2,
|
||||||
|
},
|
||||||
|
dictionaryFrequencyModeByName: {
|
||||||
|
'JPDBv2㋕': 'rank-based',
|
||||||
|
Jiten: 'rank-based',
|
||||||
|
CC100: 'rank-based',
|
||||||
|
},
|
||||||
|
profiles: [
|
||||||
|
{
|
||||||
|
options: {
|
||||||
|
scanning: { length: 40 },
|
||||||
|
dictionaries: [
|
||||||
|
{ name: 'JPDBv2㋕', enabled: true, id: 0 },
|
||||||
|
{ name: 'Jiten', enabled: true, id: 1 },
|
||||||
|
{ name: 'CC100', enabled: true, id: 2 },
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
|
||||||
|
await requestYomitanScanTokens('第二走者', deps, {
|
||||||
|
error: () => undefined,
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = (await runInjectedYomitanScript(scannerScript, (action, params) => {
|
||||||
|
if (action !== 'termsFind') {
|
||||||
|
throw new Error(`unexpected action: ${action}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const text = (params as { text?: string } | undefined)?.text ?? '';
|
||||||
|
if (!text.startsWith('第二')) {
|
||||||
|
return { originalTextLength: 0, dictionaryEntries: [] };
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
originalTextLength: 2,
|
||||||
|
dictionaryEntries: [
|
||||||
|
{
|
||||||
|
headwords: [
|
||||||
|
{
|
||||||
|
term: '第二',
|
||||||
|
reading: 'だいに',
|
||||||
|
sources: [{ originalText: '第二', isPrimary: true, matchType: 'exact' }],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
frequencies: [],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headwords: [
|
||||||
|
{
|
||||||
|
term: '第二',
|
||||||
|
reading: '',
|
||||||
|
sources: [{ originalText: '第二', isPrimary: false, matchType: 'exact' }],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
frequencies: [
|
||||||
|
{
|
||||||
|
headwordIndex: 0,
|
||||||
|
dictionary: 'JPDBv2㋕',
|
||||||
|
frequency: 189513,
|
||||||
|
displayValue: '1820,189513句',
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
})) as Array<Record<string, unknown>>;
|
||||||
|
|
||||||
|
assert.deepEqual(result?.[0], {
|
||||||
|
surface: '第二',
|
||||||
|
reading: 'だいに',
|
||||||
|
headword: '第二',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 2,
|
||||||
|
isNameMatch: false,
|
||||||
|
frequencyRank: 1820,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
test('requestYomitanScanTokens marks tokens backed by SubMiner character dictionary entries', async () => {
|
test('requestYomitanScanTokens marks tokens backed by SubMiner character dictionary entries', async () => {
|
||||||
const deps = createDeps(async (script) => {
|
const deps = createDeps(async (script) => {
|
||||||
if (script.includes('optionsGetFull')) {
|
if (script.includes('optionsGetFull')) {
|
||||||
|
|||||||
@@ -960,6 +960,9 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
|
|||||||
const matchReading = typeof match.headword?.reading === 'string' ? match.headword.reading : '';
|
const matchReading = typeof match.headword?.reading === 'string' ? match.headword.reading : '';
|
||||||
const preferredReading =
|
const preferredReading =
|
||||||
typeof preferredMatch.headword?.reading === 'string' ? preferredMatch.headword.reading : '';
|
typeof preferredMatch.headword?.reading === 'string' ? preferredMatch.headword.reading : '';
|
||||||
|
if (!matchReading || !preferredReading) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
return matchReading === preferredReading;
|
return matchReading === preferredReading;
|
||||||
}
|
}
|
||||||
function getBestFrequencyRankForMatches(matches, dictionaryPriorityByName, dictionaryFrequencyModeByName) {
|
function getBestFrequencyRankForMatches(matches, dictionaryPriorityByName, dictionaryFrequencyModeByName) {
|
||||||
|
|||||||
Reference in New Issue
Block a user