diff --git a/backlog/tasks/task-332 - Fix-subtitle-frequency-annotation-missing-ranks-shown-in-Yomitan-popup.md b/backlog/tasks/task-332 - Fix-subtitle-frequency-annotation-missing-ranks-shown-in-Yomitan-popup.md new file mode 100644 index 00000000..53fc6882 --- /dev/null +++ b/backlog/tasks/task-332 - Fix-subtitle-frequency-annotation-missing-ranks-shown-in-Yomitan-popup.md @@ -0,0 +1,60 @@ +--- +id: TASK-332 +title: Fix subtitle frequency annotation missing ranks shown in Yomitan popup +status: Done +assignee: + - Codex +created_date: '2026-05-04 03:29' +updated_date: '2026-05-04 03:41' +labels: + - bug + - tokenizer +dependencies: [] +priority: medium +--- + +## Description + + +Subtitle frequency highlighting can miss a token even when the Yomitan popup shows a rank within the configured threshold. Reproduced with `第二走者とアンカーは\n中継地点に速やかに移動!`: Yomitan popup shows `第二` JPDB rank 1820, but SubMiner tokenizer output has no `frequencyRank` for `第二`, so renderer cannot annotate it. + + +## Acceptance Criteria + +- [x] #1 `第二` in `第二走者とアンカーは\n中継地点に速やかに移動!` receives the Yomitan rank shown by the popup when frequency highlighting is enabled. +- [x] #2 Regression test covers the Yomitan scan/frequency ingestion path for exact popup-derived ranks. +- [x] #3 Existing tokenizer frequency tests continue to pass. + + +## Implementation Plan + + +1. Reproduce and inspect the missing `第二` rank path with tokenizer probes and focused tests. +2. Preserve exact Yomitan scan frequency ranks when the matching frequency entry omits reading metadata but has the same exact term. +3. Allow ranked ordinal prefix-noun tokens (`第` + numeric noun, e.g. `第二`) through annotation POS filtering while keeping standalone prefixes excluded. +4. Verify with focused tokenizer/runtime/annotation tests, typecheck, changelog lint, and a live-style Yomitan profile probe. + + +## Implementation Notes + + +Root-cause probe against temp copy of Yomitan profile: tokenizer returns no frequencyRank for `第二`; renderer config `topX` is 10000, so render threshold is not the blocker. + +User approved implementation plan on 2026-05-04. + +Verification: `bun test src/core/services/tokenizer.test.ts src/core/services/tokenizer/yomitan-parser-runtime.test.ts src/core/services/tokenizer/annotation-stage.test.ts` passed (192 tests). + +Verification: `bun run typecheck` passed. + +Verification: `bun run changelog:lint` passed. + +Verification: `bun run get-frequency:electron -- --yomitan-user-data /tmp/subminer-yomitan-probe-909423 "第二走者とアンカーは\\n中継地点に速やかに移動!"` produced `第二` with `frequencyRank: 1820`. + +Finalization check: implementation plan updated to reflect the discovered POS-filter root cause and completed solution. + + +## Final Summary + + +Fixed subtitle frequency annotation for `第二` by allowing ranked ordinal prefix-noun compounds through annotation POS filtering. Also made scan rank matching tolerate exact frequency entries where one side omits reading metadata. Verified with tokenizer/runtime/annotation tests, typecheck, changelog lint, and a live-style Yomitan profile probe showing `第二` now receives frequencyRank 1820. + diff --git a/changes/332-subtitle-frequency-ordinal-prefix.md b/changes/332-subtitle-frequency-ordinal-prefix.md new file mode 100644 index 00000000..7676cb4d --- /dev/null +++ b/changes/332-subtitle-frequency-ordinal-prefix.md @@ -0,0 +1,4 @@ +type: fixed +area: overlay + +- Overlay: Fixed frequency highlighting for ordinal prefix-noun tokens like `第二` so popup ranks such as JPDB 1820 are preserved in subtitle annotations. diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index cd59e40c..b025eb1b 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -4077,6 +4077,69 @@ test('tokenizeSubtitle keeps frequency for content-led merged token with trailin assert.equal(result.tokens?.[0]?.frequencyRank, 5468); }); +test('tokenizeSubtitle keeps frequency for ordinal prefix-noun tokens', async () => { + const result = await tokenizeSubtitle( + '第二走者', + makeDepsFromYomitanTokens( + [ + { surface: '第二', reading: 'だいに', headword: '第二' }, + { surface: '走者', reading: 'そうしゃ', headword: '走者' }, + ], + { + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => (text === '第二' ? 1820 : text === '走者' ? 41555 : null), + tokenizeWithMecab: async () => [ + { + headword: '第', + surface: '第', + reading: 'ダイ', + startPos: 0, + endPos: 1, + partOfSpeech: PartOfSpeech.other, + pos1: '接頭詞', + pos2: '数接続', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: '二', + surface: '二', + reading: 'ニ', + startPos: 1, + endPos: 2, + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '数', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: '走者', + surface: '走者', + reading: 'ソウシャ', + startPos: 2, + endPos: 4, + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '一般', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + ], + getMinSentenceWordsForNPlusOne: () => 1, + }, + ), + ); + + assert.equal(result.tokens?.[0]?.surface, '第二'); + assert.equal(result.tokens?.[0]?.pos1, '接頭詞|名詞'); + assert.equal(result.tokens?.[0]?.pos2, '数接続|数'); + assert.equal(result.tokens?.[0]?.frequencyRank, 1820); +}); + test('tokenizeSubtitle clears all annotations for explanatory contrast endings', async () => { const result = await tokenizeSubtitle( '最近辛いものが続いとるんですけど', diff --git a/src/core/services/tokenizer/annotation-stage.ts b/src/core/services/tokenizer/annotation-stage.ts index 543541ba..9e3fa9af 100644 --- a/src/core/services/tokenizer/annotation-stage.ts +++ b/src/core/services/tokenizer/annotation-stage.ts @@ -149,6 +149,24 @@ function shouldAllowContentLedMergedTokenFrequency( return true; } +function shouldAllowOrdinalPrefixNounFrequency(token: MergedToken): boolean { + const normalizedSurface = token.surface.trim(); + const normalizedHeadword = token.headword.trim(); + if (!normalizedSurface.startsWith('第') && !normalizedHeadword.startsWith('第')) { + return false; + } + + const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1)); + const pos2Parts = splitNormalizedTagParts(normalizePos2Tag(token.pos2)); + return ( + pos1Parts.length >= 2 && + pos1Parts[0] === '接頭詞' && + pos1Parts.slice(1).some((part) => part === '名詞') && + pos2Parts[0] === '数接続' && + pos2Parts.slice(1).some((part) => part === '数') + ); +} + function isFrequencyExcludedByPos( token: MergedToken, pos1Exclusions: ReadonlySet, @@ -168,12 +186,21 @@ function isFrequencyExcludedByPos( pos1Exclusions, pos2Exclusions, ); + const allowOrdinalPrefixNounToken = shouldAllowOrdinalPrefixNounFrequency(token); - if (isExcludedByTagSet(normalizedPos1, pos1Exclusions) && !allowContentLedMergedToken) { + if ( + isExcludedByTagSet(normalizedPos1, pos1Exclusions) && + !allowContentLedMergedToken && + !allowOrdinalPrefixNounToken + ) { return true; } - if (isExcludedByTagSet(normalizedPos2, pos2Exclusions) && !allowContentLedMergedToken) { + if ( + isExcludedByTagSet(normalizedPos2, pos2Exclusions) && + !allowContentLedMergedToken && + !allowOrdinalPrefixNounToken + ) { return true; } diff --git a/src/core/services/tokenizer/yomitan-parser-runtime.test.ts b/src/core/services/tokenizer/yomitan-parser-runtime.test.ts index 4a0fd903..098d2800 100644 --- a/src/core/services/tokenizer/yomitan-parser-runtime.test.ts +++ b/src/core/services/tokenizer/yomitan-parser-runtime.test.ts @@ -891,6 +891,105 @@ test('requestYomitanScanTokens can use frequency from later exact secondary-matc ]); }); +test('requestYomitanScanTokens uses exact frequency entry when selected reading differs', async () => { + let scannerScript = ''; + const deps = createDeps(async (script) => { + if (script.includes('termsFind')) { + scannerScript = script; + return []; + } + if (script.includes('optionsGetFull')) { + return { + profileCurrent: 0, + profileIndex: 0, + scanLength: 40, + dictionaries: ['JPDBv2㋕', 'Jiten', 'CC100'], + dictionaryPriorityByName: { + 'JPDBv2㋕': 0, + Jiten: 1, + CC100: 2, + }, + dictionaryFrequencyModeByName: { + 'JPDBv2㋕': 'rank-based', + Jiten: 'rank-based', + CC100: 'rank-based', + }, + profiles: [ + { + options: { + scanning: { length: 40 }, + dictionaries: [ + { name: 'JPDBv2㋕', enabled: true, id: 0 }, + { name: 'Jiten', enabled: true, id: 1 }, + { name: 'CC100', enabled: true, id: 2 }, + ], + }, + }, + ], + }; + } + return null; + }); + + await requestYomitanScanTokens('第二走者', deps, { + error: () => undefined, + }); + + const result = (await runInjectedYomitanScript(scannerScript, (action, params) => { + if (action !== 'termsFind') { + throw new Error(`unexpected action: ${action}`); + } + + const text = (params as { text?: string } | undefined)?.text ?? ''; + if (!text.startsWith('第二')) { + return { originalTextLength: 0, dictionaryEntries: [] }; + } + + return { + originalTextLength: 2, + dictionaryEntries: [ + { + headwords: [ + { + term: '第二', + reading: 'だいに', + sources: [{ originalText: '第二', isPrimary: true, matchType: 'exact' }], + }, + ], + frequencies: [], + }, + { + headwords: [ + { + term: '第二', + reading: '', + sources: [{ originalText: '第二', isPrimary: false, matchType: 'exact' }], + }, + ], + frequencies: [ + { + headwordIndex: 0, + dictionary: 'JPDBv2㋕', + frequency: 189513, + displayValue: '1820,189513句', + }, + ], + }, + ], + }; + })) as Array>; + + assert.deepEqual(result?.[0], { + surface: '第二', + reading: 'だいに', + headword: '第二', + startPos: 0, + endPos: 2, + isNameMatch: false, + frequencyRank: 1820, + }); +}); + test('requestYomitanScanTokens marks tokens backed by SubMiner character dictionary entries', async () => { const deps = createDeps(async (script) => { if (script.includes('optionsGetFull')) { diff --git a/src/core/services/tokenizer/yomitan-parser-runtime.ts b/src/core/services/tokenizer/yomitan-parser-runtime.ts index 1189dfd5..34afdef3 100644 --- a/src/core/services/tokenizer/yomitan-parser-runtime.ts +++ b/src/core/services/tokenizer/yomitan-parser-runtime.ts @@ -960,6 +960,9 @@ const YOMITAN_SCANNING_HELPERS = String.raw` const matchReading = typeof match.headword?.reading === 'string' ? match.headword.reading : ''; const preferredReading = typeof preferredMatch.headword?.reading === 'string' ? preferredMatch.headword.reading : ''; + if (!matchReading || !preferredReading) { + return true; + } return matchReading === preferredReading; } function getBestFrequencyRankForMatches(matches, dictionaryPriorityByName, dictionaryFrequencyModeByName) {