fix(tokenizer): lazy yomitan term-only frequency fallback

This commit is contained in:
2026-03-02 01:45:37 -08:00
parent 629fe97ef7
commit 737101fe9e
5 changed files with 298 additions and 79 deletions

View File

@@ -0,0 +1,56 @@
---
id: TASK-82
title: 'Subtitle frequency highlighting: fix noisy Yomitan readings and restore known/N+1 color priority'
status: Done
assignee: []
created_date: '2026-03-02 20:10'
updated_date: '2026-03-02 01:44'
labels: []
dependencies: []
priority: high
ordinal: 9002
---
## Description
<!-- SECTION:DESCRIPTION:BEGIN -->
Address frequency-highlighting regressions:
- tokens like `断じて` missed rank assignment when Yomitan merged-token reading was truncated/noisy;
- known/N+1 tokens were incorrectly colored by frequency color instead of known/N+1 color.
Expected behavior:
- known/N+1 color always wins;
- if token is frequent and within `topX`, frequency rank label can still appear on hover/metadata.
<!-- SECTION:DESCRIPTION:END -->
## Acceptance Criteria
<!-- AC:BEGIN -->
- [x] #1 Frequency lookup succeeds for noisy/truncated merged-token readings via robust fallback behavior.
- [x] #2 Merged-token reading normalization restores missing kana suffixes where safe (`headword === surface` path).
- [x] #3 Known/N+1 tokens keep known/N+1 color classes; frequency color class does not override them.
- [x] #4 Frequency rank hover label remains available for in-range frequent tokens, including known/N+1.
- [x] #5 Regression tests added for tokenizer and renderer behavior.
<!-- AC:END -->
## Final Summary
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
Implemented and validated:
- tokenizer now normalizes selected Yomitan merged-token readings by appending missing trailing kana suffixes when safe (`headword === surface`);
- frequency lookup now does lazy fallback: requests `{term, reading}` first, and only requests `{term, reading: null}` for misses;
- this removes eager `(term, null)` payload inflation on medium-frequency lines and reduces extension RPC payload/load;
- renderer restored known/N+1 color priority over frequency class coloring;
- frequency rank label display remains available for frequent known/N+1 tokens;
- added regression tests covering noisy-reading fallback, lazy fallback-query behavior, and renderer class/label precedence.
Related commits:
- `17a417e` (`fix(subtitle): improve frequency highlight reliability`)
- `79f37f3` (`fix(subtitle): prioritize known and n+1 colors over frequency`)
<!-- SECTION:FINAL_SUMMARY:END -->

View File

@@ -346,6 +346,11 @@ test('tokenizeSubtitle queries headword frequencies with token reading for disam
webContents: {
executeJavaScript: async (script: string) => {
if (script.includes('getTermFrequencies')) {
assert.equal(
script.includes('"term":"鍛える","reading":null'),
false,
'should not eagerly include term-only fallback pair when reading lookup is present',
);
if (!script.includes('"term":"鍛える","reading":"きた"')) {
return [];
}

View File

@@ -329,11 +329,6 @@ function buildYomitanFrequencyTermReadingList(
const readingRaw =
token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null;
termReadingList.push({ term, reading: readingRaw });
// Yomitan parse readings can be noisy/truncated on merged tokens; include term-only fallback.
if (readingRaw !== null) {
termReadingList.push({ term, reading: null });
}
}
return termReadingList;

View File

@@ -157,6 +157,102 @@ test('requestYomitanTermFrequencies prefers primary rank from displayValue array
assert.equal(result[0]?.frequency, 7141);
});
test('requestYomitanTermFrequencies requests term-only fallback only after reading miss', async () => {
const frequencyScripts: string[] = [];
const deps = createDeps(async (script) => {
if (script.includes('optionsGetFull')) {
return {
profileCurrent: 0,
profiles: [
{
options: {
scanning: { length: 40 },
dictionaries: [{ name: 'freq-dict', enabled: true, id: 0 }],
},
},
],
};
}
if (!script.includes('getTermFrequencies')) {
return [];
}
frequencyScripts.push(script);
if (script.includes('"term":"断じて","reading":"だん"')) {
return [];
}
if (script.includes('"term":"断じて","reading":null')) {
return [
{
term: '断じて',
reading: null,
dictionary: 'freq-dict',
frequency: 7082,
displayValue: '7082',
displayValueParsed: true,
},
];
}
return [];
});
const result = await requestYomitanTermFrequencies([{ term: '断じて', reading: 'だん' }], deps, {
error: () => undefined,
});
assert.equal(result.length, 1);
assert.equal(result[0]?.frequency, 7082);
assert.equal(frequencyScripts.length, 2);
assert.match(frequencyScripts[0] ?? '', /"term":"断じて","reading":"だん"/);
assert.doesNotMatch(frequencyScripts[0] ?? '', /"term":"断じて","reading":null/);
assert.match(frequencyScripts[1] ?? '', /"term":"断じて","reading":null/);
});
test('requestYomitanTermFrequencies avoids term-only fallback request when reading lookup succeeds', async () => {
const frequencyScripts: string[] = [];
const deps = createDeps(async (script) => {
if (script.includes('optionsGetFull')) {
return {
profileCurrent: 0,
profiles: [
{
options: {
scanning: { length: 40 },
dictionaries: [{ name: 'freq-dict', enabled: true, id: 0 }],
},
},
],
};
}
if (!script.includes('getTermFrequencies')) {
return [];
}
frequencyScripts.push(script);
return [
{
term: '鍛える',
reading: 'きたえる',
dictionary: 'freq-dict',
frequency: 2847,
displayValue: '2847',
displayValueParsed: true,
},
];
});
const result = await requestYomitanTermFrequencies([{ term: '鍛える', reading: 'きた' }], deps, {
error: () => undefined,
});
assert.equal(result.length, 1);
assert.equal(frequencyScripts.length, 1);
assert.match(frequencyScripts[0] ?? '', /"term":"鍛える","reading":"きた"/);
assert.doesNotMatch(frequencyScripts[0] ?? '', /"term":"鍛える","reading":null/);
});
test('requestYomitanTermFrequencies caches profile metadata between calls', async () => {
const scripts: string[] = [];
const deps = createDeps(async (script) => {

View File

@@ -578,50 +578,12 @@ export async function requestYomitanParseResults(
}
}
export async function requestYomitanTermFrequencies(
async function fetchYomitanTermFrequencies(
parserWindow: BrowserWindow,
termReadingList: YomitanTermReadingPair[],
deps: YomitanParserRuntimeDeps,
metadata: YomitanProfileMetadata | null,
logger: LoggerLike,
): Promise<YomitanTermFrequency[]> {
const normalizedTermReadingList = normalizeTermReadingList(termReadingList);
const yomitanExt = deps.getYomitanExt();
if (normalizedTermReadingList.length === 0 || !yomitanExt) {
return [];
}
const isReady = await ensureYomitanParserWindow(deps, logger);
const parserWindow = deps.getYomitanParserWindow();
if (!isReady || !parserWindow || parserWindow.isDestroyed()) {
return [];
}
const metadata = await requestYomitanProfileMetadata(parserWindow, logger);
const frequencyCache = getWindowFrequencyCache(parserWindow);
const missingTermReadingList: YomitanTermReadingPair[] = [];
const buildCachedResult = (): YomitanTermFrequency[] => {
const result: YomitanTermFrequency[] = [];
for (const pair of normalizedTermReadingList) {
const key = makeTermReadingCacheKey(pair.term, pair.reading);
const cached = frequencyCache.get(key);
if (cached && cached.length > 0) {
result.push(...cached);
}
}
return result;
};
for (const pair of normalizedTermReadingList) {
const key = makeTermReadingCacheKey(pair.term, pair.reading);
if (!frequencyCache.has(key)) {
missingTermReadingList.push(pair);
}
}
if (missingTermReadingList.length === 0) {
return buildCachedResult();
}
): Promise<YomitanTermFrequency[] | null> {
if (metadata && metadata.dictionaries.length > 0) {
const script = `
(async () => {
@@ -645,7 +607,7 @@ export async function requestYomitanTermFrequencies(
});
return await invoke("getTermFrequencies", {
termReadingList: ${JSON.stringify(missingTermReadingList)},
termReadingList: ${JSON.stringify(termReadingList)},
dictionaries: ${JSON.stringify(metadata.dictionaries)}
});
})();
@@ -653,28 +615,13 @@ export async function requestYomitanTermFrequencies(
try {
const rawResult = await parserWindow.webContents.executeJavaScript(script, true);
const fetchedEntries = Array.isArray(rawResult)
return Array.isArray(rawResult)
? normalizeFrequencyEntriesWithPriority(rawResult, metadata.dictionaryPriorityByName)
: [];
const groupedByPair = groupFrequencyEntriesByPair(fetchedEntries);
const groupedByTerm = groupFrequencyEntriesByTerm(fetchedEntries);
const missingTerms = new Set(missingTermReadingList.map((pair) => pair.term));
for (const pair of missingTermReadingList) {
const key = makeTermReadingCacheKey(pair.term, pair.reading);
const exactEntries = groupedByPair.get(key);
const termEntries = groupedByTerm.get(pair.term) ?? [];
frequencyCache.set(key, exactEntries ?? termEntries);
}
const cachedResult = buildCachedResult();
const unmatchedEntries = fetchedEntries.filter((entry) => !missingTerms.has(entry.term.trim()));
return [...cachedResult, ...unmatchedEntries];
} catch (err) {
logger.error('Yomitan term frequency request failed:', (err as Error).message);
return null;
}
return buildCachedResult();
}
const script = `
@@ -721,7 +668,7 @@ export async function requestYomitanTermFrequencies(
}
const rawFrequencies = await invoke("getTermFrequencies", {
termReadingList: ${JSON.stringify(missingTermReadingList)},
termReadingList: ${JSON.stringify(termReadingList)},
dictionaries
});
@@ -743,27 +690,147 @@ export async function requestYomitanTermFrequencies(
try {
const rawResult = await parserWindow.webContents.executeJavaScript(script, true);
const fetchedEntries = Array.isArray(rawResult)
return Array.isArray(rawResult)
? rawResult
.map((entry) => toYomitanTermFrequency(entry))
.filter((entry): entry is YomitanTermFrequency => entry !== null)
: [];
const groupedByPair = groupFrequencyEntriesByPair(fetchedEntries);
const groupedByTerm = groupFrequencyEntriesByTerm(fetchedEntries);
const missingTerms = new Set(missingTermReadingList.map((pair) => pair.term));
for (const pair of missingTermReadingList) {
const key = makeTermReadingCacheKey(pair.term, pair.reading);
const exactEntries = groupedByPair.get(key);
const termEntries = groupedByTerm.get(pair.term) ?? [];
frequencyCache.set(key, exactEntries ?? termEntries);
}
const cachedResult = buildCachedResult();
const unmatchedEntries = fetchedEntries.filter((entry) => !missingTerms.has(entry.term.trim()));
return [...cachedResult, ...unmatchedEntries];
} catch (err) {
logger.error('Yomitan term frequency request failed:', (err as Error).message);
return null;
}
}
function cacheFrequencyEntriesForPairs(
frequencyCache: Map<string, YomitanTermFrequency[]>,
termReadingList: YomitanTermReadingPair[],
fetchedEntries: YomitanTermFrequency[],
): void {
const groupedByPair = groupFrequencyEntriesByPair(fetchedEntries);
const groupedByTerm = groupFrequencyEntriesByTerm(fetchedEntries);
for (const pair of termReadingList) {
const key = makeTermReadingCacheKey(pair.term, pair.reading);
const exactEntries = groupedByPair.get(key);
const termEntries = groupedByTerm.get(pair.term) ?? [];
frequencyCache.set(key, exactEntries ?? termEntries);
}
}
export async function requestYomitanTermFrequencies(
termReadingList: YomitanTermReadingPair[],
deps: YomitanParserRuntimeDeps,
logger: LoggerLike,
): Promise<YomitanTermFrequency[]> {
const normalizedTermReadingList = normalizeTermReadingList(termReadingList);
const yomitanExt = deps.getYomitanExt();
if (normalizedTermReadingList.length === 0 || !yomitanExt) {
return [];
}
const isReady = await ensureYomitanParserWindow(deps, logger);
const parserWindow = deps.getYomitanParserWindow();
if (!isReady || !parserWindow || parserWindow.isDestroyed()) {
return [];
}
const metadata = await requestYomitanProfileMetadata(parserWindow, logger);
const frequencyCache = getWindowFrequencyCache(parserWindow);
const missingTermReadingList: YomitanTermReadingPair[] = [];
const buildCachedResult = (): YomitanTermFrequency[] => {
const result: YomitanTermFrequency[] = [];
for (const pair of normalizedTermReadingList) {
const key = makeTermReadingCacheKey(pair.term, pair.reading);
const cached = frequencyCache.get(key);
if (cached && cached.length > 0) {
result.push(...cached);
}
}
return result;
};
for (const pair of normalizedTermReadingList) {
const key = makeTermReadingCacheKey(pair.term, pair.reading);
if (!frequencyCache.has(key)) {
missingTermReadingList.push(pair);
}
}
if (missingTermReadingList.length === 0) {
return buildCachedResult();
}
const fetchedEntries = await fetchYomitanTermFrequencies(
parserWindow,
missingTermReadingList,
metadata,
logger,
);
if (fetchedEntries === null) {
return buildCachedResult();
}
cacheFrequencyEntriesForPairs(frequencyCache, missingTermReadingList, fetchedEntries);
const fallbackTermReadingList = normalizeTermReadingList(
missingTermReadingList
.filter((pair) => pair.reading !== null)
.map((pair) => {
const key = makeTermReadingCacheKey(pair.term, pair.reading);
const cachedEntries = frequencyCache.get(key);
if (cachedEntries && cachedEntries.length > 0) {
return null;
}
const fallbackKey = makeTermReadingCacheKey(pair.term, null);
const cachedFallback = frequencyCache.get(fallbackKey);
if (cachedFallback && cachedFallback.length > 0) {
frequencyCache.set(key, cachedFallback);
return null;
}
return { term: pair.term, reading: null };
})
.filter((pair): pair is YomitanTermReadingPair => pair !== null),
).filter((pair) => !frequencyCache.has(makeTermReadingCacheKey(pair.term, pair.reading)));
let fallbackFetchedEntries: YomitanTermFrequency[] = [];
if (fallbackTermReadingList.length > 0) {
const fallbackFetchResult = await fetchYomitanTermFrequencies(
parserWindow,
fallbackTermReadingList,
metadata,
logger,
);
if (fallbackFetchResult !== null) {
fallbackFetchedEntries = fallbackFetchResult;
cacheFrequencyEntriesForPairs(frequencyCache, fallbackTermReadingList, fallbackFetchedEntries);
}
for (const pair of missingTermReadingList) {
if (pair.reading === null) {
continue;
}
const key = makeTermReadingCacheKey(pair.term, pair.reading);
const cachedEntries = frequencyCache.get(key);
if (cachedEntries && cachedEntries.length > 0) {
continue;
}
const fallbackEntries = frequencyCache.get(makeTermReadingCacheKey(pair.term, null));
if (fallbackEntries && fallbackEntries.length > 0) {
frequencyCache.set(key, fallbackEntries);
}
}
}
const allFetchedEntries = [...fetchedEntries, ...fallbackFetchedEntries];
const queriedTerms = new Set(
[...missingTermReadingList, ...fallbackTermReadingList].map((pair) => pair.term),
);
const cachedResult = buildCachedResult();
const unmatchedEntries = allFetchedEntries.filter((entry) => !queriedTerms.has(entry.term.trim()));
return [...cachedResult, ...unmatchedEntries];
}
export async function syncYomitanDefaultAnkiServer(