fix(subtitle): improve frequency highlight reliability

This commit is contained in:
2026-03-01 20:12:42 -08:00
parent 68e5a7fef3
commit 17a417e639
4 changed files with 161 additions and 20 deletions

View File

@@ -297,6 +297,43 @@ test('tokenizeSubtitle starts Yomitan frequency lookup and MeCab enrichment in p
assert.equal(result.tokens?.[0]?.frequencyRank, 77); assert.equal(result.tokens?.[0]?.frequencyRank, 77);
}); });
test('tokenizeSubtitle appends trailing kana to merged Yomitan readings when headword equals surface', async () => {
const result = await tokenizeSubtitle(
'断じて見ていない',
makeDeps({
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
getYomitanParserWindow: () =>
({
isDestroyed: () => false,
webContents: {
executeJavaScript: async () => [
{
source: 'scanning-parser',
index: 0,
content: [
[
{ text: '断', reading: 'だん', headwords: [[{ term: '断じて' }]] },
{ text: 'じて', reading: '', headwords: [[{ term: 'じて' }]] },
],
[
{ text: '見', reading: 'み', headwords: [[{ term: '見る' }]] },
{ text: 'ていない', reading: '', headwords: [[{ term: 'ていない' }]] },
],
],
},
],
},
}) as unknown as Electron.BrowserWindow,
}),
);
assert.equal(result.tokens?.length, 2);
assert.equal(result.tokens?.[0]?.surface, '断じて');
assert.equal(result.tokens?.[0]?.reading, 'だんじて');
assert.equal(result.tokens?.[1]?.surface, '見ていない');
assert.equal(result.tokens?.[1]?.reading, 'み');
});
test('tokenizeSubtitle queries headword frequencies with token reading for disambiguation', async () => { test('tokenizeSubtitle queries headword frequencies with token reading for disambiguation', async () => {
const result = await tokenizeSubtitle( const result = await tokenizeSubtitle(
'鍛えた', '鍛えた',
@@ -351,6 +388,58 @@ test('tokenizeSubtitle queries headword frequencies with token reading for disam
assert.equal(result.tokens?.[0]?.frequencyRank, 2847); assert.equal(result.tokens?.[0]?.frequencyRank, 2847);
}); });
test('tokenizeSubtitle falls back to term-only Yomitan frequency lookup when reading is noisy', async () => {
const result = await tokenizeSubtitle(
'断じて',
makeDeps({
getFrequencyDictionaryEnabled: () => true,
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
getYomitanParserWindow: () =>
({
isDestroyed: () => false,
webContents: {
executeJavaScript: async (script: string) => {
if (script.includes('getTermFrequencies')) {
if (!script.includes('"term":"断じて","reading":null')) {
return [];
}
return [
{
term: '断じて',
reading: null,
dictionary: 'freq-dict',
frequency: 7082,
displayValue: '7082',
displayValueParsed: true,
},
];
}
return [
{
source: 'scanning-parser',
index: 0,
content: [
[
{
text: '断じて',
reading: 'だん',
headwords: [[{ term: '断じて' }]],
},
],
],
},
];
},
},
}) as unknown as Electron.BrowserWindow,
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.frequencyRank, 7082);
});
test('tokenizeSubtitle avoids headword term-only fallback rank when reading-specific frequency exists', async () => { test('tokenizeSubtitle avoids headword term-only fallback rank when reading-specific frequency exists', async () => {
const result = await tokenizeSubtitle( const result = await tokenizeSubtitle(
'無人', '無人',

View File

@@ -249,6 +249,50 @@ function normalizeFrequencyLookupText(rawText: string): string {
return rawText.trim().toLowerCase(); return rawText.trim().toLowerCase();
} }
function isKanaChar(char: string): boolean {
const code = char.codePointAt(0);
if (code === undefined) {
return false;
}
return (
(code >= 0x3041 && code <= 0x3096) ||
(code >= 0x309b && code <= 0x309f) ||
(code >= 0x30a0 && code <= 0x30fa) ||
(code >= 0x30fd && code <= 0x30ff)
);
}
function getTrailingKanaSuffix(surface: string): string {
const chars = Array.from(surface);
let splitIndex = chars.length;
while (splitIndex > 0 && isKanaChar(chars[splitIndex - 1]!)) {
splitIndex -= 1;
}
if (splitIndex <= 0 || splitIndex >= chars.length) {
return '';
}
return chars.slice(splitIndex).join('');
}
function normalizeYomitanMergedReading(token: MergedToken): string {
const reading = token.reading ?? '';
if (!reading || token.headword !== token.surface) {
return reading;
}
const trailingKanaSuffix = getTrailingKanaSuffix(token.surface);
if (!trailingKanaSuffix || reading.endsWith(trailingKanaSuffix)) {
return reading;
}
return `${reading}${trailingKanaSuffix}`;
}
function normalizeSelectedYomitanTokens(tokens: MergedToken[]): MergedToken[] {
return tokens.map((token) => ({
...token,
reading: normalizeYomitanMergedReading(token),
}));
}
function resolveFrequencyLookupText( function resolveFrequencyLookupText(
token: MergedToken, token: MergedToken,
matchMode: FrequencyDictionaryMatchMode, matchMode: FrequencyDictionaryMatchMode,
@@ -276,17 +320,24 @@ function buildYomitanFrequencyTermReadingList(
tokens: MergedToken[], tokens: MergedToken[],
matchMode: FrequencyDictionaryMatchMode, matchMode: FrequencyDictionaryMatchMode,
): Array<{ term: string; reading: string | null }> { ): Array<{ term: string; reading: string | null }> {
return tokens const termReadingList: Array<{ term: string; reading: string | null }> = [];
.map((token) => { for (const token of tokens) {
const term = resolveFrequencyLookupText(token, matchMode).trim(); const term = resolveFrequencyLookupText(token, matchMode).trim();
if (!term) { if (!term) {
return null; continue;
} }
const readingRaw =
token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null; const readingRaw =
return { term, reading: readingRaw }; token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null;
}) termReadingList.push({ term, reading: readingRaw });
.filter((pair): pair is { term: string; reading: string | null } => pair !== null);
// Yomitan parse readings can be noisy/truncated on merged tokens; include term-only fallback.
if (readingRaw !== null) {
termReadingList.push({ term, reading: null });
}
}
return termReadingList;
} }
function buildYomitanFrequencyRankMap( function buildYomitanFrequencyRankMap(
@@ -427,16 +478,17 @@ async function parseWithYomitanInternalParser(
if (!selectedTokens || selectedTokens.length === 0) { if (!selectedTokens || selectedTokens.length === 0) {
return null; return null;
} }
const normalizedSelectedTokens = normalizeSelectedYomitanTokens(selectedTokens);
if (deps.getYomitanGroupDebugEnabled?.() === true) { if (deps.getYomitanGroupDebugEnabled?.() === true) {
logSelectedYomitanGroups(text, selectedTokens); logSelectedYomitanGroups(text, normalizedSelectedTokens);
} }
const frequencyRankPromise: Promise<Map<string, number>> = options.frequencyEnabled const frequencyRankPromise: Promise<Map<string, number>> = options.frequencyEnabled
? (async () => { ? (async () => {
const frequencyMatchMode = options.frequencyMatchMode; const frequencyMatchMode = options.frequencyMatchMode;
const termReadingList = buildYomitanFrequencyTermReadingList( const termReadingList = buildYomitanFrequencyTermReadingList(
selectedTokens, normalizedSelectedTokens,
frequencyMatchMode, frequencyMatchMode,
); );
const yomitanFrequencies = await requestYomitanTermFrequencies(termReadingList, deps, logger); const yomitanFrequencies = await requestYomitanTermFrequencies(termReadingList, deps, logger);
@@ -449,19 +501,19 @@ async function parseWithYomitanInternalParser(
try { try {
const mecabTokens = await deps.tokenizeWithMecab(text); const mecabTokens = await deps.tokenizeWithMecab(text);
const enrichTokensWithMecab = deps.enrichTokensWithMecab ?? enrichTokensWithMecabAsync; const enrichTokensWithMecab = deps.enrichTokensWithMecab ?? enrichTokensWithMecabAsync;
return await enrichTokensWithMecab(selectedTokens, mecabTokens); return await enrichTokensWithMecab(normalizedSelectedTokens, mecabTokens);
} catch (err) { } catch (err) {
const error = err as Error; const error = err as Error;
logger.warn( logger.warn(
'Failed to enrich Yomitan tokens with MeCab POS:', 'Failed to enrich Yomitan tokens with MeCab POS:',
error.message, error.message,
`tokenCount=${selectedTokens.length}`, `tokenCount=${normalizedSelectedTokens.length}`,
`textLength=${text.length}`, `textLength=${text.length}`,
); );
return selectedTokens; return normalizedSelectedTokens;
} }
})() })()
: Promise.resolve(selectedTokens); : Promise.resolve(normalizedSelectedTokens);
const [yomitanRankByTerm, enrichedTokens] = await Promise.all([ const [yomitanRankByTerm, enrichedTokens] = await Promise.all([
frequencyRankPromise, frequencyRankPromise,

View File

@@ -79,7 +79,7 @@ test('computeWordClass preserves known and n+1 classes while adding JLPT classes
assert.equal(computeWordClass(nPlusOneJlpt), 'word word-n-plus-one word-jlpt-n2'); assert.equal(computeWordClass(nPlusOneJlpt), 'word word-n-plus-one word-jlpt-n2');
}); });
test('computeWordClass keeps known/N+1 color classes exclusive over frequency classes', () => { test('computeWordClass composes known class with frequency class while keeping N+1 exclusive', () => {
const known = createToken({ const known = createToken({
isKnown: true, isKnown: true,
frequencyRank: 10, frequencyRank: 10,
@@ -103,7 +103,7 @@ test('computeWordClass keeps known/N+1 color classes exclusive over frequency cl
singleColor: '#000000', singleColor: '#000000',
bandedColors: ['#000000', '#000000', '#000000', '#000000', '#000000'] as const, bandedColors: ['#000000', '#000000', '#000000', '#000000', '#000000'] as const,
}), }),
'word word-known', 'word word-known word-frequency-single',
); );
assert.equal( assert.equal(
computeWordClass(nPlusOne, { computeWordClass(nPlusOne, {

View File

@@ -429,7 +429,7 @@ export function computeWordClass(
classes.push(`word-jlpt-${token.jlptLevel.toLowerCase()}`); classes.push(`word-jlpt-${token.jlptLevel.toLowerCase()}`);
} }
if (!token.isKnown && !token.isNPlusOneTarget) { if (!token.isNPlusOneTarget) {
const frequencyClass = getFrequencyDictionaryClass(token, resolvedFrequencySettings); const frequencyClass = getFrequencyDictionaryClass(token, resolvedFrequencySettings);
if (frequencyClass) { if (frequencyClass) {
classes.push(frequencyClass); classes.push(frequencyClass);