fix(tokenizer): disambiguate Yomitan frequency lookup by reading

This commit is contained in:
2026-02-28 21:12:34 -08:00
parent 4309e0dec3
commit e4038127cb
4 changed files with 115 additions and 13 deletions

View File

@@ -297,7 +297,7 @@ test('tokenizeSubtitle starts Yomitan frequency lookup and MeCab enrichment in p
assert.equal(result.tokens?.[0]?.frequencyRank, 77);
});
test('tokenizeSubtitle queries headword frequencies without forcing surface reading', async () => {
test('tokenizeSubtitle queries headword frequencies with token reading for disambiguation', async () => {
const result = await tokenizeSubtitle(
'鍛えた',
makeDeps({
@@ -309,7 +309,7 @@ test('tokenizeSubtitle queries headword frequencies without forcing surface read
webContents: {
executeJavaScript: async (script: string) => {
if (script.includes('getTermFrequencies')) {
if (!script.includes('"term":"鍛える","reading":null')) {
if (!script.includes('"term":"鍛える","reading":"きた"')) {
return [];
}
return [
@@ -351,6 +351,68 @@ test('tokenizeSubtitle queries headword frequencies without forcing surface read
assert.equal(result.tokens?.[0]?.frequencyRank, 2847);
});
test('tokenizeSubtitle avoids headword term-only fallback rank when reading-specific frequency exists', async () => {
const result = await tokenizeSubtitle(
'無人',
makeDeps({
getFrequencyDictionaryEnabled: () => true,
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
getYomitanParserWindow: () =>
({
isDestroyed: () => false,
webContents: {
executeJavaScript: async (script: string) => {
if (script.includes('getTermFrequencies')) {
if (!script.includes('"term":"無人","reading":"むじん"')) {
return [];
}
return [
{
term: '無人',
reading: null,
dictionary: 'CC100',
dictionaryPriority: 0,
frequency: 157632,
displayValue: null,
displayValueParsed: false,
},
{
term: '無人',
reading: 'むじん',
dictionary: 'CC100',
dictionaryPriority: 0,
frequency: 7141,
displayValue: null,
displayValueParsed: false,
},
];
}
return [
{
source: 'scanning-parser',
index: 0,
content: [
[
{
text: '無人',
reading: 'むじん',
headwords: [[{ term: '無人' }]],
},
],
],
},
];
},
},
}) as unknown as Electron.BrowserWindow,
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.frequencyRank, 7141);
});
test('tokenizeSubtitle prefers Yomitan frequency from highest-priority dictionary', async () => {
const result = await tokenizeSubtitle(
'猫',

View File

@@ -284,8 +284,7 @@ function buildYomitanFrequencyTermReadingList(
}
const readingRaw =
token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null;
const reading = matchMode === 'headword' ? null : readingRaw;
return { term, reading };
return { term, reading: readingRaw };
})
.filter((pair): pair is { term: string; reading: string | null } => pair !== null);
}

View File

@@ -130,6 +130,28 @@ test('requestYomitanTermFrequencies returns normalized frequency entries', async
assert.match(scriptValue, /optionsGetFull/);
});
test('requestYomitanTermFrequencies prefers primary rank from displayValue array pair', async () => {
const deps = createDeps(async () => [
{
term: '無人',
reading: 'むじん',
dictionary: 'freq-dict',
dictionaryPriority: 0,
frequency: 157632,
displayValue: [7141, 157632],
displayValueParsed: true,
},
]);
const result = await requestYomitanTermFrequencies([{ term: '無人', reading: 'むじん' }], deps, {
error: () => undefined,
});
assert.equal(result.length, 1);
assert.equal(result[0]?.term, '無人');
assert.equal(result[0]?.frequency, 7141);
});
test('requestYomitanTermFrequencies caches profile metadata between calls', async () => {
const scripts: string[] = [];
const deps = createDeps(async (script) => {

View File

@@ -96,6 +96,28 @@ function parsePositiveFrequencyString(value: string): number | null {
return parsed;
}
function parsePositiveFrequencyValue(value: unknown): number | null {
const numeric = asPositiveInteger(value);
if (numeric !== null) {
return numeric;
}
if (typeof value === 'string') {
return parsePositiveFrequencyString(value);
}
if (Array.isArray(value)) {
for (const item of value) {
const parsed = parsePositiveFrequencyValue(item);
if (parsed !== null) {
return parsed;
}
}
}
return null;
}
function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
if (!isObject(value)) {
return null;
@@ -103,15 +125,12 @@ function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
const term = typeof value.term === 'string' ? value.term.trim() : '';
const dictionary = typeof value.dictionary === 'string' ? value.dictionary.trim() : '';
const rawFrequency = asPositiveInteger(value.frequency);
const displayValueRaw =
value.displayValue === null
? null
: typeof value.displayValue === 'string'
? value.displayValue
: null;
const rawFrequency = parsePositiveFrequencyValue(value.frequency);
const displayValueRaw = value.displayValue;
const parsedDisplayFrequency =
displayValueRaw !== null ? parsePositiveFrequencyString(displayValueRaw) : null;
displayValueRaw !== null && displayValueRaw !== undefined
? parsePositiveFrequencyValue(displayValueRaw)
: null;
const frequency = parsedDisplayFrequency ?? rawFrequency;
if (!term || !dictionary || frequency === null) {
return null;
@@ -128,7 +147,7 @@ function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
: typeof value.reading === 'string'
? value.reading
: null;
const displayValue = displayValueRaw;
const displayValue = typeof displayValueRaw === 'string' ? displayValueRaw : null;
const displayValueParsed = value.displayValueParsed === true;
return {