mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-01 18:22:41 -08:00
fix(tokenizer): disambiguate Yomitan frequency lookup by reading
This commit is contained in:
@@ -297,7 +297,7 @@ test('tokenizeSubtitle starts Yomitan frequency lookup and MeCab enrichment in p
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 77);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle queries headword frequencies without forcing surface reading', async () => {
|
||||
test('tokenizeSubtitle queries headword frequencies with token reading for disambiguation', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'鍛えた',
|
||||
makeDeps({
|
||||
@@ -309,7 +309,7 @@ test('tokenizeSubtitle queries headword frequencies without forcing surface read
|
||||
webContents: {
|
||||
executeJavaScript: async (script: string) => {
|
||||
if (script.includes('getTermFrequencies')) {
|
||||
if (!script.includes('"term":"鍛える","reading":null')) {
|
||||
if (!script.includes('"term":"鍛える","reading":"きた"')) {
|
||||
return [];
|
||||
}
|
||||
return [
|
||||
@@ -351,6 +351,68 @@ test('tokenizeSubtitle queries headword frequencies without forcing surface read
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 2847);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle avoids headword term-only fallback rank when reading-specific frequency exists', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'無人',
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
||||
getYomitanParserWindow: () =>
|
||||
({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async (script: string) => {
|
||||
if (script.includes('getTermFrequencies')) {
|
||||
if (!script.includes('"term":"無人","reading":"むじん"')) {
|
||||
return [];
|
||||
}
|
||||
return [
|
||||
{
|
||||
term: '無人',
|
||||
reading: null,
|
||||
dictionary: 'CC100',
|
||||
dictionaryPriority: 0,
|
||||
frequency: 157632,
|
||||
displayValue: null,
|
||||
displayValueParsed: false,
|
||||
},
|
||||
{
|
||||
term: '無人',
|
||||
reading: 'むじん',
|
||||
dictionary: 'CC100',
|
||||
dictionaryPriority: 0,
|
||||
frequency: 7141,
|
||||
displayValue: null,
|
||||
displayValueParsed: false,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
return [
|
||||
{
|
||||
source: 'scanning-parser',
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
{
|
||||
text: '無人',
|
||||
reading: 'むじん',
|
||||
headwords: [[{ term: '無人' }]],
|
||||
},
|
||||
],
|
||||
],
|
||||
},
|
||||
];
|
||||
},
|
||||
},
|
||||
}) as unknown as Electron.BrowserWindow,
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 7141);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle prefers Yomitan frequency from highest-priority dictionary', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫',
|
||||
|
||||
@@ -284,8 +284,7 @@ function buildYomitanFrequencyTermReadingList(
|
||||
}
|
||||
const readingRaw =
|
||||
token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null;
|
||||
const reading = matchMode === 'headword' ? null : readingRaw;
|
||||
return { term, reading };
|
||||
return { term, reading: readingRaw };
|
||||
})
|
||||
.filter((pair): pair is { term: string; reading: string | null } => pair !== null);
|
||||
}
|
||||
|
||||
@@ -130,6 +130,28 @@ test('requestYomitanTermFrequencies returns normalized frequency entries', async
|
||||
assert.match(scriptValue, /optionsGetFull/);
|
||||
});
|
||||
|
||||
test('requestYomitanTermFrequencies prefers primary rank from displayValue array pair', async () => {
|
||||
const deps = createDeps(async () => [
|
||||
{
|
||||
term: '無人',
|
||||
reading: 'むじん',
|
||||
dictionary: 'freq-dict',
|
||||
dictionaryPriority: 0,
|
||||
frequency: 157632,
|
||||
displayValue: [7141, 157632],
|
||||
displayValueParsed: true,
|
||||
},
|
||||
]);
|
||||
|
||||
const result = await requestYomitanTermFrequencies([{ term: '無人', reading: 'むじん' }], deps, {
|
||||
error: () => undefined,
|
||||
});
|
||||
|
||||
assert.equal(result.length, 1);
|
||||
assert.equal(result[0]?.term, '無人');
|
||||
assert.equal(result[0]?.frequency, 7141);
|
||||
});
|
||||
|
||||
test('requestYomitanTermFrequencies caches profile metadata between calls', async () => {
|
||||
const scripts: string[] = [];
|
||||
const deps = createDeps(async (script) => {
|
||||
|
||||
@@ -96,6 +96,28 @@ function parsePositiveFrequencyString(value: string): number | null {
|
||||
return parsed;
|
||||
}
|
||||
|
||||
function parsePositiveFrequencyValue(value: unknown): number | null {
|
||||
const numeric = asPositiveInteger(value);
|
||||
if (numeric !== null) {
|
||||
return numeric;
|
||||
}
|
||||
|
||||
if (typeof value === 'string') {
|
||||
return parsePositiveFrequencyString(value);
|
||||
}
|
||||
|
||||
if (Array.isArray(value)) {
|
||||
for (const item of value) {
|
||||
const parsed = parsePositiveFrequencyValue(item);
|
||||
if (parsed !== null) {
|
||||
return parsed;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
|
||||
if (!isObject(value)) {
|
||||
return null;
|
||||
@@ -103,15 +125,12 @@ function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
|
||||
|
||||
const term = typeof value.term === 'string' ? value.term.trim() : '';
|
||||
const dictionary = typeof value.dictionary === 'string' ? value.dictionary.trim() : '';
|
||||
const rawFrequency = asPositiveInteger(value.frequency);
|
||||
const displayValueRaw =
|
||||
value.displayValue === null
|
||||
? null
|
||||
: typeof value.displayValue === 'string'
|
||||
? value.displayValue
|
||||
: null;
|
||||
const rawFrequency = parsePositiveFrequencyValue(value.frequency);
|
||||
const displayValueRaw = value.displayValue;
|
||||
const parsedDisplayFrequency =
|
||||
displayValueRaw !== null ? parsePositiveFrequencyString(displayValueRaw) : null;
|
||||
displayValueRaw !== null && displayValueRaw !== undefined
|
||||
? parsePositiveFrequencyValue(displayValueRaw)
|
||||
: null;
|
||||
const frequency = parsedDisplayFrequency ?? rawFrequency;
|
||||
if (!term || !dictionary || frequency === null) {
|
||||
return null;
|
||||
@@ -128,7 +147,7 @@ function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
|
||||
: typeof value.reading === 'string'
|
||||
? value.reading
|
||||
: null;
|
||||
const displayValue = displayValueRaw;
|
||||
const displayValue = typeof displayValueRaw === 'string' ? displayValueRaw : null;
|
||||
const displayValueParsed = value.displayValueParsed === true;
|
||||
|
||||
return {
|
||||
|
||||
Reference in New Issue
Block a user