mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-01 18:22:41 -08:00
fix(tokenizer): disambiguate Yomitan frequency lookup by reading
This commit is contained in:
@@ -297,7 +297,7 @@ test('tokenizeSubtitle starts Yomitan frequency lookup and MeCab enrichment in p
|
|||||||
assert.equal(result.tokens?.[0]?.frequencyRank, 77);
|
assert.equal(result.tokens?.[0]?.frequencyRank, 77);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle queries headword frequencies without forcing surface reading', async () => {
|
test('tokenizeSubtitle queries headword frequencies with token reading for disambiguation', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'鍛えた',
|
'鍛えた',
|
||||||
makeDeps({
|
makeDeps({
|
||||||
@@ -309,7 +309,7 @@ test('tokenizeSubtitle queries headword frequencies without forcing surface read
|
|||||||
webContents: {
|
webContents: {
|
||||||
executeJavaScript: async (script: string) => {
|
executeJavaScript: async (script: string) => {
|
||||||
if (script.includes('getTermFrequencies')) {
|
if (script.includes('getTermFrequencies')) {
|
||||||
if (!script.includes('"term":"鍛える","reading":null')) {
|
if (!script.includes('"term":"鍛える","reading":"きた"')) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
return [
|
return [
|
||||||
@@ -351,6 +351,68 @@ test('tokenizeSubtitle queries headword frequencies without forcing surface read
|
|||||||
assert.equal(result.tokens?.[0]?.frequencyRank, 2847);
|
assert.equal(result.tokens?.[0]?.frequencyRank, 2847);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('tokenizeSubtitle avoids headword term-only fallback rank when reading-specific frequency exists', async () => {
|
||||||
|
const result = await tokenizeSubtitle(
|
||||||
|
'無人',
|
||||||
|
makeDeps({
|
||||||
|
getFrequencyDictionaryEnabled: () => true,
|
||||||
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
||||||
|
getYomitanParserWindow: () =>
|
||||||
|
({
|
||||||
|
isDestroyed: () => false,
|
||||||
|
webContents: {
|
||||||
|
executeJavaScript: async (script: string) => {
|
||||||
|
if (script.includes('getTermFrequencies')) {
|
||||||
|
if (!script.includes('"term":"無人","reading":"むじん"')) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
term: '無人',
|
||||||
|
reading: null,
|
||||||
|
dictionary: 'CC100',
|
||||||
|
dictionaryPriority: 0,
|
||||||
|
frequency: 157632,
|
||||||
|
displayValue: null,
|
||||||
|
displayValueParsed: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
term: '無人',
|
||||||
|
reading: 'むじん',
|
||||||
|
dictionary: 'CC100',
|
||||||
|
dictionaryPriority: 0,
|
||||||
|
frequency: 7141,
|
||||||
|
displayValue: null,
|
||||||
|
displayValueParsed: false,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
source: 'scanning-parser',
|
||||||
|
index: 0,
|
||||||
|
content: [
|
||||||
|
[
|
||||||
|
{
|
||||||
|
text: '無人',
|
||||||
|
reading: 'むじん',
|
||||||
|
headwords: [[{ term: '無人' }]],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
],
|
||||||
|
},
|
||||||
|
];
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}) as unknown as Electron.BrowserWindow,
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.equal(result.tokens?.length, 1);
|
||||||
|
assert.equal(result.tokens?.[0]?.frequencyRank, 7141);
|
||||||
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle prefers Yomitan frequency from highest-priority dictionary', async () => {
|
test('tokenizeSubtitle prefers Yomitan frequency from highest-priority dictionary', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'猫',
|
'猫',
|
||||||
|
|||||||
@@ -284,8 +284,7 @@ function buildYomitanFrequencyTermReadingList(
|
|||||||
}
|
}
|
||||||
const readingRaw =
|
const readingRaw =
|
||||||
token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null;
|
token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null;
|
||||||
const reading = matchMode === 'headword' ? null : readingRaw;
|
return { term, reading: readingRaw };
|
||||||
return { term, reading };
|
|
||||||
})
|
})
|
||||||
.filter((pair): pair is { term: string; reading: string | null } => pair !== null);
|
.filter((pair): pair is { term: string; reading: string | null } => pair !== null);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -130,6 +130,28 @@ test('requestYomitanTermFrequencies returns normalized frequency entries', async
|
|||||||
assert.match(scriptValue, /optionsGetFull/);
|
assert.match(scriptValue, /optionsGetFull/);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('requestYomitanTermFrequencies prefers primary rank from displayValue array pair', async () => {
|
||||||
|
const deps = createDeps(async () => [
|
||||||
|
{
|
||||||
|
term: '無人',
|
||||||
|
reading: 'むじん',
|
||||||
|
dictionary: 'freq-dict',
|
||||||
|
dictionaryPriority: 0,
|
||||||
|
frequency: 157632,
|
||||||
|
displayValue: [7141, 157632],
|
||||||
|
displayValueParsed: true,
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
|
||||||
|
const result = await requestYomitanTermFrequencies([{ term: '無人', reading: 'むじん' }], deps, {
|
||||||
|
error: () => undefined,
|
||||||
|
});
|
||||||
|
|
||||||
|
assert.equal(result.length, 1);
|
||||||
|
assert.equal(result[0]?.term, '無人');
|
||||||
|
assert.equal(result[0]?.frequency, 7141);
|
||||||
|
});
|
||||||
|
|
||||||
test('requestYomitanTermFrequencies caches profile metadata between calls', async () => {
|
test('requestYomitanTermFrequencies caches profile metadata between calls', async () => {
|
||||||
const scripts: string[] = [];
|
const scripts: string[] = [];
|
||||||
const deps = createDeps(async (script) => {
|
const deps = createDeps(async (script) => {
|
||||||
|
|||||||
@@ -96,6 +96,28 @@ function parsePositiveFrequencyString(value: string): number | null {
|
|||||||
return parsed;
|
return parsed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function parsePositiveFrequencyValue(value: unknown): number | null {
|
||||||
|
const numeric = asPositiveInteger(value);
|
||||||
|
if (numeric !== null) {
|
||||||
|
return numeric;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (typeof value === 'string') {
|
||||||
|
return parsePositiveFrequencyString(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Array.isArray(value)) {
|
||||||
|
for (const item of value) {
|
||||||
|
const parsed = parsePositiveFrequencyValue(item);
|
||||||
|
if (parsed !== null) {
|
||||||
|
return parsed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
|
function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
|
||||||
if (!isObject(value)) {
|
if (!isObject(value)) {
|
||||||
return null;
|
return null;
|
||||||
@@ -103,15 +125,12 @@ function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
|
|||||||
|
|
||||||
const term = typeof value.term === 'string' ? value.term.trim() : '';
|
const term = typeof value.term === 'string' ? value.term.trim() : '';
|
||||||
const dictionary = typeof value.dictionary === 'string' ? value.dictionary.trim() : '';
|
const dictionary = typeof value.dictionary === 'string' ? value.dictionary.trim() : '';
|
||||||
const rawFrequency = asPositiveInteger(value.frequency);
|
const rawFrequency = parsePositiveFrequencyValue(value.frequency);
|
||||||
const displayValueRaw =
|
const displayValueRaw = value.displayValue;
|
||||||
value.displayValue === null
|
|
||||||
? null
|
|
||||||
: typeof value.displayValue === 'string'
|
|
||||||
? value.displayValue
|
|
||||||
: null;
|
|
||||||
const parsedDisplayFrequency =
|
const parsedDisplayFrequency =
|
||||||
displayValueRaw !== null ? parsePositiveFrequencyString(displayValueRaw) : null;
|
displayValueRaw !== null && displayValueRaw !== undefined
|
||||||
|
? parsePositiveFrequencyValue(displayValueRaw)
|
||||||
|
: null;
|
||||||
const frequency = parsedDisplayFrequency ?? rawFrequency;
|
const frequency = parsedDisplayFrequency ?? rawFrequency;
|
||||||
if (!term || !dictionary || frequency === null) {
|
if (!term || !dictionary || frequency === null) {
|
||||||
return null;
|
return null;
|
||||||
@@ -128,7 +147,7 @@ function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
|
|||||||
: typeof value.reading === 'string'
|
: typeof value.reading === 'string'
|
||||||
? value.reading
|
? value.reading
|
||||||
: null;
|
: null;
|
||||||
const displayValue = displayValueRaw;
|
const displayValue = typeof displayValueRaw === 'string' ? displayValueRaw : null;
|
||||||
const displayValueParsed = value.displayValueParsed === true;
|
const displayValueParsed = value.displayValueParsed === true;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|||||||
Reference in New Issue
Block a user