mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-04-12 04:19:25 -07:00
feat: improve stats dashboard and annotation settings
This commit is contained in:
@@ -293,6 +293,29 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('annotateTokens keeps frequency for kanji noun tokens even when mecab marks them non-independent', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: '者',
|
||||
reading: 'もの',
|
||||
headword: '者',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '名詞',
|
||||
pos2: '非自立',
|
||||
pos3: '一般',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
frequencyRank: 475,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(tokens, makeDeps(), {
|
||||
minSentenceWordsForNPlusOne: 1,
|
||||
});
|
||||
|
||||
assert.equal(result[0]?.frequencyRank, 475);
|
||||
});
|
||||
|
||||
test('annotateTokens excludes likely kana SFX tokens from frequency when POS tags are missing', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
|
||||
@@ -89,6 +89,23 @@ function normalizePos2Tag(pos2: string | undefined): string {
|
||||
return typeof pos2 === 'string' ? pos2.trim() : '';
|
||||
}
|
||||
|
||||
function hasKanjiChar(text: string): boolean {
|
||||
for (const char of text) {
|
||||
const code = char.codePointAt(0);
|
||||
if (code === undefined) {
|
||||
continue;
|
||||
}
|
||||
if (
|
||||
(code >= 0x3400 && code <= 0x4dbf) ||
|
||||
(code >= 0x4e00 && code <= 0x9fff) ||
|
||||
(code >= 0xf900 && code <= 0xfaff)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function isExcludedComponent(
|
||||
pos1: string | undefined,
|
||||
pos2: string | undefined,
|
||||
@@ -169,6 +186,34 @@ function isFrequencyExcludedByPos(
|
||||
);
|
||||
}
|
||||
|
||||
function shouldKeepFrequencyForNonIndependentKanjiNoun(
|
||||
token: MergedToken,
|
||||
pos1Exclusions: ReadonlySet<string>,
|
||||
): boolean {
|
||||
if (pos1Exclusions.has('名詞')) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const rank =
|
||||
typeof token.frequencyRank === 'number' && Number.isFinite(token.frequencyRank)
|
||||
? Math.max(1, Math.floor(token.frequencyRank))
|
||||
: null;
|
||||
if (rank === null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1));
|
||||
const pos2Parts = splitNormalizedTagParts(normalizePos2Tag(token.pos2));
|
||||
if (pos1Parts.length !== 1 || pos2Parts.length !== 1) {
|
||||
return false;
|
||||
}
|
||||
if (pos1Parts[0] !== '名詞' || pos2Parts[0] !== '非自立') {
|
||||
return false;
|
||||
}
|
||||
|
||||
return hasKanjiChar(token.surface) || hasKanjiChar(token.headword);
|
||||
}
|
||||
|
||||
export function shouldExcludeTokenFromVocabularyPersistence(
|
||||
token: MergedToken,
|
||||
options: Pick<AnnotationStageOptions, 'pos1Exclusions' | 'pos2Exclusions'> = {},
|
||||
@@ -454,7 +499,10 @@ function filterTokenFrequencyRank(
|
||||
pos1Exclusions: ReadonlySet<string>,
|
||||
pos2Exclusions: ReadonlySet<string>,
|
||||
): number | undefined {
|
||||
if (isFrequencyExcludedByPos(token, pos1Exclusions, pos2Exclusions)) {
|
||||
if (
|
||||
isFrequencyExcludedByPos(token, pos1Exclusions, pos2Exclusions) &&
|
||||
!shouldKeepFrequencyForNonIndependentKanjiNoun(token, pos1Exclusions)
|
||||
) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
|
||||
@@ -188,6 +188,7 @@ test('requestYomitanTermFrequencies returns normalized frequency entries', async
|
||||
{
|
||||
term: '猫',
|
||||
reading: 'ねこ',
|
||||
hasReading: true,
|
||||
dictionary: 'freq-dict',
|
||||
dictionaryPriority: 0,
|
||||
frequency: 77,
|
||||
@@ -197,6 +198,7 @@ test('requestYomitanTermFrequencies returns normalized frequency entries', async
|
||||
{
|
||||
term: '鍛える',
|
||||
reading: 'きたえる',
|
||||
hasReading: false,
|
||||
dictionary: 'freq-dict',
|
||||
dictionaryPriority: 1,
|
||||
frequency: 46961,
|
||||
@@ -217,9 +219,11 @@ test('requestYomitanTermFrequencies returns normalized frequency entries', async
|
||||
|
||||
assert.equal(result.length, 2);
|
||||
assert.equal(result[0]?.term, '猫');
|
||||
assert.equal(result[0]?.hasReading, true);
|
||||
assert.equal(result[0]?.frequency, 77);
|
||||
assert.equal(result[0]?.dictionaryPriority, 0);
|
||||
assert.equal(result[1]?.term, '鍛える');
|
||||
assert.equal(result[1]?.hasReading, false);
|
||||
assert.equal(result[1]?.frequency, 2847);
|
||||
assert.match(scriptValue, /getTermFrequencies/);
|
||||
assert.match(scriptValue, /optionsGetFull/);
|
||||
@@ -247,6 +251,96 @@ test('requestYomitanTermFrequencies prefers primary rank from displayValue array
|
||||
assert.equal(result[0]?.frequency, 7141);
|
||||
});
|
||||
|
||||
test('requestYomitanTermFrequencies prefers primary rank from displayValue string pair when raw frequency matches trailing count', async () => {
|
||||
const deps = createDeps(async () => [
|
||||
{
|
||||
term: '潜む',
|
||||
reading: 'ひそむ',
|
||||
dictionary: 'freq-dict',
|
||||
dictionaryPriority: 0,
|
||||
frequency: 121,
|
||||
displayValue: '118,121',
|
||||
displayValueParsed: false,
|
||||
},
|
||||
]);
|
||||
|
||||
const result = await requestYomitanTermFrequencies([{ term: '潜む', reading: 'ひそむ' }], deps, {
|
||||
error: () => undefined,
|
||||
});
|
||||
|
||||
assert.equal(result.length, 1);
|
||||
assert.equal(result[0]?.term, '潜む');
|
||||
assert.equal(result[0]?.frequency, 118);
|
||||
});
|
||||
|
||||
test('requestYomitanTermFrequencies uses leading display digits for displayValue strings', async () => {
|
||||
const deps = createDeps(async () => [
|
||||
{
|
||||
term: '例',
|
||||
reading: 'れい',
|
||||
dictionary: 'freq-dict',
|
||||
dictionaryPriority: 0,
|
||||
frequency: 1234,
|
||||
displayValue: '1,234',
|
||||
displayValueParsed: false,
|
||||
},
|
||||
]);
|
||||
|
||||
const result = await requestYomitanTermFrequencies([{ term: '例', reading: 'れい' }], deps, {
|
||||
error: () => undefined,
|
||||
});
|
||||
|
||||
assert.equal(result.length, 1);
|
||||
assert.equal(result[0]?.term, '例');
|
||||
assert.equal(result[0]?.frequency, 1);
|
||||
});
|
||||
|
||||
test('requestYomitanTermFrequencies ignores occurrence-based dictionaries for rank tagging', async () => {
|
||||
let metadataScript = '';
|
||||
const deps = createDeps(async (script) => {
|
||||
if (script.includes('getTermFrequencies')) {
|
||||
return [
|
||||
{
|
||||
term: '潜む',
|
||||
reading: 'ひそむ',
|
||||
dictionary: 'CC100',
|
||||
frequency: 118121,
|
||||
displayValue: null,
|
||||
displayValueParsed: false,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
if (script.includes('optionsGetFull')) {
|
||||
metadataScript = script;
|
||||
return {
|
||||
profileCurrent: 0,
|
||||
profileIndex: 0,
|
||||
scanLength: 40,
|
||||
dictionaries: ['CC100'],
|
||||
dictionaryPriorityByName: { CC100: 0 },
|
||||
dictionaryFrequencyModeByName: { CC100: 'occurrence-based' },
|
||||
profiles: [
|
||||
{
|
||||
options: {
|
||||
scanning: { length: 40 },
|
||||
dictionaries: [{ name: 'CC100', enabled: true, id: 0 }],
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
return [];
|
||||
});
|
||||
|
||||
const result = await requestYomitanTermFrequencies([{ term: '潜む', reading: 'ひそむ' }], deps, {
|
||||
error: () => undefined,
|
||||
});
|
||||
|
||||
assert.deepEqual(result, []);
|
||||
assert.match(metadataScript, /getDictionaryInfo/);
|
||||
});
|
||||
|
||||
test('requestYomitanTermFrequencies requests term-only fallback only after reading miss', async () => {
|
||||
const frequencyScripts: string[] = [];
|
||||
const deps = createDeps(async (script) => {
|
||||
@@ -485,6 +579,317 @@ test('requestYomitanScanTokens uses left-to-right termsFind scanning instead of
|
||||
assert.match(scannerScript ?? '', /deinflect:\s*true/);
|
||||
});
|
||||
|
||||
test('requestYomitanScanTokens extracts best frequency rank from selected termsFind entry', async () => {
|
||||
let scannerScript = '';
|
||||
const deps = createDeps(async (script) => {
|
||||
if (script.includes('termsFind')) {
|
||||
scannerScript = script;
|
||||
return [];
|
||||
}
|
||||
if (script.includes('optionsGetFull')) {
|
||||
return {
|
||||
profileCurrent: 0,
|
||||
profileIndex: 0,
|
||||
scanLength: 40,
|
||||
dictionaries: ['JPDBv2㋕', 'Jiten', 'CC100'],
|
||||
dictionaryPriorityByName: {
|
||||
'JPDBv2㋕': 0,
|
||||
Jiten: 1,
|
||||
CC100: 2,
|
||||
},
|
||||
dictionaryFrequencyModeByName: {
|
||||
'JPDBv2㋕': 'rank-based',
|
||||
Jiten: 'rank-based',
|
||||
CC100: 'rank-based',
|
||||
},
|
||||
profiles: [
|
||||
{
|
||||
options: {
|
||||
scanning: { length: 40 },
|
||||
dictionaries: [
|
||||
{ name: 'JPDBv2㋕', enabled: true, id: 0 },
|
||||
{ name: 'Jiten', enabled: true, id: 1 },
|
||||
{ name: 'CC100', enabled: true, id: 2 },
|
||||
],
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
return null;
|
||||
});
|
||||
|
||||
await requestYomitanScanTokens('潜み', deps, {
|
||||
error: () => undefined,
|
||||
});
|
||||
|
||||
const result = await runInjectedYomitanScript(scannerScript, (action, params) => {
|
||||
if (action !== 'termsFind') {
|
||||
throw new Error(`unexpected action: ${action}`);
|
||||
}
|
||||
|
||||
const text = (params as { text?: string } | undefined)?.text ?? '';
|
||||
if (!text.startsWith('潜み')) {
|
||||
return { originalTextLength: 0, dictionaryEntries: [] };
|
||||
}
|
||||
|
||||
return {
|
||||
originalTextLength: 2,
|
||||
dictionaryEntries: [
|
||||
{
|
||||
headwords: [
|
||||
{
|
||||
term: '潜む',
|
||||
reading: 'ひそむ',
|
||||
sources: [{ originalText: '潜み', isPrimary: true, matchType: 'exact' }],
|
||||
},
|
||||
],
|
||||
frequencies: [
|
||||
{
|
||||
headwordIndex: 0,
|
||||
dictionary: 'JPDBv2㋕',
|
||||
frequency: 20181,
|
||||
displayValue: '4073,20181句',
|
||||
},
|
||||
{
|
||||
headwordIndex: 0,
|
||||
dictionary: 'Jiten',
|
||||
frequency: 28594,
|
||||
displayValue: '4592,28594句',
|
||||
},
|
||||
{
|
||||
headwordIndex: 0,
|
||||
dictionary: 'CC100',
|
||||
frequency: 118121,
|
||||
displayValue: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
});
|
||||
|
||||
assert.deepEqual(result, [
|
||||
{
|
||||
surface: '潜み',
|
||||
reading: 'ひそ',
|
||||
headword: '潜む',
|
||||
startPos: 0,
|
||||
endPos: 2,
|
||||
isNameMatch: false,
|
||||
frequencyRank: 4073,
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
test('requestYomitanScanTokens uses frequency from later exact-match entry when first exact entry has none', async () => {
|
||||
let scannerScript = '';
|
||||
const deps = createDeps(async (script) => {
|
||||
if (script.includes('termsFind')) {
|
||||
scannerScript = script;
|
||||
return [];
|
||||
}
|
||||
if (script.includes('optionsGetFull')) {
|
||||
return {
|
||||
profileCurrent: 0,
|
||||
profileIndex: 0,
|
||||
scanLength: 40,
|
||||
dictionaries: ['JPDBv2㋕', 'Jiten', 'CC100'],
|
||||
dictionaryPriorityByName: {
|
||||
'JPDBv2㋕': 0,
|
||||
Jiten: 1,
|
||||
CC100: 2,
|
||||
},
|
||||
dictionaryFrequencyModeByName: {
|
||||
'JPDBv2㋕': 'rank-based',
|
||||
Jiten: 'rank-based',
|
||||
CC100: 'rank-based',
|
||||
},
|
||||
profiles: [
|
||||
{
|
||||
options: {
|
||||
scanning: { length: 40 },
|
||||
dictionaries: [
|
||||
{ name: 'JPDBv2㋕', enabled: true, id: 0 },
|
||||
{ name: 'Jiten', enabled: true, id: 1 },
|
||||
{ name: 'CC100', enabled: true, id: 2 },
|
||||
],
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
return null;
|
||||
});
|
||||
|
||||
await requestYomitanScanTokens('者', deps, {
|
||||
error: () => undefined,
|
||||
});
|
||||
|
||||
const result = await runInjectedYomitanScript(scannerScript, (action, params) => {
|
||||
if (action !== 'termsFind') {
|
||||
throw new Error(`unexpected action: ${action}`);
|
||||
}
|
||||
|
||||
const text = (params as { text?: string } | undefined)?.text ?? '';
|
||||
if (!text.startsWith('者')) {
|
||||
return { originalTextLength: 0, dictionaryEntries: [] };
|
||||
}
|
||||
|
||||
return {
|
||||
originalTextLength: 1,
|
||||
dictionaryEntries: [
|
||||
{
|
||||
headwords: [
|
||||
{
|
||||
term: '者',
|
||||
reading: 'もの',
|
||||
sources: [{ originalText: '者', isPrimary: true, matchType: 'exact' }],
|
||||
},
|
||||
],
|
||||
frequencies: [],
|
||||
},
|
||||
{
|
||||
headwords: [
|
||||
{
|
||||
term: '者',
|
||||
reading: 'もの',
|
||||
sources: [{ originalText: '者', isPrimary: true, matchType: 'exact' }],
|
||||
},
|
||||
],
|
||||
frequencies: [
|
||||
{
|
||||
headwordIndex: 0,
|
||||
dictionary: 'JPDBv2㋕',
|
||||
frequency: 79601,
|
||||
displayValue: '475,79601句',
|
||||
},
|
||||
{
|
||||
headwordIndex: 0,
|
||||
dictionary: 'Jiten',
|
||||
frequency: 338,
|
||||
displayValue: '338',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
});
|
||||
|
||||
assert.deepEqual(result, [
|
||||
{
|
||||
surface: '者',
|
||||
reading: 'もの',
|
||||
headword: '者',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
isNameMatch: false,
|
||||
frequencyRank: 475,
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
test('requestYomitanScanTokens can use frequency from later exact secondary-match entry', async () => {
|
||||
let scannerScript = '';
|
||||
const deps = createDeps(async (script) => {
|
||||
if (script.includes('termsFind')) {
|
||||
scannerScript = script;
|
||||
return [];
|
||||
}
|
||||
if (script.includes('optionsGetFull')) {
|
||||
return {
|
||||
profileCurrent: 0,
|
||||
profileIndex: 0,
|
||||
scanLength: 40,
|
||||
dictionaries: ['JPDBv2㋕', 'Jiten', 'CC100'],
|
||||
dictionaryPriorityByName: {
|
||||
'JPDBv2㋕': 0,
|
||||
Jiten: 1,
|
||||
CC100: 2,
|
||||
},
|
||||
dictionaryFrequencyModeByName: {
|
||||
'JPDBv2㋕': 'rank-based',
|
||||
Jiten: 'rank-based',
|
||||
CC100: 'rank-based',
|
||||
},
|
||||
profiles: [
|
||||
{
|
||||
options: {
|
||||
scanning: { length: 40 },
|
||||
dictionaries: [
|
||||
{ name: 'JPDBv2㋕', enabled: true, id: 0 },
|
||||
{ name: 'Jiten', enabled: true, id: 1 },
|
||||
{ name: 'CC100', enabled: true, id: 2 },
|
||||
],
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
return null;
|
||||
});
|
||||
|
||||
await requestYomitanScanTokens('者', deps, {
|
||||
error: () => undefined,
|
||||
});
|
||||
|
||||
const result = await runInjectedYomitanScript(scannerScript, (action, params) => {
|
||||
if (action !== 'termsFind') {
|
||||
throw new Error(`unexpected action: ${action}`);
|
||||
}
|
||||
|
||||
const text = (params as { text?: string } | undefined)?.text ?? '';
|
||||
if (!text.startsWith('者')) {
|
||||
return { originalTextLength: 0, dictionaryEntries: [] };
|
||||
}
|
||||
|
||||
return {
|
||||
originalTextLength: 1,
|
||||
dictionaryEntries: [
|
||||
{
|
||||
headwords: [
|
||||
{
|
||||
term: '者',
|
||||
reading: 'もの',
|
||||
sources: [{ originalText: '者', isPrimary: true, matchType: 'exact' }],
|
||||
},
|
||||
],
|
||||
frequencies: [],
|
||||
},
|
||||
{
|
||||
headwords: [
|
||||
{
|
||||
term: '者',
|
||||
reading: 'もの',
|
||||
sources: [{ originalText: '者', isPrimary: false, matchType: 'exact' }],
|
||||
},
|
||||
],
|
||||
frequencies: [
|
||||
{
|
||||
headwordIndex: 0,
|
||||
dictionary: 'JPDBv2㋕',
|
||||
frequency: 79601,
|
||||
displayValue: '475,79601句',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
});
|
||||
|
||||
assert.deepEqual(result, [
|
||||
{
|
||||
surface: '者',
|
||||
reading: 'もの',
|
||||
headword: '者',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
isNameMatch: false,
|
||||
frequencyRank: 475,
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
test('requestYomitanScanTokens marks tokens backed by SubMiner character dictionary entries', async () => {
|
||||
const deps = createDeps(async (script) => {
|
||||
if (script.includes('optionsGetFull')) {
|
||||
|
||||
@@ -20,19 +20,24 @@ interface YomitanParserRuntimeDeps {
|
||||
createYomitanExtensionWindow?: (pageName: string) => Promise<BrowserWindow | null>;
|
||||
}
|
||||
|
||||
type YomitanFrequencyMode = 'occurrence-based' | 'rank-based';
|
||||
|
||||
export interface YomitanDictionaryInfo {
|
||||
title: string;
|
||||
revision?: string | number;
|
||||
frequencyMode?: YomitanFrequencyMode;
|
||||
}
|
||||
|
||||
export interface YomitanTermFrequency {
|
||||
term: string;
|
||||
reading: string | null;
|
||||
hasReading: boolean;
|
||||
dictionary: string;
|
||||
dictionaryPriority: number;
|
||||
frequency: number;
|
||||
displayValue: string | null;
|
||||
displayValueParsed: boolean;
|
||||
frequencyDerivedFromDisplayValue: boolean;
|
||||
}
|
||||
|
||||
export interface YomitanTermReadingPair {
|
||||
@@ -47,6 +52,7 @@ export interface YomitanScanToken {
|
||||
startPos: number;
|
||||
endPos: number;
|
||||
isNameMatch?: boolean;
|
||||
frequencyRank?: number;
|
||||
}
|
||||
|
||||
interface YomitanProfileMetadata {
|
||||
@@ -54,6 +60,7 @@ interface YomitanProfileMetadata {
|
||||
scanLength: number;
|
||||
dictionaries: string[];
|
||||
dictionaryPriorityByName: Record<string, number>;
|
||||
dictionaryFrequencyModeByName: Partial<Record<string, YomitanFrequencyMode>>;
|
||||
}
|
||||
|
||||
const DEFAULT_YOMITAN_SCAN_LENGTH = 40;
|
||||
@@ -78,7 +85,8 @@ function isScanTokenArray(value: unknown): value is YomitanScanToken[] {
|
||||
typeof entry.headword === 'string' &&
|
||||
typeof entry.startPos === 'number' &&
|
||||
typeof entry.endPos === 'number' &&
|
||||
(entry.isNameMatch === undefined || typeof entry.isNameMatch === 'boolean'),
|
||||
(entry.isNameMatch === undefined || typeof entry.isNameMatch === 'boolean') &&
|
||||
(entry.frequencyRank === undefined || typeof entry.frequencyRank === 'number'),
|
||||
)
|
||||
);
|
||||
}
|
||||
@@ -117,24 +125,22 @@ function parsePositiveFrequencyString(value: string): number | null {
|
||||
return null;
|
||||
}
|
||||
|
||||
const numericPrefix = trimmed.match(/^\d[\d,]*/)?.[0];
|
||||
if (!numericPrefix) {
|
||||
const numericMatch = trimmed.match(/[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/)?.[0];
|
||||
if (!numericMatch) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const chunks = numericPrefix.split(',');
|
||||
const normalizedNumber =
|
||||
chunks.length <= 1
|
||||
? (chunks[0] ?? '')
|
||||
: chunks.slice(1).every((chunk) => /^\d{3}$/.test(chunk))
|
||||
? chunks.join('')
|
||||
: (chunks[0] ?? '');
|
||||
const parsed = Number.parseInt(normalizedNumber, 10);
|
||||
const parsed = Number.parseFloat(numericMatch);
|
||||
if (!Number.isFinite(parsed) || parsed <= 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return parsed;
|
||||
const normalized = Math.floor(parsed);
|
||||
if (!Number.isFinite(normalized) || normalized <= 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function parsePositiveFrequencyValue(value: unknown): number | null {
|
||||
@@ -159,6 +165,19 @@ function parsePositiveFrequencyValue(value: unknown): number | null {
|
||||
return null;
|
||||
}
|
||||
|
||||
function parseDisplayFrequencyValue(value: unknown): number | null {
|
||||
if (typeof value === 'string') {
|
||||
const leadingDigits = value.trim().match(/^\d+/)?.[0];
|
||||
if (!leadingDigits) {
|
||||
return null;
|
||||
}
|
||||
const parsed = Number.parseInt(leadingDigits, 10);
|
||||
return Number.isFinite(parsed) && parsed > 0 ? parsed : null;
|
||||
}
|
||||
|
||||
return parsePositiveFrequencyValue(value);
|
||||
}
|
||||
|
||||
function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
|
||||
if (!isObject(value)) {
|
||||
return null;
|
||||
@@ -169,9 +188,7 @@ function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
|
||||
const rawFrequency = parsePositiveFrequencyValue(value.frequency);
|
||||
const displayValueRaw = value.displayValue;
|
||||
const parsedDisplayFrequency =
|
||||
displayValueRaw !== null && displayValueRaw !== undefined
|
||||
? parsePositiveFrequencyValue(displayValueRaw)
|
||||
: null;
|
||||
displayValueRaw !== null && displayValueRaw !== undefined ? parseDisplayFrequencyValue(displayValueRaw) : null;
|
||||
const frequency = parsedDisplayFrequency ?? rawFrequency;
|
||||
if (!term || !dictionary || frequency === null) {
|
||||
return null;
|
||||
@@ -184,17 +201,20 @@ function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
|
||||
|
||||
const reading =
|
||||
value.reading === null ? null : typeof value.reading === 'string' ? value.reading : null;
|
||||
const hasReading = value.hasReading === false ? false : reading !== null;
|
||||
const displayValue = typeof displayValueRaw === 'string' ? displayValueRaw : null;
|
||||
const displayValueParsed = value.displayValueParsed === true;
|
||||
|
||||
return {
|
||||
term,
|
||||
reading,
|
||||
hasReading,
|
||||
dictionary,
|
||||
dictionaryPriority,
|
||||
frequency,
|
||||
displayValue,
|
||||
displayValueParsed,
|
||||
frequencyDerivedFromDisplayValue: parsedDisplayFrequency !== null,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -300,17 +320,34 @@ function toYomitanProfileMetadata(value: unknown): YomitanProfileMetadata | null
|
||||
}
|
||||
}
|
||||
|
||||
const dictionaryFrequencyModeByNameRaw = value.dictionaryFrequencyModeByName;
|
||||
const dictionaryFrequencyModeByName: Partial<Record<string, YomitanFrequencyMode>> = {};
|
||||
if (isObject(dictionaryFrequencyModeByNameRaw)) {
|
||||
for (const [name, frequencyModeRaw] of Object.entries(dictionaryFrequencyModeByNameRaw)) {
|
||||
const normalizedName = name.trim();
|
||||
if (!normalizedName) {
|
||||
continue;
|
||||
}
|
||||
if (frequencyModeRaw !== 'occurrence-based' && frequencyModeRaw !== 'rank-based') {
|
||||
continue;
|
||||
}
|
||||
dictionaryFrequencyModeByName[normalizedName] = frequencyModeRaw;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
profileIndex,
|
||||
scanLength,
|
||||
dictionaries,
|
||||
dictionaryPriorityByName,
|
||||
dictionaryFrequencyModeByName,
|
||||
};
|
||||
}
|
||||
|
||||
function normalizeFrequencyEntriesWithPriority(
|
||||
rawResult: unknown[],
|
||||
dictionaryPriorityByName: Record<string, number>,
|
||||
dictionaryFrequencyModeByName: Partial<Record<string, YomitanFrequencyMode>>,
|
||||
): YomitanTermFrequency[] {
|
||||
const normalized: YomitanTermFrequency[] = [];
|
||||
for (const entry of rawResult) {
|
||||
@@ -319,6 +356,10 @@ function normalizeFrequencyEntriesWithPriority(
|
||||
continue;
|
||||
}
|
||||
|
||||
if (dictionaryFrequencyModeByName[frequency.dictionary] === 'occurrence-based') {
|
||||
continue;
|
||||
}
|
||||
|
||||
const dictionaryPriority = dictionaryPriorityByName[frequency.dictionary];
|
||||
normalized.push({
|
||||
...frequency,
|
||||
@@ -425,8 +466,34 @@ async function requestYomitanProfileMetadata(
|
||||
acc[entry.name] = index;
|
||||
return acc;
|
||||
}, {});
|
||||
let dictionaryFrequencyModeByName = {};
|
||||
try {
|
||||
const dictionaryInfo = await invoke("getDictionaryInfo", undefined);
|
||||
dictionaryFrequencyModeByName = Array.isArray(dictionaryInfo)
|
||||
? dictionaryInfo.reduce((acc, entry) => {
|
||||
if (!entry || typeof entry !== "object" || typeof entry.title !== "string") {
|
||||
return acc;
|
||||
}
|
||||
if (
|
||||
entry.frequencyMode === "occurrence-based" ||
|
||||
entry.frequencyMode === "rank-based"
|
||||
) {
|
||||
acc[entry.title] = entry.frequencyMode;
|
||||
}
|
||||
return acc;
|
||||
}, {})
|
||||
: {};
|
||||
} catch {
|
||||
dictionaryFrequencyModeByName = {};
|
||||
}
|
||||
|
||||
return { profileIndex, scanLength, dictionaries, dictionaryPriorityByName };
|
||||
return {
|
||||
profileIndex,
|
||||
scanLength,
|
||||
dictionaries,
|
||||
dictionaryPriorityByName,
|
||||
dictionaryFrequencyModeByName
|
||||
};
|
||||
})();
|
||||
`;
|
||||
|
||||
@@ -774,7 +841,133 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
|
||||
}
|
||||
return segments;
|
||||
}
|
||||
function getPreferredHeadword(dictionaryEntries, token) {
|
||||
function parsePositiveFrequencyNumber(value) {
|
||||
if (typeof value === 'number' && Number.isFinite(value) && value > 0) {
|
||||
return Math.max(1, Math.floor(value));
|
||||
}
|
||||
if (typeof value === 'string') {
|
||||
const numericMatch = value.trim().match(/[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/)?.[0];
|
||||
if (!numericMatch) { return null; }
|
||||
const parsed = Number.parseFloat(numericMatch);
|
||||
if (!Number.isFinite(parsed) || parsed <= 0) { return null; }
|
||||
return Math.max(1, Math.floor(parsed));
|
||||
}
|
||||
if (Array.isArray(value)) {
|
||||
for (const item of value) {
|
||||
const parsed = parsePositiveFrequencyNumber(item);
|
||||
if (parsed !== null) { return parsed; }
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
function parseDisplayFrequencyNumber(value) {
|
||||
if (typeof value === 'string') {
|
||||
const leadingDigits = value.trim().match(/^\d+/)?.[0];
|
||||
if (!leadingDigits) { return null; }
|
||||
const parsed = Number.parseInt(leadingDigits, 10);
|
||||
return Number.isFinite(parsed) && parsed > 0 ? parsed : null;
|
||||
}
|
||||
return parsePositiveFrequencyNumber(value);
|
||||
}
|
||||
function getFrequencyDictionaryName(frequency) {
|
||||
const candidates = [
|
||||
frequency?.dictionary,
|
||||
frequency?.dictionaryName,
|
||||
frequency?.name,
|
||||
frequency?.title,
|
||||
frequency?.dictionaryTitle,
|
||||
frequency?.dictionaryAlias
|
||||
];
|
||||
for (const candidate of candidates) {
|
||||
if (typeof candidate === 'string' && candidate.trim().length > 0) {
|
||||
return candidate.trim();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
function getBestFrequencyRank(dictionaryEntry, headwordIndex, dictionaryPriorityByName, dictionaryFrequencyModeByName) {
|
||||
let best = null;
|
||||
const headwordCount = Array.isArray(dictionaryEntry?.headwords) ? dictionaryEntry.headwords.length : 0;
|
||||
for (const frequency of dictionaryEntry?.frequencies || []) {
|
||||
if (!frequency || typeof frequency !== 'object') { continue; }
|
||||
const frequencyHeadwordIndex = frequency.headwordIndex;
|
||||
if (typeof frequencyHeadwordIndex === 'number') {
|
||||
if (frequencyHeadwordIndex !== headwordIndex) { continue; }
|
||||
} else if (headwordCount > 1) {
|
||||
continue;
|
||||
}
|
||||
const dictionary = getFrequencyDictionaryName(frequency);
|
||||
if (!dictionary) { continue; }
|
||||
if (dictionaryFrequencyModeByName[dictionary] === 'occurrence-based') { continue; }
|
||||
const rank =
|
||||
parseDisplayFrequencyNumber(frequency.displayValue) ??
|
||||
parsePositiveFrequencyNumber(frequency.frequency);
|
||||
if (rank === null) { continue; }
|
||||
const priorityRaw = dictionaryPriorityByName[dictionary];
|
||||
const fallbackPriority =
|
||||
typeof frequency.dictionaryIndex === 'number' && Number.isFinite(frequency.dictionaryIndex)
|
||||
? Math.max(0, Math.floor(frequency.dictionaryIndex))
|
||||
: Number.MAX_SAFE_INTEGER;
|
||||
const priority =
|
||||
typeof priorityRaw === 'number' && Number.isFinite(priorityRaw)
|
||||
? Math.max(0, Math.floor(priorityRaw))
|
||||
: fallbackPriority;
|
||||
if (best === null || priority < best.priority || (priority === best.priority && rank < best.rank)) {
|
||||
best = { priority, rank };
|
||||
}
|
||||
}
|
||||
return best?.rank ?? null;
|
||||
}
|
||||
function hasExactSource(headword, token, requirePrimary) {
|
||||
for (const src of headword.sources || []) {
|
||||
if (src.originalText !== token) { continue; }
|
||||
if (requirePrimary && !src.isPrimary) { continue; }
|
||||
if (src.matchType !== 'exact') { continue; }
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
function collectExactHeadwordMatches(dictionaryEntries, token, requirePrimary) {
|
||||
const matches = [];
|
||||
for (const dictionaryEntry of dictionaryEntries || []) {
|
||||
const headwords = Array.isArray(dictionaryEntry?.headwords) ? dictionaryEntry.headwords : [];
|
||||
for (let headwordIndex = 0; headwordIndex < headwords.length; headwordIndex += 1) {
|
||||
const headword = headwords[headwordIndex];
|
||||
if (!hasExactSource(headword, token, requirePrimary)) { continue; }
|
||||
matches.push({ dictionaryEntry, headword, headwordIndex });
|
||||
}
|
||||
}
|
||||
return matches;
|
||||
}
|
||||
function sameHeadword(match, preferredMatch) {
|
||||
if (!match || !preferredMatch) {
|
||||
return false;
|
||||
}
|
||||
if (match.headword?.term !== preferredMatch.headword?.term) {
|
||||
return false;
|
||||
}
|
||||
const matchReading = typeof match.headword?.reading === 'string' ? match.headword.reading : '';
|
||||
const preferredReading =
|
||||
typeof preferredMatch.headword?.reading === 'string' ? preferredMatch.headword.reading : '';
|
||||
return matchReading === preferredReading;
|
||||
}
|
||||
function getBestFrequencyRankForMatches(matches, dictionaryPriorityByName, dictionaryFrequencyModeByName) {
|
||||
let best = null;
|
||||
for (const match of matches) {
|
||||
const rank = getBestFrequencyRank(
|
||||
match.dictionaryEntry,
|
||||
match.headwordIndex,
|
||||
dictionaryPriorityByName,
|
||||
dictionaryFrequencyModeByName
|
||||
);
|
||||
if (rank === null) { continue; }
|
||||
if (best === null || rank < best) {
|
||||
best = rank;
|
||||
}
|
||||
}
|
||||
return best;
|
||||
}
|
||||
function getPreferredHeadword(dictionaryEntries, token, dictionaryPriorityByName, dictionaryFrequencyModeByName) {
|
||||
function appendDictionaryNames(target, value) {
|
||||
if (!value || typeof value !== 'object') {
|
||||
return;
|
||||
@@ -813,36 +1006,33 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
|
||||
}
|
||||
return getDictionaryEntryNames(entry).some((name) => name.startsWith("SubMiner Character Dictionary"));
|
||||
}
|
||||
function hasExactPrimarySource(headword, token) {
|
||||
for (const src of headword.sources || []) {
|
||||
if (src.originalText !== token) { continue; }
|
||||
if (!src.isPrimary) { continue; }
|
||||
if (src.matchType !== 'exact') { continue; }
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
const exactPrimaryMatches = collectExactHeadwordMatches(dictionaryEntries, token, true);
|
||||
let matchedNameDictionary = false;
|
||||
if (includeNameMatchMetadata) {
|
||||
for (const dictionaryEntry of dictionaryEntries || []) {
|
||||
if (!isNameDictionaryEntry(dictionaryEntry)) { continue; }
|
||||
for (const headword of dictionaryEntry.headwords || []) {
|
||||
if (!hasExactPrimarySource(headword, token)) { continue; }
|
||||
for (const match of exactPrimaryMatches) {
|
||||
if (match.dictionaryEntry !== dictionaryEntry) { continue; }
|
||||
matchedNameDictionary = true;
|
||||
break;
|
||||
}
|
||||
if (matchedNameDictionary) { break; }
|
||||
}
|
||||
}
|
||||
for (const dictionaryEntry of dictionaryEntries || []) {
|
||||
for (const headword of dictionaryEntry.headwords || []) {
|
||||
if (!hasExactPrimarySource(headword, token)) { continue; }
|
||||
return {
|
||||
term: headword.term,
|
||||
reading: headword.reading,
|
||||
isNameMatch: matchedNameDictionary || isNameDictionaryEntry(dictionaryEntry)
|
||||
};
|
||||
}
|
||||
const preferredMatch = exactPrimaryMatches[0];
|
||||
if (preferredMatch) {
|
||||
const exactFrequencyMatches = collectExactHeadwordMatches(dictionaryEntries, token, false)
|
||||
.filter((match) => sameHeadword(match, preferredMatch));
|
||||
return {
|
||||
term: preferredMatch.headword.term,
|
||||
reading: preferredMatch.headword.reading,
|
||||
isNameMatch: matchedNameDictionary || isNameDictionaryEntry(preferredMatch.dictionaryEntry),
|
||||
frequencyRank: getBestFrequencyRankForMatches(
|
||||
exactFrequencyMatches.length > 0 ? exactFrequencyMatches : exactPrimaryMatches,
|
||||
dictionaryPriorityByName,
|
||||
dictionaryFrequencyModeByName
|
||||
)
|
||||
};
|
||||
}
|
||||
return null;
|
||||
}
|
||||
@@ -853,6 +1043,8 @@ function buildYomitanScanningScript(
|
||||
profileIndex: number,
|
||||
scanLength: number,
|
||||
includeNameMatchMetadata: boolean,
|
||||
dictionaryPriorityByName: Record<string, number>,
|
||||
dictionaryFrequencyModeByName: Partial<Record<string, YomitanFrequencyMode>>,
|
||||
): string {
|
||||
return `
|
||||
(async () => {
|
||||
@@ -876,6 +1068,8 @@ function buildYomitanScanningScript(
|
||||
});
|
||||
${YOMITAN_SCANNING_HELPERS}
|
||||
const includeNameMatchMetadata = ${includeNameMatchMetadata ? 'true' : 'false'};
|
||||
const dictionaryPriorityByName = ${JSON.stringify(dictionaryPriorityByName)};
|
||||
const dictionaryFrequencyModeByName = ${JSON.stringify(dictionaryFrequencyModeByName)};
|
||||
const text = ${JSON.stringify(text)};
|
||||
const details = {matchType: "exact", deinflect: true};
|
||||
const tokens = [];
|
||||
@@ -889,7 +1083,12 @@ ${YOMITAN_SCANNING_HELPERS}
|
||||
const originalTextLength = typeof result?.originalTextLength === "number" ? result.originalTextLength : 0;
|
||||
if (dictionaryEntries.length > 0 && originalTextLength > 0 && (originalTextLength !== character.length || isCodePointJapanese(codePoint))) {
|
||||
const source = substring.substring(0, originalTextLength);
|
||||
const preferredHeadword = getPreferredHeadword(dictionaryEntries, source);
|
||||
const preferredHeadword = getPreferredHeadword(
|
||||
dictionaryEntries,
|
||||
source,
|
||||
dictionaryPriorityByName,
|
||||
dictionaryFrequencyModeByName
|
||||
);
|
||||
if (preferredHeadword && typeof preferredHeadword.term === "string") {
|
||||
const reading = typeof preferredHeadword.reading === "string" ? preferredHeadword.reading : "";
|
||||
const segments = distributeFuriganaInflected(preferredHeadword.term, reading, source);
|
||||
@@ -900,6 +1099,10 @@ ${YOMITAN_SCANNING_HELPERS}
|
||||
startPos: i,
|
||||
endPos: i + originalTextLength,
|
||||
isNameMatch: includeNameMatchMetadata && preferredHeadword.isNameMatch === true,
|
||||
frequencyRank:
|
||||
typeof preferredHeadword.frequencyRank === "number" && Number.isFinite(preferredHeadword.frequencyRank)
|
||||
? Math.max(1, Math.floor(preferredHeadword.frequencyRank))
|
||||
: undefined,
|
||||
});
|
||||
i += originalTextLength;
|
||||
continue;
|
||||
@@ -1036,6 +1239,8 @@ export async function requestYomitanScanTokens(
|
||||
profileIndex,
|
||||
scanLength,
|
||||
options?.includeNameMatchMetadata === true,
|
||||
metadata?.dictionaryPriorityByName ?? {},
|
||||
metadata?.dictionaryFrequencyModeByName ?? {},
|
||||
),
|
||||
true,
|
||||
);
|
||||
@@ -1099,7 +1304,11 @@ async function fetchYomitanTermFrequencies(
|
||||
try {
|
||||
const rawResult = await parserWindow.webContents.executeJavaScript(script, true);
|
||||
return Array.isArray(rawResult)
|
||||
? normalizeFrequencyEntriesWithPriority(rawResult, metadata.dictionaryPriorityByName)
|
||||
? normalizeFrequencyEntriesWithPriority(
|
||||
rawResult,
|
||||
metadata.dictionaryPriorityByName,
|
||||
metadata.dictionaryFrequencyModeByName,
|
||||
)
|
||||
: [];
|
||||
} catch (err) {
|
||||
logger.error('Yomitan term frequency request failed:', (err as Error).message);
|
||||
@@ -1541,10 +1750,15 @@ export async function getYomitanDictionaryInfo(
|
||||
.map((entry) => {
|
||||
const title = typeof entry.title === 'string' ? entry.title.trim() : '';
|
||||
const revision = entry.revision;
|
||||
const frequencyMode: YomitanFrequencyMode | undefined =
|
||||
entry.frequencyMode === 'occurrence-based' || entry.frequencyMode === 'rank-based'
|
||||
? entry.frequencyMode
|
||||
: undefined;
|
||||
return {
|
||||
title,
|
||||
revision:
|
||||
typeof revision === 'string' || typeof revision === 'number' ? revision : undefined,
|
||||
frequencyMode,
|
||||
};
|
||||
})
|
||||
.filter((entry) => entry.title.length > 0);
|
||||
|
||||
Reference in New Issue
Block a user