feat: improve stats dashboard and annotation settings

This commit is contained in:
2026-03-15 21:18:35 -07:00
parent 650e95cdc3
commit 04682a02cc
75 changed files with 3420 additions and 619 deletions

View File

@@ -53,6 +53,7 @@ const VOCABULARY_STATS = [
pos2: '自立',
pos3: null,
frequency: 100,
frequencyRank: 42,
firstSeen: Date.now(),
lastSeen: Date.now(),
},
@@ -132,9 +133,7 @@ const EPISODES_PER_DAY = [
{ epochDay: Math.floor(Date.now() / 86_400_000), episodeCount: 1 },
];
const NEW_ANIME_PER_DAY = [
{ epochDay: Math.floor(Date.now() / 86_400_000) - 2, newAnimeCount: 2 },
];
const NEW_ANIME_PER_DAY = [{ epochDay: Math.floor(Date.now() / 86_400_000) - 2, newAnimeCount: 2 }];
const WATCH_TIME_PER_ANIME = [
{
@@ -210,7 +209,12 @@ function createMockTracker(
getSessionSummaries: async () => SESSION_SUMMARIES,
getDailyRollups: async () => DAILY_ROLLUPS,
getMonthlyRollups: async () => [],
getQueryHints: async () => ({ totalSessions: 5, activeSessions: 1, episodesToday: 2, activeAnimeCount: 3 }),
getQueryHints: async () => ({
totalSessions: 5,
activeSessions: 1,
episodesToday: 2,
activeAnimeCount: 3,
}),
getSessionTimeline: async () => [],
getSessionEvents: async () => [],
getVocabularyStats: async () => VOCABULARY_STATS,
@@ -445,7 +449,9 @@ describe('stats server API routes', () => {
}),
);
const res = await app.request('/api/stats/kanji/occurrences?kanji=%E6%97%A5&limit=999999&offset=10');
const res = await app.request(
'/api/stats/kanji/occurrences?kanji=%E6%97%A5&limit=999999&offset=10',
);
assert.equal(res.status, 200);
const body = await res.json();
assert.ok(Array.isArray(body));
@@ -711,6 +717,23 @@ describe('stats server API routes', () => {
assert.equal(res.status, 400);
});
it('DELETE /api/stats/sessions/:sessionId deletes a session', async () => {
let deletedSessionId = 0;
const app = createStatsApp(
createMockTracker({
deleteSession: async (sessionId: number) => {
deletedSessionId = sessionId;
},
}),
);
const res = await app.request('/api/stats/sessions/42', { method: 'DELETE' });
assert.equal(res.status, 200);
assert.equal(deletedSessionId, 42);
assert.deepEqual(await res.json(), { ok: true });
});
it('POST /api/stats/anki/browse returns 400 for missing noteId', async () => {
const app = createStatsApp(createMockTracker());
const res = await app.request('/api/stats/anki/browse', { method: 'POST' });

View File

@@ -130,6 +130,56 @@ test('createFrequencyDictionaryLookup parses composite displayValue by primary r
assert.equal(lookup('高み'), 9933);
});
test('createFrequencyDictionaryLookup uses leading display digits for displayValue strings', async () => {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-frequency-dict-'));
const bankPath = path.join(tempDir, 'term_meta_bank_1.json');
fs.writeFileSync(
bankPath,
JSON.stringify([
['潜む', 1, { frequency: { value: 121, displayValue: '118,121' } }],
['例', 2, { frequency: { value: 1234, displayValue: '1,234' } }],
]),
);
const lookup = await createFrequencyDictionaryLookup({
searchPaths: [tempDir],
log: () => undefined,
});
assert.equal(lookup('潜む'), 118);
assert.equal(lookup('例'), 1);
});
test('createFrequencyDictionaryLookup ignores occurrence-based Yomitan dictionaries', async () => {
const logs: string[] = [];
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-frequency-dict-'));
fs.writeFileSync(
path.join(tempDir, 'index.json'),
JSON.stringify({
title: 'CC100',
revision: '1',
frequencyMode: 'occurrence-based',
}),
);
fs.writeFileSync(
path.join(tempDir, 'term_meta_bank_1.json'),
JSON.stringify([['潜む', 1, { frequency: { value: 118121 } }]]),
);
const lookup = await createFrequencyDictionaryLookup({
searchPaths: [tempDir],
log: (message) => {
logs.push(message);
},
});
assert.equal(lookup('潜む'), null);
assert.equal(
logs.some((entry) => entry.includes('occurrence-based') && entry.includes('CC100')),
true,
);
});
test('createFrequencyDictionaryLookup does not require synchronous fs APIs', async () => {
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-frequency-dict-'));
const bankPath = path.join(tempDir, 'term_meta_bank_1.json');

View File

@@ -6,6 +6,8 @@ export interface FrequencyDictionaryLookupOptions {
log: (message: string) => void;
}
type FrequencyDictionaryMode = 'occurrence-based' | 'rank-based';
interface FrequencyDictionaryEntry {
rank: number;
term: string;
@@ -29,30 +31,67 @@ function normalizeFrequencyTerm(value: string): string {
return value.trim().toLowerCase();
}
async function readDictionaryMetadata(
dictionaryPath: string,
log: (message: string) => void,
): Promise<{ title: string | null; frequencyMode: FrequencyDictionaryMode | null }> {
const indexPath = path.join(dictionaryPath, 'index.json');
let rawText: string;
try {
rawText = await fs.readFile(indexPath, 'utf-8');
} catch (error) {
if (isErrorCode(error, 'ENOENT')) {
return { title: null, frequencyMode: null };
}
log(`Failed to read frequency dictionary index ${indexPath}: ${String(error)}`);
return { title: null, frequencyMode: null };
}
let rawIndex: unknown;
try {
rawIndex = JSON.parse(rawText) as unknown;
} catch {
log(`Failed to parse frequency dictionary index as JSON: ${indexPath}`);
return { title: null, frequencyMode: null };
}
if (!rawIndex || typeof rawIndex !== 'object') {
return { title: null, frequencyMode: null };
}
const titleRaw = (rawIndex as { title?: unknown }).title;
const frequencyModeRaw = (rawIndex as { frequencyMode?: unknown }).frequencyMode;
return {
title: typeof titleRaw === 'string' && titleRaw.trim().length > 0 ? titleRaw.trim() : null,
frequencyMode:
frequencyModeRaw === 'occurrence-based' || frequencyModeRaw === 'rank-based'
? frequencyModeRaw
: null,
};
}
function parsePositiveFrequencyString(value: string): number | null {
const trimmed = value.trim();
if (!trimmed) {
return null;
}
const numericPrefix = trimmed.match(/^\d[\d,]*/)?.[0];
if (!numericPrefix) {
const numericMatch = trimmed.match(/[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/)?.[0];
if (!numericMatch) {
return null;
}
const chunks = numericPrefix.split(',');
const normalizedNumber =
chunks.length <= 1
? (chunks[0] ?? '')
: chunks.slice(1).every((chunk) => /^\d{3}$/.test(chunk))
? chunks.join('')
: (chunks[0] ?? '');
const parsed = Number.parseInt(normalizedNumber, 10);
const parsed = Number.parseFloat(numericMatch);
if (!Number.isFinite(parsed) || parsed <= 0) {
return null;
}
return parsed;
const normalized = Math.floor(parsed);
if (!Number.isFinite(normalized) || normalized <= 0) {
return null;
}
return normalized;
}
function parsePositiveFrequencyNumber(value: unknown): number | null {
@@ -68,18 +107,32 @@ function parsePositiveFrequencyNumber(value: unknown): number | null {
return null;
}
function parseDisplayFrequencyNumber(value: unknown): number | null {
if (typeof value === 'string') {
const leadingDigits = value.trim().match(/^\d+/)?.[0];
if (!leadingDigits) {
return null;
}
const parsed = Number.parseInt(leadingDigits, 10);
return Number.isFinite(parsed) && parsed > 0 ? parsed : null;
}
return parsePositiveFrequencyNumber(value);
}
function extractFrequencyDisplayValue(meta: unknown): number | null {
if (!meta || typeof meta !== 'object') return null;
const frequency = (meta as { frequency?: unknown }).frequency;
if (!frequency || typeof frequency !== 'object') return null;
const rawValue = (frequency as { value?: unknown }).value;
const parsedRawValue = parsePositiveFrequencyNumber(rawValue);
const displayValue = (frequency as { displayValue?: unknown }).displayValue;
const parsedDisplayValue = parsePositiveFrequencyNumber(displayValue);
const parsedDisplayValue = parseDisplayFrequencyNumber(displayValue);
if (parsedDisplayValue !== null) {
return parsedDisplayValue;
}
const rawValue = (frequency as { value?: unknown }).value;
return parsePositiveFrequencyNumber(rawValue);
return parsedRawValue;
}
function asFrequencyDictionaryEntry(entry: unknown): FrequencyDictionaryEntry | null {
@@ -141,6 +194,15 @@ async function collectDictionaryFromPath(
log: (message: string) => void,
): Promise<Map<string, number>> {
const terms = new Map<string, number>();
const metadata = await readDictionaryMetadata(dictionaryPath, log);
if (metadata.frequencyMode === 'occurrence-based') {
log(
`Skipping occurrence-based frequency dictionary ${
metadata.title ?? dictionaryPath
}; SubMiner frequency tags require rank-based values.`,
);
return terms;
}
let fileNames: string[];
try {

View File

@@ -57,6 +57,8 @@ import {
getWordOccurrences,
getVideoDurationMs,
markVideoWatched,
deleteSession as deleteSessionQuery,
deleteVideo as deleteVideoQuery,
} from './immersion-tracker/query';
import {
buildVideoKey,
@@ -125,6 +127,7 @@ import {
type WordDetailRow,
type WordOccurrenceRow,
type VocabularyStatsRow,
type CountedWordOccurrence,
} from './immersion-tracker/types';
import type { MergedToken } from '../../types';
import { shouldExcludeTokenFromVocabularyPersistence } from './tokenizer/annotation-stage';
@@ -402,6 +405,70 @@ export class ImmersionTrackerService {
markVideoWatched(this.db, videoId, watched);
}
async deleteSession(sessionId: number): Promise<void> {
deleteSessionQuery(this.db, sessionId);
}
async deleteVideo(videoId: number): Promise<void> {
deleteVideoQuery(this.db, videoId);
}
async reassignAnimeAnilist(animeId: number, info: {
anilistId: number;
titleRomaji?: string | null;
titleEnglish?: string | null;
titleNative?: string | null;
episodesTotal?: number | null;
description?: string | null;
coverUrl?: string | null;
}): Promise<void> {
this.db.prepare(`
UPDATE imm_anime
SET anilist_id = ?,
title_romaji = COALESCE(?, title_romaji),
title_english = COALESCE(?, title_english),
title_native = COALESCE(?, title_native),
episodes_total = COALESCE(?, episodes_total),
description = ?,
LAST_UPDATE_DATE = ?
WHERE anime_id = ?
`).run(
info.anilistId,
info.titleRomaji ?? null,
info.titleEnglish ?? null,
info.titleNative ?? null,
info.episodesTotal ?? null,
info.description ?? null,
Date.now(),
animeId,
);
// Update cover art for all videos in this anime
if (info.coverUrl) {
const videos = this.db.prepare('SELECT video_id FROM imm_videos WHERE anime_id = ?')
.all(animeId) as Array<{ video_id: number }>;
let coverBlob: Buffer | null = null;
try {
const res = await fetch(info.coverUrl);
if (res.ok) coverBlob = Buffer.from(await res.arrayBuffer());
} catch { /* ignore */ }
for (const v of videos) {
this.db.prepare(`
INSERT INTO imm_media_art (video_id, anilist_id, cover_url, cover_blob, title_romaji, title_english, episodes_total, fetched_at_ms, CREATED_DATE, LAST_UPDATE_DATE)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(video_id) DO UPDATE SET
anilist_id = excluded.anilist_id, cover_url = excluded.cover_url, cover_blob = COALESCE(excluded.cover_blob, cover_blob),
title_romaji = excluded.title_romaji, title_english = excluded.title_english, episodes_total = excluded.episodes_total,
fetched_at_ms = excluded.fetched_at_ms, LAST_UPDATE_DATE = excluded.LAST_UPDATE_DATE
`).run(
v.video_id, info.anilistId, info.coverUrl, coverBlob,
info.titleRomaji ?? null, info.titleEnglish ?? null, info.episodesTotal ?? null,
Date.now(), Date.now(), Date.now(),
);
}
}
}
async getEpisodeCardEvents(videoId: number): Promise<EpisodeCardEventRow[]> {
return getEpisodeCardEvents(this.db, videoId);
}
@@ -571,19 +638,7 @@ export class ImmersionTrackerService {
this.sessionState.tokensSeen += metrics.tokens;
this.sessionState.pendingTelemetry = true;
const wordOccurrences = new Map<
string,
{
headword: string;
word: string;
reading: string;
partOfSpeech: string;
pos1: string;
pos2: string;
pos3: string;
occurrenceCount: number;
}
>();
const wordOccurrences = new Map<string, CountedWordOccurrence>();
for (const token of tokens ?? []) {
if (shouldExcludeTokenFromVocabularyPersistence(token)) {
continue;
@@ -617,6 +672,7 @@ export class ImmersionTrackerService {
pos2: token.pos2 ?? '',
pos3: token.pos3 ?? '',
occurrenceCount: 1,
frequencyRank: token.frequencyRank ?? null,
});
}

View File

@@ -14,6 +14,7 @@ import {
import { startSessionRecord } from '../session.js';
import {
cleanupVocabularyStats,
deleteSession,
getAnimeDetail,
getAnimeEpisodes,
getAnimeLibrary,
@@ -295,35 +296,32 @@ test('cleanupVocabularyStats repairs stored POS metadata and removes excluded im
{ headword: '旧', frequency: 1 },
],
);
assert.deepEqual(
repairedRows,
[
{
headword: '',
word: '',
reading: 'きゅう',
part_of_speech: 'noun',
pos1: '名詞',
pos2: '一般',
},
{
headword: '',
word: '',
reading: 'ねこ',
part_of_speech: 'noun',
pos1: '名詞',
pos2: '一般',
},
{
headword: 'る',
word: '知っている',
reading: 'しっている',
part_of_speech: 'verb',
pos1: '動詞',
pos2: '自立',
},
],
);
assert.deepEqual(repairedRows, [
{
headword: '旧',
word: '旧',
reading: 'きゅう',
part_of_speech: 'noun',
pos1: '名詞',
pos2: '一般',
},
{
headword: '猫',
word: '猫',
reading: 'ねこ',
part_of_speech: 'noun',
pos1: '名詞',
pos2: '一般',
},
{
headword: '知る',
word: '知っている',
reading: 'しっている',
part_of_speech: 'verb',
pos1: '動詞',
pos2: '自立',
},
]);
} finally {
db.close();
cleanupDbPath(dbPath);
@@ -708,7 +706,7 @@ test('anime-level queries group by anime_id and preserve episode-level rows', ()
canonicalTitle: 'Frieren',
anilistId: 52_921,
titleRomaji: 'Sousou no Frieren',
titleEnglish: 'Frieren: Beyond Journey\'s End',
titleEnglish: "Frieren: Beyond Journey's End",
titleNative: '葬送のフリーレン',
metadataJson: '{"source":"anilist"}',
});
@@ -1070,3 +1068,151 @@ test('getKanjiOccurrences maps a kanji back to anime, video, and subtitle line c
cleanupDbPath(dbPath);
}
});
test('deleteSession removes the session and all associated session-scoped rows', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
ensureSchema(db);
const stmts = createTrackerPreparedStatements(db);
const videoId = getOrCreateVideoRecord(db, 'local:/tmp/delete-session.mkv', {
canonicalTitle: 'Delete Session Test',
sourcePath: '/tmp/delete-session.mkv',
sourceUrl: null,
sourceType: SOURCE_TYPE_LOCAL,
});
const startedAtMs = 6_000_000;
const { sessionId } = startSessionRecord(db, videoId, startedAtMs);
stmts.telemetryInsertStmt.run(
sessionId,
startedAtMs + 1_000,
5_000,
4_000,
3,
9,
9,
1,
2,
1,
0,
0,
0,
0,
0,
startedAtMs + 1_000,
startedAtMs + 1_000,
);
const eventResult = stmts.eventInsertStmt.run(
sessionId,
startedAtMs + 1_500,
EVENT_SUBTITLE_LINE,
0,
0,
900,
2,
0,
'{"line":"delete me"}',
startedAtMs + 1_500,
startedAtMs + 1_500,
);
const eventId = Number(eventResult.lastInsertRowid);
const wordResult = db
.prepare(
`INSERT INTO imm_words (
headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
)
.run('削除', '削除', 'さくじょ', 'noun', '名詞', '一般', '', startedAtMs, startedAtMs, 1);
const kanjiResult = db
.prepare(
`INSERT INTO imm_kanji (
kanji, first_seen, last_seen, frequency
) VALUES (?, ?, ?, ?)`,
)
.run('削', startedAtMs, startedAtMs, 1);
const lineResult = stmts.subtitleLineInsertStmt.run(
sessionId,
eventId,
videoId,
null,
0,
0,
900,
'delete me',
startedAtMs + 1_500,
startedAtMs + 1_500,
);
const lineId = Number(lineResult.lastInsertRowid);
db.prepare(
`INSERT INTO imm_word_line_occurrences (line_id, word_id, occurrence_count)
VALUES (?, ?, ?)`,
).run(lineId, Number(wordResult.lastInsertRowid), 1);
db.prepare(
`INSERT INTO imm_kanji_line_occurrences (line_id, kanji_id, occurrence_count)
VALUES (?, ?, ?)`,
).run(lineId, Number(kanjiResult.lastInsertRowid), 1);
deleteSession(db, sessionId);
const sessionCount = Number(
(
db
.prepare('SELECT COUNT(*) AS total FROM imm_sessions WHERE session_id = ?')
.get(sessionId) as {
total: number;
}
).total,
);
const telemetryCount = Number(
(
db
.prepare('SELECT COUNT(*) AS total FROM imm_session_telemetry WHERE session_id = ?')
.get(sessionId) as { total: number }
).total,
);
const eventCount = Number(
(
db
.prepare('SELECT COUNT(*) AS total FROM imm_session_events WHERE session_id = ?')
.get(sessionId) as {
total: number;
}
).total,
);
const subtitleLineCount = Number(
(
db
.prepare('SELECT COUNT(*) AS total FROM imm_subtitle_lines WHERE session_id = ?')
.get(sessionId) as { total: number }
).total,
);
const wordOccurrenceCount = Number(
(
db
.prepare('SELECT COUNT(*) AS total FROM imm_word_line_occurrences WHERE line_id = ?')
.get(lineId) as { total: number }
).total,
);
const kanjiOccurrenceCount = Number(
(
db
.prepare('SELECT COUNT(*) AS total FROM imm_kanji_line_occurrences WHERE line_id = ?')
.get(lineId) as { total: number }
).total,
);
assert.equal(sessionCount, 0);
assert.equal(telemetryCount, 0);
assert.equal(eventCount, 0);
assert.equal(subtitleLineCount, 0);
assert.equal(wordOccurrenceCount, 0);
assert.equal(kanjiOccurrenceCount, 0);
} finally {
db.close();
cleanupDbPath(dbPath);
}
});

View File

@@ -223,7 +223,8 @@ export function getVocabularyStats(
const stmt = db.prepare(`
SELECT id AS wordId, headword, word, reading,
part_of_speech AS partOfSpeech, pos1, pos2, pos3,
frequency, first_seen AS firstSeen, last_seen AS lastSeen
frequency, frequency_rank AS frequencyRank,
first_seen AS firstSeen, last_seen AS lastSeen
FROM imm_words ${whereClause} ORDER BY frequency DESC LIMIT ?
`);
const params = hasExclude ? [...excludePos, limit] : [limit];
@@ -632,6 +633,7 @@ export function getAnimeDetail(db: DatabaseSync, animeId: number): AnimeDetailRo
a.title_romaji AS titleRomaji,
a.title_english AS titleEnglish,
a.title_native AS titleNative,
a.description AS description,
COUNT(DISTINCT s.session_id) AS totalSessions,
COALESCE(SUM(sm.max_active_ms), 0) AS totalActiveMs,
COALESCE(SUM(sm.max_cards), 0) AS totalCards,
@@ -1165,3 +1167,22 @@ export function isVideoWatched(db: DatabaseSync, videoId: number): boolean {
} | null;
return row?.watched === 1;
}
export function deleteSession(db: DatabaseSync, sessionId: number): void {
db.prepare('DELETE FROM imm_subtitle_lines WHERE session_id = ?').run(sessionId);
db.prepare('DELETE FROM imm_session_telemetry WHERE session_id = ?').run(sessionId);
db.prepare('DELETE FROM imm_session_events WHERE session_id = ?').run(sessionId);
db.prepare('DELETE FROM imm_sessions WHERE session_id = ?').run(sessionId);
}
export function deleteVideo(db: DatabaseSync, videoId: number): void {
const sessions = db.prepare('SELECT session_id FROM imm_sessions WHERE video_id = ?').all(videoId) as Array<{ session_id: number }>;
for (const s of sessions) {
deleteSession(db, s.session_id);
}
db.prepare('DELETE FROM imm_subtitle_lines WHERE video_id = ?').run(videoId);
db.prepare('DELETE FROM imm_daily_rollups WHERE video_id = ?').run(videoId);
db.prepare('DELETE FROM imm_monthly_rollups WHERE video_id = ?').run(videoId);
db.prepare('DELETE FROM imm_media_art WHERE video_id = ?').run(videoId);
db.prepare('DELETE FROM imm_videos WHERE video_id = ?').run(videoId);
}

View File

@@ -345,6 +345,7 @@ export function ensureSchema(db: DatabaseSync): void {
title_english TEXT,
title_native TEXT,
episodes_total INTEGER,
description TEXT,
metadata_json TEXT,
CREATED_DATE INTEGER,
LAST_UPDATE_DATE INTEGER
@@ -479,6 +480,7 @@ export function ensureSchema(db: DatabaseSync): void {
first_seen REAL,
last_seen REAL,
frequency INTEGER,
frequency_rank INTEGER,
UNIQUE(headword, word, reading)
);
`);
@@ -672,6 +674,11 @@ export function ensureSchema(db: DatabaseSync): void {
`);
}
if (currentVersion?.schema_version && currentVersion.schema_version < 9) {
addColumnIfMissing(db, 'imm_anime', 'description', 'TEXT');
addColumnIfMissing(db, 'imm_words', 'frequency_rank', 'INTEGER');
}
db.exec(`
CREATE INDEX IF NOT EXISTS idx_anime_normalized_title
ON imm_anime(normalized_title_key)
@@ -776,9 +783,9 @@ export function createTrackerPreparedStatements(db: DatabaseSync): TrackerPrepar
`),
wordUpsertStmt: db.prepare(`
INSERT INTO imm_words (
headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency
headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency, frequency_rank
) VALUES (
?, ?, ?, ?, ?, ?, ?, ?, ?, 1
?, ?, ?, ?, ?, ?, ?, ?, ?, 1, ?
)
ON CONFLICT(headword, word, reading) DO UPDATE SET
frequency = COALESCE(frequency, 0) + 1,
@@ -792,7 +799,12 @@ export function createTrackerPreparedStatements(db: DatabaseSync): TrackerPrepar
pos2 = COALESCE(NULLIF(imm_words.pos2, ''), excluded.pos2),
pos3 = COALESCE(NULLIF(imm_words.pos3, ''), excluded.pos3),
first_seen = MIN(COALESCE(first_seen, excluded.first_seen), excluded.first_seen),
last_seen = MAX(COALESCE(last_seen, excluded.last_seen), excluded.last_seen)
last_seen = MAX(COALESCE(last_seen, excluded.last_seen), excluded.last_seen),
frequency_rank = CASE
WHEN excluded.frequency_rank IS NOT NULL AND (imm_words.frequency_rank IS NULL OR excluded.frequency_rank < imm_words.frequency_rank)
THEN excluded.frequency_rank
ELSE imm_words.frequency_rank
END
`),
kanjiUpsertStmt: db.prepare(`
INSERT INTO imm_kanji (
@@ -863,6 +875,7 @@ function incrementWordAggregate(
occurrence.pos3,
firstSeen,
lastSeen,
occurrence.frequencyRank ?? null,
);
}
const row = stmts.wordIdSelectStmt.get(
@@ -926,6 +939,7 @@ export function executeQueuedWrite(write: QueuedWrite, stmts: TrackerPreparedSta
write.pos3,
write.firstSeen,
write.lastSeen,
write.frequencyRank ?? null,
);
return;
}

View File

@@ -1,4 +1,4 @@
export const SCHEMA_VERSION = 7;
export const SCHEMA_VERSION = 9;
export const DEFAULT_QUEUE_CAP = 1_000;
export const DEFAULT_BATCH_SIZE = 25;
export const DEFAULT_FLUSH_INTERVAL_MS = 500;
@@ -128,6 +128,7 @@ interface QueuedWordWrite {
pos3: string;
firstSeen: number;
lastSeen: number;
frequencyRank: number | null;
}
interface QueuedKanjiWrite {
@@ -146,6 +147,7 @@ export interface CountedWordOccurrence {
pos2: string;
pos3: string;
occurrenceCount: number;
frequencyRank: number | null;
}
export interface CountedKanjiOccurrence {
@@ -240,6 +242,7 @@ export interface VocabularyStatsRow {
pos2: string | null;
pos3: string | null;
frequency: number;
frequencyRank: number | null;
firstSeen: number;
lastSeen: number;
}
@@ -395,6 +398,7 @@ export interface AnimeDetailRow {
titleRomaji: string | null;
titleEnglish: string | null;
titleNative: string | null;
description: string | null;
totalSessions: number;
totalActiveMs: number;
totalCards: number;

View File

@@ -18,6 +18,7 @@ export interface StatsServerConfig {
port: number;
staticDir: string; // Path to stats/dist/
tracker: ImmersionTrackerService;
knownWordCachePath?: string;
}
const STATS_STATIC_CONTENT_TYPES: Record<string, string> = {
@@ -79,7 +80,7 @@ function createStatsStaticResponse(staticDir: string, requestPath: string): Resp
export function createStatsApp(
tracker: ImmersionTrackerService,
options?: { staticDir?: string },
options?: { staticDir?: string; knownWordCachePath?: string },
) {
const app = new Hono();
@@ -259,6 +260,70 @@ export function createStatsApp(
return c.json({ ok: true });
});
app.delete('/api/stats/sessions/:sessionId', async (c) => {
const sessionId = parseIntQuery(c.req.param('sessionId'), 0);
if (sessionId <= 0) return c.body(null, 400);
await tracker.deleteSession(sessionId);
return c.json({ ok: true });
});
app.delete('/api/stats/media/:videoId', async (c) => {
const videoId = parseIntQuery(c.req.param('videoId'), 0);
if (videoId <= 0) return c.body(null, 400);
await tracker.deleteVideo(videoId);
return c.json({ ok: true });
});
app.get('/api/stats/anilist/search', async (c) => {
const query = (c.req.query('q') ?? '').trim();
if (!query) return c.json([]);
try {
const res = await fetch('https://graphql.anilist.co', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
query: `query ($search: String!) {
Page(perPage: 10) {
media(search: $search, type: ANIME) {
id
episodes
season
seasonYear
description(asHtml: false)
coverImage { large medium }
title { romaji english native }
}
}
}`,
variables: { search: query },
}),
});
const json = await res.json() as { data?: { Page?: { media?: unknown[] } } };
return c.json(json.data?.Page?.media ?? []);
} catch {
return c.json([]);
}
});
app.get('/api/stats/known-words', (c) => {
const cachePath = options?.knownWordCachePath;
if (!cachePath || !existsSync(cachePath)) return c.json([]);
try {
const raw = JSON.parse(readFileSync(cachePath, 'utf-8')) as { version?: number; words?: string[] };
if (raw.version === 1 && Array.isArray(raw.words)) return c.json(raw.words);
} catch { /* ignore */ }
return c.json([]);
});
app.patch('/api/stats/anime/:animeId/anilist', async (c) => {
const animeId = parseIntQuery(c.req.param('animeId'), 0);
if (animeId <= 0) return c.body(null, 400);
const body = await c.req.json().catch(() => null);
if (!body?.anilistId) return c.body(null, 400);
await tracker.reassignAnimeAnilist(animeId, body);
return c.json({ ok: true });
});
app.get('/api/stats/anime/:animeId/cover', async (c) => {
const animeId = parseIntQuery(c.req.param('animeId'), 0);
if (animeId <= 0) return c.body(null, 404);
@@ -363,7 +428,7 @@ export function createStatsApp(
}
export function startStatsServer(config: StatsServerConfig): { close: () => void } {
const app = createStatsApp(config.tracker, { staticDir: config.staticDir });
const app = createStatsApp(config.tracker, { staticDir: config.staticDir, knownWordCachePath: config.knownWordCachePath });
const server = serve({
fetch: app.fetch,

View File

@@ -55,10 +55,13 @@ export function buildStatsWindowOptions(options: {
};
}
export function buildStatsWindowLoadFileOptions(): { query: Record<string, string> } {
export function buildStatsWindowLoadFileOptions(apiBaseUrl?: string): {
query: Record<string, string>;
} {
return {
query: {
overlay: '1',
...(apiBaseUrl ? { apiBase: apiBaseUrl } : {}),
},
};
}

View File

@@ -140,3 +140,12 @@ test('buildStatsWindowLoadFileOptions enables overlay rendering mode', () => {
},
});
});
test('buildStatsWindowLoadFileOptions includes provided stats API base URL', () => {
assert.deepEqual(buildStatsWindowLoadFileOptions('http://127.0.0.1:6123'), {
query: {
overlay: '1',
apiBase: 'http://127.0.0.1:6123',
},
});
});

View File

@@ -16,6 +16,8 @@ export interface StatsWindowOptions {
staticDir: string;
/** Absolute path to the compiled preload-stats.js */
preloadPath: string;
/** Resolve the active stats API base URL */
getApiBaseUrl?: () => string;
/** Resolve the active stats toggle key from config */
getToggleKey: () => string;
/** Resolve the tracked overlay/mpv bounds */
@@ -46,7 +48,7 @@ export function toggleStatsOverlay(options: StatsWindowOptions): void {
);
const indexPath = path.join(options.staticDir, 'index.html');
statsWindow.loadFile(indexPath, buildStatsWindowLoadFileOptions());
statsWindow.loadFile(indexPath, buildStatsWindowLoadFileOptions(options.getApiBaseUrl?.()));
statsWindow.on('closed', () => {
statsWindow = null;

View File

@@ -706,6 +706,240 @@ test('tokenizeSubtitle prefers Yomitan frequency from highest-priority dictionar
assert.equal(result.tokens?.[0]?.frequencyRank, 100);
});
test('tokenizeSubtitle ignores occurrence-based Yomitan frequencies for inflected terms', async () => {
const result = await tokenizeSubtitle(
'潜み',
makeDeps({
getFrequencyDictionaryEnabled: () => true,
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
getYomitanParserWindow: () =>
({
isDestroyed: () => false,
webContents: {
executeJavaScript: async (script: string) => {
if (script.includes('getTermFrequencies')) {
return [
{
term: '潜む',
reading: 'ひそ',
dictionary: 'CC100',
frequency: 118121,
displayValue: null,
displayValueParsed: false,
},
];
}
if (script.includes('optionsGetFull')) {
return {
profileCurrent: 0,
profileIndex: 0,
scanLength: 40,
dictionaries: ['CC100'],
dictionaryPriorityByName: { CC100: 0 },
dictionaryFrequencyModeByName: { CC100: 'occurrence-based' },
profiles: [
{
options: {
scanning: { length: 40 },
dictionaries: [{ name: 'CC100', enabled: true, id: 0 }],
},
},
],
};
}
return [
{
surface: '潜み',
reading: 'ひそ',
headword: '潜む',
startPos: 0,
endPos: 2,
},
];
},
},
}) as unknown as Electron.BrowserWindow,
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
});
test('tokenizeSubtitle falls back to raw term-only Yomitan rank when no scan-derived rank exists', async () => {
const result = await tokenizeSubtitle(
'潜み',
makeDeps({
getFrequencyDictionaryEnabled: () => true,
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
getYomitanParserWindow: () =>
({
isDestroyed: () => false,
webContents: {
executeJavaScript: async (script: string) => {
if (script.includes('getTermFrequencies')) {
return [
{
term: '潜む',
reading: 'ひそ',
hasReading: false,
dictionary: 'CC100',
frequency: 118121,
displayValue: null,
displayValueParsed: false,
},
];
}
if (script.includes('optionsGetFull')) {
return {
profileCurrent: 0,
profileIndex: 0,
scanLength: 40,
dictionaries: ['CC100'],
dictionaryPriorityByName: { CC100: 0 },
dictionaryFrequencyModeByName: { CC100: 'rank-based' },
profiles: [
{
options: {
scanning: { length: 40 },
dictionaries: [{ name: 'CC100', enabled: true, id: 0 }],
},
},
],
};
}
return [
{
surface: '潜み',
reading: 'ひそ',
headword: '潜む',
startPos: 0,
endPos: 2,
},
];
},
},
}) as unknown as Electron.BrowserWindow,
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.frequencyRank, 118121);
});
test('tokenizeSubtitle keeps parsed display rank for term-only inflected headword fallback', async () => {
const result = await tokenizeSubtitle(
'潜み',
makeDeps({
getFrequencyDictionaryEnabled: () => true,
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
getYomitanParserWindow: () =>
({
isDestroyed: () => false,
webContents: {
executeJavaScript: async (script: string) => {
if (script.includes('getTermFrequencies')) {
return [
{
term: '潜む',
reading: 'ひそ',
hasReading: false,
dictionary: 'CC100',
frequency: 118121,
displayValue: '118,121',
displayValueParsed: false,
},
];
}
if (script.includes('optionsGetFull')) {
return {
profileCurrent: 0,
profileIndex: 0,
scanLength: 40,
dictionaries: ['CC100'],
dictionaryPriorityByName: { CC100: 0 },
dictionaryFrequencyModeByName: { CC100: 'rank-based' },
profiles: [
{
options: {
scanning: { length: 40 },
dictionaries: [{ name: 'CC100', enabled: true, id: 0 }],
},
},
],
};
}
return [
{
surface: '潜み',
reading: 'ひそ',
headword: '潜む',
startPos: 0,
endPos: 2,
},
];
},
},
}) as unknown as Electron.BrowserWindow,
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.frequencyRank, 118);
});
test('tokenizeSubtitle preserves scan-derived rank over lower-priority Yomitan fallback', async () => {
const result = await tokenizeSubtitle(
'潜み',
makeDeps({
getFrequencyDictionaryEnabled: () => true,
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
getYomitanParserWindow: () =>
({
isDestroyed: () => false,
webContents: {
executeJavaScript: async (script: string) => {
if (script.includes('getTermFrequencies')) {
return [
{
term: '潜む',
reading: 'ひそ',
hasReading: false,
dictionary: 'CC100',
dictionaryPriority: 2,
frequency: 118121,
displayValue: null,
displayValueParsed: false,
},
];
}
return [
{
surface: '潜み',
reading: 'ひそむ',
headword: '潜む',
startPos: 0,
endPos: 2,
frequencyRank: 4073,
},
];
},
},
}) as unknown as Electron.BrowserWindow,
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.frequencyRank, 4073);
});
test('tokenizeSubtitle uses only selected Yomitan headword for frequency lookup', async () => {
const result = await tokenizeSubtitle(
'猫です',
@@ -836,6 +1070,69 @@ test('tokenizeSubtitle prefers exact headword frequency over surface/reading whe
assert.equal(result.tokens?.[0]?.frequencyRank, 8);
});
test('tokenizeSubtitle falls back to exact surface frequency when merged headword lookup misses', async () => {
const frequencyScripts: string[] = [];
const result = await tokenizeSubtitle(
'陰に',
makeDeps({
getFrequencyDictionaryEnabled: () => true,
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
getYomitanParserWindow: () =>
({
isDestroyed: () => false,
webContents: {
executeJavaScript: async (script: string) => {
if (script.includes('getTermFrequencies')) {
frequencyScripts.push(script);
return script.includes('"term":"陰に","reading":"いんに"')
? [
{
term: '陰に',
reading: 'いんに',
dictionary: 'freq-dict',
frequency: 5702,
displayValue: '5702',
displayValueParsed: true,
},
]
: [];
}
return [
{
source: 'scanning-parser',
index: 0,
content: [
[
{
text: '陰に',
reading: 'いんに',
headwords: [[{ term: '陰' }]],
},
],
],
},
];
},
},
}) as unknown as Electron.BrowserWindow,
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.surface, '陰に');
assert.equal(result.tokens?.[0]?.headword, '陰');
assert.equal(result.tokens?.[0]?.frequencyRank, 5702);
assert.equal(
frequencyScripts.some((script) => script.includes('"term":"陰","reading":"いんに"')),
true,
);
assert.equal(
frequencyScripts.some((script) => script.includes('"term":"陰に","reading":"いんに"')),
true,
);
});
test('tokenizeSubtitle keeps no frequency when only reading matches and headword misses', async () => {
const result = await tokenizeSubtitle(
'猫です',
@@ -2287,6 +2584,131 @@ test('tokenizeSubtitle keeps correct MeCab pos1 enrichment when Yomitan offsets
assert.equal(targets[0]?.surface, '仮面');
});
test('tokenizeSubtitle preserves merged token frequency when MeCab positions cross a newline gap', async () => {
const parserWindow = {
isDestroyed: () => false,
webContents: {
executeJavaScript: async (script: string) => {
if (script.includes('getTermFrequencies')) {
return script.includes('"term":"陰に","reading":"いんに"')
? [
{
term: '陰に',
reading: 'いんに',
dictionary: 'JPDBv2㋕',
frequency: 5702,
displayValue: '5702',
displayValueParsed: false,
},
]
: [];
}
return [
{
surface: 'X',
reading: 'えっくす',
headword: 'X',
startPos: 0,
endPos: 1,
},
{
surface: '陰に',
reading: 'いんに',
headword: '陰に',
startPos: 2,
endPos: 4,
},
{
surface: '潜み',
reading: 'ひそ',
headword: '潜む',
startPos: 4,
endPos: 6,
},
];
},
},
} as unknown as Electron.BrowserWindow;
const deps = createTokenizerDepsRuntime({
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
getYomitanParserWindow: () => parserWindow,
setYomitanParserWindow: () => {},
getYomitanParserReadyPromise: () => null,
setYomitanParserReadyPromise: () => {},
getYomitanParserInitPromise: () => null,
setYomitanParserInitPromise: () => {},
isKnownWord: () => false,
getKnownWordMatchMode: () => 'headword',
getJlptLevel: () => null,
getFrequencyDictionaryEnabled: () => true,
getMecabTokenizer: () => ({
tokenize: async () => [
{
word: 'X',
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '一般',
pos3: '',
pos4: '',
inflectionType: '',
inflectionForm: '',
headword: 'X',
katakanaReading: 'エックス',
pronunciation: 'エックス',
},
{
word: '陰',
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '一般',
pos3: '',
pos4: '',
inflectionType: '',
inflectionForm: '',
headword: '陰',
katakanaReading: 'カゲ',
pronunciation: 'カゲ',
},
{
word: 'に',
partOfSpeech: PartOfSpeech.particle,
pos1: '助詞',
pos2: '格助詞',
pos3: '一般',
pos4: '',
inflectionType: '',
inflectionForm: '',
headword: 'に',
katakanaReading: 'ニ',
pronunciation: 'ニ',
},
{
word: '潜み',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '自立',
pos3: '',
pos4: '',
inflectionType: '五段・マ行',
inflectionForm: '連用形',
headword: '潜む',
katakanaReading: 'ヒソミ',
pronunciation: 'ヒソミ',
},
],
}),
});
const result = await tokenizeSubtitle('X\n陰に潜み', deps);
assert.equal(result.tokens?.[1]?.surface, '陰に');
assert.equal(result.tokens?.[1]?.pos1, '名詞|助詞');
assert.equal(result.tokens?.[1]?.pos2, '一般|格助詞');
assert.equal(result.tokens?.[1]?.frequencyRank, 5702);
});
test('tokenizeSubtitle does not color 1-2 word sentences by default', async () => {
const result = await tokenizeSubtitle(
'猫です',

View File

@@ -23,6 +23,7 @@ import {
requestYomitanScanTokens,
requestYomitanTermFrequencies,
} from './tokenizer/yomitan-parser-runtime';
import type { YomitanTermFrequency } from './tokenizer/yomitan-parser-runtime';
const logger = createLogger('main:tokenizer');
@@ -225,7 +226,13 @@ export function createTokenizerDepsRuntime(
return null;
}
return mergeTokens(rawTokens, options.isKnownWord, options.getKnownWordMatchMode(), false);
return mergeTokens(
rawTokens,
options.isKnownWord,
options.getKnownWordMatchMode(),
false,
text,
);
},
enrichTokensWithMecab: async (tokens, mecabTokens) =>
enrichTokensWithMecabAsync(tokens, mecabTokens),
@@ -336,56 +343,154 @@ function resolveFrequencyLookupText(
return token.surface;
}
function resolveYomitanFrequencyLookupTexts(
token: MergedToken,
matchMode: FrequencyDictionaryMatchMode,
): string[] {
const primaryLookupText = resolveFrequencyLookupText(token, matchMode).trim();
if (!primaryLookupText) {
return [];
}
if (matchMode !== 'headword') {
return [primaryLookupText];
}
const normalizedHeadword = token.headword.trim();
const normalizedSurface = token.surface.trim();
if (
!normalizedHeadword ||
!normalizedSurface ||
normalizedSurface === normalizedHeadword ||
normalizedSurface === primaryLookupText
) {
return [primaryLookupText];
}
return [primaryLookupText, normalizedSurface];
}
function buildYomitanFrequencyTermReadingList(
tokens: MergedToken[],
matchMode: FrequencyDictionaryMatchMode,
): Array<{ term: string; reading: string | null }> {
const termReadingList: Array<{ term: string; reading: string | null }> = [];
for (const token of tokens) {
const term = resolveFrequencyLookupText(token, matchMode).trim();
if (!term) {
continue;
}
const readingRaw =
token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null;
termReadingList.push({ term, reading: readingRaw });
for (const term of resolveYomitanFrequencyLookupTexts(token, matchMode)) {
termReadingList.push({ term, reading: readingRaw });
}
}
return termReadingList;
}
function buildYomitanFrequencyRankMap(
frequencies: ReadonlyArray<{ term: string; frequency: number; dictionaryPriority?: number }>,
): Map<string, number> {
const rankByTerm = new Map<string, { rank: number; dictionaryPriority: number }>();
function makeYomitanFrequencyPairKey(term: string, reading: string | null): string {
return `${term}\u0000${reading ?? ''}`;
}
interface NormalizedYomitanTermFrequency extends YomitanTermFrequency {
reading: string | null;
frequency: number;
}
interface YomitanFrequencyIndex {
byPair: Map<string, NormalizedYomitanTermFrequency[]>;
byTerm: Map<string, NormalizedYomitanTermFrequency[]>;
}
function appendYomitanFrequencyEntry(
map: Map<string, NormalizedYomitanTermFrequency[]>,
key: string,
entry: NormalizedYomitanTermFrequency,
): void {
const existing = map.get(key);
if (existing) {
existing.push(entry);
return;
}
map.set(key, [entry]);
}
function buildYomitanFrequencyIndex(
frequencies: ReadonlyArray<YomitanTermFrequency>,
): YomitanFrequencyIndex {
const byPair = new Map<string, NormalizedYomitanTermFrequency[]>();
const byTerm = new Map<string, NormalizedYomitanTermFrequency[]>();
for (const frequency of frequencies) {
const normalizedTerm = frequency.term.trim();
const term = frequency.term.trim();
const rank = normalizePositiveFrequencyRank(frequency.frequency);
if (!normalizedTerm || rank === null) {
if (!term || rank === null) {
continue;
}
const dictionaryPriority =
typeof frequency.dictionaryPriority === 'number' &&
Number.isFinite(frequency.dictionaryPriority)
? Math.max(0, Math.floor(frequency.dictionaryPriority))
: Number.MAX_SAFE_INTEGER;
const current = rankByTerm.get(normalizedTerm);
const reading =
typeof frequency.reading === 'string' && frequency.reading.trim().length > 0
? frequency.reading.trim()
: null;
const normalizedEntry: NormalizedYomitanTermFrequency = {
...frequency,
term,
reading,
frequency: rank,
};
appendYomitanFrequencyEntry(byPair, makeYomitanFrequencyPairKey(term, reading), normalizedEntry);
appendYomitanFrequencyEntry(byTerm, term, normalizedEntry);
}
return { byPair, byTerm };
}
function selectBestYomitanFrequencyRank(
entries: ReadonlyArray<NormalizedYomitanTermFrequency>,
): number | null {
let bestEntry: NormalizedYomitanTermFrequency | null = null;
for (const entry of entries) {
if (
current === undefined ||
dictionaryPriority < current.dictionaryPriority ||
(dictionaryPriority === current.dictionaryPriority && rank < current.rank)
bestEntry === null ||
entry.dictionaryPriority < bestEntry.dictionaryPriority ||
(entry.dictionaryPriority === bestEntry.dictionaryPriority &&
entry.frequency < bestEntry.frequency)
) {
rankByTerm.set(normalizedTerm, { rank, dictionaryPriority });
bestEntry = entry;
}
}
const collapsedRankByTerm = new Map<string, number>();
for (const [term, entry] of rankByTerm.entries()) {
collapsedRankByTerm.set(term, entry.rank);
return bestEntry?.frequency ?? null;
}
function getYomitanFrequencyRank(
token: MergedToken,
candidateText: string,
matchMode: FrequencyDictionaryMatchMode,
frequencyIndex: YomitanFrequencyIndex,
): number | null {
const normalizedCandidateText = candidateText.trim();
if (!normalizedCandidateText) {
return null;
}
return collapsedRankByTerm;
const reading =
typeof token.reading === 'string' && token.reading.trim().length > 0 ? token.reading.trim() : null;
const pairEntries =
frequencyIndex.byPair.get(makeYomitanFrequencyPairKey(normalizedCandidateText, reading)) ?? [];
const candidateEntries =
pairEntries.length > 0 ? pairEntries : (frequencyIndex.byTerm.get(normalizedCandidateText) ?? []);
if (candidateEntries.length === 0) {
return null;
}
const normalizedHeadword = token.headword.trim();
const normalizedSurface = token.surface.trim();
const isInflectedHeadwordFallback =
matchMode === 'headword' &&
normalizedCandidateText === normalizedHeadword &&
normalizedSurface.length > 0 &&
normalizedSurface !== normalizedHeadword;
return selectBestYomitanFrequencyRank(candidateEntries);
}
function getLocalFrequencyRank(
@@ -416,7 +521,7 @@ function getLocalFrequencyRank(
function applyFrequencyRanks(
tokens: MergedToken[],
matchMode: FrequencyDictionaryMatchMode,
yomitanRankByTerm: Map<string, number>,
yomitanFrequencyIndex: YomitanFrequencyIndex,
getFrequencyRank: FrequencyDictionaryLookup | undefined,
): MergedToken[] {
if (tokens.length === 0) {
@@ -441,12 +546,19 @@ function applyFrequencyRanks(
};
}
const yomitanRank = yomitanRankByTerm.get(lookupText);
if (yomitanRank !== undefined) {
return {
...token,
frequencyRank: yomitanRank,
};
for (const candidateText of resolveYomitanFrequencyLookupTexts(token, matchMode)) {
const yomitanRank = getYomitanFrequencyRank(
token,
candidateText,
matchMode,
yomitanFrequencyIndex,
);
if (yomitanRank !== null) {
return {
...token,
frequencyRank: yomitanRank,
};
}
}
if (!getFrequencyRank) {
@@ -501,6 +613,7 @@ async function parseWithYomitanInternalParser(
isKnown: false,
isNPlusOneTarget: false,
isNameMatch: token.isNameMatch ?? false,
frequencyRank: token.frequencyRank,
}),
),
);
@@ -510,7 +623,7 @@ async function parseWithYomitanInternalParser(
}
deps.onTokenizationReady?.(text);
const frequencyRankPromise: Promise<Map<string, number>> = options.frequencyEnabled
const frequencyRankPromise: Promise<YomitanFrequencyIndex> = options.frequencyEnabled
? (async () => {
const frequencyMatchMode = options.frequencyMatchMode;
const termReadingList = buildYomitanFrequencyTermReadingList(
@@ -522,9 +635,9 @@ async function parseWithYomitanInternalParser(
deps,
logger,
);
return buildYomitanFrequencyRankMap(yomitanFrequencies);
return buildYomitanFrequencyIndex(yomitanFrequencies);
})()
: Promise.resolve(new Map<string, number>());
: Promise.resolve({ byPair: new Map(), byTerm: new Map() });
const mecabEnrichmentPromise: Promise<MergedToken[]> = needsMecabPosEnrichment(options)
? (async () => {
@@ -545,7 +658,7 @@ async function parseWithYomitanInternalParser(
})()
: Promise.resolve(normalizedSelectedTokens);
const [yomitanRankByTerm, enrichedTokens] = await Promise.all([
const [yomitanFrequencyIndex, enrichedTokens] = await Promise.all([
frequencyRankPromise,
mecabEnrichmentPromise,
]);
@@ -554,7 +667,7 @@ async function parseWithYomitanInternalParser(
return applyFrequencyRanks(
enrichedTokens,
options.frequencyMatchMode,
yomitanRankByTerm,
yomitanFrequencyIndex,
deps.getFrequencyRank,
);
}

View File

@@ -293,6 +293,29 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+
assert.equal(result[0]?.isNPlusOneTarget, false);
});
test('annotateTokens keeps frequency for kanji noun tokens even when mecab marks them non-independent', () => {
const tokens = [
makeToken({
surface: '者',
reading: 'もの',
headword: '者',
partOfSpeech: PartOfSpeech.other,
pos1: '名詞',
pos2: '非自立',
pos3: '一般',
startPos: 0,
endPos: 1,
frequencyRank: 475,
}),
];
const result = annotateTokens(tokens, makeDeps(), {
minSentenceWordsForNPlusOne: 1,
});
assert.equal(result[0]?.frequencyRank, 475);
});
test('annotateTokens excludes likely kana SFX tokens from frequency when POS tags are missing', () => {
const tokens = [
makeToken({

View File

@@ -89,6 +89,23 @@ function normalizePos2Tag(pos2: string | undefined): string {
return typeof pos2 === 'string' ? pos2.trim() : '';
}
function hasKanjiChar(text: string): boolean {
for (const char of text) {
const code = char.codePointAt(0);
if (code === undefined) {
continue;
}
if (
(code >= 0x3400 && code <= 0x4dbf) ||
(code >= 0x4e00 && code <= 0x9fff) ||
(code >= 0xf900 && code <= 0xfaff)
) {
return true;
}
}
return false;
}
function isExcludedComponent(
pos1: string | undefined,
pos2: string | undefined,
@@ -169,6 +186,34 @@ function isFrequencyExcludedByPos(
);
}
function shouldKeepFrequencyForNonIndependentKanjiNoun(
token: MergedToken,
pos1Exclusions: ReadonlySet<string>,
): boolean {
if (pos1Exclusions.has('名詞')) {
return false;
}
const rank =
typeof token.frequencyRank === 'number' && Number.isFinite(token.frequencyRank)
? Math.max(1, Math.floor(token.frequencyRank))
: null;
if (rank === null) {
return false;
}
const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1));
const pos2Parts = splitNormalizedTagParts(normalizePos2Tag(token.pos2));
if (pos1Parts.length !== 1 || pos2Parts.length !== 1) {
return false;
}
if (pos1Parts[0] !== '名詞' || pos2Parts[0] !== '非自立') {
return false;
}
return hasKanjiChar(token.surface) || hasKanjiChar(token.headword);
}
export function shouldExcludeTokenFromVocabularyPersistence(
token: MergedToken,
options: Pick<AnnotationStageOptions, 'pos1Exclusions' | 'pos2Exclusions'> = {},
@@ -454,7 +499,10 @@ function filterTokenFrequencyRank(
pos1Exclusions: ReadonlySet<string>,
pos2Exclusions: ReadonlySet<string>,
): number | undefined {
if (isFrequencyExcludedByPos(token, pos1Exclusions, pos2Exclusions)) {
if (
isFrequencyExcludedByPos(token, pos1Exclusions, pos2Exclusions) &&
!shouldKeepFrequencyForNonIndependentKanjiNoun(token, pos1Exclusions)
) {
return undefined;
}

View File

@@ -188,6 +188,7 @@ test('requestYomitanTermFrequencies returns normalized frequency entries', async
{
term: '猫',
reading: 'ねこ',
hasReading: true,
dictionary: 'freq-dict',
dictionaryPriority: 0,
frequency: 77,
@@ -197,6 +198,7 @@ test('requestYomitanTermFrequencies returns normalized frequency entries', async
{
term: '鍛える',
reading: 'きたえる',
hasReading: false,
dictionary: 'freq-dict',
dictionaryPriority: 1,
frequency: 46961,
@@ -217,9 +219,11 @@ test('requestYomitanTermFrequencies returns normalized frequency entries', async
assert.equal(result.length, 2);
assert.equal(result[0]?.term, '猫');
assert.equal(result[0]?.hasReading, true);
assert.equal(result[0]?.frequency, 77);
assert.equal(result[0]?.dictionaryPriority, 0);
assert.equal(result[1]?.term, '鍛える');
assert.equal(result[1]?.hasReading, false);
assert.equal(result[1]?.frequency, 2847);
assert.match(scriptValue, /getTermFrequencies/);
assert.match(scriptValue, /optionsGetFull/);
@@ -247,6 +251,96 @@ test('requestYomitanTermFrequencies prefers primary rank from displayValue array
assert.equal(result[0]?.frequency, 7141);
});
test('requestYomitanTermFrequencies prefers primary rank from displayValue string pair when raw frequency matches trailing count', async () => {
const deps = createDeps(async () => [
{
term: '潜む',
reading: 'ひそむ',
dictionary: 'freq-dict',
dictionaryPriority: 0,
frequency: 121,
displayValue: '118,121',
displayValueParsed: false,
},
]);
const result = await requestYomitanTermFrequencies([{ term: '潜む', reading: 'ひそむ' }], deps, {
error: () => undefined,
});
assert.equal(result.length, 1);
assert.equal(result[0]?.term, '潜む');
assert.equal(result[0]?.frequency, 118);
});
test('requestYomitanTermFrequencies uses leading display digits for displayValue strings', async () => {
const deps = createDeps(async () => [
{
term: '例',
reading: 'れい',
dictionary: 'freq-dict',
dictionaryPriority: 0,
frequency: 1234,
displayValue: '1,234',
displayValueParsed: false,
},
]);
const result = await requestYomitanTermFrequencies([{ term: '例', reading: 'れい' }], deps, {
error: () => undefined,
});
assert.equal(result.length, 1);
assert.equal(result[0]?.term, '例');
assert.equal(result[0]?.frequency, 1);
});
test('requestYomitanTermFrequencies ignores occurrence-based dictionaries for rank tagging', async () => {
let metadataScript = '';
const deps = createDeps(async (script) => {
if (script.includes('getTermFrequencies')) {
return [
{
term: '潜む',
reading: 'ひそむ',
dictionary: 'CC100',
frequency: 118121,
displayValue: null,
displayValueParsed: false,
},
];
}
if (script.includes('optionsGetFull')) {
metadataScript = script;
return {
profileCurrent: 0,
profileIndex: 0,
scanLength: 40,
dictionaries: ['CC100'],
dictionaryPriorityByName: { CC100: 0 },
dictionaryFrequencyModeByName: { CC100: 'occurrence-based' },
profiles: [
{
options: {
scanning: { length: 40 },
dictionaries: [{ name: 'CC100', enabled: true, id: 0 }],
},
},
],
};
}
return [];
});
const result = await requestYomitanTermFrequencies([{ term: '潜む', reading: 'ひそむ' }], deps, {
error: () => undefined,
});
assert.deepEqual(result, []);
assert.match(metadataScript, /getDictionaryInfo/);
});
test('requestYomitanTermFrequencies requests term-only fallback only after reading miss', async () => {
const frequencyScripts: string[] = [];
const deps = createDeps(async (script) => {
@@ -485,6 +579,317 @@ test('requestYomitanScanTokens uses left-to-right termsFind scanning instead of
assert.match(scannerScript ?? '', /deinflect:\s*true/);
});
test('requestYomitanScanTokens extracts best frequency rank from selected termsFind entry', async () => {
let scannerScript = '';
const deps = createDeps(async (script) => {
if (script.includes('termsFind')) {
scannerScript = script;
return [];
}
if (script.includes('optionsGetFull')) {
return {
profileCurrent: 0,
profileIndex: 0,
scanLength: 40,
dictionaries: ['JPDBv2㋕', 'Jiten', 'CC100'],
dictionaryPriorityByName: {
'JPDBv2㋕': 0,
Jiten: 1,
CC100: 2,
},
dictionaryFrequencyModeByName: {
'JPDBv2㋕': 'rank-based',
Jiten: 'rank-based',
CC100: 'rank-based',
},
profiles: [
{
options: {
scanning: { length: 40 },
dictionaries: [
{ name: 'JPDBv2㋕', enabled: true, id: 0 },
{ name: 'Jiten', enabled: true, id: 1 },
{ name: 'CC100', enabled: true, id: 2 },
],
},
},
],
};
}
return null;
});
await requestYomitanScanTokens('潜み', deps, {
error: () => undefined,
});
const result = await runInjectedYomitanScript(scannerScript, (action, params) => {
if (action !== 'termsFind') {
throw new Error(`unexpected action: ${action}`);
}
const text = (params as { text?: string } | undefined)?.text ?? '';
if (!text.startsWith('潜み')) {
return { originalTextLength: 0, dictionaryEntries: [] };
}
return {
originalTextLength: 2,
dictionaryEntries: [
{
headwords: [
{
term: '潜む',
reading: 'ひそむ',
sources: [{ originalText: '潜み', isPrimary: true, matchType: 'exact' }],
},
],
frequencies: [
{
headwordIndex: 0,
dictionary: 'JPDBv2㋕',
frequency: 20181,
displayValue: '4073,20181句',
},
{
headwordIndex: 0,
dictionary: 'Jiten',
frequency: 28594,
displayValue: '4592,28594句',
},
{
headwordIndex: 0,
dictionary: 'CC100',
frequency: 118121,
displayValue: null,
},
],
},
],
};
});
assert.deepEqual(result, [
{
surface: '潜み',
reading: 'ひそ',
headword: '潜む',
startPos: 0,
endPos: 2,
isNameMatch: false,
frequencyRank: 4073,
},
]);
});
test('requestYomitanScanTokens uses frequency from later exact-match entry when first exact entry has none', async () => {
let scannerScript = '';
const deps = createDeps(async (script) => {
if (script.includes('termsFind')) {
scannerScript = script;
return [];
}
if (script.includes('optionsGetFull')) {
return {
profileCurrent: 0,
profileIndex: 0,
scanLength: 40,
dictionaries: ['JPDBv2㋕', 'Jiten', 'CC100'],
dictionaryPriorityByName: {
'JPDBv2㋕': 0,
Jiten: 1,
CC100: 2,
},
dictionaryFrequencyModeByName: {
'JPDBv2㋕': 'rank-based',
Jiten: 'rank-based',
CC100: 'rank-based',
},
profiles: [
{
options: {
scanning: { length: 40 },
dictionaries: [
{ name: 'JPDBv2㋕', enabled: true, id: 0 },
{ name: 'Jiten', enabled: true, id: 1 },
{ name: 'CC100', enabled: true, id: 2 },
],
},
},
],
};
}
return null;
});
await requestYomitanScanTokens('者', deps, {
error: () => undefined,
});
const result = await runInjectedYomitanScript(scannerScript, (action, params) => {
if (action !== 'termsFind') {
throw new Error(`unexpected action: ${action}`);
}
const text = (params as { text?: string } | undefined)?.text ?? '';
if (!text.startsWith('者')) {
return { originalTextLength: 0, dictionaryEntries: [] };
}
return {
originalTextLength: 1,
dictionaryEntries: [
{
headwords: [
{
term: '者',
reading: 'もの',
sources: [{ originalText: '者', isPrimary: true, matchType: 'exact' }],
},
],
frequencies: [],
},
{
headwords: [
{
term: '者',
reading: 'もの',
sources: [{ originalText: '者', isPrimary: true, matchType: 'exact' }],
},
],
frequencies: [
{
headwordIndex: 0,
dictionary: 'JPDBv2㋕',
frequency: 79601,
displayValue: '475,79601句',
},
{
headwordIndex: 0,
dictionary: 'Jiten',
frequency: 338,
displayValue: '338',
},
],
},
],
};
});
assert.deepEqual(result, [
{
surface: '者',
reading: 'もの',
headword: '者',
startPos: 0,
endPos: 1,
isNameMatch: false,
frequencyRank: 475,
},
]);
});
test('requestYomitanScanTokens can use frequency from later exact secondary-match entry', async () => {
let scannerScript = '';
const deps = createDeps(async (script) => {
if (script.includes('termsFind')) {
scannerScript = script;
return [];
}
if (script.includes('optionsGetFull')) {
return {
profileCurrent: 0,
profileIndex: 0,
scanLength: 40,
dictionaries: ['JPDBv2㋕', 'Jiten', 'CC100'],
dictionaryPriorityByName: {
'JPDBv2㋕': 0,
Jiten: 1,
CC100: 2,
},
dictionaryFrequencyModeByName: {
'JPDBv2㋕': 'rank-based',
Jiten: 'rank-based',
CC100: 'rank-based',
},
profiles: [
{
options: {
scanning: { length: 40 },
dictionaries: [
{ name: 'JPDBv2㋕', enabled: true, id: 0 },
{ name: 'Jiten', enabled: true, id: 1 },
{ name: 'CC100', enabled: true, id: 2 },
],
},
},
],
};
}
return null;
});
await requestYomitanScanTokens('者', deps, {
error: () => undefined,
});
const result = await runInjectedYomitanScript(scannerScript, (action, params) => {
if (action !== 'termsFind') {
throw new Error(`unexpected action: ${action}`);
}
const text = (params as { text?: string } | undefined)?.text ?? '';
if (!text.startsWith('者')) {
return { originalTextLength: 0, dictionaryEntries: [] };
}
return {
originalTextLength: 1,
dictionaryEntries: [
{
headwords: [
{
term: '者',
reading: 'もの',
sources: [{ originalText: '者', isPrimary: true, matchType: 'exact' }],
},
],
frequencies: [],
},
{
headwords: [
{
term: '者',
reading: 'もの',
sources: [{ originalText: '者', isPrimary: false, matchType: 'exact' }],
},
],
frequencies: [
{
headwordIndex: 0,
dictionary: 'JPDBv2㋕',
frequency: 79601,
displayValue: '475,79601句',
},
],
},
],
};
});
assert.deepEqual(result, [
{
surface: '者',
reading: 'もの',
headword: '者',
startPos: 0,
endPos: 1,
isNameMatch: false,
frequencyRank: 475,
},
]);
});
test('requestYomitanScanTokens marks tokens backed by SubMiner character dictionary entries', async () => {
const deps = createDeps(async (script) => {
if (script.includes('optionsGetFull')) {

View File

@@ -20,19 +20,24 @@ interface YomitanParserRuntimeDeps {
createYomitanExtensionWindow?: (pageName: string) => Promise<BrowserWindow | null>;
}
type YomitanFrequencyMode = 'occurrence-based' | 'rank-based';
export interface YomitanDictionaryInfo {
title: string;
revision?: string | number;
frequencyMode?: YomitanFrequencyMode;
}
export interface YomitanTermFrequency {
term: string;
reading: string | null;
hasReading: boolean;
dictionary: string;
dictionaryPriority: number;
frequency: number;
displayValue: string | null;
displayValueParsed: boolean;
frequencyDerivedFromDisplayValue: boolean;
}
export interface YomitanTermReadingPair {
@@ -47,6 +52,7 @@ export interface YomitanScanToken {
startPos: number;
endPos: number;
isNameMatch?: boolean;
frequencyRank?: number;
}
interface YomitanProfileMetadata {
@@ -54,6 +60,7 @@ interface YomitanProfileMetadata {
scanLength: number;
dictionaries: string[];
dictionaryPriorityByName: Record<string, number>;
dictionaryFrequencyModeByName: Partial<Record<string, YomitanFrequencyMode>>;
}
const DEFAULT_YOMITAN_SCAN_LENGTH = 40;
@@ -78,7 +85,8 @@ function isScanTokenArray(value: unknown): value is YomitanScanToken[] {
typeof entry.headword === 'string' &&
typeof entry.startPos === 'number' &&
typeof entry.endPos === 'number' &&
(entry.isNameMatch === undefined || typeof entry.isNameMatch === 'boolean'),
(entry.isNameMatch === undefined || typeof entry.isNameMatch === 'boolean') &&
(entry.frequencyRank === undefined || typeof entry.frequencyRank === 'number'),
)
);
}
@@ -117,24 +125,22 @@ function parsePositiveFrequencyString(value: string): number | null {
return null;
}
const numericPrefix = trimmed.match(/^\d[\d,]*/)?.[0];
if (!numericPrefix) {
const numericMatch = trimmed.match(/[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/)?.[0];
if (!numericMatch) {
return null;
}
const chunks = numericPrefix.split(',');
const normalizedNumber =
chunks.length <= 1
? (chunks[0] ?? '')
: chunks.slice(1).every((chunk) => /^\d{3}$/.test(chunk))
? chunks.join('')
: (chunks[0] ?? '');
const parsed = Number.parseInt(normalizedNumber, 10);
const parsed = Number.parseFloat(numericMatch);
if (!Number.isFinite(parsed) || parsed <= 0) {
return null;
}
return parsed;
const normalized = Math.floor(parsed);
if (!Number.isFinite(normalized) || normalized <= 0) {
return null;
}
return normalized;
}
function parsePositiveFrequencyValue(value: unknown): number | null {
@@ -159,6 +165,19 @@ function parsePositiveFrequencyValue(value: unknown): number | null {
return null;
}
function parseDisplayFrequencyValue(value: unknown): number | null {
if (typeof value === 'string') {
const leadingDigits = value.trim().match(/^\d+/)?.[0];
if (!leadingDigits) {
return null;
}
const parsed = Number.parseInt(leadingDigits, 10);
return Number.isFinite(parsed) && parsed > 0 ? parsed : null;
}
return parsePositiveFrequencyValue(value);
}
function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
if (!isObject(value)) {
return null;
@@ -169,9 +188,7 @@ function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
const rawFrequency = parsePositiveFrequencyValue(value.frequency);
const displayValueRaw = value.displayValue;
const parsedDisplayFrequency =
displayValueRaw !== null && displayValueRaw !== undefined
? parsePositiveFrequencyValue(displayValueRaw)
: null;
displayValueRaw !== null && displayValueRaw !== undefined ? parseDisplayFrequencyValue(displayValueRaw) : null;
const frequency = parsedDisplayFrequency ?? rawFrequency;
if (!term || !dictionary || frequency === null) {
return null;
@@ -184,17 +201,20 @@ function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
const reading =
value.reading === null ? null : typeof value.reading === 'string' ? value.reading : null;
const hasReading = value.hasReading === false ? false : reading !== null;
const displayValue = typeof displayValueRaw === 'string' ? displayValueRaw : null;
const displayValueParsed = value.displayValueParsed === true;
return {
term,
reading,
hasReading,
dictionary,
dictionaryPriority,
frequency,
displayValue,
displayValueParsed,
frequencyDerivedFromDisplayValue: parsedDisplayFrequency !== null,
};
}
@@ -300,17 +320,34 @@ function toYomitanProfileMetadata(value: unknown): YomitanProfileMetadata | null
}
}
const dictionaryFrequencyModeByNameRaw = value.dictionaryFrequencyModeByName;
const dictionaryFrequencyModeByName: Partial<Record<string, YomitanFrequencyMode>> = {};
if (isObject(dictionaryFrequencyModeByNameRaw)) {
for (const [name, frequencyModeRaw] of Object.entries(dictionaryFrequencyModeByNameRaw)) {
const normalizedName = name.trim();
if (!normalizedName) {
continue;
}
if (frequencyModeRaw !== 'occurrence-based' && frequencyModeRaw !== 'rank-based') {
continue;
}
dictionaryFrequencyModeByName[normalizedName] = frequencyModeRaw;
}
}
return {
profileIndex,
scanLength,
dictionaries,
dictionaryPriorityByName,
dictionaryFrequencyModeByName,
};
}
function normalizeFrequencyEntriesWithPriority(
rawResult: unknown[],
dictionaryPriorityByName: Record<string, number>,
dictionaryFrequencyModeByName: Partial<Record<string, YomitanFrequencyMode>>,
): YomitanTermFrequency[] {
const normalized: YomitanTermFrequency[] = [];
for (const entry of rawResult) {
@@ -319,6 +356,10 @@ function normalizeFrequencyEntriesWithPriority(
continue;
}
if (dictionaryFrequencyModeByName[frequency.dictionary] === 'occurrence-based') {
continue;
}
const dictionaryPriority = dictionaryPriorityByName[frequency.dictionary];
normalized.push({
...frequency,
@@ -425,8 +466,34 @@ async function requestYomitanProfileMetadata(
acc[entry.name] = index;
return acc;
}, {});
let dictionaryFrequencyModeByName = {};
try {
const dictionaryInfo = await invoke("getDictionaryInfo", undefined);
dictionaryFrequencyModeByName = Array.isArray(dictionaryInfo)
? dictionaryInfo.reduce((acc, entry) => {
if (!entry || typeof entry !== "object" || typeof entry.title !== "string") {
return acc;
}
if (
entry.frequencyMode === "occurrence-based" ||
entry.frequencyMode === "rank-based"
) {
acc[entry.title] = entry.frequencyMode;
}
return acc;
}, {})
: {};
} catch {
dictionaryFrequencyModeByName = {};
}
return { profileIndex, scanLength, dictionaries, dictionaryPriorityByName };
return {
profileIndex,
scanLength,
dictionaries,
dictionaryPriorityByName,
dictionaryFrequencyModeByName
};
})();
`;
@@ -774,7 +841,133 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
}
return segments;
}
function getPreferredHeadword(dictionaryEntries, token) {
function parsePositiveFrequencyNumber(value) {
if (typeof value === 'number' && Number.isFinite(value) && value > 0) {
return Math.max(1, Math.floor(value));
}
if (typeof value === 'string') {
const numericMatch = value.trim().match(/[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/)?.[0];
if (!numericMatch) { return null; }
const parsed = Number.parseFloat(numericMatch);
if (!Number.isFinite(parsed) || parsed <= 0) { return null; }
return Math.max(1, Math.floor(parsed));
}
if (Array.isArray(value)) {
for (const item of value) {
const parsed = parsePositiveFrequencyNumber(item);
if (parsed !== null) { return parsed; }
}
}
return null;
}
function parseDisplayFrequencyNumber(value) {
if (typeof value === 'string') {
const leadingDigits = value.trim().match(/^\d+/)?.[0];
if (!leadingDigits) { return null; }
const parsed = Number.parseInt(leadingDigits, 10);
return Number.isFinite(parsed) && parsed > 0 ? parsed : null;
}
return parsePositiveFrequencyNumber(value);
}
function getFrequencyDictionaryName(frequency) {
const candidates = [
frequency?.dictionary,
frequency?.dictionaryName,
frequency?.name,
frequency?.title,
frequency?.dictionaryTitle,
frequency?.dictionaryAlias
];
for (const candidate of candidates) {
if (typeof candidate === 'string' && candidate.trim().length > 0) {
return candidate.trim();
}
}
return null;
}
function getBestFrequencyRank(dictionaryEntry, headwordIndex, dictionaryPriorityByName, dictionaryFrequencyModeByName) {
let best = null;
const headwordCount = Array.isArray(dictionaryEntry?.headwords) ? dictionaryEntry.headwords.length : 0;
for (const frequency of dictionaryEntry?.frequencies || []) {
if (!frequency || typeof frequency !== 'object') { continue; }
const frequencyHeadwordIndex = frequency.headwordIndex;
if (typeof frequencyHeadwordIndex === 'number') {
if (frequencyHeadwordIndex !== headwordIndex) { continue; }
} else if (headwordCount > 1) {
continue;
}
const dictionary = getFrequencyDictionaryName(frequency);
if (!dictionary) { continue; }
if (dictionaryFrequencyModeByName[dictionary] === 'occurrence-based') { continue; }
const rank =
parseDisplayFrequencyNumber(frequency.displayValue) ??
parsePositiveFrequencyNumber(frequency.frequency);
if (rank === null) { continue; }
const priorityRaw = dictionaryPriorityByName[dictionary];
const fallbackPriority =
typeof frequency.dictionaryIndex === 'number' && Number.isFinite(frequency.dictionaryIndex)
? Math.max(0, Math.floor(frequency.dictionaryIndex))
: Number.MAX_SAFE_INTEGER;
const priority =
typeof priorityRaw === 'number' && Number.isFinite(priorityRaw)
? Math.max(0, Math.floor(priorityRaw))
: fallbackPriority;
if (best === null || priority < best.priority || (priority === best.priority && rank < best.rank)) {
best = { priority, rank };
}
}
return best?.rank ?? null;
}
function hasExactSource(headword, token, requirePrimary) {
for (const src of headword.sources || []) {
if (src.originalText !== token) { continue; }
if (requirePrimary && !src.isPrimary) { continue; }
if (src.matchType !== 'exact') { continue; }
return true;
}
return false;
}
function collectExactHeadwordMatches(dictionaryEntries, token, requirePrimary) {
const matches = [];
for (const dictionaryEntry of dictionaryEntries || []) {
const headwords = Array.isArray(dictionaryEntry?.headwords) ? dictionaryEntry.headwords : [];
for (let headwordIndex = 0; headwordIndex < headwords.length; headwordIndex += 1) {
const headword = headwords[headwordIndex];
if (!hasExactSource(headword, token, requirePrimary)) { continue; }
matches.push({ dictionaryEntry, headword, headwordIndex });
}
}
return matches;
}
function sameHeadword(match, preferredMatch) {
if (!match || !preferredMatch) {
return false;
}
if (match.headword?.term !== preferredMatch.headword?.term) {
return false;
}
const matchReading = typeof match.headword?.reading === 'string' ? match.headword.reading : '';
const preferredReading =
typeof preferredMatch.headword?.reading === 'string' ? preferredMatch.headword.reading : '';
return matchReading === preferredReading;
}
function getBestFrequencyRankForMatches(matches, dictionaryPriorityByName, dictionaryFrequencyModeByName) {
let best = null;
for (const match of matches) {
const rank = getBestFrequencyRank(
match.dictionaryEntry,
match.headwordIndex,
dictionaryPriorityByName,
dictionaryFrequencyModeByName
);
if (rank === null) { continue; }
if (best === null || rank < best) {
best = rank;
}
}
return best;
}
function getPreferredHeadword(dictionaryEntries, token, dictionaryPriorityByName, dictionaryFrequencyModeByName) {
function appendDictionaryNames(target, value) {
if (!value || typeof value !== 'object') {
return;
@@ -813,36 +1006,33 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
}
return getDictionaryEntryNames(entry).some((name) => name.startsWith("SubMiner Character Dictionary"));
}
function hasExactPrimarySource(headword, token) {
for (const src of headword.sources || []) {
if (src.originalText !== token) { continue; }
if (!src.isPrimary) { continue; }
if (src.matchType !== 'exact') { continue; }
return true;
}
return false;
}
const exactPrimaryMatches = collectExactHeadwordMatches(dictionaryEntries, token, true);
let matchedNameDictionary = false;
if (includeNameMatchMetadata) {
for (const dictionaryEntry of dictionaryEntries || []) {
if (!isNameDictionaryEntry(dictionaryEntry)) { continue; }
for (const headword of dictionaryEntry.headwords || []) {
if (!hasExactPrimarySource(headword, token)) { continue; }
for (const match of exactPrimaryMatches) {
if (match.dictionaryEntry !== dictionaryEntry) { continue; }
matchedNameDictionary = true;
break;
}
if (matchedNameDictionary) { break; }
}
}
for (const dictionaryEntry of dictionaryEntries || []) {
for (const headword of dictionaryEntry.headwords || []) {
if (!hasExactPrimarySource(headword, token)) { continue; }
return {
term: headword.term,
reading: headword.reading,
isNameMatch: matchedNameDictionary || isNameDictionaryEntry(dictionaryEntry)
};
}
const preferredMatch = exactPrimaryMatches[0];
if (preferredMatch) {
const exactFrequencyMatches = collectExactHeadwordMatches(dictionaryEntries, token, false)
.filter((match) => sameHeadword(match, preferredMatch));
return {
term: preferredMatch.headword.term,
reading: preferredMatch.headword.reading,
isNameMatch: matchedNameDictionary || isNameDictionaryEntry(preferredMatch.dictionaryEntry),
frequencyRank: getBestFrequencyRankForMatches(
exactFrequencyMatches.length > 0 ? exactFrequencyMatches : exactPrimaryMatches,
dictionaryPriorityByName,
dictionaryFrequencyModeByName
)
};
}
return null;
}
@@ -853,6 +1043,8 @@ function buildYomitanScanningScript(
profileIndex: number,
scanLength: number,
includeNameMatchMetadata: boolean,
dictionaryPriorityByName: Record<string, number>,
dictionaryFrequencyModeByName: Partial<Record<string, YomitanFrequencyMode>>,
): string {
return `
(async () => {
@@ -876,6 +1068,8 @@ function buildYomitanScanningScript(
});
${YOMITAN_SCANNING_HELPERS}
const includeNameMatchMetadata = ${includeNameMatchMetadata ? 'true' : 'false'};
const dictionaryPriorityByName = ${JSON.stringify(dictionaryPriorityByName)};
const dictionaryFrequencyModeByName = ${JSON.stringify(dictionaryFrequencyModeByName)};
const text = ${JSON.stringify(text)};
const details = {matchType: "exact", deinflect: true};
const tokens = [];
@@ -889,7 +1083,12 @@ ${YOMITAN_SCANNING_HELPERS}
const originalTextLength = typeof result?.originalTextLength === "number" ? result.originalTextLength : 0;
if (dictionaryEntries.length > 0 && originalTextLength > 0 && (originalTextLength !== character.length || isCodePointJapanese(codePoint))) {
const source = substring.substring(0, originalTextLength);
const preferredHeadword = getPreferredHeadword(dictionaryEntries, source);
const preferredHeadword = getPreferredHeadword(
dictionaryEntries,
source,
dictionaryPriorityByName,
dictionaryFrequencyModeByName
);
if (preferredHeadword && typeof preferredHeadword.term === "string") {
const reading = typeof preferredHeadword.reading === "string" ? preferredHeadword.reading : "";
const segments = distributeFuriganaInflected(preferredHeadword.term, reading, source);
@@ -900,6 +1099,10 @@ ${YOMITAN_SCANNING_HELPERS}
startPos: i,
endPos: i + originalTextLength,
isNameMatch: includeNameMatchMetadata && preferredHeadword.isNameMatch === true,
frequencyRank:
typeof preferredHeadword.frequencyRank === "number" && Number.isFinite(preferredHeadword.frequencyRank)
? Math.max(1, Math.floor(preferredHeadword.frequencyRank))
: undefined,
});
i += originalTextLength;
continue;
@@ -1036,6 +1239,8 @@ export async function requestYomitanScanTokens(
profileIndex,
scanLength,
options?.includeNameMatchMetadata === true,
metadata?.dictionaryPriorityByName ?? {},
metadata?.dictionaryFrequencyModeByName ?? {},
),
true,
);
@@ -1099,7 +1304,11 @@ async function fetchYomitanTermFrequencies(
try {
const rawResult = await parserWindow.webContents.executeJavaScript(script, true);
return Array.isArray(rawResult)
? normalizeFrequencyEntriesWithPriority(rawResult, metadata.dictionaryPriorityByName)
? normalizeFrequencyEntriesWithPriority(
rawResult,
metadata.dictionaryPriorityByName,
metadata.dictionaryFrequencyModeByName,
)
: [];
} catch (err) {
logger.error('Yomitan term frequency request failed:', (err as Error).message);
@@ -1541,10 +1750,15 @@ export async function getYomitanDictionaryInfo(
.map((entry) => {
const title = typeof entry.title === 'string' ? entry.title.trim() : '';
const revision = entry.revision;
const frequencyMode: YomitanFrequencyMode | undefined =
entry.frequencyMode === 'occurrence-based' || entry.frequencyMode === 'rank-based'
? entry.frequencyMode
: undefined;
return {
title,
revision:
typeof revision === 'string' || typeof revision === 'number' ? revision : undefined,
frequencyMode,
};
})
.filter((entry) => entry.title.length > 0);