mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-20 12:11:28 -07:00
feat: improve stats dashboard and annotation settings
This commit is contained in:
@@ -53,6 +53,7 @@ const VOCABULARY_STATS = [
|
||||
pos2: '自立',
|
||||
pos3: null,
|
||||
frequency: 100,
|
||||
frequencyRank: 42,
|
||||
firstSeen: Date.now(),
|
||||
lastSeen: Date.now(),
|
||||
},
|
||||
@@ -132,9 +133,7 @@ const EPISODES_PER_DAY = [
|
||||
{ epochDay: Math.floor(Date.now() / 86_400_000), episodeCount: 1 },
|
||||
];
|
||||
|
||||
const NEW_ANIME_PER_DAY = [
|
||||
{ epochDay: Math.floor(Date.now() / 86_400_000) - 2, newAnimeCount: 2 },
|
||||
];
|
||||
const NEW_ANIME_PER_DAY = [{ epochDay: Math.floor(Date.now() / 86_400_000) - 2, newAnimeCount: 2 }];
|
||||
|
||||
const WATCH_TIME_PER_ANIME = [
|
||||
{
|
||||
@@ -210,7 +209,12 @@ function createMockTracker(
|
||||
getSessionSummaries: async () => SESSION_SUMMARIES,
|
||||
getDailyRollups: async () => DAILY_ROLLUPS,
|
||||
getMonthlyRollups: async () => [],
|
||||
getQueryHints: async () => ({ totalSessions: 5, activeSessions: 1, episodesToday: 2, activeAnimeCount: 3 }),
|
||||
getQueryHints: async () => ({
|
||||
totalSessions: 5,
|
||||
activeSessions: 1,
|
||||
episodesToday: 2,
|
||||
activeAnimeCount: 3,
|
||||
}),
|
||||
getSessionTimeline: async () => [],
|
||||
getSessionEvents: async () => [],
|
||||
getVocabularyStats: async () => VOCABULARY_STATS,
|
||||
@@ -445,7 +449,9 @@ describe('stats server API routes', () => {
|
||||
}),
|
||||
);
|
||||
|
||||
const res = await app.request('/api/stats/kanji/occurrences?kanji=%E6%97%A5&limit=999999&offset=10');
|
||||
const res = await app.request(
|
||||
'/api/stats/kanji/occurrences?kanji=%E6%97%A5&limit=999999&offset=10',
|
||||
);
|
||||
assert.equal(res.status, 200);
|
||||
const body = await res.json();
|
||||
assert.ok(Array.isArray(body));
|
||||
@@ -711,6 +717,23 @@ describe('stats server API routes', () => {
|
||||
assert.equal(res.status, 400);
|
||||
});
|
||||
|
||||
it('DELETE /api/stats/sessions/:sessionId deletes a session', async () => {
|
||||
let deletedSessionId = 0;
|
||||
const app = createStatsApp(
|
||||
createMockTracker({
|
||||
deleteSession: async (sessionId: number) => {
|
||||
deletedSessionId = sessionId;
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const res = await app.request('/api/stats/sessions/42', { method: 'DELETE' });
|
||||
|
||||
assert.equal(res.status, 200);
|
||||
assert.equal(deletedSessionId, 42);
|
||||
assert.deepEqual(await res.json(), { ok: true });
|
||||
});
|
||||
|
||||
it('POST /api/stats/anki/browse returns 400 for missing noteId', async () => {
|
||||
const app = createStatsApp(createMockTracker());
|
||||
const res = await app.request('/api/stats/anki/browse', { method: 'POST' });
|
||||
|
||||
@@ -130,6 +130,56 @@ test('createFrequencyDictionaryLookup parses composite displayValue by primary r
|
||||
assert.equal(lookup('高み'), 9933);
|
||||
});
|
||||
|
||||
test('createFrequencyDictionaryLookup uses leading display digits for displayValue strings', async () => {
|
||||
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-frequency-dict-'));
|
||||
const bankPath = path.join(tempDir, 'term_meta_bank_1.json');
|
||||
fs.writeFileSync(
|
||||
bankPath,
|
||||
JSON.stringify([
|
||||
['潜む', 1, { frequency: { value: 121, displayValue: '118,121' } }],
|
||||
['例', 2, { frequency: { value: 1234, displayValue: '1,234' } }],
|
||||
]),
|
||||
);
|
||||
|
||||
const lookup = await createFrequencyDictionaryLookup({
|
||||
searchPaths: [tempDir],
|
||||
log: () => undefined,
|
||||
});
|
||||
|
||||
assert.equal(lookup('潜む'), 118);
|
||||
assert.equal(lookup('例'), 1);
|
||||
});
|
||||
|
||||
test('createFrequencyDictionaryLookup ignores occurrence-based Yomitan dictionaries', async () => {
|
||||
const logs: string[] = [];
|
||||
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-frequency-dict-'));
|
||||
fs.writeFileSync(
|
||||
path.join(tempDir, 'index.json'),
|
||||
JSON.stringify({
|
||||
title: 'CC100',
|
||||
revision: '1',
|
||||
frequencyMode: 'occurrence-based',
|
||||
}),
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(tempDir, 'term_meta_bank_1.json'),
|
||||
JSON.stringify([['潜む', 1, { frequency: { value: 118121 } }]]),
|
||||
);
|
||||
|
||||
const lookup = await createFrequencyDictionaryLookup({
|
||||
searchPaths: [tempDir],
|
||||
log: (message) => {
|
||||
logs.push(message);
|
||||
},
|
||||
});
|
||||
|
||||
assert.equal(lookup('潜む'), null);
|
||||
assert.equal(
|
||||
logs.some((entry) => entry.includes('occurrence-based') && entry.includes('CC100')),
|
||||
true,
|
||||
);
|
||||
});
|
||||
|
||||
test('createFrequencyDictionaryLookup does not require synchronous fs APIs', async () => {
|
||||
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-frequency-dict-'));
|
||||
const bankPath = path.join(tempDir, 'term_meta_bank_1.json');
|
||||
|
||||
@@ -6,6 +6,8 @@ export interface FrequencyDictionaryLookupOptions {
|
||||
log: (message: string) => void;
|
||||
}
|
||||
|
||||
type FrequencyDictionaryMode = 'occurrence-based' | 'rank-based';
|
||||
|
||||
interface FrequencyDictionaryEntry {
|
||||
rank: number;
|
||||
term: string;
|
||||
@@ -29,30 +31,67 @@ function normalizeFrequencyTerm(value: string): string {
|
||||
return value.trim().toLowerCase();
|
||||
}
|
||||
|
||||
async function readDictionaryMetadata(
|
||||
dictionaryPath: string,
|
||||
log: (message: string) => void,
|
||||
): Promise<{ title: string | null; frequencyMode: FrequencyDictionaryMode | null }> {
|
||||
const indexPath = path.join(dictionaryPath, 'index.json');
|
||||
let rawText: string;
|
||||
try {
|
||||
rawText = await fs.readFile(indexPath, 'utf-8');
|
||||
} catch (error) {
|
||||
if (isErrorCode(error, 'ENOENT')) {
|
||||
return { title: null, frequencyMode: null };
|
||||
}
|
||||
log(`Failed to read frequency dictionary index ${indexPath}: ${String(error)}`);
|
||||
return { title: null, frequencyMode: null };
|
||||
}
|
||||
|
||||
let rawIndex: unknown;
|
||||
try {
|
||||
rawIndex = JSON.parse(rawText) as unknown;
|
||||
} catch {
|
||||
log(`Failed to parse frequency dictionary index as JSON: ${indexPath}`);
|
||||
return { title: null, frequencyMode: null };
|
||||
}
|
||||
|
||||
if (!rawIndex || typeof rawIndex !== 'object') {
|
||||
return { title: null, frequencyMode: null };
|
||||
}
|
||||
|
||||
const titleRaw = (rawIndex as { title?: unknown }).title;
|
||||
const frequencyModeRaw = (rawIndex as { frequencyMode?: unknown }).frequencyMode;
|
||||
return {
|
||||
title: typeof titleRaw === 'string' && titleRaw.trim().length > 0 ? titleRaw.trim() : null,
|
||||
frequencyMode:
|
||||
frequencyModeRaw === 'occurrence-based' || frequencyModeRaw === 'rank-based'
|
||||
? frequencyModeRaw
|
||||
: null,
|
||||
};
|
||||
}
|
||||
|
||||
function parsePositiveFrequencyString(value: string): number | null {
|
||||
const trimmed = value.trim();
|
||||
if (!trimmed) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const numericPrefix = trimmed.match(/^\d[\d,]*/)?.[0];
|
||||
if (!numericPrefix) {
|
||||
const numericMatch = trimmed.match(/[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/)?.[0];
|
||||
if (!numericMatch) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const chunks = numericPrefix.split(',');
|
||||
const normalizedNumber =
|
||||
chunks.length <= 1
|
||||
? (chunks[0] ?? '')
|
||||
: chunks.slice(1).every((chunk) => /^\d{3}$/.test(chunk))
|
||||
? chunks.join('')
|
||||
: (chunks[0] ?? '');
|
||||
const parsed = Number.parseInt(normalizedNumber, 10);
|
||||
const parsed = Number.parseFloat(numericMatch);
|
||||
if (!Number.isFinite(parsed) || parsed <= 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return parsed;
|
||||
const normalized = Math.floor(parsed);
|
||||
if (!Number.isFinite(normalized) || normalized <= 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function parsePositiveFrequencyNumber(value: unknown): number | null {
|
||||
@@ -68,18 +107,32 @@ function parsePositiveFrequencyNumber(value: unknown): number | null {
|
||||
return null;
|
||||
}
|
||||
|
||||
function parseDisplayFrequencyNumber(value: unknown): number | null {
|
||||
if (typeof value === 'string') {
|
||||
const leadingDigits = value.trim().match(/^\d+/)?.[0];
|
||||
if (!leadingDigits) {
|
||||
return null;
|
||||
}
|
||||
const parsed = Number.parseInt(leadingDigits, 10);
|
||||
return Number.isFinite(parsed) && parsed > 0 ? parsed : null;
|
||||
}
|
||||
|
||||
return parsePositiveFrequencyNumber(value);
|
||||
}
|
||||
|
||||
function extractFrequencyDisplayValue(meta: unknown): number | null {
|
||||
if (!meta || typeof meta !== 'object') return null;
|
||||
const frequency = (meta as { frequency?: unknown }).frequency;
|
||||
if (!frequency || typeof frequency !== 'object') return null;
|
||||
const rawValue = (frequency as { value?: unknown }).value;
|
||||
const parsedRawValue = parsePositiveFrequencyNumber(rawValue);
|
||||
const displayValue = (frequency as { displayValue?: unknown }).displayValue;
|
||||
const parsedDisplayValue = parsePositiveFrequencyNumber(displayValue);
|
||||
const parsedDisplayValue = parseDisplayFrequencyNumber(displayValue);
|
||||
if (parsedDisplayValue !== null) {
|
||||
return parsedDisplayValue;
|
||||
}
|
||||
|
||||
const rawValue = (frequency as { value?: unknown }).value;
|
||||
return parsePositiveFrequencyNumber(rawValue);
|
||||
return parsedRawValue;
|
||||
}
|
||||
|
||||
function asFrequencyDictionaryEntry(entry: unknown): FrequencyDictionaryEntry | null {
|
||||
@@ -141,6 +194,15 @@ async function collectDictionaryFromPath(
|
||||
log: (message: string) => void,
|
||||
): Promise<Map<string, number>> {
|
||||
const terms = new Map<string, number>();
|
||||
const metadata = await readDictionaryMetadata(dictionaryPath, log);
|
||||
if (metadata.frequencyMode === 'occurrence-based') {
|
||||
log(
|
||||
`Skipping occurrence-based frequency dictionary ${
|
||||
metadata.title ?? dictionaryPath
|
||||
}; SubMiner frequency tags require rank-based values.`,
|
||||
);
|
||||
return terms;
|
||||
}
|
||||
|
||||
let fileNames: string[];
|
||||
try {
|
||||
|
||||
@@ -57,6 +57,8 @@ import {
|
||||
getWordOccurrences,
|
||||
getVideoDurationMs,
|
||||
markVideoWatched,
|
||||
deleteSession as deleteSessionQuery,
|
||||
deleteVideo as deleteVideoQuery,
|
||||
} from './immersion-tracker/query';
|
||||
import {
|
||||
buildVideoKey,
|
||||
@@ -125,6 +127,7 @@ import {
|
||||
type WordDetailRow,
|
||||
type WordOccurrenceRow,
|
||||
type VocabularyStatsRow,
|
||||
type CountedWordOccurrence,
|
||||
} from './immersion-tracker/types';
|
||||
import type { MergedToken } from '../../types';
|
||||
import { shouldExcludeTokenFromVocabularyPersistence } from './tokenizer/annotation-stage';
|
||||
@@ -402,6 +405,70 @@ export class ImmersionTrackerService {
|
||||
markVideoWatched(this.db, videoId, watched);
|
||||
}
|
||||
|
||||
async deleteSession(sessionId: number): Promise<void> {
|
||||
deleteSessionQuery(this.db, sessionId);
|
||||
}
|
||||
|
||||
async deleteVideo(videoId: number): Promise<void> {
|
||||
deleteVideoQuery(this.db, videoId);
|
||||
}
|
||||
|
||||
async reassignAnimeAnilist(animeId: number, info: {
|
||||
anilistId: number;
|
||||
titleRomaji?: string | null;
|
||||
titleEnglish?: string | null;
|
||||
titleNative?: string | null;
|
||||
episodesTotal?: number | null;
|
||||
description?: string | null;
|
||||
coverUrl?: string | null;
|
||||
}): Promise<void> {
|
||||
this.db.prepare(`
|
||||
UPDATE imm_anime
|
||||
SET anilist_id = ?,
|
||||
title_romaji = COALESCE(?, title_romaji),
|
||||
title_english = COALESCE(?, title_english),
|
||||
title_native = COALESCE(?, title_native),
|
||||
episodes_total = COALESCE(?, episodes_total),
|
||||
description = ?,
|
||||
LAST_UPDATE_DATE = ?
|
||||
WHERE anime_id = ?
|
||||
`).run(
|
||||
info.anilistId,
|
||||
info.titleRomaji ?? null,
|
||||
info.titleEnglish ?? null,
|
||||
info.titleNative ?? null,
|
||||
info.episodesTotal ?? null,
|
||||
info.description ?? null,
|
||||
Date.now(),
|
||||
animeId,
|
||||
);
|
||||
|
||||
// Update cover art for all videos in this anime
|
||||
if (info.coverUrl) {
|
||||
const videos = this.db.prepare('SELECT video_id FROM imm_videos WHERE anime_id = ?')
|
||||
.all(animeId) as Array<{ video_id: number }>;
|
||||
let coverBlob: Buffer | null = null;
|
||||
try {
|
||||
const res = await fetch(info.coverUrl);
|
||||
if (res.ok) coverBlob = Buffer.from(await res.arrayBuffer());
|
||||
} catch { /* ignore */ }
|
||||
for (const v of videos) {
|
||||
this.db.prepare(`
|
||||
INSERT INTO imm_media_art (video_id, anilist_id, cover_url, cover_blob, title_romaji, title_english, episodes_total, fetched_at_ms, CREATED_DATE, LAST_UPDATE_DATE)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(video_id) DO UPDATE SET
|
||||
anilist_id = excluded.anilist_id, cover_url = excluded.cover_url, cover_blob = COALESCE(excluded.cover_blob, cover_blob),
|
||||
title_romaji = excluded.title_romaji, title_english = excluded.title_english, episodes_total = excluded.episodes_total,
|
||||
fetched_at_ms = excluded.fetched_at_ms, LAST_UPDATE_DATE = excluded.LAST_UPDATE_DATE
|
||||
`).run(
|
||||
v.video_id, info.anilistId, info.coverUrl, coverBlob,
|
||||
info.titleRomaji ?? null, info.titleEnglish ?? null, info.episodesTotal ?? null,
|
||||
Date.now(), Date.now(), Date.now(),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async getEpisodeCardEvents(videoId: number): Promise<EpisodeCardEventRow[]> {
|
||||
return getEpisodeCardEvents(this.db, videoId);
|
||||
}
|
||||
@@ -571,19 +638,7 @@ export class ImmersionTrackerService {
|
||||
this.sessionState.tokensSeen += metrics.tokens;
|
||||
this.sessionState.pendingTelemetry = true;
|
||||
|
||||
const wordOccurrences = new Map<
|
||||
string,
|
||||
{
|
||||
headword: string;
|
||||
word: string;
|
||||
reading: string;
|
||||
partOfSpeech: string;
|
||||
pos1: string;
|
||||
pos2: string;
|
||||
pos3: string;
|
||||
occurrenceCount: number;
|
||||
}
|
||||
>();
|
||||
const wordOccurrences = new Map<string, CountedWordOccurrence>();
|
||||
for (const token of tokens ?? []) {
|
||||
if (shouldExcludeTokenFromVocabularyPersistence(token)) {
|
||||
continue;
|
||||
@@ -617,6 +672,7 @@ export class ImmersionTrackerService {
|
||||
pos2: token.pos2 ?? '',
|
||||
pos3: token.pos3 ?? '',
|
||||
occurrenceCount: 1,
|
||||
frequencyRank: token.frequencyRank ?? null,
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@ import {
|
||||
import { startSessionRecord } from '../session.js';
|
||||
import {
|
||||
cleanupVocabularyStats,
|
||||
deleteSession,
|
||||
getAnimeDetail,
|
||||
getAnimeEpisodes,
|
||||
getAnimeLibrary,
|
||||
@@ -295,35 +296,32 @@ test('cleanupVocabularyStats repairs stored POS metadata and removes excluded im
|
||||
{ headword: '旧', frequency: 1 },
|
||||
],
|
||||
);
|
||||
assert.deepEqual(
|
||||
repairedRows,
|
||||
[
|
||||
{
|
||||
headword: '旧',
|
||||
word: '旧',
|
||||
reading: 'きゅう',
|
||||
part_of_speech: 'noun',
|
||||
pos1: '名詞',
|
||||
pos2: '一般',
|
||||
},
|
||||
{
|
||||
headword: '猫',
|
||||
word: '猫',
|
||||
reading: 'ねこ',
|
||||
part_of_speech: 'noun',
|
||||
pos1: '名詞',
|
||||
pos2: '一般',
|
||||
},
|
||||
{
|
||||
headword: '知る',
|
||||
word: '知っている',
|
||||
reading: 'しっている',
|
||||
part_of_speech: 'verb',
|
||||
pos1: '動詞',
|
||||
pos2: '自立',
|
||||
},
|
||||
],
|
||||
);
|
||||
assert.deepEqual(repairedRows, [
|
||||
{
|
||||
headword: '旧',
|
||||
word: '旧',
|
||||
reading: 'きゅう',
|
||||
part_of_speech: 'noun',
|
||||
pos1: '名詞',
|
||||
pos2: '一般',
|
||||
},
|
||||
{
|
||||
headword: '猫',
|
||||
word: '猫',
|
||||
reading: 'ねこ',
|
||||
part_of_speech: 'noun',
|
||||
pos1: '名詞',
|
||||
pos2: '一般',
|
||||
},
|
||||
{
|
||||
headword: '知る',
|
||||
word: '知っている',
|
||||
reading: 'しっている',
|
||||
part_of_speech: 'verb',
|
||||
pos1: '動詞',
|
||||
pos2: '自立',
|
||||
},
|
||||
]);
|
||||
} finally {
|
||||
db.close();
|
||||
cleanupDbPath(dbPath);
|
||||
@@ -708,7 +706,7 @@ test('anime-level queries group by anime_id and preserve episode-level rows', ()
|
||||
canonicalTitle: 'Frieren',
|
||||
anilistId: 52_921,
|
||||
titleRomaji: 'Sousou no Frieren',
|
||||
titleEnglish: 'Frieren: Beyond Journey\'s End',
|
||||
titleEnglish: "Frieren: Beyond Journey's End",
|
||||
titleNative: '葬送のフリーレン',
|
||||
metadataJson: '{"source":"anilist"}',
|
||||
});
|
||||
@@ -1070,3 +1068,151 @@ test('getKanjiOccurrences maps a kanji back to anime, video, and subtitle line c
|
||||
cleanupDbPath(dbPath);
|
||||
}
|
||||
});
|
||||
|
||||
test('deleteSession removes the session and all associated session-scoped rows', () => {
|
||||
const dbPath = makeDbPath();
|
||||
const db = new Database(dbPath);
|
||||
|
||||
try {
|
||||
ensureSchema(db);
|
||||
const stmts = createTrackerPreparedStatements(db);
|
||||
|
||||
const videoId = getOrCreateVideoRecord(db, 'local:/tmp/delete-session.mkv', {
|
||||
canonicalTitle: 'Delete Session Test',
|
||||
sourcePath: '/tmp/delete-session.mkv',
|
||||
sourceUrl: null,
|
||||
sourceType: SOURCE_TYPE_LOCAL,
|
||||
});
|
||||
|
||||
const startedAtMs = 6_000_000;
|
||||
const { sessionId } = startSessionRecord(db, videoId, startedAtMs);
|
||||
|
||||
stmts.telemetryInsertStmt.run(
|
||||
sessionId,
|
||||
startedAtMs + 1_000,
|
||||
5_000,
|
||||
4_000,
|
||||
3,
|
||||
9,
|
||||
9,
|
||||
1,
|
||||
2,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
startedAtMs + 1_000,
|
||||
startedAtMs + 1_000,
|
||||
);
|
||||
const eventResult = stmts.eventInsertStmt.run(
|
||||
sessionId,
|
||||
startedAtMs + 1_500,
|
||||
EVENT_SUBTITLE_LINE,
|
||||
0,
|
||||
0,
|
||||
900,
|
||||
2,
|
||||
0,
|
||||
'{"line":"delete me"}',
|
||||
startedAtMs + 1_500,
|
||||
startedAtMs + 1_500,
|
||||
);
|
||||
const eventId = Number(eventResult.lastInsertRowid);
|
||||
const wordResult = db
|
||||
.prepare(
|
||||
`INSERT INTO imm_words (
|
||||
headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
)
|
||||
.run('削除', '削除', 'さくじょ', 'noun', '名詞', '一般', '', startedAtMs, startedAtMs, 1);
|
||||
const kanjiResult = db
|
||||
.prepare(
|
||||
`INSERT INTO imm_kanji (
|
||||
kanji, first_seen, last_seen, frequency
|
||||
) VALUES (?, ?, ?, ?)`,
|
||||
)
|
||||
.run('削', startedAtMs, startedAtMs, 1);
|
||||
const lineResult = stmts.subtitleLineInsertStmt.run(
|
||||
sessionId,
|
||||
eventId,
|
||||
videoId,
|
||||
null,
|
||||
0,
|
||||
0,
|
||||
900,
|
||||
'delete me',
|
||||
startedAtMs + 1_500,
|
||||
startedAtMs + 1_500,
|
||||
);
|
||||
const lineId = Number(lineResult.lastInsertRowid);
|
||||
db.prepare(
|
||||
`INSERT INTO imm_word_line_occurrences (line_id, word_id, occurrence_count)
|
||||
VALUES (?, ?, ?)`,
|
||||
).run(lineId, Number(wordResult.lastInsertRowid), 1);
|
||||
db.prepare(
|
||||
`INSERT INTO imm_kanji_line_occurrences (line_id, kanji_id, occurrence_count)
|
||||
VALUES (?, ?, ?)`,
|
||||
).run(lineId, Number(kanjiResult.lastInsertRowid), 1);
|
||||
|
||||
deleteSession(db, sessionId);
|
||||
|
||||
const sessionCount = Number(
|
||||
(
|
||||
db
|
||||
.prepare('SELECT COUNT(*) AS total FROM imm_sessions WHERE session_id = ?')
|
||||
.get(sessionId) as {
|
||||
total: number;
|
||||
}
|
||||
).total,
|
||||
);
|
||||
const telemetryCount = Number(
|
||||
(
|
||||
db
|
||||
.prepare('SELECT COUNT(*) AS total FROM imm_session_telemetry WHERE session_id = ?')
|
||||
.get(sessionId) as { total: number }
|
||||
).total,
|
||||
);
|
||||
const eventCount = Number(
|
||||
(
|
||||
db
|
||||
.prepare('SELECT COUNT(*) AS total FROM imm_session_events WHERE session_id = ?')
|
||||
.get(sessionId) as {
|
||||
total: number;
|
||||
}
|
||||
).total,
|
||||
);
|
||||
const subtitleLineCount = Number(
|
||||
(
|
||||
db
|
||||
.prepare('SELECT COUNT(*) AS total FROM imm_subtitle_lines WHERE session_id = ?')
|
||||
.get(sessionId) as { total: number }
|
||||
).total,
|
||||
);
|
||||
const wordOccurrenceCount = Number(
|
||||
(
|
||||
db
|
||||
.prepare('SELECT COUNT(*) AS total FROM imm_word_line_occurrences WHERE line_id = ?')
|
||||
.get(lineId) as { total: number }
|
||||
).total,
|
||||
);
|
||||
const kanjiOccurrenceCount = Number(
|
||||
(
|
||||
db
|
||||
.prepare('SELECT COUNT(*) AS total FROM imm_kanji_line_occurrences WHERE line_id = ?')
|
||||
.get(lineId) as { total: number }
|
||||
).total,
|
||||
);
|
||||
|
||||
assert.equal(sessionCount, 0);
|
||||
assert.equal(telemetryCount, 0);
|
||||
assert.equal(eventCount, 0);
|
||||
assert.equal(subtitleLineCount, 0);
|
||||
assert.equal(wordOccurrenceCount, 0);
|
||||
assert.equal(kanjiOccurrenceCount, 0);
|
||||
} finally {
|
||||
db.close();
|
||||
cleanupDbPath(dbPath);
|
||||
}
|
||||
});
|
||||
|
||||
@@ -223,7 +223,8 @@ export function getVocabularyStats(
|
||||
const stmt = db.prepare(`
|
||||
SELECT id AS wordId, headword, word, reading,
|
||||
part_of_speech AS partOfSpeech, pos1, pos2, pos3,
|
||||
frequency, first_seen AS firstSeen, last_seen AS lastSeen
|
||||
frequency, frequency_rank AS frequencyRank,
|
||||
first_seen AS firstSeen, last_seen AS lastSeen
|
||||
FROM imm_words ${whereClause} ORDER BY frequency DESC LIMIT ?
|
||||
`);
|
||||
const params = hasExclude ? [...excludePos, limit] : [limit];
|
||||
@@ -632,6 +633,7 @@ export function getAnimeDetail(db: DatabaseSync, animeId: number): AnimeDetailRo
|
||||
a.title_romaji AS titleRomaji,
|
||||
a.title_english AS titleEnglish,
|
||||
a.title_native AS titleNative,
|
||||
a.description AS description,
|
||||
COUNT(DISTINCT s.session_id) AS totalSessions,
|
||||
COALESCE(SUM(sm.max_active_ms), 0) AS totalActiveMs,
|
||||
COALESCE(SUM(sm.max_cards), 0) AS totalCards,
|
||||
@@ -1165,3 +1167,22 @@ export function isVideoWatched(db: DatabaseSync, videoId: number): boolean {
|
||||
} | null;
|
||||
return row?.watched === 1;
|
||||
}
|
||||
|
||||
export function deleteSession(db: DatabaseSync, sessionId: number): void {
|
||||
db.prepare('DELETE FROM imm_subtitle_lines WHERE session_id = ?').run(sessionId);
|
||||
db.prepare('DELETE FROM imm_session_telemetry WHERE session_id = ?').run(sessionId);
|
||||
db.prepare('DELETE FROM imm_session_events WHERE session_id = ?').run(sessionId);
|
||||
db.prepare('DELETE FROM imm_sessions WHERE session_id = ?').run(sessionId);
|
||||
}
|
||||
|
||||
export function deleteVideo(db: DatabaseSync, videoId: number): void {
|
||||
const sessions = db.prepare('SELECT session_id FROM imm_sessions WHERE video_id = ?').all(videoId) as Array<{ session_id: number }>;
|
||||
for (const s of sessions) {
|
||||
deleteSession(db, s.session_id);
|
||||
}
|
||||
db.prepare('DELETE FROM imm_subtitle_lines WHERE video_id = ?').run(videoId);
|
||||
db.prepare('DELETE FROM imm_daily_rollups WHERE video_id = ?').run(videoId);
|
||||
db.prepare('DELETE FROM imm_monthly_rollups WHERE video_id = ?').run(videoId);
|
||||
db.prepare('DELETE FROM imm_media_art WHERE video_id = ?').run(videoId);
|
||||
db.prepare('DELETE FROM imm_videos WHERE video_id = ?').run(videoId);
|
||||
}
|
||||
|
||||
@@ -345,6 +345,7 @@ export function ensureSchema(db: DatabaseSync): void {
|
||||
title_english TEXT,
|
||||
title_native TEXT,
|
||||
episodes_total INTEGER,
|
||||
description TEXT,
|
||||
metadata_json TEXT,
|
||||
CREATED_DATE INTEGER,
|
||||
LAST_UPDATE_DATE INTEGER
|
||||
@@ -479,6 +480,7 @@ export function ensureSchema(db: DatabaseSync): void {
|
||||
first_seen REAL,
|
||||
last_seen REAL,
|
||||
frequency INTEGER,
|
||||
frequency_rank INTEGER,
|
||||
UNIQUE(headword, word, reading)
|
||||
);
|
||||
`);
|
||||
@@ -672,6 +674,11 @@ export function ensureSchema(db: DatabaseSync): void {
|
||||
`);
|
||||
}
|
||||
|
||||
if (currentVersion?.schema_version && currentVersion.schema_version < 9) {
|
||||
addColumnIfMissing(db, 'imm_anime', 'description', 'TEXT');
|
||||
addColumnIfMissing(db, 'imm_words', 'frequency_rank', 'INTEGER');
|
||||
}
|
||||
|
||||
db.exec(`
|
||||
CREATE INDEX IF NOT EXISTS idx_anime_normalized_title
|
||||
ON imm_anime(normalized_title_key)
|
||||
@@ -776,9 +783,9 @@ export function createTrackerPreparedStatements(db: DatabaseSync): TrackerPrepar
|
||||
`),
|
||||
wordUpsertStmt: db.prepare(`
|
||||
INSERT INTO imm_words (
|
||||
headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency
|
||||
headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency, frequency_rank
|
||||
) VALUES (
|
||||
?, ?, ?, ?, ?, ?, ?, ?, ?, 1
|
||||
?, ?, ?, ?, ?, ?, ?, ?, ?, 1, ?
|
||||
)
|
||||
ON CONFLICT(headword, word, reading) DO UPDATE SET
|
||||
frequency = COALESCE(frequency, 0) + 1,
|
||||
@@ -792,7 +799,12 @@ export function createTrackerPreparedStatements(db: DatabaseSync): TrackerPrepar
|
||||
pos2 = COALESCE(NULLIF(imm_words.pos2, ''), excluded.pos2),
|
||||
pos3 = COALESCE(NULLIF(imm_words.pos3, ''), excluded.pos3),
|
||||
first_seen = MIN(COALESCE(first_seen, excluded.first_seen), excluded.first_seen),
|
||||
last_seen = MAX(COALESCE(last_seen, excluded.last_seen), excluded.last_seen)
|
||||
last_seen = MAX(COALESCE(last_seen, excluded.last_seen), excluded.last_seen),
|
||||
frequency_rank = CASE
|
||||
WHEN excluded.frequency_rank IS NOT NULL AND (imm_words.frequency_rank IS NULL OR excluded.frequency_rank < imm_words.frequency_rank)
|
||||
THEN excluded.frequency_rank
|
||||
ELSE imm_words.frequency_rank
|
||||
END
|
||||
`),
|
||||
kanjiUpsertStmt: db.prepare(`
|
||||
INSERT INTO imm_kanji (
|
||||
@@ -863,6 +875,7 @@ function incrementWordAggregate(
|
||||
occurrence.pos3,
|
||||
firstSeen,
|
||||
lastSeen,
|
||||
occurrence.frequencyRank ?? null,
|
||||
);
|
||||
}
|
||||
const row = stmts.wordIdSelectStmt.get(
|
||||
@@ -926,6 +939,7 @@ export function executeQueuedWrite(write: QueuedWrite, stmts: TrackerPreparedSta
|
||||
write.pos3,
|
||||
write.firstSeen,
|
||||
write.lastSeen,
|
||||
write.frequencyRank ?? null,
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
export const SCHEMA_VERSION = 7;
|
||||
export const SCHEMA_VERSION = 9;
|
||||
export const DEFAULT_QUEUE_CAP = 1_000;
|
||||
export const DEFAULT_BATCH_SIZE = 25;
|
||||
export const DEFAULT_FLUSH_INTERVAL_MS = 500;
|
||||
@@ -128,6 +128,7 @@ interface QueuedWordWrite {
|
||||
pos3: string;
|
||||
firstSeen: number;
|
||||
lastSeen: number;
|
||||
frequencyRank: number | null;
|
||||
}
|
||||
|
||||
interface QueuedKanjiWrite {
|
||||
@@ -146,6 +147,7 @@ export interface CountedWordOccurrence {
|
||||
pos2: string;
|
||||
pos3: string;
|
||||
occurrenceCount: number;
|
||||
frequencyRank: number | null;
|
||||
}
|
||||
|
||||
export interface CountedKanjiOccurrence {
|
||||
@@ -240,6 +242,7 @@ export interface VocabularyStatsRow {
|
||||
pos2: string | null;
|
||||
pos3: string | null;
|
||||
frequency: number;
|
||||
frequencyRank: number | null;
|
||||
firstSeen: number;
|
||||
lastSeen: number;
|
||||
}
|
||||
@@ -395,6 +398,7 @@ export interface AnimeDetailRow {
|
||||
titleRomaji: string | null;
|
||||
titleEnglish: string | null;
|
||||
titleNative: string | null;
|
||||
description: string | null;
|
||||
totalSessions: number;
|
||||
totalActiveMs: number;
|
||||
totalCards: number;
|
||||
|
||||
@@ -18,6 +18,7 @@ export interface StatsServerConfig {
|
||||
port: number;
|
||||
staticDir: string; // Path to stats/dist/
|
||||
tracker: ImmersionTrackerService;
|
||||
knownWordCachePath?: string;
|
||||
}
|
||||
|
||||
const STATS_STATIC_CONTENT_TYPES: Record<string, string> = {
|
||||
@@ -79,7 +80,7 @@ function createStatsStaticResponse(staticDir: string, requestPath: string): Resp
|
||||
|
||||
export function createStatsApp(
|
||||
tracker: ImmersionTrackerService,
|
||||
options?: { staticDir?: string },
|
||||
options?: { staticDir?: string; knownWordCachePath?: string },
|
||||
) {
|
||||
const app = new Hono();
|
||||
|
||||
@@ -259,6 +260,70 @@ export function createStatsApp(
|
||||
return c.json({ ok: true });
|
||||
});
|
||||
|
||||
app.delete('/api/stats/sessions/:sessionId', async (c) => {
|
||||
const sessionId = parseIntQuery(c.req.param('sessionId'), 0);
|
||||
if (sessionId <= 0) return c.body(null, 400);
|
||||
await tracker.deleteSession(sessionId);
|
||||
return c.json({ ok: true });
|
||||
});
|
||||
|
||||
app.delete('/api/stats/media/:videoId', async (c) => {
|
||||
const videoId = parseIntQuery(c.req.param('videoId'), 0);
|
||||
if (videoId <= 0) return c.body(null, 400);
|
||||
await tracker.deleteVideo(videoId);
|
||||
return c.json({ ok: true });
|
||||
});
|
||||
|
||||
app.get('/api/stats/anilist/search', async (c) => {
|
||||
const query = (c.req.query('q') ?? '').trim();
|
||||
if (!query) return c.json([]);
|
||||
try {
|
||||
const res = await fetch('https://graphql.anilist.co', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
query: `query ($search: String!) {
|
||||
Page(perPage: 10) {
|
||||
media(search: $search, type: ANIME) {
|
||||
id
|
||||
episodes
|
||||
season
|
||||
seasonYear
|
||||
description(asHtml: false)
|
||||
coverImage { large medium }
|
||||
title { romaji english native }
|
||||
}
|
||||
}
|
||||
}`,
|
||||
variables: { search: query },
|
||||
}),
|
||||
});
|
||||
const json = await res.json() as { data?: { Page?: { media?: unknown[] } } };
|
||||
return c.json(json.data?.Page?.media ?? []);
|
||||
} catch {
|
||||
return c.json([]);
|
||||
}
|
||||
});
|
||||
|
||||
app.get('/api/stats/known-words', (c) => {
|
||||
const cachePath = options?.knownWordCachePath;
|
||||
if (!cachePath || !existsSync(cachePath)) return c.json([]);
|
||||
try {
|
||||
const raw = JSON.parse(readFileSync(cachePath, 'utf-8')) as { version?: number; words?: string[] };
|
||||
if (raw.version === 1 && Array.isArray(raw.words)) return c.json(raw.words);
|
||||
} catch { /* ignore */ }
|
||||
return c.json([]);
|
||||
});
|
||||
|
||||
app.patch('/api/stats/anime/:animeId/anilist', async (c) => {
|
||||
const animeId = parseIntQuery(c.req.param('animeId'), 0);
|
||||
if (animeId <= 0) return c.body(null, 400);
|
||||
const body = await c.req.json().catch(() => null);
|
||||
if (!body?.anilistId) return c.body(null, 400);
|
||||
await tracker.reassignAnimeAnilist(animeId, body);
|
||||
return c.json({ ok: true });
|
||||
});
|
||||
|
||||
app.get('/api/stats/anime/:animeId/cover', async (c) => {
|
||||
const animeId = parseIntQuery(c.req.param('animeId'), 0);
|
||||
if (animeId <= 0) return c.body(null, 404);
|
||||
@@ -363,7 +428,7 @@ export function createStatsApp(
|
||||
}
|
||||
|
||||
export function startStatsServer(config: StatsServerConfig): { close: () => void } {
|
||||
const app = createStatsApp(config.tracker, { staticDir: config.staticDir });
|
||||
const app = createStatsApp(config.tracker, { staticDir: config.staticDir, knownWordCachePath: config.knownWordCachePath });
|
||||
|
||||
const server = serve({
|
||||
fetch: app.fetch,
|
||||
|
||||
@@ -55,10 +55,13 @@ export function buildStatsWindowOptions(options: {
|
||||
};
|
||||
}
|
||||
|
||||
export function buildStatsWindowLoadFileOptions(): { query: Record<string, string> } {
|
||||
export function buildStatsWindowLoadFileOptions(apiBaseUrl?: string): {
|
||||
query: Record<string, string>;
|
||||
} {
|
||||
return {
|
||||
query: {
|
||||
overlay: '1',
|
||||
...(apiBaseUrl ? { apiBase: apiBaseUrl } : {}),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
@@ -140,3 +140,12 @@ test('buildStatsWindowLoadFileOptions enables overlay rendering mode', () => {
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
test('buildStatsWindowLoadFileOptions includes provided stats API base URL', () => {
|
||||
assert.deepEqual(buildStatsWindowLoadFileOptions('http://127.0.0.1:6123'), {
|
||||
query: {
|
||||
overlay: '1',
|
||||
apiBase: 'http://127.0.0.1:6123',
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
@@ -16,6 +16,8 @@ export interface StatsWindowOptions {
|
||||
staticDir: string;
|
||||
/** Absolute path to the compiled preload-stats.js */
|
||||
preloadPath: string;
|
||||
/** Resolve the active stats API base URL */
|
||||
getApiBaseUrl?: () => string;
|
||||
/** Resolve the active stats toggle key from config */
|
||||
getToggleKey: () => string;
|
||||
/** Resolve the tracked overlay/mpv bounds */
|
||||
@@ -46,7 +48,7 @@ export function toggleStatsOverlay(options: StatsWindowOptions): void {
|
||||
);
|
||||
|
||||
const indexPath = path.join(options.staticDir, 'index.html');
|
||||
statsWindow.loadFile(indexPath, buildStatsWindowLoadFileOptions());
|
||||
statsWindow.loadFile(indexPath, buildStatsWindowLoadFileOptions(options.getApiBaseUrl?.()));
|
||||
|
||||
statsWindow.on('closed', () => {
|
||||
statsWindow = null;
|
||||
|
||||
@@ -706,6 +706,240 @@ test('tokenizeSubtitle prefers Yomitan frequency from highest-priority dictionar
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 100);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle ignores occurrence-based Yomitan frequencies for inflected terms', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'潜み',
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
||||
getYomitanParserWindow: () =>
|
||||
({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async (script: string) => {
|
||||
if (script.includes('getTermFrequencies')) {
|
||||
return [
|
||||
{
|
||||
term: '潜む',
|
||||
reading: 'ひそ',
|
||||
dictionary: 'CC100',
|
||||
frequency: 118121,
|
||||
displayValue: null,
|
||||
displayValueParsed: false,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
if (script.includes('optionsGetFull')) {
|
||||
return {
|
||||
profileCurrent: 0,
|
||||
profileIndex: 0,
|
||||
scanLength: 40,
|
||||
dictionaries: ['CC100'],
|
||||
dictionaryPriorityByName: { CC100: 0 },
|
||||
dictionaryFrequencyModeByName: { CC100: 'occurrence-based' },
|
||||
profiles: [
|
||||
{
|
||||
options: {
|
||||
scanning: { length: 40 },
|
||||
dictionaries: [{ name: 'CC100', enabled: true, id: 0 }],
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
return [
|
||||
{
|
||||
surface: '潜み',
|
||||
reading: 'ひそ',
|
||||
headword: '潜む',
|
||||
startPos: 0,
|
||||
endPos: 2,
|
||||
},
|
||||
];
|
||||
},
|
||||
},
|
||||
}) as unknown as Electron.BrowserWindow,
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle falls back to raw term-only Yomitan rank when no scan-derived rank exists', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'潜み',
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
||||
getYomitanParserWindow: () =>
|
||||
({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async (script: string) => {
|
||||
if (script.includes('getTermFrequencies')) {
|
||||
return [
|
||||
{
|
||||
term: '潜む',
|
||||
reading: 'ひそ',
|
||||
hasReading: false,
|
||||
dictionary: 'CC100',
|
||||
frequency: 118121,
|
||||
displayValue: null,
|
||||
displayValueParsed: false,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
if (script.includes('optionsGetFull')) {
|
||||
return {
|
||||
profileCurrent: 0,
|
||||
profileIndex: 0,
|
||||
scanLength: 40,
|
||||
dictionaries: ['CC100'],
|
||||
dictionaryPriorityByName: { CC100: 0 },
|
||||
dictionaryFrequencyModeByName: { CC100: 'rank-based' },
|
||||
profiles: [
|
||||
{
|
||||
options: {
|
||||
scanning: { length: 40 },
|
||||
dictionaries: [{ name: 'CC100', enabled: true, id: 0 }],
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
return [
|
||||
{
|
||||
surface: '潜み',
|
||||
reading: 'ひそ',
|
||||
headword: '潜む',
|
||||
startPos: 0,
|
||||
endPos: 2,
|
||||
},
|
||||
];
|
||||
},
|
||||
},
|
||||
}) as unknown as Electron.BrowserWindow,
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 118121);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle keeps parsed display rank for term-only inflected headword fallback', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'潜み',
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
||||
getYomitanParserWindow: () =>
|
||||
({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async (script: string) => {
|
||||
if (script.includes('getTermFrequencies')) {
|
||||
return [
|
||||
{
|
||||
term: '潜む',
|
||||
reading: 'ひそ',
|
||||
hasReading: false,
|
||||
dictionary: 'CC100',
|
||||
frequency: 118121,
|
||||
displayValue: '118,121',
|
||||
displayValueParsed: false,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
if (script.includes('optionsGetFull')) {
|
||||
return {
|
||||
profileCurrent: 0,
|
||||
profileIndex: 0,
|
||||
scanLength: 40,
|
||||
dictionaries: ['CC100'],
|
||||
dictionaryPriorityByName: { CC100: 0 },
|
||||
dictionaryFrequencyModeByName: { CC100: 'rank-based' },
|
||||
profiles: [
|
||||
{
|
||||
options: {
|
||||
scanning: { length: 40 },
|
||||
dictionaries: [{ name: 'CC100', enabled: true, id: 0 }],
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
return [
|
||||
{
|
||||
surface: '潜み',
|
||||
reading: 'ひそ',
|
||||
headword: '潜む',
|
||||
startPos: 0,
|
||||
endPos: 2,
|
||||
},
|
||||
];
|
||||
},
|
||||
},
|
||||
}) as unknown as Electron.BrowserWindow,
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 118);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle preserves scan-derived rank over lower-priority Yomitan fallback', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'潜み',
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
||||
getYomitanParserWindow: () =>
|
||||
({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async (script: string) => {
|
||||
if (script.includes('getTermFrequencies')) {
|
||||
return [
|
||||
{
|
||||
term: '潜む',
|
||||
reading: 'ひそ',
|
||||
hasReading: false,
|
||||
dictionary: 'CC100',
|
||||
dictionaryPriority: 2,
|
||||
frequency: 118121,
|
||||
displayValue: null,
|
||||
displayValueParsed: false,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
return [
|
||||
{
|
||||
surface: '潜み',
|
||||
reading: 'ひそむ',
|
||||
headword: '潜む',
|
||||
startPos: 0,
|
||||
endPos: 2,
|
||||
frequencyRank: 4073,
|
||||
},
|
||||
];
|
||||
},
|
||||
},
|
||||
}) as unknown as Electron.BrowserWindow,
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 4073);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle uses only selected Yomitan headword for frequency lookup', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫です',
|
||||
@@ -836,6 +1070,69 @@ test('tokenizeSubtitle prefers exact headword frequency over surface/reading whe
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 8);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle falls back to exact surface frequency when merged headword lookup misses', async () => {
|
||||
const frequencyScripts: string[] = [];
|
||||
const result = await tokenizeSubtitle(
|
||||
'陰に',
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
||||
getYomitanParserWindow: () =>
|
||||
({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async (script: string) => {
|
||||
if (script.includes('getTermFrequencies')) {
|
||||
frequencyScripts.push(script);
|
||||
return script.includes('"term":"陰に","reading":"いんに"')
|
||||
? [
|
||||
{
|
||||
term: '陰に',
|
||||
reading: 'いんに',
|
||||
dictionary: 'freq-dict',
|
||||
frequency: 5702,
|
||||
displayValue: '5702',
|
||||
displayValueParsed: true,
|
||||
},
|
||||
]
|
||||
: [];
|
||||
}
|
||||
|
||||
return [
|
||||
{
|
||||
source: 'scanning-parser',
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
{
|
||||
text: '陰に',
|
||||
reading: 'いんに',
|
||||
headwords: [[{ term: '陰' }]],
|
||||
},
|
||||
],
|
||||
],
|
||||
},
|
||||
];
|
||||
},
|
||||
},
|
||||
}) as unknown as Electron.BrowserWindow,
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.surface, '陰に');
|
||||
assert.equal(result.tokens?.[0]?.headword, '陰');
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 5702);
|
||||
assert.equal(
|
||||
frequencyScripts.some((script) => script.includes('"term":"陰","reading":"いんに"')),
|
||||
true,
|
||||
);
|
||||
assert.equal(
|
||||
frequencyScripts.some((script) => script.includes('"term":"陰に","reading":"いんに"')),
|
||||
true,
|
||||
);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle keeps no frequency when only reading matches and headword misses', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫です',
|
||||
@@ -2287,6 +2584,131 @@ test('tokenizeSubtitle keeps correct MeCab pos1 enrichment when Yomitan offsets
|
||||
assert.equal(targets[0]?.surface, '仮面');
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle preserves merged token frequency when MeCab positions cross a newline gap', async () => {
|
||||
const parserWindow = {
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async (script: string) => {
|
||||
if (script.includes('getTermFrequencies')) {
|
||||
return script.includes('"term":"陰に","reading":"いんに"')
|
||||
? [
|
||||
{
|
||||
term: '陰に',
|
||||
reading: 'いんに',
|
||||
dictionary: 'JPDBv2㋕',
|
||||
frequency: 5702,
|
||||
displayValue: '5702',
|
||||
displayValueParsed: false,
|
||||
},
|
||||
]
|
||||
: [];
|
||||
}
|
||||
|
||||
return [
|
||||
{
|
||||
surface: 'X',
|
||||
reading: 'えっくす',
|
||||
headword: 'X',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
},
|
||||
{
|
||||
surface: '陰に',
|
||||
reading: 'いんに',
|
||||
headword: '陰に',
|
||||
startPos: 2,
|
||||
endPos: 4,
|
||||
},
|
||||
{
|
||||
surface: '潜み',
|
||||
reading: 'ひそ',
|
||||
headword: '潜む',
|
||||
startPos: 4,
|
||||
endPos: 6,
|
||||
},
|
||||
];
|
||||
},
|
||||
},
|
||||
} as unknown as Electron.BrowserWindow;
|
||||
|
||||
const deps = createTokenizerDepsRuntime({
|
||||
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
||||
getYomitanParserWindow: () => parserWindow,
|
||||
setYomitanParserWindow: () => {},
|
||||
getYomitanParserReadyPromise: () => null,
|
||||
setYomitanParserReadyPromise: () => {},
|
||||
getYomitanParserInitPromise: () => null,
|
||||
setYomitanParserInitPromise: () => {},
|
||||
isKnownWord: () => false,
|
||||
getKnownWordMatchMode: () => 'headword',
|
||||
getJlptLevel: () => null,
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getMecabTokenizer: () => ({
|
||||
tokenize: async () => [
|
||||
{
|
||||
word: 'X',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '一般',
|
||||
pos3: '',
|
||||
pos4: '',
|
||||
inflectionType: '',
|
||||
inflectionForm: '',
|
||||
headword: 'X',
|
||||
katakanaReading: 'エックス',
|
||||
pronunciation: 'エックス',
|
||||
},
|
||||
{
|
||||
word: '陰',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '一般',
|
||||
pos3: '',
|
||||
pos4: '',
|
||||
inflectionType: '',
|
||||
inflectionForm: '',
|
||||
headword: '陰',
|
||||
katakanaReading: 'カゲ',
|
||||
pronunciation: 'カゲ',
|
||||
},
|
||||
{
|
||||
word: 'に',
|
||||
partOfSpeech: PartOfSpeech.particle,
|
||||
pos1: '助詞',
|
||||
pos2: '格助詞',
|
||||
pos3: '一般',
|
||||
pos4: '',
|
||||
inflectionType: '',
|
||||
inflectionForm: '',
|
||||
headword: 'に',
|
||||
katakanaReading: 'ニ',
|
||||
pronunciation: 'ニ',
|
||||
},
|
||||
{
|
||||
word: '潜み',
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞',
|
||||
pos2: '自立',
|
||||
pos3: '',
|
||||
pos4: '',
|
||||
inflectionType: '五段・マ行',
|
||||
inflectionForm: '連用形',
|
||||
headword: '潜む',
|
||||
katakanaReading: 'ヒソミ',
|
||||
pronunciation: 'ヒソミ',
|
||||
},
|
||||
],
|
||||
}),
|
||||
});
|
||||
|
||||
const result = await tokenizeSubtitle('X\n陰に潜み', deps);
|
||||
|
||||
assert.equal(result.tokens?.[1]?.surface, '陰に');
|
||||
assert.equal(result.tokens?.[1]?.pos1, '名詞|助詞');
|
||||
assert.equal(result.tokens?.[1]?.pos2, '一般|格助詞');
|
||||
assert.equal(result.tokens?.[1]?.frequencyRank, 5702);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle does not color 1-2 word sentences by default', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫です',
|
||||
|
||||
@@ -23,6 +23,7 @@ import {
|
||||
requestYomitanScanTokens,
|
||||
requestYomitanTermFrequencies,
|
||||
} from './tokenizer/yomitan-parser-runtime';
|
||||
import type { YomitanTermFrequency } from './tokenizer/yomitan-parser-runtime';
|
||||
|
||||
const logger = createLogger('main:tokenizer');
|
||||
|
||||
@@ -225,7 +226,13 @@ export function createTokenizerDepsRuntime(
|
||||
return null;
|
||||
}
|
||||
|
||||
return mergeTokens(rawTokens, options.isKnownWord, options.getKnownWordMatchMode(), false);
|
||||
return mergeTokens(
|
||||
rawTokens,
|
||||
options.isKnownWord,
|
||||
options.getKnownWordMatchMode(),
|
||||
false,
|
||||
text,
|
||||
);
|
||||
},
|
||||
enrichTokensWithMecab: async (tokens, mecabTokens) =>
|
||||
enrichTokensWithMecabAsync(tokens, mecabTokens),
|
||||
@@ -336,56 +343,154 @@ function resolveFrequencyLookupText(
|
||||
return token.surface;
|
||||
}
|
||||
|
||||
function resolveYomitanFrequencyLookupTexts(
|
||||
token: MergedToken,
|
||||
matchMode: FrequencyDictionaryMatchMode,
|
||||
): string[] {
|
||||
const primaryLookupText = resolveFrequencyLookupText(token, matchMode).trim();
|
||||
if (!primaryLookupText) {
|
||||
return [];
|
||||
}
|
||||
|
||||
if (matchMode !== 'headword') {
|
||||
return [primaryLookupText];
|
||||
}
|
||||
|
||||
const normalizedHeadword = token.headword.trim();
|
||||
const normalizedSurface = token.surface.trim();
|
||||
if (
|
||||
!normalizedHeadword ||
|
||||
!normalizedSurface ||
|
||||
normalizedSurface === normalizedHeadword ||
|
||||
normalizedSurface === primaryLookupText
|
||||
) {
|
||||
return [primaryLookupText];
|
||||
}
|
||||
|
||||
return [primaryLookupText, normalizedSurface];
|
||||
}
|
||||
|
||||
function buildYomitanFrequencyTermReadingList(
|
||||
tokens: MergedToken[],
|
||||
matchMode: FrequencyDictionaryMatchMode,
|
||||
): Array<{ term: string; reading: string | null }> {
|
||||
const termReadingList: Array<{ term: string; reading: string | null }> = [];
|
||||
for (const token of tokens) {
|
||||
const term = resolveFrequencyLookupText(token, matchMode).trim();
|
||||
if (!term) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const readingRaw =
|
||||
token.reading && token.reading.trim().length > 0 ? token.reading.trim() : null;
|
||||
termReadingList.push({ term, reading: readingRaw });
|
||||
for (const term of resolveYomitanFrequencyLookupTexts(token, matchMode)) {
|
||||
termReadingList.push({ term, reading: readingRaw });
|
||||
}
|
||||
}
|
||||
|
||||
return termReadingList;
|
||||
}
|
||||
|
||||
function buildYomitanFrequencyRankMap(
|
||||
frequencies: ReadonlyArray<{ term: string; frequency: number; dictionaryPriority?: number }>,
|
||||
): Map<string, number> {
|
||||
const rankByTerm = new Map<string, { rank: number; dictionaryPriority: number }>();
|
||||
function makeYomitanFrequencyPairKey(term: string, reading: string | null): string {
|
||||
return `${term}\u0000${reading ?? ''}`;
|
||||
}
|
||||
|
||||
interface NormalizedYomitanTermFrequency extends YomitanTermFrequency {
|
||||
reading: string | null;
|
||||
frequency: number;
|
||||
}
|
||||
|
||||
interface YomitanFrequencyIndex {
|
||||
byPair: Map<string, NormalizedYomitanTermFrequency[]>;
|
||||
byTerm: Map<string, NormalizedYomitanTermFrequency[]>;
|
||||
}
|
||||
|
||||
function appendYomitanFrequencyEntry(
|
||||
map: Map<string, NormalizedYomitanTermFrequency[]>,
|
||||
key: string,
|
||||
entry: NormalizedYomitanTermFrequency,
|
||||
): void {
|
||||
const existing = map.get(key);
|
||||
if (existing) {
|
||||
existing.push(entry);
|
||||
return;
|
||||
}
|
||||
|
||||
map.set(key, [entry]);
|
||||
}
|
||||
|
||||
function buildYomitanFrequencyIndex(
|
||||
frequencies: ReadonlyArray<YomitanTermFrequency>,
|
||||
): YomitanFrequencyIndex {
|
||||
const byPair = new Map<string, NormalizedYomitanTermFrequency[]>();
|
||||
const byTerm = new Map<string, NormalizedYomitanTermFrequency[]>();
|
||||
for (const frequency of frequencies) {
|
||||
const normalizedTerm = frequency.term.trim();
|
||||
const term = frequency.term.trim();
|
||||
const rank = normalizePositiveFrequencyRank(frequency.frequency);
|
||||
if (!normalizedTerm || rank === null) {
|
||||
if (!term || rank === null) {
|
||||
continue;
|
||||
}
|
||||
const dictionaryPriority =
|
||||
typeof frequency.dictionaryPriority === 'number' &&
|
||||
Number.isFinite(frequency.dictionaryPriority)
|
||||
? Math.max(0, Math.floor(frequency.dictionaryPriority))
|
||||
: Number.MAX_SAFE_INTEGER;
|
||||
const current = rankByTerm.get(normalizedTerm);
|
||||
|
||||
const reading =
|
||||
typeof frequency.reading === 'string' && frequency.reading.trim().length > 0
|
||||
? frequency.reading.trim()
|
||||
: null;
|
||||
const normalizedEntry: NormalizedYomitanTermFrequency = {
|
||||
...frequency,
|
||||
term,
|
||||
reading,
|
||||
frequency: rank,
|
||||
};
|
||||
appendYomitanFrequencyEntry(byPair, makeYomitanFrequencyPairKey(term, reading), normalizedEntry);
|
||||
appendYomitanFrequencyEntry(byTerm, term, normalizedEntry);
|
||||
}
|
||||
|
||||
return { byPair, byTerm };
|
||||
}
|
||||
|
||||
function selectBestYomitanFrequencyRank(
|
||||
entries: ReadonlyArray<NormalizedYomitanTermFrequency>,
|
||||
): number | null {
|
||||
let bestEntry: NormalizedYomitanTermFrequency | null = null;
|
||||
for (const entry of entries) {
|
||||
if (
|
||||
current === undefined ||
|
||||
dictionaryPriority < current.dictionaryPriority ||
|
||||
(dictionaryPriority === current.dictionaryPriority && rank < current.rank)
|
||||
bestEntry === null ||
|
||||
entry.dictionaryPriority < bestEntry.dictionaryPriority ||
|
||||
(entry.dictionaryPriority === bestEntry.dictionaryPriority &&
|
||||
entry.frequency < bestEntry.frequency)
|
||||
) {
|
||||
rankByTerm.set(normalizedTerm, { rank, dictionaryPriority });
|
||||
bestEntry = entry;
|
||||
}
|
||||
}
|
||||
|
||||
const collapsedRankByTerm = new Map<string, number>();
|
||||
for (const [term, entry] of rankByTerm.entries()) {
|
||||
collapsedRankByTerm.set(term, entry.rank);
|
||||
return bestEntry?.frequency ?? null;
|
||||
}
|
||||
|
||||
function getYomitanFrequencyRank(
|
||||
token: MergedToken,
|
||||
candidateText: string,
|
||||
matchMode: FrequencyDictionaryMatchMode,
|
||||
frequencyIndex: YomitanFrequencyIndex,
|
||||
): number | null {
|
||||
const normalizedCandidateText = candidateText.trim();
|
||||
if (!normalizedCandidateText) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return collapsedRankByTerm;
|
||||
const reading =
|
||||
typeof token.reading === 'string' && token.reading.trim().length > 0 ? token.reading.trim() : null;
|
||||
const pairEntries =
|
||||
frequencyIndex.byPair.get(makeYomitanFrequencyPairKey(normalizedCandidateText, reading)) ?? [];
|
||||
const candidateEntries =
|
||||
pairEntries.length > 0 ? pairEntries : (frequencyIndex.byTerm.get(normalizedCandidateText) ?? []);
|
||||
if (candidateEntries.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const normalizedHeadword = token.headword.trim();
|
||||
const normalizedSurface = token.surface.trim();
|
||||
const isInflectedHeadwordFallback =
|
||||
matchMode === 'headword' &&
|
||||
normalizedCandidateText === normalizedHeadword &&
|
||||
normalizedSurface.length > 0 &&
|
||||
normalizedSurface !== normalizedHeadword;
|
||||
|
||||
return selectBestYomitanFrequencyRank(candidateEntries);
|
||||
}
|
||||
|
||||
function getLocalFrequencyRank(
|
||||
@@ -416,7 +521,7 @@ function getLocalFrequencyRank(
|
||||
function applyFrequencyRanks(
|
||||
tokens: MergedToken[],
|
||||
matchMode: FrequencyDictionaryMatchMode,
|
||||
yomitanRankByTerm: Map<string, number>,
|
||||
yomitanFrequencyIndex: YomitanFrequencyIndex,
|
||||
getFrequencyRank: FrequencyDictionaryLookup | undefined,
|
||||
): MergedToken[] {
|
||||
if (tokens.length === 0) {
|
||||
@@ -441,12 +546,19 @@ function applyFrequencyRanks(
|
||||
};
|
||||
}
|
||||
|
||||
const yomitanRank = yomitanRankByTerm.get(lookupText);
|
||||
if (yomitanRank !== undefined) {
|
||||
return {
|
||||
...token,
|
||||
frequencyRank: yomitanRank,
|
||||
};
|
||||
for (const candidateText of resolveYomitanFrequencyLookupTexts(token, matchMode)) {
|
||||
const yomitanRank = getYomitanFrequencyRank(
|
||||
token,
|
||||
candidateText,
|
||||
matchMode,
|
||||
yomitanFrequencyIndex,
|
||||
);
|
||||
if (yomitanRank !== null) {
|
||||
return {
|
||||
...token,
|
||||
frequencyRank: yomitanRank,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
if (!getFrequencyRank) {
|
||||
@@ -501,6 +613,7 @@ async function parseWithYomitanInternalParser(
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
isNameMatch: token.isNameMatch ?? false,
|
||||
frequencyRank: token.frequencyRank,
|
||||
}),
|
||||
),
|
||||
);
|
||||
@@ -510,7 +623,7 @@ async function parseWithYomitanInternalParser(
|
||||
}
|
||||
deps.onTokenizationReady?.(text);
|
||||
|
||||
const frequencyRankPromise: Promise<Map<string, number>> = options.frequencyEnabled
|
||||
const frequencyRankPromise: Promise<YomitanFrequencyIndex> = options.frequencyEnabled
|
||||
? (async () => {
|
||||
const frequencyMatchMode = options.frequencyMatchMode;
|
||||
const termReadingList = buildYomitanFrequencyTermReadingList(
|
||||
@@ -522,9 +635,9 @@ async function parseWithYomitanInternalParser(
|
||||
deps,
|
||||
logger,
|
||||
);
|
||||
return buildYomitanFrequencyRankMap(yomitanFrequencies);
|
||||
return buildYomitanFrequencyIndex(yomitanFrequencies);
|
||||
})()
|
||||
: Promise.resolve(new Map<string, number>());
|
||||
: Promise.resolve({ byPair: new Map(), byTerm: new Map() });
|
||||
|
||||
const mecabEnrichmentPromise: Promise<MergedToken[]> = needsMecabPosEnrichment(options)
|
||||
? (async () => {
|
||||
@@ -545,7 +658,7 @@ async function parseWithYomitanInternalParser(
|
||||
})()
|
||||
: Promise.resolve(normalizedSelectedTokens);
|
||||
|
||||
const [yomitanRankByTerm, enrichedTokens] = await Promise.all([
|
||||
const [yomitanFrequencyIndex, enrichedTokens] = await Promise.all([
|
||||
frequencyRankPromise,
|
||||
mecabEnrichmentPromise,
|
||||
]);
|
||||
@@ -554,7 +667,7 @@ async function parseWithYomitanInternalParser(
|
||||
return applyFrequencyRanks(
|
||||
enrichedTokens,
|
||||
options.frequencyMatchMode,
|
||||
yomitanRankByTerm,
|
||||
yomitanFrequencyIndex,
|
||||
deps.getFrequencyRank,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -293,6 +293,29 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('annotateTokens keeps frequency for kanji noun tokens even when mecab marks them non-independent', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: '者',
|
||||
reading: 'もの',
|
||||
headword: '者',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '名詞',
|
||||
pos2: '非自立',
|
||||
pos3: '一般',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
frequencyRank: 475,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(tokens, makeDeps(), {
|
||||
minSentenceWordsForNPlusOne: 1,
|
||||
});
|
||||
|
||||
assert.equal(result[0]?.frequencyRank, 475);
|
||||
});
|
||||
|
||||
test('annotateTokens excludes likely kana SFX tokens from frequency when POS tags are missing', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
|
||||
@@ -89,6 +89,23 @@ function normalizePos2Tag(pos2: string | undefined): string {
|
||||
return typeof pos2 === 'string' ? pos2.trim() : '';
|
||||
}
|
||||
|
||||
function hasKanjiChar(text: string): boolean {
|
||||
for (const char of text) {
|
||||
const code = char.codePointAt(0);
|
||||
if (code === undefined) {
|
||||
continue;
|
||||
}
|
||||
if (
|
||||
(code >= 0x3400 && code <= 0x4dbf) ||
|
||||
(code >= 0x4e00 && code <= 0x9fff) ||
|
||||
(code >= 0xf900 && code <= 0xfaff)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function isExcludedComponent(
|
||||
pos1: string | undefined,
|
||||
pos2: string | undefined,
|
||||
@@ -169,6 +186,34 @@ function isFrequencyExcludedByPos(
|
||||
);
|
||||
}
|
||||
|
||||
function shouldKeepFrequencyForNonIndependentKanjiNoun(
|
||||
token: MergedToken,
|
||||
pos1Exclusions: ReadonlySet<string>,
|
||||
): boolean {
|
||||
if (pos1Exclusions.has('名詞')) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const rank =
|
||||
typeof token.frequencyRank === 'number' && Number.isFinite(token.frequencyRank)
|
||||
? Math.max(1, Math.floor(token.frequencyRank))
|
||||
: null;
|
||||
if (rank === null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1));
|
||||
const pos2Parts = splitNormalizedTagParts(normalizePos2Tag(token.pos2));
|
||||
if (pos1Parts.length !== 1 || pos2Parts.length !== 1) {
|
||||
return false;
|
||||
}
|
||||
if (pos1Parts[0] !== '名詞' || pos2Parts[0] !== '非自立') {
|
||||
return false;
|
||||
}
|
||||
|
||||
return hasKanjiChar(token.surface) || hasKanjiChar(token.headword);
|
||||
}
|
||||
|
||||
export function shouldExcludeTokenFromVocabularyPersistence(
|
||||
token: MergedToken,
|
||||
options: Pick<AnnotationStageOptions, 'pos1Exclusions' | 'pos2Exclusions'> = {},
|
||||
@@ -454,7 +499,10 @@ function filterTokenFrequencyRank(
|
||||
pos1Exclusions: ReadonlySet<string>,
|
||||
pos2Exclusions: ReadonlySet<string>,
|
||||
): number | undefined {
|
||||
if (isFrequencyExcludedByPos(token, pos1Exclusions, pos2Exclusions)) {
|
||||
if (
|
||||
isFrequencyExcludedByPos(token, pos1Exclusions, pos2Exclusions) &&
|
||||
!shouldKeepFrequencyForNonIndependentKanjiNoun(token, pos1Exclusions)
|
||||
) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
|
||||
@@ -188,6 +188,7 @@ test('requestYomitanTermFrequencies returns normalized frequency entries', async
|
||||
{
|
||||
term: '猫',
|
||||
reading: 'ねこ',
|
||||
hasReading: true,
|
||||
dictionary: 'freq-dict',
|
||||
dictionaryPriority: 0,
|
||||
frequency: 77,
|
||||
@@ -197,6 +198,7 @@ test('requestYomitanTermFrequencies returns normalized frequency entries', async
|
||||
{
|
||||
term: '鍛える',
|
||||
reading: 'きたえる',
|
||||
hasReading: false,
|
||||
dictionary: 'freq-dict',
|
||||
dictionaryPriority: 1,
|
||||
frequency: 46961,
|
||||
@@ -217,9 +219,11 @@ test('requestYomitanTermFrequencies returns normalized frequency entries', async
|
||||
|
||||
assert.equal(result.length, 2);
|
||||
assert.equal(result[0]?.term, '猫');
|
||||
assert.equal(result[0]?.hasReading, true);
|
||||
assert.equal(result[0]?.frequency, 77);
|
||||
assert.equal(result[0]?.dictionaryPriority, 0);
|
||||
assert.equal(result[1]?.term, '鍛える');
|
||||
assert.equal(result[1]?.hasReading, false);
|
||||
assert.equal(result[1]?.frequency, 2847);
|
||||
assert.match(scriptValue, /getTermFrequencies/);
|
||||
assert.match(scriptValue, /optionsGetFull/);
|
||||
@@ -247,6 +251,96 @@ test('requestYomitanTermFrequencies prefers primary rank from displayValue array
|
||||
assert.equal(result[0]?.frequency, 7141);
|
||||
});
|
||||
|
||||
test('requestYomitanTermFrequencies prefers primary rank from displayValue string pair when raw frequency matches trailing count', async () => {
|
||||
const deps = createDeps(async () => [
|
||||
{
|
||||
term: '潜む',
|
||||
reading: 'ひそむ',
|
||||
dictionary: 'freq-dict',
|
||||
dictionaryPriority: 0,
|
||||
frequency: 121,
|
||||
displayValue: '118,121',
|
||||
displayValueParsed: false,
|
||||
},
|
||||
]);
|
||||
|
||||
const result = await requestYomitanTermFrequencies([{ term: '潜む', reading: 'ひそむ' }], deps, {
|
||||
error: () => undefined,
|
||||
});
|
||||
|
||||
assert.equal(result.length, 1);
|
||||
assert.equal(result[0]?.term, '潜む');
|
||||
assert.equal(result[0]?.frequency, 118);
|
||||
});
|
||||
|
||||
test('requestYomitanTermFrequencies uses leading display digits for displayValue strings', async () => {
|
||||
const deps = createDeps(async () => [
|
||||
{
|
||||
term: '例',
|
||||
reading: 'れい',
|
||||
dictionary: 'freq-dict',
|
||||
dictionaryPriority: 0,
|
||||
frequency: 1234,
|
||||
displayValue: '1,234',
|
||||
displayValueParsed: false,
|
||||
},
|
||||
]);
|
||||
|
||||
const result = await requestYomitanTermFrequencies([{ term: '例', reading: 'れい' }], deps, {
|
||||
error: () => undefined,
|
||||
});
|
||||
|
||||
assert.equal(result.length, 1);
|
||||
assert.equal(result[0]?.term, '例');
|
||||
assert.equal(result[0]?.frequency, 1);
|
||||
});
|
||||
|
||||
test('requestYomitanTermFrequencies ignores occurrence-based dictionaries for rank tagging', async () => {
|
||||
let metadataScript = '';
|
||||
const deps = createDeps(async (script) => {
|
||||
if (script.includes('getTermFrequencies')) {
|
||||
return [
|
||||
{
|
||||
term: '潜む',
|
||||
reading: 'ひそむ',
|
||||
dictionary: 'CC100',
|
||||
frequency: 118121,
|
||||
displayValue: null,
|
||||
displayValueParsed: false,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
if (script.includes('optionsGetFull')) {
|
||||
metadataScript = script;
|
||||
return {
|
||||
profileCurrent: 0,
|
||||
profileIndex: 0,
|
||||
scanLength: 40,
|
||||
dictionaries: ['CC100'],
|
||||
dictionaryPriorityByName: { CC100: 0 },
|
||||
dictionaryFrequencyModeByName: { CC100: 'occurrence-based' },
|
||||
profiles: [
|
||||
{
|
||||
options: {
|
||||
scanning: { length: 40 },
|
||||
dictionaries: [{ name: 'CC100', enabled: true, id: 0 }],
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
return [];
|
||||
});
|
||||
|
||||
const result = await requestYomitanTermFrequencies([{ term: '潜む', reading: 'ひそむ' }], deps, {
|
||||
error: () => undefined,
|
||||
});
|
||||
|
||||
assert.deepEqual(result, []);
|
||||
assert.match(metadataScript, /getDictionaryInfo/);
|
||||
});
|
||||
|
||||
test('requestYomitanTermFrequencies requests term-only fallback only after reading miss', async () => {
|
||||
const frequencyScripts: string[] = [];
|
||||
const deps = createDeps(async (script) => {
|
||||
@@ -485,6 +579,317 @@ test('requestYomitanScanTokens uses left-to-right termsFind scanning instead of
|
||||
assert.match(scannerScript ?? '', /deinflect:\s*true/);
|
||||
});
|
||||
|
||||
test('requestYomitanScanTokens extracts best frequency rank from selected termsFind entry', async () => {
|
||||
let scannerScript = '';
|
||||
const deps = createDeps(async (script) => {
|
||||
if (script.includes('termsFind')) {
|
||||
scannerScript = script;
|
||||
return [];
|
||||
}
|
||||
if (script.includes('optionsGetFull')) {
|
||||
return {
|
||||
profileCurrent: 0,
|
||||
profileIndex: 0,
|
||||
scanLength: 40,
|
||||
dictionaries: ['JPDBv2㋕', 'Jiten', 'CC100'],
|
||||
dictionaryPriorityByName: {
|
||||
'JPDBv2㋕': 0,
|
||||
Jiten: 1,
|
||||
CC100: 2,
|
||||
},
|
||||
dictionaryFrequencyModeByName: {
|
||||
'JPDBv2㋕': 'rank-based',
|
||||
Jiten: 'rank-based',
|
||||
CC100: 'rank-based',
|
||||
},
|
||||
profiles: [
|
||||
{
|
||||
options: {
|
||||
scanning: { length: 40 },
|
||||
dictionaries: [
|
||||
{ name: 'JPDBv2㋕', enabled: true, id: 0 },
|
||||
{ name: 'Jiten', enabled: true, id: 1 },
|
||||
{ name: 'CC100', enabled: true, id: 2 },
|
||||
],
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
return null;
|
||||
});
|
||||
|
||||
await requestYomitanScanTokens('潜み', deps, {
|
||||
error: () => undefined,
|
||||
});
|
||||
|
||||
const result = await runInjectedYomitanScript(scannerScript, (action, params) => {
|
||||
if (action !== 'termsFind') {
|
||||
throw new Error(`unexpected action: ${action}`);
|
||||
}
|
||||
|
||||
const text = (params as { text?: string } | undefined)?.text ?? '';
|
||||
if (!text.startsWith('潜み')) {
|
||||
return { originalTextLength: 0, dictionaryEntries: [] };
|
||||
}
|
||||
|
||||
return {
|
||||
originalTextLength: 2,
|
||||
dictionaryEntries: [
|
||||
{
|
||||
headwords: [
|
||||
{
|
||||
term: '潜む',
|
||||
reading: 'ひそむ',
|
||||
sources: [{ originalText: '潜み', isPrimary: true, matchType: 'exact' }],
|
||||
},
|
||||
],
|
||||
frequencies: [
|
||||
{
|
||||
headwordIndex: 0,
|
||||
dictionary: 'JPDBv2㋕',
|
||||
frequency: 20181,
|
||||
displayValue: '4073,20181句',
|
||||
},
|
||||
{
|
||||
headwordIndex: 0,
|
||||
dictionary: 'Jiten',
|
||||
frequency: 28594,
|
||||
displayValue: '4592,28594句',
|
||||
},
|
||||
{
|
||||
headwordIndex: 0,
|
||||
dictionary: 'CC100',
|
||||
frequency: 118121,
|
||||
displayValue: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
});
|
||||
|
||||
assert.deepEqual(result, [
|
||||
{
|
||||
surface: '潜み',
|
||||
reading: 'ひそ',
|
||||
headword: '潜む',
|
||||
startPos: 0,
|
||||
endPos: 2,
|
||||
isNameMatch: false,
|
||||
frequencyRank: 4073,
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
test('requestYomitanScanTokens uses frequency from later exact-match entry when first exact entry has none', async () => {
|
||||
let scannerScript = '';
|
||||
const deps = createDeps(async (script) => {
|
||||
if (script.includes('termsFind')) {
|
||||
scannerScript = script;
|
||||
return [];
|
||||
}
|
||||
if (script.includes('optionsGetFull')) {
|
||||
return {
|
||||
profileCurrent: 0,
|
||||
profileIndex: 0,
|
||||
scanLength: 40,
|
||||
dictionaries: ['JPDBv2㋕', 'Jiten', 'CC100'],
|
||||
dictionaryPriorityByName: {
|
||||
'JPDBv2㋕': 0,
|
||||
Jiten: 1,
|
||||
CC100: 2,
|
||||
},
|
||||
dictionaryFrequencyModeByName: {
|
||||
'JPDBv2㋕': 'rank-based',
|
||||
Jiten: 'rank-based',
|
||||
CC100: 'rank-based',
|
||||
},
|
||||
profiles: [
|
||||
{
|
||||
options: {
|
||||
scanning: { length: 40 },
|
||||
dictionaries: [
|
||||
{ name: 'JPDBv2㋕', enabled: true, id: 0 },
|
||||
{ name: 'Jiten', enabled: true, id: 1 },
|
||||
{ name: 'CC100', enabled: true, id: 2 },
|
||||
],
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
return null;
|
||||
});
|
||||
|
||||
await requestYomitanScanTokens('者', deps, {
|
||||
error: () => undefined,
|
||||
});
|
||||
|
||||
const result = await runInjectedYomitanScript(scannerScript, (action, params) => {
|
||||
if (action !== 'termsFind') {
|
||||
throw new Error(`unexpected action: ${action}`);
|
||||
}
|
||||
|
||||
const text = (params as { text?: string } | undefined)?.text ?? '';
|
||||
if (!text.startsWith('者')) {
|
||||
return { originalTextLength: 0, dictionaryEntries: [] };
|
||||
}
|
||||
|
||||
return {
|
||||
originalTextLength: 1,
|
||||
dictionaryEntries: [
|
||||
{
|
||||
headwords: [
|
||||
{
|
||||
term: '者',
|
||||
reading: 'もの',
|
||||
sources: [{ originalText: '者', isPrimary: true, matchType: 'exact' }],
|
||||
},
|
||||
],
|
||||
frequencies: [],
|
||||
},
|
||||
{
|
||||
headwords: [
|
||||
{
|
||||
term: '者',
|
||||
reading: 'もの',
|
||||
sources: [{ originalText: '者', isPrimary: true, matchType: 'exact' }],
|
||||
},
|
||||
],
|
||||
frequencies: [
|
||||
{
|
||||
headwordIndex: 0,
|
||||
dictionary: 'JPDBv2㋕',
|
||||
frequency: 79601,
|
||||
displayValue: '475,79601句',
|
||||
},
|
||||
{
|
||||
headwordIndex: 0,
|
||||
dictionary: 'Jiten',
|
||||
frequency: 338,
|
||||
displayValue: '338',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
});
|
||||
|
||||
assert.deepEqual(result, [
|
||||
{
|
||||
surface: '者',
|
||||
reading: 'もの',
|
||||
headword: '者',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
isNameMatch: false,
|
||||
frequencyRank: 475,
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
test('requestYomitanScanTokens can use frequency from later exact secondary-match entry', async () => {
|
||||
let scannerScript = '';
|
||||
const deps = createDeps(async (script) => {
|
||||
if (script.includes('termsFind')) {
|
||||
scannerScript = script;
|
||||
return [];
|
||||
}
|
||||
if (script.includes('optionsGetFull')) {
|
||||
return {
|
||||
profileCurrent: 0,
|
||||
profileIndex: 0,
|
||||
scanLength: 40,
|
||||
dictionaries: ['JPDBv2㋕', 'Jiten', 'CC100'],
|
||||
dictionaryPriorityByName: {
|
||||
'JPDBv2㋕': 0,
|
||||
Jiten: 1,
|
||||
CC100: 2,
|
||||
},
|
||||
dictionaryFrequencyModeByName: {
|
||||
'JPDBv2㋕': 'rank-based',
|
||||
Jiten: 'rank-based',
|
||||
CC100: 'rank-based',
|
||||
},
|
||||
profiles: [
|
||||
{
|
||||
options: {
|
||||
scanning: { length: 40 },
|
||||
dictionaries: [
|
||||
{ name: 'JPDBv2㋕', enabled: true, id: 0 },
|
||||
{ name: 'Jiten', enabled: true, id: 1 },
|
||||
{ name: 'CC100', enabled: true, id: 2 },
|
||||
],
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
return null;
|
||||
});
|
||||
|
||||
await requestYomitanScanTokens('者', deps, {
|
||||
error: () => undefined,
|
||||
});
|
||||
|
||||
const result = await runInjectedYomitanScript(scannerScript, (action, params) => {
|
||||
if (action !== 'termsFind') {
|
||||
throw new Error(`unexpected action: ${action}`);
|
||||
}
|
||||
|
||||
const text = (params as { text?: string } | undefined)?.text ?? '';
|
||||
if (!text.startsWith('者')) {
|
||||
return { originalTextLength: 0, dictionaryEntries: [] };
|
||||
}
|
||||
|
||||
return {
|
||||
originalTextLength: 1,
|
||||
dictionaryEntries: [
|
||||
{
|
||||
headwords: [
|
||||
{
|
||||
term: '者',
|
||||
reading: 'もの',
|
||||
sources: [{ originalText: '者', isPrimary: true, matchType: 'exact' }],
|
||||
},
|
||||
],
|
||||
frequencies: [],
|
||||
},
|
||||
{
|
||||
headwords: [
|
||||
{
|
||||
term: '者',
|
||||
reading: 'もの',
|
||||
sources: [{ originalText: '者', isPrimary: false, matchType: 'exact' }],
|
||||
},
|
||||
],
|
||||
frequencies: [
|
||||
{
|
||||
headwordIndex: 0,
|
||||
dictionary: 'JPDBv2㋕',
|
||||
frequency: 79601,
|
||||
displayValue: '475,79601句',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
});
|
||||
|
||||
assert.deepEqual(result, [
|
||||
{
|
||||
surface: '者',
|
||||
reading: 'もの',
|
||||
headword: '者',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
isNameMatch: false,
|
||||
frequencyRank: 475,
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
test('requestYomitanScanTokens marks tokens backed by SubMiner character dictionary entries', async () => {
|
||||
const deps = createDeps(async (script) => {
|
||||
if (script.includes('optionsGetFull')) {
|
||||
|
||||
@@ -20,19 +20,24 @@ interface YomitanParserRuntimeDeps {
|
||||
createYomitanExtensionWindow?: (pageName: string) => Promise<BrowserWindow | null>;
|
||||
}
|
||||
|
||||
type YomitanFrequencyMode = 'occurrence-based' | 'rank-based';
|
||||
|
||||
export interface YomitanDictionaryInfo {
|
||||
title: string;
|
||||
revision?: string | number;
|
||||
frequencyMode?: YomitanFrequencyMode;
|
||||
}
|
||||
|
||||
export interface YomitanTermFrequency {
|
||||
term: string;
|
||||
reading: string | null;
|
||||
hasReading: boolean;
|
||||
dictionary: string;
|
||||
dictionaryPriority: number;
|
||||
frequency: number;
|
||||
displayValue: string | null;
|
||||
displayValueParsed: boolean;
|
||||
frequencyDerivedFromDisplayValue: boolean;
|
||||
}
|
||||
|
||||
export interface YomitanTermReadingPair {
|
||||
@@ -47,6 +52,7 @@ export interface YomitanScanToken {
|
||||
startPos: number;
|
||||
endPos: number;
|
||||
isNameMatch?: boolean;
|
||||
frequencyRank?: number;
|
||||
}
|
||||
|
||||
interface YomitanProfileMetadata {
|
||||
@@ -54,6 +60,7 @@ interface YomitanProfileMetadata {
|
||||
scanLength: number;
|
||||
dictionaries: string[];
|
||||
dictionaryPriorityByName: Record<string, number>;
|
||||
dictionaryFrequencyModeByName: Partial<Record<string, YomitanFrequencyMode>>;
|
||||
}
|
||||
|
||||
const DEFAULT_YOMITAN_SCAN_LENGTH = 40;
|
||||
@@ -78,7 +85,8 @@ function isScanTokenArray(value: unknown): value is YomitanScanToken[] {
|
||||
typeof entry.headword === 'string' &&
|
||||
typeof entry.startPos === 'number' &&
|
||||
typeof entry.endPos === 'number' &&
|
||||
(entry.isNameMatch === undefined || typeof entry.isNameMatch === 'boolean'),
|
||||
(entry.isNameMatch === undefined || typeof entry.isNameMatch === 'boolean') &&
|
||||
(entry.frequencyRank === undefined || typeof entry.frequencyRank === 'number'),
|
||||
)
|
||||
);
|
||||
}
|
||||
@@ -117,24 +125,22 @@ function parsePositiveFrequencyString(value: string): number | null {
|
||||
return null;
|
||||
}
|
||||
|
||||
const numericPrefix = trimmed.match(/^\d[\d,]*/)?.[0];
|
||||
if (!numericPrefix) {
|
||||
const numericMatch = trimmed.match(/[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/)?.[0];
|
||||
if (!numericMatch) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const chunks = numericPrefix.split(',');
|
||||
const normalizedNumber =
|
||||
chunks.length <= 1
|
||||
? (chunks[0] ?? '')
|
||||
: chunks.slice(1).every((chunk) => /^\d{3}$/.test(chunk))
|
||||
? chunks.join('')
|
||||
: (chunks[0] ?? '');
|
||||
const parsed = Number.parseInt(normalizedNumber, 10);
|
||||
const parsed = Number.parseFloat(numericMatch);
|
||||
if (!Number.isFinite(parsed) || parsed <= 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return parsed;
|
||||
const normalized = Math.floor(parsed);
|
||||
if (!Number.isFinite(normalized) || normalized <= 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function parsePositiveFrequencyValue(value: unknown): number | null {
|
||||
@@ -159,6 +165,19 @@ function parsePositiveFrequencyValue(value: unknown): number | null {
|
||||
return null;
|
||||
}
|
||||
|
||||
function parseDisplayFrequencyValue(value: unknown): number | null {
|
||||
if (typeof value === 'string') {
|
||||
const leadingDigits = value.trim().match(/^\d+/)?.[0];
|
||||
if (!leadingDigits) {
|
||||
return null;
|
||||
}
|
||||
const parsed = Number.parseInt(leadingDigits, 10);
|
||||
return Number.isFinite(parsed) && parsed > 0 ? parsed : null;
|
||||
}
|
||||
|
||||
return parsePositiveFrequencyValue(value);
|
||||
}
|
||||
|
||||
function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
|
||||
if (!isObject(value)) {
|
||||
return null;
|
||||
@@ -169,9 +188,7 @@ function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
|
||||
const rawFrequency = parsePositiveFrequencyValue(value.frequency);
|
||||
const displayValueRaw = value.displayValue;
|
||||
const parsedDisplayFrequency =
|
||||
displayValueRaw !== null && displayValueRaw !== undefined
|
||||
? parsePositiveFrequencyValue(displayValueRaw)
|
||||
: null;
|
||||
displayValueRaw !== null && displayValueRaw !== undefined ? parseDisplayFrequencyValue(displayValueRaw) : null;
|
||||
const frequency = parsedDisplayFrequency ?? rawFrequency;
|
||||
if (!term || !dictionary || frequency === null) {
|
||||
return null;
|
||||
@@ -184,17 +201,20 @@ function toYomitanTermFrequency(value: unknown): YomitanTermFrequency | null {
|
||||
|
||||
const reading =
|
||||
value.reading === null ? null : typeof value.reading === 'string' ? value.reading : null;
|
||||
const hasReading = value.hasReading === false ? false : reading !== null;
|
||||
const displayValue = typeof displayValueRaw === 'string' ? displayValueRaw : null;
|
||||
const displayValueParsed = value.displayValueParsed === true;
|
||||
|
||||
return {
|
||||
term,
|
||||
reading,
|
||||
hasReading,
|
||||
dictionary,
|
||||
dictionaryPriority,
|
||||
frequency,
|
||||
displayValue,
|
||||
displayValueParsed,
|
||||
frequencyDerivedFromDisplayValue: parsedDisplayFrequency !== null,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -300,17 +320,34 @@ function toYomitanProfileMetadata(value: unknown): YomitanProfileMetadata | null
|
||||
}
|
||||
}
|
||||
|
||||
const dictionaryFrequencyModeByNameRaw = value.dictionaryFrequencyModeByName;
|
||||
const dictionaryFrequencyModeByName: Partial<Record<string, YomitanFrequencyMode>> = {};
|
||||
if (isObject(dictionaryFrequencyModeByNameRaw)) {
|
||||
for (const [name, frequencyModeRaw] of Object.entries(dictionaryFrequencyModeByNameRaw)) {
|
||||
const normalizedName = name.trim();
|
||||
if (!normalizedName) {
|
||||
continue;
|
||||
}
|
||||
if (frequencyModeRaw !== 'occurrence-based' && frequencyModeRaw !== 'rank-based') {
|
||||
continue;
|
||||
}
|
||||
dictionaryFrequencyModeByName[normalizedName] = frequencyModeRaw;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
profileIndex,
|
||||
scanLength,
|
||||
dictionaries,
|
||||
dictionaryPriorityByName,
|
||||
dictionaryFrequencyModeByName,
|
||||
};
|
||||
}
|
||||
|
||||
function normalizeFrequencyEntriesWithPriority(
|
||||
rawResult: unknown[],
|
||||
dictionaryPriorityByName: Record<string, number>,
|
||||
dictionaryFrequencyModeByName: Partial<Record<string, YomitanFrequencyMode>>,
|
||||
): YomitanTermFrequency[] {
|
||||
const normalized: YomitanTermFrequency[] = [];
|
||||
for (const entry of rawResult) {
|
||||
@@ -319,6 +356,10 @@ function normalizeFrequencyEntriesWithPriority(
|
||||
continue;
|
||||
}
|
||||
|
||||
if (dictionaryFrequencyModeByName[frequency.dictionary] === 'occurrence-based') {
|
||||
continue;
|
||||
}
|
||||
|
||||
const dictionaryPriority = dictionaryPriorityByName[frequency.dictionary];
|
||||
normalized.push({
|
||||
...frequency,
|
||||
@@ -425,8 +466,34 @@ async function requestYomitanProfileMetadata(
|
||||
acc[entry.name] = index;
|
||||
return acc;
|
||||
}, {});
|
||||
let dictionaryFrequencyModeByName = {};
|
||||
try {
|
||||
const dictionaryInfo = await invoke("getDictionaryInfo", undefined);
|
||||
dictionaryFrequencyModeByName = Array.isArray(dictionaryInfo)
|
||||
? dictionaryInfo.reduce((acc, entry) => {
|
||||
if (!entry || typeof entry !== "object" || typeof entry.title !== "string") {
|
||||
return acc;
|
||||
}
|
||||
if (
|
||||
entry.frequencyMode === "occurrence-based" ||
|
||||
entry.frequencyMode === "rank-based"
|
||||
) {
|
||||
acc[entry.title] = entry.frequencyMode;
|
||||
}
|
||||
return acc;
|
||||
}, {})
|
||||
: {};
|
||||
} catch {
|
||||
dictionaryFrequencyModeByName = {};
|
||||
}
|
||||
|
||||
return { profileIndex, scanLength, dictionaries, dictionaryPriorityByName };
|
||||
return {
|
||||
profileIndex,
|
||||
scanLength,
|
||||
dictionaries,
|
||||
dictionaryPriorityByName,
|
||||
dictionaryFrequencyModeByName
|
||||
};
|
||||
})();
|
||||
`;
|
||||
|
||||
@@ -774,7 +841,133 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
|
||||
}
|
||||
return segments;
|
||||
}
|
||||
function getPreferredHeadword(dictionaryEntries, token) {
|
||||
function parsePositiveFrequencyNumber(value) {
|
||||
if (typeof value === 'number' && Number.isFinite(value) && value > 0) {
|
||||
return Math.max(1, Math.floor(value));
|
||||
}
|
||||
if (typeof value === 'string') {
|
||||
const numericMatch = value.trim().match(/[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/)?.[0];
|
||||
if (!numericMatch) { return null; }
|
||||
const parsed = Number.parseFloat(numericMatch);
|
||||
if (!Number.isFinite(parsed) || parsed <= 0) { return null; }
|
||||
return Math.max(1, Math.floor(parsed));
|
||||
}
|
||||
if (Array.isArray(value)) {
|
||||
for (const item of value) {
|
||||
const parsed = parsePositiveFrequencyNumber(item);
|
||||
if (parsed !== null) { return parsed; }
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
function parseDisplayFrequencyNumber(value) {
|
||||
if (typeof value === 'string') {
|
||||
const leadingDigits = value.trim().match(/^\d+/)?.[0];
|
||||
if (!leadingDigits) { return null; }
|
||||
const parsed = Number.parseInt(leadingDigits, 10);
|
||||
return Number.isFinite(parsed) && parsed > 0 ? parsed : null;
|
||||
}
|
||||
return parsePositiveFrequencyNumber(value);
|
||||
}
|
||||
function getFrequencyDictionaryName(frequency) {
|
||||
const candidates = [
|
||||
frequency?.dictionary,
|
||||
frequency?.dictionaryName,
|
||||
frequency?.name,
|
||||
frequency?.title,
|
||||
frequency?.dictionaryTitle,
|
||||
frequency?.dictionaryAlias
|
||||
];
|
||||
for (const candidate of candidates) {
|
||||
if (typeof candidate === 'string' && candidate.trim().length > 0) {
|
||||
return candidate.trim();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
function getBestFrequencyRank(dictionaryEntry, headwordIndex, dictionaryPriorityByName, dictionaryFrequencyModeByName) {
|
||||
let best = null;
|
||||
const headwordCount = Array.isArray(dictionaryEntry?.headwords) ? dictionaryEntry.headwords.length : 0;
|
||||
for (const frequency of dictionaryEntry?.frequencies || []) {
|
||||
if (!frequency || typeof frequency !== 'object') { continue; }
|
||||
const frequencyHeadwordIndex = frequency.headwordIndex;
|
||||
if (typeof frequencyHeadwordIndex === 'number') {
|
||||
if (frequencyHeadwordIndex !== headwordIndex) { continue; }
|
||||
} else if (headwordCount > 1) {
|
||||
continue;
|
||||
}
|
||||
const dictionary = getFrequencyDictionaryName(frequency);
|
||||
if (!dictionary) { continue; }
|
||||
if (dictionaryFrequencyModeByName[dictionary] === 'occurrence-based') { continue; }
|
||||
const rank =
|
||||
parseDisplayFrequencyNumber(frequency.displayValue) ??
|
||||
parsePositiveFrequencyNumber(frequency.frequency);
|
||||
if (rank === null) { continue; }
|
||||
const priorityRaw = dictionaryPriorityByName[dictionary];
|
||||
const fallbackPriority =
|
||||
typeof frequency.dictionaryIndex === 'number' && Number.isFinite(frequency.dictionaryIndex)
|
||||
? Math.max(0, Math.floor(frequency.dictionaryIndex))
|
||||
: Number.MAX_SAFE_INTEGER;
|
||||
const priority =
|
||||
typeof priorityRaw === 'number' && Number.isFinite(priorityRaw)
|
||||
? Math.max(0, Math.floor(priorityRaw))
|
||||
: fallbackPriority;
|
||||
if (best === null || priority < best.priority || (priority === best.priority && rank < best.rank)) {
|
||||
best = { priority, rank };
|
||||
}
|
||||
}
|
||||
return best?.rank ?? null;
|
||||
}
|
||||
function hasExactSource(headword, token, requirePrimary) {
|
||||
for (const src of headword.sources || []) {
|
||||
if (src.originalText !== token) { continue; }
|
||||
if (requirePrimary && !src.isPrimary) { continue; }
|
||||
if (src.matchType !== 'exact') { continue; }
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
function collectExactHeadwordMatches(dictionaryEntries, token, requirePrimary) {
|
||||
const matches = [];
|
||||
for (const dictionaryEntry of dictionaryEntries || []) {
|
||||
const headwords = Array.isArray(dictionaryEntry?.headwords) ? dictionaryEntry.headwords : [];
|
||||
for (let headwordIndex = 0; headwordIndex < headwords.length; headwordIndex += 1) {
|
||||
const headword = headwords[headwordIndex];
|
||||
if (!hasExactSource(headword, token, requirePrimary)) { continue; }
|
||||
matches.push({ dictionaryEntry, headword, headwordIndex });
|
||||
}
|
||||
}
|
||||
return matches;
|
||||
}
|
||||
function sameHeadword(match, preferredMatch) {
|
||||
if (!match || !preferredMatch) {
|
||||
return false;
|
||||
}
|
||||
if (match.headword?.term !== preferredMatch.headword?.term) {
|
||||
return false;
|
||||
}
|
||||
const matchReading = typeof match.headword?.reading === 'string' ? match.headword.reading : '';
|
||||
const preferredReading =
|
||||
typeof preferredMatch.headword?.reading === 'string' ? preferredMatch.headword.reading : '';
|
||||
return matchReading === preferredReading;
|
||||
}
|
||||
function getBestFrequencyRankForMatches(matches, dictionaryPriorityByName, dictionaryFrequencyModeByName) {
|
||||
let best = null;
|
||||
for (const match of matches) {
|
||||
const rank = getBestFrequencyRank(
|
||||
match.dictionaryEntry,
|
||||
match.headwordIndex,
|
||||
dictionaryPriorityByName,
|
||||
dictionaryFrequencyModeByName
|
||||
);
|
||||
if (rank === null) { continue; }
|
||||
if (best === null || rank < best) {
|
||||
best = rank;
|
||||
}
|
||||
}
|
||||
return best;
|
||||
}
|
||||
function getPreferredHeadword(dictionaryEntries, token, dictionaryPriorityByName, dictionaryFrequencyModeByName) {
|
||||
function appendDictionaryNames(target, value) {
|
||||
if (!value || typeof value !== 'object') {
|
||||
return;
|
||||
@@ -813,36 +1006,33 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
|
||||
}
|
||||
return getDictionaryEntryNames(entry).some((name) => name.startsWith("SubMiner Character Dictionary"));
|
||||
}
|
||||
function hasExactPrimarySource(headword, token) {
|
||||
for (const src of headword.sources || []) {
|
||||
if (src.originalText !== token) { continue; }
|
||||
if (!src.isPrimary) { continue; }
|
||||
if (src.matchType !== 'exact') { continue; }
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
const exactPrimaryMatches = collectExactHeadwordMatches(dictionaryEntries, token, true);
|
||||
let matchedNameDictionary = false;
|
||||
if (includeNameMatchMetadata) {
|
||||
for (const dictionaryEntry of dictionaryEntries || []) {
|
||||
if (!isNameDictionaryEntry(dictionaryEntry)) { continue; }
|
||||
for (const headword of dictionaryEntry.headwords || []) {
|
||||
if (!hasExactPrimarySource(headword, token)) { continue; }
|
||||
for (const match of exactPrimaryMatches) {
|
||||
if (match.dictionaryEntry !== dictionaryEntry) { continue; }
|
||||
matchedNameDictionary = true;
|
||||
break;
|
||||
}
|
||||
if (matchedNameDictionary) { break; }
|
||||
}
|
||||
}
|
||||
for (const dictionaryEntry of dictionaryEntries || []) {
|
||||
for (const headword of dictionaryEntry.headwords || []) {
|
||||
if (!hasExactPrimarySource(headword, token)) { continue; }
|
||||
return {
|
||||
term: headword.term,
|
||||
reading: headword.reading,
|
||||
isNameMatch: matchedNameDictionary || isNameDictionaryEntry(dictionaryEntry)
|
||||
};
|
||||
}
|
||||
const preferredMatch = exactPrimaryMatches[0];
|
||||
if (preferredMatch) {
|
||||
const exactFrequencyMatches = collectExactHeadwordMatches(dictionaryEntries, token, false)
|
||||
.filter((match) => sameHeadword(match, preferredMatch));
|
||||
return {
|
||||
term: preferredMatch.headword.term,
|
||||
reading: preferredMatch.headword.reading,
|
||||
isNameMatch: matchedNameDictionary || isNameDictionaryEntry(preferredMatch.dictionaryEntry),
|
||||
frequencyRank: getBestFrequencyRankForMatches(
|
||||
exactFrequencyMatches.length > 0 ? exactFrequencyMatches : exactPrimaryMatches,
|
||||
dictionaryPriorityByName,
|
||||
dictionaryFrequencyModeByName
|
||||
)
|
||||
};
|
||||
}
|
||||
return null;
|
||||
}
|
||||
@@ -853,6 +1043,8 @@ function buildYomitanScanningScript(
|
||||
profileIndex: number,
|
||||
scanLength: number,
|
||||
includeNameMatchMetadata: boolean,
|
||||
dictionaryPriorityByName: Record<string, number>,
|
||||
dictionaryFrequencyModeByName: Partial<Record<string, YomitanFrequencyMode>>,
|
||||
): string {
|
||||
return `
|
||||
(async () => {
|
||||
@@ -876,6 +1068,8 @@ function buildYomitanScanningScript(
|
||||
});
|
||||
${YOMITAN_SCANNING_HELPERS}
|
||||
const includeNameMatchMetadata = ${includeNameMatchMetadata ? 'true' : 'false'};
|
||||
const dictionaryPriorityByName = ${JSON.stringify(dictionaryPriorityByName)};
|
||||
const dictionaryFrequencyModeByName = ${JSON.stringify(dictionaryFrequencyModeByName)};
|
||||
const text = ${JSON.stringify(text)};
|
||||
const details = {matchType: "exact", deinflect: true};
|
||||
const tokens = [];
|
||||
@@ -889,7 +1083,12 @@ ${YOMITAN_SCANNING_HELPERS}
|
||||
const originalTextLength = typeof result?.originalTextLength === "number" ? result.originalTextLength : 0;
|
||||
if (dictionaryEntries.length > 0 && originalTextLength > 0 && (originalTextLength !== character.length || isCodePointJapanese(codePoint))) {
|
||||
const source = substring.substring(0, originalTextLength);
|
||||
const preferredHeadword = getPreferredHeadword(dictionaryEntries, source);
|
||||
const preferredHeadword = getPreferredHeadword(
|
||||
dictionaryEntries,
|
||||
source,
|
||||
dictionaryPriorityByName,
|
||||
dictionaryFrequencyModeByName
|
||||
);
|
||||
if (preferredHeadword && typeof preferredHeadword.term === "string") {
|
||||
const reading = typeof preferredHeadword.reading === "string" ? preferredHeadword.reading : "";
|
||||
const segments = distributeFuriganaInflected(preferredHeadword.term, reading, source);
|
||||
@@ -900,6 +1099,10 @@ ${YOMITAN_SCANNING_HELPERS}
|
||||
startPos: i,
|
||||
endPos: i + originalTextLength,
|
||||
isNameMatch: includeNameMatchMetadata && preferredHeadword.isNameMatch === true,
|
||||
frequencyRank:
|
||||
typeof preferredHeadword.frequencyRank === "number" && Number.isFinite(preferredHeadword.frequencyRank)
|
||||
? Math.max(1, Math.floor(preferredHeadword.frequencyRank))
|
||||
: undefined,
|
||||
});
|
||||
i += originalTextLength;
|
||||
continue;
|
||||
@@ -1036,6 +1239,8 @@ export async function requestYomitanScanTokens(
|
||||
profileIndex,
|
||||
scanLength,
|
||||
options?.includeNameMatchMetadata === true,
|
||||
metadata?.dictionaryPriorityByName ?? {},
|
||||
metadata?.dictionaryFrequencyModeByName ?? {},
|
||||
),
|
||||
true,
|
||||
);
|
||||
@@ -1099,7 +1304,11 @@ async function fetchYomitanTermFrequencies(
|
||||
try {
|
||||
const rawResult = await parserWindow.webContents.executeJavaScript(script, true);
|
||||
return Array.isArray(rawResult)
|
||||
? normalizeFrequencyEntriesWithPriority(rawResult, metadata.dictionaryPriorityByName)
|
||||
? normalizeFrequencyEntriesWithPriority(
|
||||
rawResult,
|
||||
metadata.dictionaryPriorityByName,
|
||||
metadata.dictionaryFrequencyModeByName,
|
||||
)
|
||||
: [];
|
||||
} catch (err) {
|
||||
logger.error('Yomitan term frequency request failed:', (err as Error).message);
|
||||
@@ -1541,10 +1750,15 @@ export async function getYomitanDictionaryInfo(
|
||||
.map((entry) => {
|
||||
const title = typeof entry.title === 'string' ? entry.title.trim() : '';
|
||||
const revision = entry.revision;
|
||||
const frequencyMode: YomitanFrequencyMode | undefined =
|
||||
entry.frequencyMode === 'occurrence-based' || entry.frequencyMode === 'rank-based'
|
||||
? entry.frequencyMode
|
||||
: undefined;
|
||||
return {
|
||||
title,
|
||||
revision:
|
||||
typeof revision === 'string' || typeof revision === 'number' ? revision : undefined,
|
||||
frequencyMode,
|
||||
};
|
||||
})
|
||||
.filter((entry) => entry.title.length > 0);
|
||||
|
||||
Reference in New Issue
Block a user