mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-05-04 00:41:33 -07:00
Persist stats exclusions in DB and fix word metrics filtering
- Stats vocabulary exclusions stored in `imm_stats_excluded_words` (schema v18); seeded from localStorage on first load - Session, overview, trends, and library word metrics use filtered persisted occurrences with raw fallback - Session known-word % chart uses filtered persisted totals as denominator for both known and total - JLPT subtitle styling changed to underline-only; no longer overrides text color
This commit is contained in:
@@ -277,6 +277,8 @@ function createMockTracker(
|
||||
getSessionTimeline: async () => [],
|
||||
getSessionEvents: async () => [],
|
||||
getVocabularyStats: async () => VOCABULARY_STATS,
|
||||
getStatsExcludedWords: async () => [],
|
||||
replaceStatsExcludedWords: async () => {},
|
||||
getKanjiStats: async () => KANJI_STATS,
|
||||
getWordOccurrences: async () => OCCURRENCES,
|
||||
getKanjiOccurrences: async () => OCCURRENCES,
|
||||
@@ -362,7 +364,7 @@ describe('stats server API routes', () => {
|
||||
assert.ok(Array.isArray(body));
|
||||
});
|
||||
|
||||
it('GET /api/stats/sessions enriches each session with known-word metrics when cache exists', async () => {
|
||||
it('GET /api/stats/sessions enriches known-word metrics using filtered persisted totals', async () => {
|
||||
await withTempDir(async (dir) => {
|
||||
const cachePath = path.join(dir, 'known-words.json');
|
||||
fs.writeFileSync(
|
||||
@@ -391,7 +393,7 @@ describe('stats server API routes', () => {
|
||||
const body = await res.json();
|
||||
const first = body[0];
|
||||
assert.equal(first.knownWordsSeen, 2);
|
||||
assert.equal(first.knownWordRate, 2.5);
|
||||
assert.equal(first.knownWordRate, 66.7);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -436,7 +438,7 @@ describe('stats server API routes', () => {
|
||||
assert.equal(seenLimit, undefined);
|
||||
});
|
||||
|
||||
it('GET /api/stats/sessions/:id/known-words-timeline preserves line positions and counts known occurrences', async () => {
|
||||
it('GET /api/stats/sessions/:id/known-words-timeline preserves line positions and counts filtered totals', async () => {
|
||||
await withTempDir(async (dir) => {
|
||||
const cachePath = path.join(dir, 'known-words.json');
|
||||
fs.writeFileSync(
|
||||
@@ -461,8 +463,8 @@ describe('stats server API routes', () => {
|
||||
const res = await app.request('/api/stats/sessions/1/known-words-timeline');
|
||||
assert.equal(res.status, 200);
|
||||
assert.deepEqual(await res.json(), [
|
||||
{ linesSeen: 1, knownWordsSeen: 2 },
|
||||
{ linesSeen: 3, knownWordsSeen: 3 },
|
||||
{ linesSeen: 1, knownWordsSeen: 2, totalWordsSeen: 2 },
|
||||
{ linesSeen: 3, knownWordsSeen: 3, totalWordsSeen: 7 },
|
||||
]);
|
||||
});
|
||||
});
|
||||
@@ -730,6 +732,65 @@ describe('stats server API routes', () => {
|
||||
assert.equal(body[0].pos3, null);
|
||||
});
|
||||
|
||||
it('GET /api/stats/excluded-words returns tracker exclusion rows', async () => {
|
||||
const app = createStatsApp(
|
||||
createMockTracker({
|
||||
getStatsExcludedWords: async () => [
|
||||
{ headword: '猫', word: '猫', reading: 'ねこ' },
|
||||
{ headword: 'する', word: 'する', reading: 'する' },
|
||||
],
|
||||
}),
|
||||
);
|
||||
|
||||
const res = await app.request('/api/stats/excluded-words');
|
||||
assert.equal(res.status, 200);
|
||||
assert.deepEqual(await res.json(), [
|
||||
{ headword: '猫', word: '猫', reading: 'ねこ' },
|
||||
{ headword: 'する', word: 'する', reading: 'する' },
|
||||
]);
|
||||
});
|
||||
|
||||
it('PUT /api/stats/excluded-words replaces tracker exclusion rows', async () => {
|
||||
let seenWords: unknown = null;
|
||||
const app = createStatsApp(
|
||||
createMockTracker({
|
||||
replaceStatsExcludedWords: async (words: unknown) => {
|
||||
seenWords = words;
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const res = await app.request('/api/stats/excluded-words', {
|
||||
method: 'PUT',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
words: [
|
||||
{ headword: '猫', word: '猫', reading: 'ねこ' },
|
||||
{ headword: 'する', word: 'する', reading: 'する' },
|
||||
],
|
||||
}),
|
||||
});
|
||||
|
||||
assert.equal(res.status, 200);
|
||||
assert.deepEqual(await res.json(), { ok: true });
|
||||
assert.deepEqual(seenWords, [
|
||||
{ headword: '猫', word: '猫', reading: 'ねこ' },
|
||||
{ headword: 'する', word: 'する', reading: 'する' },
|
||||
]);
|
||||
});
|
||||
|
||||
it('PUT /api/stats/excluded-words rejects malformed rows', async () => {
|
||||
const app = createStatsApp(createMockTracker());
|
||||
|
||||
const res = await app.request('/api/stats/excluded-words', {
|
||||
method: 'PUT',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ words: [{ headword: '猫', word: 7, reading: 'ねこ' }] }),
|
||||
});
|
||||
|
||||
assert.equal(res.status, 400);
|
||||
});
|
||||
|
||||
it('GET /api/stats/anime returns anime library', async () => {
|
||||
const app = createStatsApp(createMockTracker());
|
||||
const res = await app.request('/api/stats/anime');
|
||||
|
||||
@@ -52,7 +52,9 @@ import {
|
||||
getKanjiWords,
|
||||
getSessionEvents,
|
||||
getSimilarWords,
|
||||
getStatsExcludedWords,
|
||||
getVocabularyStats,
|
||||
replaceStatsExcludedWords,
|
||||
getWordAnimeAppearances,
|
||||
getWordDetail,
|
||||
getWordOccurrences,
|
||||
@@ -151,6 +153,7 @@ import {
|
||||
type SessionSummaryQueryRow,
|
||||
type SessionTimelineRow,
|
||||
type SimilarWordRow,
|
||||
type StatsExcludedWordRow,
|
||||
type StreakCalendarRow,
|
||||
type VocabularyCleanupSummary,
|
||||
type WatchTimePerAnimeRow,
|
||||
@@ -289,6 +292,7 @@ export type {
|
||||
SessionSummaryQueryRow,
|
||||
SessionTimelineRow,
|
||||
SimilarWordRow,
|
||||
StatsExcludedWordRow,
|
||||
StreakCalendarRow,
|
||||
WatchTimePerAnimeRow,
|
||||
WordAnimeAppearanceRow,
|
||||
@@ -498,6 +502,14 @@ export class ImmersionTrackerService {
|
||||
return getVocabularyStats(this.db, limit, excludePos);
|
||||
}
|
||||
|
||||
async getStatsExcludedWords(): Promise<StatsExcludedWordRow[]> {
|
||||
return getStatsExcludedWords(this.db);
|
||||
}
|
||||
|
||||
async replaceStatsExcludedWords(words: StatsExcludedWordRow[]): Promise<void> {
|
||||
replaceStatsExcludedWords(this.db, words);
|
||||
}
|
||||
|
||||
async cleanupVocabularyStats(): Promise<VocabularyCleanupSummary> {
|
||||
return cleanupVocabularyStats(this.db, {
|
||||
resolveLegacyPos: this.resolveLegacyVocabularyPos,
|
||||
|
||||
@@ -86,6 +86,77 @@ function cleanupDbPath(dbPath: string): void {
|
||||
}
|
||||
}
|
||||
|
||||
function insertFilteredWordOccurrence(
|
||||
db: InstanceType<typeof Database>,
|
||||
options: {
|
||||
sessionId: number;
|
||||
videoId: number;
|
||||
animeId?: number | null;
|
||||
lineIndex?: number;
|
||||
occurrenceCount: number;
|
||||
startedAtMs: number;
|
||||
headword?: string;
|
||||
word?: string;
|
||||
reading?: string;
|
||||
partOfSpeech?: string;
|
||||
pos1?: string;
|
||||
pos2?: string;
|
||||
pos3?: string;
|
||||
},
|
||||
): void {
|
||||
const headword = options.headword ?? options.word ?? '猫';
|
||||
const word = options.word ?? headword;
|
||||
const lineId = Number(
|
||||
db
|
||||
.prepare(
|
||||
`INSERT INTO imm_subtitle_lines (
|
||||
session_id, event_id, video_id, anime_id, line_index,
|
||||
segment_start_ms, segment_end_ms, text, CREATED_DATE, LAST_UPDATE_DATE
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
)
|
||||
.run(
|
||||
options.sessionId,
|
||||
null,
|
||||
options.videoId,
|
||||
options.animeId ?? null,
|
||||
options.lineIndex ?? 1,
|
||||
0,
|
||||
1000,
|
||||
word,
|
||||
options.startedAtMs,
|
||||
options.startedAtMs,
|
||||
).lastInsertRowid,
|
||||
);
|
||||
const wordRow = db
|
||||
.prepare(
|
||||
`INSERT INTO imm_words (
|
||||
headword, word, reading, pos1, pos2, pos3, part_of_speech,
|
||||
first_seen, last_seen, frequency
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(headword, word, reading) DO UPDATE SET
|
||||
frequency = imm_words.frequency + excluded.frequency,
|
||||
last_seen = excluded.last_seen
|
||||
RETURNING id`,
|
||||
)
|
||||
.get(
|
||||
word,
|
||||
options.reading ?? '',
|
||||
options.pos1 ?? '名詞',
|
||||
options.pos2 ?? '一般',
|
||||
options.pos3 ?? '',
|
||||
options.partOfSpeech ?? 'noun',
|
||||
Math.floor(options.startedAtMs / 1000),
|
||||
Math.floor(options.startedAtMs / 1000),
|
||||
options.occurrenceCount,
|
||||
) as { id: number };
|
||||
const wordId = Number(wordRow.id);
|
||||
|
||||
db.prepare(
|
||||
`INSERT INTO imm_word_line_occurrences (line_id, word_id, occurrence_count)
|
||||
VALUES (?, ?, ?)`,
|
||||
).run(lineId, wordId, options.occurrenceCount);
|
||||
}
|
||||
|
||||
function withMockNowMs<T>(fixedDateMs: string | number, run: () => T): T {
|
||||
const previousNowMs = globalThis.__subminerTestNowMs;
|
||||
globalThis.__subminerTestNowMs = fixedDateMs;
|
||||
@@ -1236,6 +1307,89 @@ test('getQueryHints computes weekly new-word cutoff from calendar midnights', ()
|
||||
});
|
||||
});
|
||||
|
||||
test('word-count read models use filtered persisted occurrences with raw fallback', () => {
|
||||
const dbPath = makeDbPath();
|
||||
const db = new Database(dbPath);
|
||||
|
||||
try {
|
||||
ensureSchema(db);
|
||||
const videoId = getOrCreateVideoRecord(db, 'local:/tmp/filtered-word-metrics.mkv', {
|
||||
canonicalTitle: 'Filtered Word Metrics',
|
||||
sourcePath: '/tmp/filtered-word-metrics.mkv',
|
||||
sourceUrl: null,
|
||||
sourceType: SOURCE_TYPE_LOCAL,
|
||||
});
|
||||
|
||||
const startedAtMs = 1_700_000_000_000;
|
||||
const withOccurrences = startSessionRecord(db, videoId, startedAtMs);
|
||||
const fallbackOnly = startSessionRecord(db, videoId, startedAtMs + 60_000);
|
||||
|
||||
db.prepare(
|
||||
`
|
||||
UPDATE imm_sessions
|
||||
SET ended_at_ms = ?, status = 2, active_watched_ms = ?, tokens_seen = ?, yomitan_lookup_count = ?
|
||||
WHERE session_id = ?
|
||||
`,
|
||||
).run(startedAtMs + 30_000, 2, 5, 1, withOccurrences.sessionId);
|
||||
db.prepare(
|
||||
`
|
||||
UPDATE imm_sessions
|
||||
SET ended_at_ms = ?, status = 2, active_watched_ms = ?, tokens_seen = ?, yomitan_lookup_count = ?
|
||||
WHERE session_id = ?
|
||||
`,
|
||||
).run(startedAtMs + 90_000, 2, 7, 2, fallbackOnly.sessionId);
|
||||
|
||||
insertFilteredWordOccurrence(db, {
|
||||
sessionId: withOccurrences.sessionId,
|
||||
videoId,
|
||||
occurrenceCount: 2,
|
||||
startedAtMs,
|
||||
});
|
||||
insertFilteredWordOccurrence(db, {
|
||||
sessionId: withOccurrences.sessionId,
|
||||
videoId,
|
||||
lineIndex: 2,
|
||||
occurrenceCount: 3,
|
||||
startedAtMs,
|
||||
headword: 'じゃない',
|
||||
word: 'じゃない',
|
||||
partOfSpeech: 'i_adjective',
|
||||
pos1: '形容詞',
|
||||
pos2: '*|自立',
|
||||
pos3: '*',
|
||||
});
|
||||
|
||||
db.prepare(
|
||||
`
|
||||
INSERT INTO imm_daily_rollups (
|
||||
rollup_day, video_id, total_sessions, total_active_min, total_lines_seen,
|
||||
total_tokens_seen, total_cards
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
`,
|
||||
).run(Math.floor(startedAtMs / 86_400_000), videoId, 2, 1, 2, 12, 0);
|
||||
|
||||
const summaries = getSessionSummaries(db, 10);
|
||||
assert.equal(
|
||||
summaries.find((session) => session.sessionId === withOccurrences.sessionId)?.tokensSeen,
|
||||
2,
|
||||
);
|
||||
assert.equal(
|
||||
summaries.find((session) => session.sessionId === fallbackOnly.sessionId)?.tokensSeen,
|
||||
7,
|
||||
);
|
||||
|
||||
const hints = getQueryHints(db);
|
||||
assert.equal(hints.totalTokensSeen, 9);
|
||||
|
||||
const rollup = getDailyRollups(db, 1)[0]!;
|
||||
assert.equal(rollup.totalTokensSeen, 9);
|
||||
assert.equal(rollup.tokensPerMin, 9);
|
||||
} finally {
|
||||
db.close();
|
||||
cleanupDbPath(dbPath);
|
||||
}
|
||||
});
|
||||
|
||||
test('getQueryHints counts new words by distinct headword first-seen time', () => {
|
||||
const dbPath = makeDbPath();
|
||||
const db = new Database(dbPath);
|
||||
@@ -1430,6 +1584,61 @@ test('getVocabularyStats returns rows ordered by frequency descending', () => {
|
||||
}
|
||||
});
|
||||
|
||||
test('getVocabularyStats filters rows that fail tokenizer vocabulary rules', () => {
|
||||
const dbPath = makeDbPath();
|
||||
const db = new Database(dbPath);
|
||||
|
||||
try {
|
||||
ensureSchema(db);
|
||||
const stmts = createTrackerPreparedStatements(db);
|
||||
|
||||
stmts.wordUpsertStmt.run(
|
||||
'どうしても',
|
||||
'どうしてもって',
|
||||
'どうしてもって',
|
||||
'other',
|
||||
'副詞|助詞',
|
||||
'一般|格助詞',
|
||||
'',
|
||||
1_000,
|
||||
1_000,
|
||||
);
|
||||
stmts.wordUpsertStmt.run(
|
||||
'じゃない',
|
||||
'じゃない',
|
||||
'',
|
||||
'i_adjective',
|
||||
'形容詞',
|
||||
'*|自立',
|
||||
'*',
|
||||
1_100,
|
||||
1_100,
|
||||
);
|
||||
stmts.wordUpsertStmt.run(
|
||||
'何か',
|
||||
'何か',
|
||||
'なにか',
|
||||
'other',
|
||||
'名詞|助詞',
|
||||
'代名詞|副助詞/並立助詞/終助詞',
|
||||
'一般|*',
|
||||
1_200,
|
||||
1_200,
|
||||
);
|
||||
stmts.wordUpsertStmt.run('猫', '猫', 'ねこ', 'noun', '名詞', '一般', '', 1_500, 1_500);
|
||||
|
||||
const rows = getVocabularyStats(db, 10);
|
||||
|
||||
assert.deepEqual(
|
||||
rows.map((row) => row.headword),
|
||||
['猫'],
|
||||
);
|
||||
} finally {
|
||||
db.close();
|
||||
cleanupDbPath(dbPath);
|
||||
}
|
||||
});
|
||||
|
||||
test('getVocabularyStats returns empty array when no words exist', () => {
|
||||
const dbPath = makeDbPath();
|
||||
const db = new Database(dbPath);
|
||||
@@ -1475,6 +1684,22 @@ test('cleanupVocabularyStats repairs stored POS metadata and removes excluded im
|
||||
headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
).run('未解決', '未解決', '', '', '', '', '', 901, 951, 1);
|
||||
db.prepare(
|
||||
`INSERT INTO imm_words (
|
||||
headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
).run(
|
||||
'どうしても',
|
||||
'どうしてもって',
|
||||
'どうしてもって',
|
||||
'other',
|
||||
'副詞|助詞',
|
||||
'一般|格助詞',
|
||||
'',
|
||||
1_110,
|
||||
1_610,
|
||||
7,
|
||||
);
|
||||
|
||||
const result = await cleanupVocabularyStats(db, {
|
||||
resolveLegacyPos: async (row) => {
|
||||
@@ -1517,7 +1742,7 @@ test('cleanupVocabularyStats repairs stored POS metadata and removes excluded im
|
||||
pos2: string;
|
||||
}>;
|
||||
|
||||
assert.deepEqual(result, { scanned: 5, kept: 3, deleted: 2, repaired: 2 });
|
||||
assert.deepEqual(result, { scanned: 6, kept: 3, deleted: 3, repaired: 2 });
|
||||
assert.deepEqual(
|
||||
rows.map((row) => ({ headword: row.headword, frequency: row.frequency })),
|
||||
[
|
||||
@@ -2226,6 +2451,31 @@ test('getSessionWordsByLine joins word occurrences through imm_words.id', () =>
|
||||
`INSERT INTO imm_word_line_occurrences (line_id, word_id, occurrence_count)
|
||||
VALUES (?, ?, ?)`,
|
||||
).run(lineId, wordId, 1);
|
||||
const excludedWordId = Number(
|
||||
db
|
||||
.prepare(
|
||||
`INSERT INTO imm_words (
|
||||
headword, word, reading, pos1, pos2, pos3, part_of_speech, first_seen, last_seen, frequency
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
)
|
||||
.run(
|
||||
'じゃない',
|
||||
'じゃない',
|
||||
'',
|
||||
'形容詞',
|
||||
'*|自立',
|
||||
'*',
|
||||
'i_adjective',
|
||||
startedAtMs,
|
||||
startedAtMs,
|
||||
1,
|
||||
).lastInsertRowid,
|
||||
);
|
||||
|
||||
db.prepare(
|
||||
`INSERT INTO imm_word_line_occurrences (line_id, word_id, occurrence_count)
|
||||
VALUES (?, ?, ?)`,
|
||||
).run(lineId, excludedWordId, 3);
|
||||
|
||||
assert.deepEqual(getSessionWordsByLine(db, sessionId), [
|
||||
{ lineIndex: 0, headword: '猫', occurrenceCount: 1 },
|
||||
@@ -3959,6 +4209,121 @@ test('getTrendsDashboard librarySummary returns null lookupsPerHundred when word
|
||||
}
|
||||
});
|
||||
|
||||
test('getTrendsDashboard word metrics use filtered persisted occurrences', () => {
|
||||
const dbPath = makeDbPath();
|
||||
const db = new Database(dbPath);
|
||||
|
||||
try {
|
||||
ensureSchema(db);
|
||||
const stmts = createTrackerPreparedStatements(db);
|
||||
const videoId = getOrCreateVideoRecord(db, 'local:/tmp/filtered-trends.mkv', {
|
||||
canonicalTitle: 'Filtered Trends Episode',
|
||||
sourcePath: '/tmp/filtered-trends.mkv',
|
||||
sourceUrl: null,
|
||||
sourceType: SOURCE_TYPE_LOCAL,
|
||||
});
|
||||
const animeId = getOrCreateAnimeRecord(db, {
|
||||
parsedTitle: 'Filtered Trends Anime',
|
||||
canonicalTitle: 'Filtered Trends Anime',
|
||||
anilistId: null,
|
||||
titleRomaji: null,
|
||||
titleEnglish: null,
|
||||
titleNative: null,
|
||||
metadataJson: null,
|
||||
});
|
||||
linkVideoToAnimeRecord(db, videoId, {
|
||||
animeId,
|
||||
parsedBasename: 'filtered-trends.mkv',
|
||||
parsedTitle: 'Filtered Trends Anime',
|
||||
parsedSeason: 1,
|
||||
parsedEpisode: 1,
|
||||
parserSource: 'test',
|
||||
parserConfidence: 1,
|
||||
parseMetadataJson: null,
|
||||
});
|
||||
|
||||
const dayOneStart = 1_700_000_000_000;
|
||||
const dayTwoStart = dayOneStart + 86_400_000;
|
||||
const rows = [
|
||||
{ start: dayOneStart, rawWords: 10, filteredWords: 2, lookups: 4 },
|
||||
{ start: dayTwoStart, rawWords: 20, filteredWords: 3, lookups: 6 },
|
||||
];
|
||||
|
||||
for (const [index, row] of rows.entries()) {
|
||||
const session = startSessionRecord(db, videoId, row.start);
|
||||
stmts.telemetryInsertStmt.run(
|
||||
session.sessionId,
|
||||
`${row.start + 60_000}`,
|
||||
10 * 60_000,
|
||||
10 * 60_000,
|
||||
1,
|
||||
row.rawWords,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
row.lookups,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
`${row.start + 60_000}`,
|
||||
`${row.start + 60_000}`,
|
||||
);
|
||||
db.prepare(
|
||||
`
|
||||
UPDATE imm_sessions
|
||||
SET ended_at_ms = ?, total_watched_ms = ?, active_watched_ms = ?,
|
||||
lines_seen = ?, tokens_seen = ?, cards_mined = ?, yomitan_lookup_count = ?
|
||||
WHERE session_id = ?
|
||||
`,
|
||||
).run(
|
||||
`${row.start + 60_000}`,
|
||||
10 * 60_000,
|
||||
10 * 60_000,
|
||||
1,
|
||||
row.rawWords,
|
||||
0,
|
||||
row.lookups,
|
||||
session.sessionId,
|
||||
);
|
||||
insertFilteredWordOccurrence(db, {
|
||||
sessionId: session.sessionId,
|
||||
videoId,
|
||||
animeId,
|
||||
lineIndex: index + 1,
|
||||
occurrenceCount: row.filteredWords,
|
||||
startedAtMs: row.start,
|
||||
headword: `単語${index}`,
|
||||
});
|
||||
db.prepare(
|
||||
`
|
||||
INSERT INTO imm_daily_rollups (
|
||||
rollup_day, video_id, total_sessions, total_active_min, total_lines_seen,
|
||||
total_tokens_seen, total_cards
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
`,
|
||||
).run(Math.floor(row.start / 86_400_000), videoId, 1, 10, 1, row.rawWords, 0);
|
||||
}
|
||||
|
||||
const dashboard = getTrendsDashboard(db, 'all', 'day');
|
||||
assert.deepEqual(
|
||||
dashboard.activity.words.map((point) => point.value),
|
||||
[2, 3],
|
||||
);
|
||||
assert.deepEqual(
|
||||
dashboard.progress.words.map((point) => point.value),
|
||||
[2, 5],
|
||||
);
|
||||
assert.equal(dashboard.ratios.lookupsPerHundred[0]?.value, 200);
|
||||
assert.equal(dashboard.librarySummary[0]?.words, 5);
|
||||
assert.equal(dashboard.librarySummary[0]?.lookupsPerHundred, 200);
|
||||
assert.equal(dashboard.animeCumulative.words.at(-1)?.value, 5);
|
||||
} finally {
|
||||
db.close();
|
||||
cleanupDbPath(dbPath);
|
||||
}
|
||||
});
|
||||
|
||||
test('getTrendsDashboard librarySummary is empty when no rollups exist', () => {
|
||||
const dbPath = makeDbPath();
|
||||
const db = new Database(dbPath);
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
import type { DatabaseSync } from './sqlite';
|
||||
import { PartOfSpeech, type MergedToken } from '../../../types';
|
||||
import { shouldExcludeTokenFromVocabularyPersistence } from '../tokenizer/annotation-stage';
|
||||
import type {
|
||||
KanjiAnimeAppearanceRow,
|
||||
KanjiDetailRow,
|
||||
@@ -7,18 +9,55 @@ import type {
|
||||
KanjiWordRow,
|
||||
SessionEventRow,
|
||||
SimilarWordRow,
|
||||
StatsExcludedWordRow,
|
||||
VocabularyStatsRow,
|
||||
WordAnimeAppearanceRow,
|
||||
WordDetailRow,
|
||||
WordOccurrenceRow,
|
||||
} from './types';
|
||||
import { fromDbTimestamp } from './query-shared';
|
||||
import { fromDbTimestamp, toDbTimestamp } from './query-shared';
|
||||
import { nowMs } from './time';
|
||||
|
||||
const VOCABULARY_STATS_FILTER_OVERSAMPLE_FACTOR = 4;
|
||||
const VOCABULARY_STATS_FILTER_OVERSAMPLE_MIN = 100;
|
||||
|
||||
function toVocabularyToken(row: VocabularyStatsRow): MergedToken {
|
||||
const partOfSpeech =
|
||||
row.partOfSpeech && Object.values(PartOfSpeech).includes(row.partOfSpeech as PartOfSpeech)
|
||||
? (row.partOfSpeech as PartOfSpeech)
|
||||
: PartOfSpeech.other;
|
||||
|
||||
return {
|
||||
surface: row.word,
|
||||
reading: row.reading ?? '',
|
||||
headword: row.headword,
|
||||
startPos: 0,
|
||||
endPos: row.word.length,
|
||||
partOfSpeech,
|
||||
pos1: row.pos1 ?? '',
|
||||
pos2: row.pos2 ?? '',
|
||||
pos3: row.pos3 ?? '',
|
||||
frequencyRank: row.frequencyRank ?? undefined,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
};
|
||||
}
|
||||
|
||||
function isVocabularyStatsRowVisible(row: VocabularyStatsRow): boolean {
|
||||
return !shouldExcludeTokenFromVocabularyPersistence(toVocabularyToken(row));
|
||||
}
|
||||
|
||||
export function getVocabularyStats(
|
||||
db: DatabaseSync,
|
||||
limit = 100,
|
||||
excludePos?: string[],
|
||||
): VocabularyStatsRow[] {
|
||||
const queryLimit = Math.max(
|
||||
limit,
|
||||
limit * VOCABULARY_STATS_FILTER_OVERSAMPLE_FACTOR,
|
||||
limit + VOCABULARY_STATS_FILTER_OVERSAMPLE_MIN,
|
||||
);
|
||||
const hasExclude = excludePos && excludePos.length > 0;
|
||||
const placeholders = hasExclude ? excludePos.map(() => '?').join(', ') : '';
|
||||
const whereClause = hasExclude
|
||||
@@ -37,8 +76,48 @@ export function getVocabularyStats(
|
||||
GROUP BY w.id
|
||||
ORDER BY w.frequency DESC LIMIT ?
|
||||
`);
|
||||
const params = hasExclude ? [...excludePos, limit] : [limit];
|
||||
return stmt.all(...params) as VocabularyStatsRow[];
|
||||
const params = hasExclude ? [...excludePos, queryLimit] : [queryLimit];
|
||||
return (stmt.all(...params) as VocabularyStatsRow[])
|
||||
.filter(isVocabularyStatsRowVisible)
|
||||
.slice(0, limit);
|
||||
}
|
||||
|
||||
export function getStatsExcludedWords(db: DatabaseSync): StatsExcludedWordRow[] {
|
||||
return db
|
||||
.prepare(
|
||||
`
|
||||
SELECT headword, word, reading
|
||||
FROM imm_stats_excluded_words
|
||||
ORDER BY headword COLLATE NOCASE, word COLLATE NOCASE, reading COLLATE NOCASE
|
||||
`,
|
||||
)
|
||||
.all() as StatsExcludedWordRow[];
|
||||
}
|
||||
|
||||
export function replaceStatsExcludedWords(db: DatabaseSync, words: StatsExcludedWordRow[]): void {
|
||||
const now = toDbTimestamp(nowMs());
|
||||
const insertStmt = db.prepare(`
|
||||
INSERT OR IGNORE INTO imm_stats_excluded_words(
|
||||
headword,
|
||||
word,
|
||||
reading,
|
||||
CREATED_DATE,
|
||||
LAST_UPDATE_DATE
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
`);
|
||||
|
||||
db.exec('BEGIN IMMEDIATE');
|
||||
try {
|
||||
db.prepare('DELETE FROM imm_stats_excluded_words').run();
|
||||
for (const word of words) {
|
||||
insertStmt.run(word.headword, word.word, word.reading, now, now);
|
||||
}
|
||||
db.exec('COMMIT');
|
||||
} catch (error) {
|
||||
db.exec('ROLLBACK');
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
export function getKanjiStats(db: DatabaseSync, limit = 100): KanjiStatsRow[] {
|
||||
|
||||
@@ -16,12 +16,31 @@ import type {
|
||||
StreakCalendarRow,
|
||||
WatchTimePerAnimeRow,
|
||||
} from './types';
|
||||
import { ACTIVE_SESSION_METRICS_CTE, fromDbTimestamp, resolvedCoverBlobExpr } from './query-shared';
|
||||
import {
|
||||
ACTIVE_SESSION_METRICS_CTE,
|
||||
SESSION_WORD_COUNTS_CTE,
|
||||
SESSION_WORD_COUNTS_SELECT,
|
||||
fromDbTimestamp,
|
||||
resolvedCoverBlobExpr,
|
||||
sessionDisplayWordsExpr,
|
||||
visibleWordSql,
|
||||
} from './query-shared';
|
||||
|
||||
export function getAnimeLibrary(db: DatabaseSync): AnimeLibraryRow[] {
|
||||
const wordsExpr = sessionDisplayWordsExpr('s', 'swc');
|
||||
const rows = db
|
||||
.prepare(
|
||||
`
|
||||
${SESSION_WORD_COUNTS_CTE},
|
||||
anime_word_counts AS (
|
||||
SELECT v.anime_id AS animeId, SUM(${wordsExpr}) AS totalTokensSeen
|
||||
FROM imm_sessions s
|
||||
JOIN imm_videos v ON v.video_id = s.video_id
|
||||
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
|
||||
WHERE s.ended_at_ms IS NOT NULL
|
||||
AND v.anime_id IS NOT NULL
|
||||
GROUP BY v.anime_id
|
||||
)
|
||||
SELECT
|
||||
a.anime_id AS animeId,
|
||||
a.canonical_title AS canonicalTitle,
|
||||
@@ -29,13 +48,14 @@ export function getAnimeLibrary(db: DatabaseSync): AnimeLibraryRow[] {
|
||||
COALESCE(lm.total_sessions, 0) AS totalSessions,
|
||||
COALESCE(lm.total_active_ms, 0) AS totalActiveMs,
|
||||
COALESCE(lm.total_cards, 0) AS totalCards,
|
||||
COALESCE(lm.total_tokens_seen, 0) AS totalTokensSeen,
|
||||
COALESCE(awc.totalTokensSeen, lm.total_tokens_seen, 0) AS totalTokensSeen,
|
||||
COUNT(DISTINCT v.video_id) AS episodeCount,
|
||||
a.episodes_total AS episodesTotal,
|
||||
COALESCE(lm.last_watched_ms, 0) AS lastWatchedMs
|
||||
FROM imm_anime a
|
||||
JOIN imm_lifetime_anime lm ON lm.anime_id = a.anime_id
|
||||
JOIN imm_videos v ON v.anime_id = a.anime_id
|
||||
LEFT JOIN anime_word_counts awc ON awc.animeId = a.anime_id
|
||||
GROUP BY a.anime_id
|
||||
ORDER BY totalActiveMs DESC, lm.last_watched_ms DESC, canonicalTitle ASC
|
||||
`,
|
||||
@@ -48,6 +68,7 @@ export function getAnimeLibrary(db: DatabaseSync): AnimeLibraryRow[] {
|
||||
}
|
||||
|
||||
export function getAnimeDetail(db: DatabaseSync, animeId: number): AnimeDetailRow | null {
|
||||
const wordsExpr = sessionDisplayWordsExpr('s', 'swc', 'COALESCE(asm.tokensSeen, s.tokens_seen)');
|
||||
const row = db
|
||||
.prepare(
|
||||
`
|
||||
@@ -63,7 +84,10 @@ export function getAnimeDetail(db: DatabaseSync, animeId: number): AnimeDetailRo
|
||||
COALESCE(lm.total_sessions, 0) AS totalSessions,
|
||||
COALESCE(lm.total_active_ms, 0) AS totalActiveMs,
|
||||
COALESCE(lm.total_cards, 0) AS totalCards,
|
||||
COALESCE(lm.total_tokens_seen, 0) AS totalTokensSeen,
|
||||
CASE
|
||||
WHEN COUNT(s.session_id) > 0 THEN COALESCE(SUM(${wordsExpr}), 0)
|
||||
ELSE COALESCE(lm.total_tokens_seen, 0)
|
||||
END AS totalTokensSeen,
|
||||
COALESCE(lm.total_lines_seen, 0) AS totalLinesSeen,
|
||||
COALESCE(SUM(COALESCE(asm.lookupCount, s.lookup_count, 0)), 0) AS totalLookupCount,
|
||||
COALESCE(SUM(COALESCE(asm.lookupHits, s.lookup_hits, 0)), 0) AS totalLookupHits,
|
||||
@@ -75,6 +99,7 @@ export function getAnimeDetail(db: DatabaseSync, animeId: number): AnimeDetailRo
|
||||
JOIN imm_videos v ON v.anime_id = a.anime_id
|
||||
LEFT JOIN imm_sessions s ON s.video_id = v.video_id
|
||||
LEFT JOIN active_session_metrics asm ON asm.sessionId = s.session_id
|
||||
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
|
||||
WHERE a.anime_id = ?
|
||||
GROUP BY a.anime_id
|
||||
`,
|
||||
@@ -108,6 +133,7 @@ export function getAnimeAnilistEntries(db: DatabaseSync, animeId: number): Anime
|
||||
}
|
||||
|
||||
export function getAnimeEpisodes(db: DatabaseSync, animeId: number): AnimeEpisodeRow[] {
|
||||
const wordsExpr = sessionDisplayWordsExpr('s', 'swc', 'COALESCE(asm.tokensSeen, s.tokens_seen)');
|
||||
const rows = db
|
||||
.prepare(
|
||||
`
|
||||
@@ -162,12 +188,13 @@ export function getAnimeEpisodes(db: DatabaseSync, animeId: number): AnimeEpisod
|
||||
COUNT(DISTINCT s.session_id) AS totalSessions,
|
||||
COALESCE(SUM(COALESCE(asm.activeWatchedMs, s.active_watched_ms, 0)), 0) AS totalActiveMs,
|
||||
COALESCE(SUM(COALESCE(asm.cardsMined, s.cards_mined, 0)), 0) AS totalCards,
|
||||
COALESCE(SUM(COALESCE(asm.tokensSeen, s.tokens_seen, 0)), 0) AS totalTokensSeen,
|
||||
COALESCE(SUM(${wordsExpr}), 0) AS totalTokensSeen,
|
||||
COALESCE(SUM(COALESCE(asm.yomitanLookupCount, s.yomitan_lookup_count, 0)), 0) AS totalYomitanLookupCount,
|
||||
MAX(s.started_at_ms) AS lastWatchedMs
|
||||
FROM imm_videos v
|
||||
LEFT JOIN imm_sessions s ON s.video_id = v.video_id
|
||||
LEFT JOIN active_session_metrics asm ON asm.sessionId = s.session_id
|
||||
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
|
||||
WHERE v.anime_id = ?
|
||||
GROUP BY v.video_id
|
||||
ORDER BY
|
||||
@@ -192,16 +219,25 @@ export function getAnimeEpisodes(db: DatabaseSync, animeId: number): AnimeEpisod
|
||||
}
|
||||
|
||||
export function getMediaLibrary(db: DatabaseSync): MediaLibraryRow[] {
|
||||
const wordsExpr = sessionDisplayWordsExpr('s', 'swc');
|
||||
const rows = db
|
||||
.prepare(
|
||||
`
|
||||
${SESSION_WORD_COUNTS_CTE},
|
||||
media_word_counts AS (
|
||||
SELECT s.video_id AS videoId, SUM(${wordsExpr}) AS totalTokensSeen
|
||||
FROM imm_sessions s
|
||||
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
|
||||
WHERE s.ended_at_ms IS NOT NULL
|
||||
GROUP BY s.video_id
|
||||
)
|
||||
SELECT
|
||||
v.video_id AS videoId,
|
||||
v.canonical_title AS canonicalTitle,
|
||||
COALESCE(lm.total_sessions, 0) AS totalSessions,
|
||||
COALESCE(lm.total_active_ms, 0) AS totalActiveMs,
|
||||
COALESCE(lm.total_cards, 0) AS totalCards,
|
||||
COALESCE(lm.total_tokens_seen, 0) AS totalTokensSeen,
|
||||
COALESCE(mwc.totalTokensSeen, lm.total_tokens_seen, 0) AS totalTokensSeen,
|
||||
COALESCE(lm.last_watched_ms, 0) AS lastWatchedMs,
|
||||
yv.youtube_video_id AS youtubeVideoId,
|
||||
yv.video_url AS videoUrl,
|
||||
@@ -220,6 +256,7 @@ export function getMediaLibrary(db: DatabaseSync): MediaLibraryRow[] {
|
||||
END AS hasCoverArt
|
||||
FROM imm_videos v
|
||||
JOIN imm_lifetime_media lm ON lm.video_id = v.video_id
|
||||
LEFT JOIN media_word_counts mwc ON mwc.videoId = v.video_id
|
||||
LEFT JOIN imm_media_art ma ON ma.video_id = v.video_id
|
||||
LEFT JOIN imm_youtube_videos yv ON yv.video_id = v.video_id
|
||||
ORDER BY lm.last_watched_ms DESC
|
||||
@@ -233,6 +270,7 @@ export function getMediaLibrary(db: DatabaseSync): MediaLibraryRow[] {
|
||||
}
|
||||
|
||||
export function getMediaDetail(db: DatabaseSync, videoId: number): MediaDetailRow | null {
|
||||
const wordsExpr = sessionDisplayWordsExpr('s', 'swc', 'COALESCE(asm.tokensSeen, s.tokens_seen)');
|
||||
return db
|
||||
.prepare(
|
||||
`
|
||||
@@ -244,7 +282,10 @@ export function getMediaDetail(db: DatabaseSync, videoId: number): MediaDetailRo
|
||||
COALESCE(lm.total_sessions, 0) AS totalSessions,
|
||||
COALESCE(lm.total_active_ms, 0) AS totalActiveMs,
|
||||
COALESCE(lm.total_cards, 0) AS totalCards,
|
||||
COALESCE(lm.total_tokens_seen, 0) AS totalTokensSeen,
|
||||
CASE
|
||||
WHEN COUNT(s.session_id) > 0 THEN COALESCE(SUM(${wordsExpr}), 0)
|
||||
ELSE COALESCE(lm.total_tokens_seen, 0)
|
||||
END AS totalTokensSeen,
|
||||
COALESCE(lm.total_lines_seen, 0) AS totalLinesSeen,
|
||||
COALESCE(SUM(COALESCE(asm.lookupCount, s.lookup_count, 0)), 0) AS totalLookupCount,
|
||||
COALESCE(SUM(COALESCE(asm.lookupHits, s.lookup_hits, 0)), 0) AS totalLookupHits,
|
||||
@@ -265,6 +306,7 @@ export function getMediaDetail(db: DatabaseSync, videoId: number): MediaDetailRo
|
||||
LEFT JOIN imm_youtube_videos yv ON yv.video_id = v.video_id
|
||||
LEFT JOIN imm_sessions s ON s.video_id = v.video_id
|
||||
LEFT JOIN active_session_metrics asm ON asm.sessionId = s.session_id
|
||||
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
|
||||
WHERE v.video_id = ?
|
||||
GROUP BY v.video_id
|
||||
`,
|
||||
@@ -277,6 +319,7 @@ export function getMediaSessions(
|
||||
videoId: number,
|
||||
limit = 100,
|
||||
): SessionSummaryQueryRow[] {
|
||||
const wordsExpr = sessionDisplayWordsExpr('s', 'swc', 'COALESCE(asm.tokensSeen, s.tokens_seen)');
|
||||
const rows = db
|
||||
.prepare(
|
||||
`
|
||||
@@ -290,13 +333,14 @@ export function getMediaSessions(
|
||||
COALESCE(asm.totalWatchedMs, s.total_watched_ms, 0) AS totalWatchedMs,
|
||||
COALESCE(asm.activeWatchedMs, s.active_watched_ms, 0) AS activeWatchedMs,
|
||||
COALESCE(asm.linesSeen, s.lines_seen, 0) AS linesSeen,
|
||||
COALESCE(asm.tokensSeen, s.tokens_seen, 0) AS tokensSeen,
|
||||
${wordsExpr} AS tokensSeen,
|
||||
COALESCE(asm.cardsMined, s.cards_mined, 0) AS cardsMined,
|
||||
COALESCE(asm.lookupCount, s.lookup_count, 0) AS lookupCount,
|
||||
COALESCE(asm.lookupHits, s.lookup_hits, 0) AS lookupHits,
|
||||
COALESCE(asm.yomitanLookupCount, s.yomitan_lookup_count, 0) AS yomitanLookupCount
|
||||
FROM imm_sessions s
|
||||
LEFT JOIN active_session_metrics asm ON asm.sessionId = s.session_id
|
||||
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
|
||||
LEFT JOIN imm_videos v ON v.video_id = s.video_id
|
||||
WHERE s.video_id = ?
|
||||
ORDER BY s.started_at_ms DESC
|
||||
@@ -321,10 +365,27 @@ export function getMediaDailyRollups(
|
||||
videoId: number,
|
||||
limit = 90,
|
||||
): ImmersionSessionRollupRow[] {
|
||||
const wordsExpr = sessionDisplayWordsExpr('s', 'swc');
|
||||
return db
|
||||
.prepare(
|
||||
`
|
||||
WITH recent_days AS (
|
||||
WITH session_word_counts AS (
|
||||
${SESSION_WORD_COUNTS_SELECT}
|
||||
),
|
||||
daily_word_counts AS (
|
||||
SELECT
|
||||
CAST(
|
||||
julianday(CAST(s.started_at_ms AS REAL) / 1000, 'unixepoch', 'localtime') - 2440587.5
|
||||
AS INTEGER
|
||||
) AS rollupDay,
|
||||
s.video_id AS videoId,
|
||||
SUM(${wordsExpr}) AS totalTokensSeen
|
||||
FROM imm_sessions s
|
||||
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
|
||||
WHERE s.ended_at_ms IS NOT NULL
|
||||
GROUP BY rollupDay, s.video_id
|
||||
),
|
||||
recent_days AS (
|
||||
SELECT DISTINCT rollup_day
|
||||
FROM imm_daily_rollups
|
||||
WHERE video_id = ?
|
||||
@@ -337,12 +398,18 @@ export function getMediaDailyRollups(
|
||||
total_sessions AS totalSessions,
|
||||
total_active_min AS totalActiveMin,
|
||||
total_lines_seen AS totalLinesSeen,
|
||||
total_tokens_seen AS totalTokensSeen,
|
||||
COALESCE(dwc.totalTokensSeen, total_tokens_seen) AS totalTokensSeen,
|
||||
total_cards AS totalCards,
|
||||
cards_per_hour AS cardsPerHour,
|
||||
tokens_per_min AS tokensPerMin,
|
||||
CASE
|
||||
WHEN total_active_min > 0 THEN COALESCE(dwc.totalTokensSeen, total_tokens_seen) * 1.0 / total_active_min
|
||||
ELSE NULL
|
||||
END AS tokensPerMin,
|
||||
lookup_hit_rate AS lookupHitRate
|
||||
FROM imm_daily_rollups
|
||||
LEFT JOIN daily_word_counts dwc
|
||||
ON dwc.rollupDay = rollup_day
|
||||
AND dwc.videoId = video_id
|
||||
WHERE video_id = ?
|
||||
AND rollup_day IN (SELECT rollup_day FROM recent_days)
|
||||
ORDER BY rollup_day DESC, video_id DESC
|
||||
@@ -356,10 +423,27 @@ export function getAnimeDailyRollups(
|
||||
animeId: number,
|
||||
limit = 90,
|
||||
): ImmersionSessionRollupRow[] {
|
||||
const wordsExpr = sessionDisplayWordsExpr('s', 'swc');
|
||||
return db
|
||||
.prepare(
|
||||
`
|
||||
WITH recent_days AS (
|
||||
WITH session_word_counts AS (
|
||||
${SESSION_WORD_COUNTS_SELECT}
|
||||
),
|
||||
daily_word_counts AS (
|
||||
SELECT
|
||||
CAST(
|
||||
julianday(CAST(s.started_at_ms AS REAL) / 1000, 'unixepoch', 'localtime') - 2440587.5
|
||||
AS INTEGER
|
||||
) AS rollupDay,
|
||||
s.video_id AS videoId,
|
||||
SUM(${wordsExpr}) AS totalTokensSeen
|
||||
FROM imm_sessions s
|
||||
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
|
||||
WHERE s.ended_at_ms IS NOT NULL
|
||||
GROUP BY rollupDay, s.video_id
|
||||
),
|
||||
recent_days AS (
|
||||
SELECT DISTINCT r.rollup_day
|
||||
FROM imm_daily_rollups r
|
||||
JOIN imm_videos v ON v.video_id = r.video_id
|
||||
@@ -370,11 +454,19 @@ export function getAnimeDailyRollups(
|
||||
SELECT r.rollup_day AS rollupDayOrMonth, r.video_id AS videoId,
|
||||
r.total_sessions AS totalSessions, r.total_active_min AS totalActiveMin,
|
||||
r.total_lines_seen AS totalLinesSeen,
|
||||
r.total_tokens_seen AS totalTokensSeen, r.total_cards AS totalCards,
|
||||
r.cards_per_hour AS cardsPerHour, r.tokens_per_min AS tokensPerMin,
|
||||
COALESCE(dwc.totalTokensSeen, r.total_tokens_seen) AS totalTokensSeen,
|
||||
r.total_cards AS totalCards,
|
||||
r.cards_per_hour AS cardsPerHour,
|
||||
CASE
|
||||
WHEN r.total_active_min > 0 THEN COALESCE(dwc.totalTokensSeen, r.total_tokens_seen) * 1.0 / r.total_active_min
|
||||
ELSE NULL
|
||||
END AS tokensPerMin,
|
||||
r.lookup_hit_rate AS lookupHitRate
|
||||
FROM imm_daily_rollups r
|
||||
JOIN imm_videos v ON v.video_id = r.video_id
|
||||
LEFT JOIN daily_word_counts dwc
|
||||
ON dwc.rollupDay = r.rollup_day
|
||||
AND dwc.videoId = r.video_id
|
||||
WHERE v.anime_id = ?
|
||||
AND r.rollup_day IN (SELECT rollup_day FROM recent_days)
|
||||
ORDER BY r.rollup_day DESC, r.video_id DESC
|
||||
@@ -470,7 +562,7 @@ export function getAnimeWords(db: DatabaseSync, animeId: number, limit = 50): An
|
||||
FROM imm_word_line_occurrences o
|
||||
JOIN imm_subtitle_lines sl ON sl.line_id = o.line_id
|
||||
JOIN imm_words w ON w.id = o.word_id
|
||||
WHERE sl.anime_id = ?
|
||||
WHERE sl.anime_id = ? AND ${visibleWordSql('w')}
|
||||
GROUP BY w.id
|
||||
ORDER BY frequency DESC
|
||||
LIMIT ?
|
||||
@@ -556,6 +648,7 @@ export function getEpisodeWords(db: DatabaseSync, videoId: number, limit = 50):
|
||||
}
|
||||
|
||||
export function getEpisodeSessions(db: DatabaseSync, videoId: number): SessionSummaryQueryRow[] {
|
||||
const wordsExpr = sessionDisplayWordsExpr('s', 'swc', 'COALESCE(asm.tokensSeen, s.tokens_seen)');
|
||||
const rows = db
|
||||
.prepare(
|
||||
`
|
||||
@@ -567,7 +660,7 @@ export function getEpisodeSessions(db: DatabaseSync, videoId: number): SessionSu
|
||||
COALESCE(asm.totalWatchedMs, s.total_watched_ms, 0) AS totalWatchedMs,
|
||||
COALESCE(asm.activeWatchedMs, s.active_watched_ms, 0) AS activeWatchedMs,
|
||||
COALESCE(asm.linesSeen, s.lines_seen, 0) AS linesSeen,
|
||||
COALESCE(asm.tokensSeen, s.tokens_seen, 0) AS tokensSeen,
|
||||
${wordsExpr} AS tokensSeen,
|
||||
COALESCE(asm.cardsMined, s.cards_mined, 0) AS cardsMined,
|
||||
COALESCE(asm.lookupCount, s.lookup_count, 0) AS lookupCount,
|
||||
COALESCE(asm.lookupHits, s.lookup_hits, 0) AS lookupHits,
|
||||
@@ -575,6 +668,7 @@ export function getEpisodeSessions(db: DatabaseSync, videoId: number): SessionSu
|
||||
FROM imm_sessions s
|
||||
JOIN imm_videos v ON v.video_id = s.video_id
|
||||
LEFT JOIN active_session_metrics asm ON asm.sessionId = s.session_id
|
||||
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
|
||||
WHERE s.video_id = ?
|
||||
ORDER BY s.started_at_ms DESC
|
||||
`,
|
||||
|
||||
@@ -6,14 +6,18 @@ import type {
|
||||
} from './types';
|
||||
import {
|
||||
ACTIVE_SESSION_METRICS_CTE,
|
||||
SESSION_WORD_COUNTS_CTE,
|
||||
SESSION_WORD_COUNTS_SELECT,
|
||||
currentDbTimestamp,
|
||||
fromDbTimestamp,
|
||||
getLocalEpochDay,
|
||||
getShiftedLocalDaySec,
|
||||
toDbTimestamp,
|
||||
sessionDisplayWordsExpr,
|
||||
visibleWordSql,
|
||||
} from './query-shared';
|
||||
|
||||
export function getSessionSummaries(db: DatabaseSync, limit = 50): SessionSummaryQueryRow[] {
|
||||
const wordsExpr = sessionDisplayWordsExpr('s', 'swc', 'COALESCE(asm.tokensSeen, s.tokens_seen)');
|
||||
const prepared = db.prepare(`
|
||||
${ACTIVE_SESSION_METRICS_CTE}
|
||||
SELECT
|
||||
@@ -27,13 +31,14 @@ export function getSessionSummaries(db: DatabaseSync, limit = 50): SessionSummar
|
||||
COALESCE(asm.totalWatchedMs, s.total_watched_ms, 0) AS totalWatchedMs,
|
||||
COALESCE(asm.activeWatchedMs, s.active_watched_ms, 0) AS activeWatchedMs,
|
||||
COALESCE(asm.linesSeen, s.lines_seen, 0) AS linesSeen,
|
||||
COALESCE(asm.tokensSeen, s.tokens_seen, 0) AS tokensSeen,
|
||||
${wordsExpr} AS tokensSeen,
|
||||
COALESCE(asm.cardsMined, s.cards_mined, 0) AS cardsMined,
|
||||
COALESCE(asm.lookupCount, s.lookup_count, 0) AS lookupCount,
|
||||
COALESCE(asm.lookupHits, s.lookup_hits, 0) AS lookupHits,
|
||||
COALESCE(asm.yomitanLookupCount, s.yomitan_lookup_count, 0) AS yomitanLookupCount
|
||||
FROM imm_sessions s
|
||||
LEFT JOIN active_session_metrics asm ON asm.sessionId = s.session_id
|
||||
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
|
||||
LEFT JOIN imm_videos v ON v.video_id = s.video_id
|
||||
LEFT JOIN imm_anime a ON a.anime_id = v.anime_id
|
||||
ORDER BY s.started_at_ms DESC
|
||||
@@ -94,7 +99,9 @@ export function getSessionTimeline(
|
||||
|
||||
/** Returns all distinct headwords in the vocabulary table (global). */
|
||||
export function getAllDistinctHeadwords(db: DatabaseSync): string[] {
|
||||
const rows = db.prepare('SELECT DISTINCT headword FROM imm_words').all() as Array<{
|
||||
const rows = db
|
||||
.prepare(`SELECT DISTINCT headword FROM imm_words w WHERE ${visibleWordSql('w')}`)
|
||||
.all() as Array<{
|
||||
headword: string;
|
||||
}>;
|
||||
return rows.map((r) => r.headword);
|
||||
@@ -109,7 +116,7 @@ export function getAnimeDistinctHeadwords(db: DatabaseSync, animeId: number): st
|
||||
FROM imm_word_line_occurrences o
|
||||
JOIN imm_subtitle_lines sl ON sl.line_id = o.line_id
|
||||
JOIN imm_words w ON w.id = o.word_id
|
||||
WHERE sl.anime_id = ?
|
||||
WHERE sl.anime_id = ? AND ${visibleWordSql('w')}
|
||||
`,
|
||||
)
|
||||
.all(animeId) as Array<{ headword: string }>;
|
||||
@@ -125,7 +132,7 @@ export function getMediaDistinctHeadwords(db: DatabaseSync, videoId: number): st
|
||||
FROM imm_word_line_occurrences o
|
||||
JOIN imm_subtitle_lines sl ON sl.line_id = o.line_id
|
||||
JOIN imm_words w ON w.id = o.word_id
|
||||
WHERE sl.video_id = ?
|
||||
WHERE sl.video_id = ? AND ${visibleWordSql('w')}
|
||||
`,
|
||||
)
|
||||
.all(videoId) as Array<{ headword: string }>;
|
||||
@@ -148,7 +155,7 @@ export function getSessionWordsByLine(
|
||||
FROM imm_subtitle_lines sl
|
||||
JOIN imm_word_line_occurrences wlo ON wlo.line_id = sl.line_id
|
||||
JOIN imm_words w ON w.id = wlo.word_id
|
||||
WHERE sl.session_id = ?
|
||||
WHERE sl.session_id = ? AND ${visibleWordSql('w')}
|
||||
ORDER BY sl.line_index ASC
|
||||
`);
|
||||
return stmt.all(sessionId) as Array<{
|
||||
@@ -290,11 +297,17 @@ export function getQueryHints(db: DatabaseSync): {
|
||||
const totalCards = Number(lifetime?.totalCards ?? 0);
|
||||
const activeDays = Number(lifetime?.activeDays ?? 0);
|
||||
|
||||
const lookupWordsExpr = sessionDisplayWordsExpr(
|
||||
's',
|
||||
'swc',
|
||||
'COALESCE(t.tokens_seen, s.tokens_seen)',
|
||||
);
|
||||
const lookupTotals = db
|
||||
.prepare(
|
||||
`
|
||||
${SESSION_WORD_COUNTS_CTE}
|
||||
SELECT
|
||||
COALESCE(SUM(COALESCE(t.tokens_seen, s.tokens_seen, 0)), 0) AS totalTokensSeen,
|
||||
COALESCE(SUM(${lookupWordsExpr}), 0) AS totalTokensSeen,
|
||||
COALESCE(SUM(COALESCE(t.lookup_count, s.lookup_count, 0)), 0) AS totalLookupCount,
|
||||
COALESCE(SUM(COALESCE(t.lookup_hits, s.lookup_hits, 0)), 0) AS totalLookupHits,
|
||||
COALESCE(SUM(COALESCE(t.yomitan_lookup_count, s.yomitan_lookup_count, 0)), 0) AS totalYomitanLookupCount
|
||||
@@ -309,6 +322,7 @@ export function getQueryHints(db: DatabaseSync): {
|
||||
FROM imm_session_telemetry
|
||||
GROUP BY session_id
|
||||
) t ON t.session_id = s.session_id
|
||||
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
|
||||
WHERE s.ended_at_ms IS NOT NULL
|
||||
`,
|
||||
)
|
||||
@@ -338,8 +352,25 @@ export function getQueryHints(db: DatabaseSync): {
|
||||
}
|
||||
|
||||
export function getDailyRollups(db: DatabaseSync, limit = 60): ImmersionSessionRollupRow[] {
|
||||
const wordsExpr = sessionDisplayWordsExpr('s', 'swc');
|
||||
const prepared = db.prepare(`
|
||||
WITH recent_days AS (
|
||||
WITH session_word_counts AS (
|
||||
${SESSION_WORD_COUNTS_SELECT}
|
||||
),
|
||||
daily_word_counts AS (
|
||||
SELECT
|
||||
CAST(
|
||||
julianday(CAST(s.started_at_ms AS REAL) / 1000, 'unixepoch', 'localtime') - 2440587.5
|
||||
AS INTEGER
|
||||
) AS rollupDay,
|
||||
s.video_id AS videoId,
|
||||
SUM(${wordsExpr}) AS totalTokensSeen
|
||||
FROM imm_sessions s
|
||||
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
|
||||
WHERE s.ended_at_ms IS NOT NULL
|
||||
GROUP BY rollupDay, s.video_id
|
||||
),
|
||||
recent_days AS (
|
||||
SELECT DISTINCT rollup_day
|
||||
FROM imm_daily_rollups
|
||||
ORDER BY rollup_day DESC
|
||||
@@ -351,12 +382,21 @@ export function getDailyRollups(db: DatabaseSync, limit = 60): ImmersionSessionR
|
||||
r.total_sessions AS totalSessions,
|
||||
r.total_active_min AS totalActiveMin,
|
||||
r.total_lines_seen AS totalLinesSeen,
|
||||
r.total_tokens_seen AS totalTokensSeen,
|
||||
COALESCE(dwc.totalTokensSeen, r.total_tokens_seen) AS totalTokensSeen,
|
||||
r.total_cards AS totalCards,
|
||||
r.cards_per_hour AS cardsPerHour,
|
||||
r.tokens_per_min AS tokensPerMin,
|
||||
CASE
|
||||
WHEN r.total_active_min > 0 THEN COALESCE(dwc.totalTokensSeen, r.total_tokens_seen) * 1.0 / r.total_active_min
|
||||
ELSE NULL
|
||||
END AS tokensPerMin,
|
||||
r.lookup_hit_rate AS lookupHitRate
|
||||
FROM imm_daily_rollups r
|
||||
LEFT JOIN daily_word_counts dwc
|
||||
ON dwc.rollupDay = r.rollup_day
|
||||
AND (
|
||||
(dwc.videoId IS NULL AND r.video_id IS NULL)
|
||||
OR dwc.videoId = r.video_id
|
||||
)
|
||||
WHERE r.rollup_day IN (SELECT rollup_day FROM recent_days)
|
||||
ORDER BY r.rollup_day DESC, r.video_id DESC
|
||||
`);
|
||||
@@ -365,33 +405,53 @@ export function getDailyRollups(db: DatabaseSync, limit = 60): ImmersionSessionR
|
||||
}
|
||||
|
||||
export function getMonthlyRollups(db: DatabaseSync, limit = 24): ImmersionSessionRollupRow[] {
|
||||
const wordsExpr = sessionDisplayWordsExpr('s', 'swc');
|
||||
const prepared = db.prepare(`
|
||||
WITH recent_months AS (
|
||||
WITH session_word_counts AS (
|
||||
${SESSION_WORD_COUNTS_SELECT}
|
||||
),
|
||||
monthly_word_counts AS (
|
||||
SELECT
|
||||
CAST(strftime('%Y%m', CAST(s.started_at_ms AS REAL) / 1000, 'unixepoch', 'localtime') AS INTEGER) AS rollupMonth,
|
||||
s.video_id AS videoId,
|
||||
SUM(${wordsExpr}) AS totalTokensSeen
|
||||
FROM imm_sessions s
|
||||
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
|
||||
WHERE s.ended_at_ms IS NOT NULL
|
||||
GROUP BY rollupMonth, s.video_id
|
||||
),
|
||||
recent_months AS (
|
||||
SELECT DISTINCT rollup_month
|
||||
FROM imm_monthly_rollups
|
||||
ORDER BY rollup_month DESC
|
||||
LIMIT ?
|
||||
)
|
||||
SELECT
|
||||
rollup_month AS rollupDayOrMonth,
|
||||
video_id AS videoId,
|
||||
total_sessions AS totalSessions,
|
||||
total_active_min AS totalActiveMin,
|
||||
total_lines_seen AS totalLinesSeen,
|
||||
total_tokens_seen AS totalTokensSeen,
|
||||
total_cards AS totalCards,
|
||||
r.rollup_month AS rollupDayOrMonth,
|
||||
r.video_id AS videoId,
|
||||
r.total_sessions AS totalSessions,
|
||||
r.total_active_min AS totalActiveMin,
|
||||
r.total_lines_seen AS totalLinesSeen,
|
||||
COALESCE(mwc.totalTokensSeen, r.total_tokens_seen) AS totalTokensSeen,
|
||||
r.total_cards AS totalCards,
|
||||
CASE
|
||||
WHEN total_active_min > 0 THEN (total_cards * 60.0) / total_active_min
|
||||
WHEN r.total_active_min > 0 THEN (r.total_cards * 60.0) / r.total_active_min
|
||||
ELSE NULL
|
||||
END AS cardsPerHour,
|
||||
CASE
|
||||
WHEN total_active_min > 0 THEN total_tokens_seen * 1.0 / total_active_min
|
||||
WHEN r.total_active_min > 0 THEN COALESCE(mwc.totalTokensSeen, r.total_tokens_seen) * 1.0 / r.total_active_min
|
||||
ELSE NULL
|
||||
END AS tokensPerMin,
|
||||
NULL AS lookupHitRate
|
||||
FROM imm_monthly_rollups
|
||||
WHERE rollup_month IN (SELECT rollup_month FROM recent_months)
|
||||
ORDER BY rollup_month DESC, video_id DESC
|
||||
FROM imm_monthly_rollups r
|
||||
LEFT JOIN monthly_word_counts mwc
|
||||
ON mwc.rollupMonth = r.rollup_month
|
||||
AND (
|
||||
(mwc.videoId IS NULL AND r.video_id IS NULL)
|
||||
OR mwc.videoId = r.video_id
|
||||
)
|
||||
WHERE r.rollup_month IN (SELECT rollup_month FROM recent_months)
|
||||
ORDER BY r.rollup_month DESC, r.video_id DESC
|
||||
`);
|
||||
return prepared.all(limit) as unknown as ImmersionSessionRollupRow[];
|
||||
}
|
||||
|
||||
@@ -1,6 +1,42 @@
|
||||
import type { DatabaseSync } from './sqlite';
|
||||
import { SUBTITLE_ANNOTATION_EXCLUDED_TERMS } from '../tokenizer/subtitle-annotation-filter';
|
||||
import { nowMs } from './time';
|
||||
|
||||
function quoteSqlString(value: string): string {
|
||||
return `'${value.replaceAll("'", "''")}'`;
|
||||
}
|
||||
|
||||
const SQL_EXCLUDED_VOCABULARY_TERMS = [...SUBTITLE_ANNOTATION_EXCLUDED_TERMS].map(quoteSqlString);
|
||||
const SQL_EXCLUDED_VOCABULARY_TERMS_LIST =
|
||||
SQL_EXCLUDED_VOCABULARY_TERMS.length > 0 ? SQL_EXCLUDED_VOCABULARY_TERMS.join(', ') : "''";
|
||||
|
||||
export function visibleWordSql(wordAlias: string): string {
|
||||
return `(
|
||||
TRIM(COALESCE(${wordAlias}.word, '')) NOT IN (${SQL_EXCLUDED_VOCABULARY_TERMS_LIST})
|
||||
AND TRIM(COALESCE(${wordAlias}.headword, '')) NOT IN (${SQL_EXCLUDED_VOCABULARY_TERMS_LIST})
|
||||
AND TRIM(COALESCE(${wordAlias}.reading, '')) NOT IN (${SQL_EXCLUDED_VOCABULARY_TERMS_LIST})
|
||||
)`;
|
||||
}
|
||||
|
||||
export function filteredWordOccurrenceCountSql(occurrenceAlias: string, wordAlias: string): string {
|
||||
return `CASE
|
||||
WHEN ${occurrenceAlias}.word_id IS NOT NULL AND ${visibleWordSql(wordAlias)}
|
||||
THEN ${occurrenceAlias}.occurrence_count
|
||||
ELSE 0
|
||||
END`;
|
||||
}
|
||||
|
||||
export const SESSION_WORD_COUNTS_SELECT = `
|
||||
SELECT
|
||||
sl.session_id AS sessionId,
|
||||
COUNT(DISTINCT sl.line_id) AS persistedLineCount,
|
||||
COALESCE(SUM(${filteredWordOccurrenceCountSql('wlo', 'w')}), 0) AS filteredWordsSeen
|
||||
FROM imm_subtitle_lines sl
|
||||
LEFT JOIN imm_word_line_occurrences wlo ON wlo.line_id = sl.line_id
|
||||
LEFT JOIN imm_words w ON w.id = wlo.word_id
|
||||
GROUP BY sl.session_id
|
||||
`;
|
||||
|
||||
export const ACTIVE_SESSION_METRICS_CTE = `
|
||||
WITH active_session_metrics AS (
|
||||
SELECT
|
||||
@@ -17,9 +53,29 @@ export const ACTIVE_SESSION_METRICS_CTE = `
|
||||
JOIN imm_sessions s ON s.session_id = t.session_id
|
||||
WHERE s.ended_at_ms IS NULL
|
||||
GROUP BY t.session_id
|
||||
),
|
||||
session_word_counts AS (
|
||||
${SESSION_WORD_COUNTS_SELECT}
|
||||
)
|
||||
`;
|
||||
|
||||
export const SESSION_WORD_COUNTS_CTE = `
|
||||
WITH session_word_counts AS (
|
||||
${SESSION_WORD_COUNTS_SELECT}
|
||||
)
|
||||
`;
|
||||
|
||||
export function sessionDisplayWordsExpr(
|
||||
sessionAlias: string,
|
||||
wordCountAlias: string,
|
||||
rawTokensExpr = `${sessionAlias}.tokens_seen`,
|
||||
): string {
|
||||
return `CASE
|
||||
WHEN COALESCE(${wordCountAlias}.persistedLineCount, 0) > 0 THEN COALESCE(${wordCountAlias}.filteredWordsSeen, 0)
|
||||
ELSE COALESCE(${rawTokensExpr}, 0)
|
||||
END`;
|
||||
}
|
||||
|
||||
export function makePlaceholders(values: number[]): string {
|
||||
return values.map(() => '?').join(',');
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ import {
|
||||
getLocalMonthKey,
|
||||
getShiftedLocalDayTimestamp,
|
||||
makePlaceholders,
|
||||
sessionDisplayWordsExpr,
|
||||
toDbTimestamp,
|
||||
} from './query-shared';
|
||||
import { getDailyRollups, getMonthlyRollups } from './query-sessions';
|
||||
@@ -560,6 +561,7 @@ function getTrendSessionMetrics(
|
||||
db: DatabaseSync,
|
||||
cutoffMs: string | null,
|
||||
): TrendSessionMetricRow[] {
|
||||
const wordsExpr = sessionDisplayWordsExpr('s', 'swc', 'COALESCE(asm.tokensSeen, s.tokens_seen)');
|
||||
const whereClause = cutoffMs === null ? '' : 'WHERE s.started_at_ms >= ?';
|
||||
const cutoffValue = cutoffMs === null ? null : toDbTimestamp(cutoffMs);
|
||||
const prepared = db.prepare(`
|
||||
@@ -570,11 +572,12 @@ function getTrendSessionMetrics(
|
||||
v.canonical_title AS canonicalTitle,
|
||||
a.canonical_title AS animeTitle,
|
||||
COALESCE(asm.activeWatchedMs, s.active_watched_ms, 0) AS activeWatchedMs,
|
||||
COALESCE(asm.tokensSeen, s.tokens_seen, 0) AS tokensSeen,
|
||||
${wordsExpr} AS tokensSeen,
|
||||
COALESCE(asm.cardsMined, s.cards_mined, 0) AS cardsMined,
|
||||
COALESCE(asm.yomitanLookupCount, s.yomitan_lookup_count, 0) AS yomitanLookupCount
|
||||
FROM imm_sessions s
|
||||
LEFT JOIN active_session_metrics asm ON asm.sessionId = s.session_id
|
||||
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
|
||||
LEFT JOIN imm_videos v ON v.video_id = s.video_id
|
||||
LEFT JOIN imm_anime a ON a.anime_id = v.anime_id
|
||||
${whereClause}
|
||||
|
||||
@@ -4,6 +4,7 @@ import os from 'node:os';
|
||||
import path from 'node:path';
|
||||
import test from 'node:test';
|
||||
import { Database } from './sqlite';
|
||||
import { getStatsExcludedWords, replaceStatsExcludedWords } from './query-lexical';
|
||||
import { finalizeSessionRecord, startSessionRecord } from './session';
|
||||
import {
|
||||
applyPragmas,
|
||||
@@ -113,6 +114,7 @@ test('ensureSchema creates immersion core tables', () => {
|
||||
assert.ok(tableNames.has('imm_rollup_state'));
|
||||
assert.ok(tableNames.has('imm_cover_art_blobs'));
|
||||
assert.ok(tableNames.has('imm_youtube_videos'));
|
||||
assert.ok(tableNames.has('imm_stats_excluded_words'));
|
||||
|
||||
const videoColumns = new Set(
|
||||
(
|
||||
@@ -153,6 +155,32 @@ test('ensureSchema creates immersion core tables', () => {
|
||||
}
|
||||
});
|
||||
|
||||
test('stats excluded words are replaced and read from sqlite storage', () => {
|
||||
const dbPath = makeDbPath();
|
||||
const db = new Database(dbPath);
|
||||
|
||||
try {
|
||||
ensureSchema(db);
|
||||
|
||||
replaceStatsExcludedWords(db, [
|
||||
{ headword: '猫', word: '猫', reading: 'ねこ' },
|
||||
{ headword: 'する', word: 'する', reading: 'する' },
|
||||
]);
|
||||
assert.deepEqual(getStatsExcludedWords(db), [
|
||||
{ headword: 'する', word: 'する', reading: 'する' },
|
||||
{ headword: '猫', word: '猫', reading: 'ねこ' },
|
||||
]);
|
||||
|
||||
replaceStatsExcludedWords(db, [{ headword: '犬', word: '犬', reading: 'いぬ' }]);
|
||||
assert.deepEqual(getStatsExcludedWords(db), [
|
||||
{ headword: '犬', word: '犬', reading: 'いぬ' },
|
||||
]);
|
||||
} finally {
|
||||
db.close();
|
||||
cleanupDbPath(dbPath);
|
||||
}
|
||||
});
|
||||
|
||||
test('ensureSchema adds youtube metadata table to existing schema version 15 databases', () => {
|
||||
const dbPath = makeDbPath();
|
||||
const db = new Database(dbPath);
|
||||
|
||||
@@ -464,6 +464,19 @@ function ensureLifetimeSummaryTables(db: DatabaseSync): void {
|
||||
`);
|
||||
}
|
||||
|
||||
function ensureStatsExcludedWordsTable(db: DatabaseSync): void {
|
||||
db.exec(`
|
||||
CREATE TABLE IF NOT EXISTS imm_stats_excluded_words(
|
||||
headword TEXT NOT NULL,
|
||||
word TEXT NOT NULL,
|
||||
reading TEXT NOT NULL,
|
||||
CREATED_DATE TEXT,
|
||||
LAST_UPDATE_DATE TEXT,
|
||||
PRIMARY KEY(headword, word, reading)
|
||||
)
|
||||
`);
|
||||
}
|
||||
|
||||
export function getOrCreateAnimeRecord(db: DatabaseSync, input: AnimeRecordInput): number {
|
||||
const normalizedTitleKey = normalizeAnimeIdentityKey(input.parsedTitle);
|
||||
if (!normalizedTitleKey) {
|
||||
@@ -678,6 +691,7 @@ export function ensureSchema(db: DatabaseSync): void {
|
||||
.get() as { schema_version: number } | null;
|
||||
if (currentVersion?.schema_version === SCHEMA_VERSION) {
|
||||
ensureLifetimeSummaryTables(db);
|
||||
ensureStatsExcludedWordsTable(db);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1221,6 +1235,7 @@ export function ensureSchema(db: DatabaseSync): void {
|
||||
migrateSessionEventTimestampsToText(db);
|
||||
|
||||
ensureLifetimeSummaryTables(db);
|
||||
ensureStatsExcludedWordsTable(db);
|
||||
|
||||
db.exec(`
|
||||
CREATE INDEX IF NOT EXISTS idx_anime_normalized_title
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
export const SCHEMA_VERSION = 17;
|
||||
export const SCHEMA_VERSION = 18;
|
||||
export const DEFAULT_QUEUE_CAP = 1_000;
|
||||
export const DEFAULT_BATCH_SIZE = 25;
|
||||
export const DEFAULT_FLUSH_INTERVAL_MS = 500;
|
||||
@@ -301,6 +301,12 @@ export interface VocabularyStatsRow {
|
||||
lastSeen: number;
|
||||
}
|
||||
|
||||
export interface StatsExcludedWordRow {
|
||||
headword: string;
|
||||
word: string;
|
||||
reading: string;
|
||||
}
|
||||
|
||||
export interface VocabularyCleanupSummary {
|
||||
scanned: number;
|
||||
kept: number;
|
||||
|
||||
@@ -20,6 +20,12 @@ type StatsServerNoteInfo = {
|
||||
fields: Record<string, { value: string }>;
|
||||
};
|
||||
|
||||
type StatsExcludedWordPayload = {
|
||||
headword: string;
|
||||
word: string;
|
||||
reading: string;
|
||||
};
|
||||
|
||||
function parseIntQuery(raw: string | undefined, fallback: number, maxLimit?: number): number {
|
||||
if (raw === undefined) return fallback;
|
||||
const n = Number(raw);
|
||||
@@ -49,6 +55,23 @@ function parseEventTypesQuery(raw: string | undefined): number[] | undefined {
|
||||
return parsed.length > 0 ? parsed : undefined;
|
||||
}
|
||||
|
||||
function parseExcludedWordsBody(body: unknown): StatsExcludedWordPayload[] | null {
|
||||
if (!body || typeof body !== 'object' || !Array.isArray((body as { words?: unknown }).words)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const words: StatsExcludedWordPayload[] = [];
|
||||
for (const row of (body as { words: unknown[] }).words) {
|
||||
if (!row || typeof row !== 'object') return null;
|
||||
const { headword, word, reading } = row as Record<string, unknown>;
|
||||
if (typeof headword !== 'string' || typeof word !== 'string' || typeof reading !== 'string') {
|
||||
return null;
|
||||
}
|
||||
words.push({ headword, word, reading });
|
||||
}
|
||||
return words;
|
||||
}
|
||||
|
||||
function resolveStatsNoteFieldName(
|
||||
noteInfo: StatsServerNoteInfo,
|
||||
...preferredNames: (string | undefined)[]
|
||||
@@ -161,6 +184,21 @@ function toKnownWordRate(knownWordsSeen: number, tokensSeen: number): number {
|
||||
return Number(((knownWordsSeen / tokensSeen) * 100).toFixed(1));
|
||||
}
|
||||
|
||||
function summarizeFilteredWordOccurrences(
|
||||
wordsByLine: Array<{ lineIndex: number; headword: string; occurrenceCount: number }>,
|
||||
knownWordsSet: Set<string>,
|
||||
): { knownWordsSeen: number; totalWordsSeen: number } {
|
||||
let knownWordsSeen = 0;
|
||||
let totalWordsSeen = 0;
|
||||
for (const row of wordsByLine) {
|
||||
totalWordsSeen += row.occurrenceCount;
|
||||
if (knownWordsSet.has(row.headword)) {
|
||||
knownWordsSeen += row.occurrenceCount;
|
||||
}
|
||||
}
|
||||
return { knownWordsSeen, totalWordsSeen };
|
||||
}
|
||||
|
||||
async function enrichSessionsWithKnownWordMetrics(
|
||||
tracker: ImmersionTrackerService,
|
||||
sessions: Array<{
|
||||
@@ -188,21 +226,21 @@ async function enrichSessionsWithKnownWordMetrics(
|
||||
const enriched = await Promise.all(
|
||||
sessions.map(async (session) => {
|
||||
let knownWordsSeen = 0;
|
||||
let totalWordsSeen = 0;
|
||||
try {
|
||||
const wordsByLine = await tracker.getSessionWordsByLine(session.sessionId);
|
||||
for (const row of wordsByLine) {
|
||||
if (knownWordsSet.has(row.headword)) {
|
||||
knownWordsSeen += row.occurrenceCount;
|
||||
}
|
||||
}
|
||||
const summary = summarizeFilteredWordOccurrences(wordsByLine, knownWordsSet);
|
||||
knownWordsSeen = summary.knownWordsSeen;
|
||||
totalWordsSeen = summary.totalWordsSeen;
|
||||
} catch {
|
||||
knownWordsSeen = 0;
|
||||
totalWordsSeen = 0;
|
||||
}
|
||||
|
||||
return {
|
||||
...session,
|
||||
knownWordsSeen,
|
||||
knownWordRate: toKnownWordRate(knownWordsSeen, session.tokensSeen),
|
||||
knownWordRate: toKnownWordRate(knownWordsSeen, totalWordsSeen),
|
||||
};
|
||||
}),
|
||||
);
|
||||
@@ -391,32 +429,45 @@ export function createStatsApp(
|
||||
const id = parseIntQuery(c.req.param('id'), 0);
|
||||
if (id <= 0) return c.json([], 400);
|
||||
|
||||
const knownWordsSet = loadKnownWordsSet(options?.knownWordCachePath);
|
||||
if (!knownWordsSet) return c.json([]);
|
||||
const knownWordsSet = loadKnownWordsSet(options?.knownWordCachePath) ?? new Set<string>();
|
||||
|
||||
// Get per-line word occurrences for the session.
|
||||
const wordsByLine = await tracker.getSessionWordsByLine(id);
|
||||
|
||||
// Build cumulative known-word occurrence count per recorded line index.
|
||||
// Build cumulative filtered occurrence counts per recorded line index.
|
||||
// The stats UI uses line-count progress to align this series with the session
|
||||
// timeline, so preserve the stored line position rather than compressing gaps.
|
||||
const lineGroups = new Map<number, number>();
|
||||
const totalLineGroups = new Map<number, number>();
|
||||
const knownLineGroups = new Map<number, number>();
|
||||
for (const row of wordsByLine) {
|
||||
if (!knownWordsSet.has(row.headword)) {
|
||||
continue;
|
||||
totalLineGroups.set(
|
||||
row.lineIndex,
|
||||
(totalLineGroups.get(row.lineIndex) ?? 0) + row.occurrenceCount,
|
||||
);
|
||||
if (knownWordsSet.has(row.headword)) {
|
||||
knownLineGroups.set(
|
||||
row.lineIndex,
|
||||
(knownLineGroups.get(row.lineIndex) ?? 0) + row.occurrenceCount,
|
||||
);
|
||||
}
|
||||
lineGroups.set(row.lineIndex, (lineGroups.get(row.lineIndex) ?? 0) + row.occurrenceCount);
|
||||
}
|
||||
|
||||
const sortedLineIndices = [...lineGroups.keys()].sort((a, b) => a - b);
|
||||
const sortedLineIndices = [...totalLineGroups.keys()].sort((a, b) => a - b);
|
||||
let knownWordsSeen = 0;
|
||||
const knownByLinesSeen: Array<{ linesSeen: number; knownWordsSeen: number }> = [];
|
||||
let totalWordsSeen = 0;
|
||||
const knownByLinesSeen: Array<{
|
||||
linesSeen: number;
|
||||
knownWordsSeen: number;
|
||||
totalWordsSeen: number;
|
||||
}> = [];
|
||||
|
||||
for (const lineIdx of sortedLineIndices) {
|
||||
knownWordsSeen += lineGroups.get(lineIdx)!;
|
||||
knownWordsSeen += knownLineGroups.get(lineIdx) ?? 0;
|
||||
totalWordsSeen += totalLineGroups.get(lineIdx)!;
|
||||
knownByLinesSeen.push({
|
||||
linesSeen: lineIdx,
|
||||
knownWordsSeen,
|
||||
totalWordsSeen,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -430,6 +481,18 @@ export function createStatsApp(
|
||||
return c.json(vocab);
|
||||
});
|
||||
|
||||
app.get('/api/stats/excluded-words', async (c) => {
|
||||
return c.json(await tracker.getStatsExcludedWords());
|
||||
});
|
||||
|
||||
app.put('/api/stats/excluded-words', async (c) => {
|
||||
const body = await c.req.json().catch(() => null);
|
||||
const words = parseExcludedWordsBody(body);
|
||||
if (!words) return c.body(null, 400);
|
||||
await tracker.replaceStatsExcludedWords(words);
|
||||
return c.json({ ok: true });
|
||||
});
|
||||
|
||||
app.get('/api/stats/vocabulary/occurrences', async (c) => {
|
||||
const headword = (c.req.query('headword') ?? '').trim();
|
||||
const word = (c.req.query('word') ?? '').trim();
|
||||
|
||||
@@ -5,6 +5,7 @@ import {
|
||||
annotateTokens,
|
||||
AnnotationStageDeps,
|
||||
shouldExcludeTokenFromSubtitleAnnotations,
|
||||
shouldExcludeTokenFromVocabularyPersistence,
|
||||
stripSubtitleAnnotationMetadata,
|
||||
} from './annotation-stage';
|
||||
|
||||
@@ -366,6 +367,87 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes kana-only non-independe
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromVocabularyPersistence mirrors subtitle annotation grammar filters', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'どうしてもって',
|
||||
headword: 'どうしても',
|
||||
reading: 'ドウシテモッテ',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '副詞|助詞',
|
||||
pos2: '一般|格助詞',
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'そうだ',
|
||||
headword: 'そう',
|
||||
reading: 'ソウダ',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞|助動詞',
|
||||
pos2: '一般|',
|
||||
pos3: '助動詞語幹|',
|
||||
}),
|
||||
];
|
||||
|
||||
for (const token of tokens) {
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
|
||||
assert.equal(shouldExcludeTokenFromVocabularyPersistence(token), true, token.surface);
|
||||
}
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromVocabularyPersistence excludes common frequency stop terms', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'じゃない',
|
||||
headword: 'じゃない',
|
||||
reading: '',
|
||||
partOfSpeech: PartOfSpeech.i_adjective,
|
||||
pos1: '形容詞',
|
||||
pos2: '*|自立',
|
||||
pos3: '*',
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'である',
|
||||
headword: 'である',
|
||||
reading: '',
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞',
|
||||
pos2: '*',
|
||||
pos3: '*',
|
||||
}),
|
||||
makeToken({
|
||||
surface: '何か',
|
||||
headword: '何か',
|
||||
reading: 'なにか',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '名詞|助詞',
|
||||
pos2: '代名詞|副助詞/並立助詞/終助詞',
|
||||
pos3: '一般|*',
|
||||
}),
|
||||
makeToken({
|
||||
surface: '確かに',
|
||||
headword: '確かに',
|
||||
reading: 'たしかに',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '名詞|助詞',
|
||||
pos2: '形容動詞語幹|副詞化',
|
||||
pos3: '*',
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'あなた',
|
||||
headword: '貴方',
|
||||
reading: 'あなた',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '代名詞',
|
||||
pos3: '一般',
|
||||
}),
|
||||
];
|
||||
|
||||
for (const token of tokens) {
|
||||
assert.equal(shouldExcludeTokenFromVocabularyPersistence(token), true, token.surface);
|
||||
}
|
||||
});
|
||||
|
||||
test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => {
|
||||
const token = makeToken({
|
||||
surface: 'は',
|
||||
|
||||
@@ -328,10 +328,12 @@ export function shouldExcludeTokenFromVocabularyPersistence(
|
||||
token: MergedToken,
|
||||
options: Pick<AnnotationStageOptions, 'pos1Exclusions' | 'pos2Exclusions'> = {},
|
||||
): boolean {
|
||||
return isFrequencyExcludedByPos(
|
||||
token,
|
||||
resolvePos1Exclusions(options),
|
||||
resolvePos2Exclusions(options),
|
||||
const pos1Exclusions = resolvePos1Exclusions(options);
|
||||
const pos2Exclusions = resolvePos2Exclusions(options);
|
||||
|
||||
return (
|
||||
sharedShouldExcludeTokenFromSubtitleAnnotations(token, { pos1Exclusions, pos2Exclusions }) ||
|
||||
isFrequencyExcludedByPos(token, pos1Exclusions, pos2Exclusions)
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -13,17 +13,40 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
|
||||
const KATAKANA_CODEPOINT_START = 0x30a1;
|
||||
const KATAKANA_CODEPOINT_END = 0x30f6;
|
||||
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
||||
export const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
||||
'あ',
|
||||
'ああ',
|
||||
'あなた',
|
||||
'あんた',
|
||||
'ええ',
|
||||
'うう',
|
||||
'おお',
|
||||
'おい',
|
||||
'お前',
|
||||
'こいつ',
|
||||
'こっち',
|
||||
'じゃない',
|
||||
'そうだ',
|
||||
'たち',
|
||||
'である',
|
||||
'どこか',
|
||||
'なんか',
|
||||
'べき',
|
||||
'はあ',
|
||||
'はは',
|
||||
'へえ',
|
||||
'ふう',
|
||||
'ほう',
|
||||
'やはり',
|
||||
'って',
|
||||
'何か',
|
||||
'何だ',
|
||||
'何も',
|
||||
'如何した',
|
||||
'様',
|
||||
'確かに',
|
||||
'誰も',
|
||||
'貴方',
|
||||
]);
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
|
||||
|
||||
Reference in New Issue
Block a user