Persist stats exclusions in DB and fix word metrics filtering

- Stats vocabulary exclusions stored in `imm_stats_excluded_words` (schema v18); seeded from localStorage on first load
- Session, overview, trends, and library word metrics use filtered persisted occurrences with raw fallback
- Session known-word % chart uses filtered persisted totals as denominator for both known and total
- JLPT subtitle styling changed to underline-only; no longer overrides text color
This commit is contained in:
2026-05-03 19:40:54 -07:00
parent db30c61327
commit 25d0aa47db
32 changed files with 1541 additions and 211 deletions
@@ -277,6 +277,8 @@ function createMockTracker(
getSessionTimeline: async () => [],
getSessionEvents: async () => [],
getVocabularyStats: async () => VOCABULARY_STATS,
getStatsExcludedWords: async () => [],
replaceStatsExcludedWords: async () => {},
getKanjiStats: async () => KANJI_STATS,
getWordOccurrences: async () => OCCURRENCES,
getKanjiOccurrences: async () => OCCURRENCES,
@@ -362,7 +364,7 @@ describe('stats server API routes', () => {
assert.ok(Array.isArray(body));
});
it('GET /api/stats/sessions enriches each session with known-word metrics when cache exists', async () => {
it('GET /api/stats/sessions enriches known-word metrics using filtered persisted totals', async () => {
await withTempDir(async (dir) => {
const cachePath = path.join(dir, 'known-words.json');
fs.writeFileSync(
@@ -391,7 +393,7 @@ describe('stats server API routes', () => {
const body = await res.json();
const first = body[0];
assert.equal(first.knownWordsSeen, 2);
assert.equal(first.knownWordRate, 2.5);
assert.equal(first.knownWordRate, 66.7);
});
});
@@ -436,7 +438,7 @@ describe('stats server API routes', () => {
assert.equal(seenLimit, undefined);
});
it('GET /api/stats/sessions/:id/known-words-timeline preserves line positions and counts known occurrences', async () => {
it('GET /api/stats/sessions/:id/known-words-timeline preserves line positions and counts filtered totals', async () => {
await withTempDir(async (dir) => {
const cachePath = path.join(dir, 'known-words.json');
fs.writeFileSync(
@@ -461,8 +463,8 @@ describe('stats server API routes', () => {
const res = await app.request('/api/stats/sessions/1/known-words-timeline');
assert.equal(res.status, 200);
assert.deepEqual(await res.json(), [
{ linesSeen: 1, knownWordsSeen: 2 },
{ linesSeen: 3, knownWordsSeen: 3 },
{ linesSeen: 1, knownWordsSeen: 2, totalWordsSeen: 2 },
{ linesSeen: 3, knownWordsSeen: 3, totalWordsSeen: 7 },
]);
});
});
@@ -730,6 +732,65 @@ describe('stats server API routes', () => {
assert.equal(body[0].pos3, null);
});
it('GET /api/stats/excluded-words returns tracker exclusion rows', async () => {
const app = createStatsApp(
createMockTracker({
getStatsExcludedWords: async () => [
{ headword: '猫', word: '猫', reading: 'ねこ' },
{ headword: 'する', word: 'する', reading: 'する' },
],
}),
);
const res = await app.request('/api/stats/excluded-words');
assert.equal(res.status, 200);
assert.deepEqual(await res.json(), [
{ headword: '猫', word: '猫', reading: 'ねこ' },
{ headword: 'する', word: 'する', reading: 'する' },
]);
});
it('PUT /api/stats/excluded-words replaces tracker exclusion rows', async () => {
let seenWords: unknown = null;
const app = createStatsApp(
createMockTracker({
replaceStatsExcludedWords: async (words: unknown) => {
seenWords = words;
},
}),
);
const res = await app.request('/api/stats/excluded-words', {
method: 'PUT',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
words: [
{ headword: '猫', word: '猫', reading: 'ねこ' },
{ headword: 'する', word: 'する', reading: 'する' },
],
}),
});
assert.equal(res.status, 200);
assert.deepEqual(await res.json(), { ok: true });
assert.deepEqual(seenWords, [
{ headword: '猫', word: '猫', reading: 'ねこ' },
{ headword: 'する', word: 'する', reading: 'する' },
]);
});
it('PUT /api/stats/excluded-words rejects malformed rows', async () => {
const app = createStatsApp(createMockTracker());
const res = await app.request('/api/stats/excluded-words', {
method: 'PUT',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ words: [{ headword: '猫', word: 7, reading: 'ねこ' }] }),
});
assert.equal(res.status, 400);
});
it('GET /api/stats/anime returns anime library', async () => {
const app = createStatsApp(createMockTracker());
const res = await app.request('/api/stats/anime');
@@ -52,7 +52,9 @@ import {
getKanjiWords,
getSessionEvents,
getSimilarWords,
getStatsExcludedWords,
getVocabularyStats,
replaceStatsExcludedWords,
getWordAnimeAppearances,
getWordDetail,
getWordOccurrences,
@@ -151,6 +153,7 @@ import {
type SessionSummaryQueryRow,
type SessionTimelineRow,
type SimilarWordRow,
type StatsExcludedWordRow,
type StreakCalendarRow,
type VocabularyCleanupSummary,
type WatchTimePerAnimeRow,
@@ -289,6 +292,7 @@ export type {
SessionSummaryQueryRow,
SessionTimelineRow,
SimilarWordRow,
StatsExcludedWordRow,
StreakCalendarRow,
WatchTimePerAnimeRow,
WordAnimeAppearanceRow,
@@ -498,6 +502,14 @@ export class ImmersionTrackerService {
return getVocabularyStats(this.db, limit, excludePos);
}
async getStatsExcludedWords(): Promise<StatsExcludedWordRow[]> {
return getStatsExcludedWords(this.db);
}
async replaceStatsExcludedWords(words: StatsExcludedWordRow[]): Promise<void> {
replaceStatsExcludedWords(this.db, words);
}
async cleanupVocabularyStats(): Promise<VocabularyCleanupSummary> {
return cleanupVocabularyStats(this.db, {
resolveLegacyPos: this.resolveLegacyVocabularyPos,
@@ -86,6 +86,77 @@ function cleanupDbPath(dbPath: string): void {
}
}
function insertFilteredWordOccurrence(
db: InstanceType<typeof Database>,
options: {
sessionId: number;
videoId: number;
animeId?: number | null;
lineIndex?: number;
occurrenceCount: number;
startedAtMs: number;
headword?: string;
word?: string;
reading?: string;
partOfSpeech?: string;
pos1?: string;
pos2?: string;
pos3?: string;
},
): void {
const headword = options.headword ?? options.word ?? '猫';
const word = options.word ?? headword;
const lineId = Number(
db
.prepare(
`INSERT INTO imm_subtitle_lines (
session_id, event_id, video_id, anime_id, line_index,
segment_start_ms, segment_end_ms, text, CREATED_DATE, LAST_UPDATE_DATE
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
)
.run(
options.sessionId,
null,
options.videoId,
options.animeId ?? null,
options.lineIndex ?? 1,
0,
1000,
word,
options.startedAtMs,
options.startedAtMs,
).lastInsertRowid,
);
const wordRow = db
.prepare(
`INSERT INTO imm_words (
headword, word, reading, pos1, pos2, pos3, part_of_speech,
first_seen, last_seen, frequency
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(headword, word, reading) DO UPDATE SET
frequency = imm_words.frequency + excluded.frequency,
last_seen = excluded.last_seen
RETURNING id`,
)
.get(
word,
options.reading ?? '',
options.pos1 ?? '名詞',
options.pos2 ?? '一般',
options.pos3 ?? '',
options.partOfSpeech ?? 'noun',
Math.floor(options.startedAtMs / 1000),
Math.floor(options.startedAtMs / 1000),
options.occurrenceCount,
) as { id: number };
const wordId = Number(wordRow.id);
db.prepare(
`INSERT INTO imm_word_line_occurrences (line_id, word_id, occurrence_count)
VALUES (?, ?, ?)`,
).run(lineId, wordId, options.occurrenceCount);
}
function withMockNowMs<T>(fixedDateMs: string | number, run: () => T): T {
const previousNowMs = globalThis.__subminerTestNowMs;
globalThis.__subminerTestNowMs = fixedDateMs;
@@ -1236,6 +1307,89 @@ test('getQueryHints computes weekly new-word cutoff from calendar midnights', ()
});
});
test('word-count read models use filtered persisted occurrences with raw fallback', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
ensureSchema(db);
const videoId = getOrCreateVideoRecord(db, 'local:/tmp/filtered-word-metrics.mkv', {
canonicalTitle: 'Filtered Word Metrics',
sourcePath: '/tmp/filtered-word-metrics.mkv',
sourceUrl: null,
sourceType: SOURCE_TYPE_LOCAL,
});
const startedAtMs = 1_700_000_000_000;
const withOccurrences = startSessionRecord(db, videoId, startedAtMs);
const fallbackOnly = startSessionRecord(db, videoId, startedAtMs + 60_000);
db.prepare(
`
UPDATE imm_sessions
SET ended_at_ms = ?, status = 2, active_watched_ms = ?, tokens_seen = ?, yomitan_lookup_count = ?
WHERE session_id = ?
`,
).run(startedAtMs + 30_000, 2, 5, 1, withOccurrences.sessionId);
db.prepare(
`
UPDATE imm_sessions
SET ended_at_ms = ?, status = 2, active_watched_ms = ?, tokens_seen = ?, yomitan_lookup_count = ?
WHERE session_id = ?
`,
).run(startedAtMs + 90_000, 2, 7, 2, fallbackOnly.sessionId);
insertFilteredWordOccurrence(db, {
sessionId: withOccurrences.sessionId,
videoId,
occurrenceCount: 2,
startedAtMs,
});
insertFilteredWordOccurrence(db, {
sessionId: withOccurrences.sessionId,
videoId,
lineIndex: 2,
occurrenceCount: 3,
startedAtMs,
headword: 'じゃない',
word: 'じゃない',
partOfSpeech: 'i_adjective',
pos1: '形容詞',
pos2: '*|自立',
pos3: '*',
});
db.prepare(
`
INSERT INTO imm_daily_rollups (
rollup_day, video_id, total_sessions, total_active_min, total_lines_seen,
total_tokens_seen, total_cards
) VALUES (?, ?, ?, ?, ?, ?, ?)
`,
).run(Math.floor(startedAtMs / 86_400_000), videoId, 2, 1, 2, 12, 0);
const summaries = getSessionSummaries(db, 10);
assert.equal(
summaries.find((session) => session.sessionId === withOccurrences.sessionId)?.tokensSeen,
2,
);
assert.equal(
summaries.find((session) => session.sessionId === fallbackOnly.sessionId)?.tokensSeen,
7,
);
const hints = getQueryHints(db);
assert.equal(hints.totalTokensSeen, 9);
const rollup = getDailyRollups(db, 1)[0]!;
assert.equal(rollup.totalTokensSeen, 9);
assert.equal(rollup.tokensPerMin, 9);
} finally {
db.close();
cleanupDbPath(dbPath);
}
});
test('getQueryHints counts new words by distinct headword first-seen time', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
@@ -1430,6 +1584,61 @@ test('getVocabularyStats returns rows ordered by frequency descending', () => {
}
});
test('getVocabularyStats filters rows that fail tokenizer vocabulary rules', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
ensureSchema(db);
const stmts = createTrackerPreparedStatements(db);
stmts.wordUpsertStmt.run(
'どうしても',
'どうしてもって',
'どうしてもって',
'other',
'副詞|助詞',
'一般|格助詞',
'',
1_000,
1_000,
);
stmts.wordUpsertStmt.run(
'じゃない',
'じゃない',
'',
'i_adjective',
'形容詞',
'*|自立',
'*',
1_100,
1_100,
);
stmts.wordUpsertStmt.run(
'何か',
'何か',
'なにか',
'other',
'名詞|助詞',
'代名詞|副助詞/並立助詞/終助詞',
'一般|*',
1_200,
1_200,
);
stmts.wordUpsertStmt.run('猫', '猫', 'ねこ', 'noun', '名詞', '一般', '', 1_500, 1_500);
const rows = getVocabularyStats(db, 10);
assert.deepEqual(
rows.map((row) => row.headword),
['猫'],
);
} finally {
db.close();
cleanupDbPath(dbPath);
}
});
test('getVocabularyStats returns empty array when no words exist', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
@@ -1475,6 +1684,22 @@ test('cleanupVocabularyStats repairs stored POS metadata and removes excluded im
headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
).run('未解決', '未解決', '', '', '', '', '', 901, 951, 1);
db.prepare(
`INSERT INTO imm_words (
headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
).run(
'どうしても',
'どうしてもって',
'どうしてもって',
'other',
'副詞|助詞',
'一般|格助詞',
'',
1_110,
1_610,
7,
);
const result = await cleanupVocabularyStats(db, {
resolveLegacyPos: async (row) => {
@@ -1517,7 +1742,7 @@ test('cleanupVocabularyStats repairs stored POS metadata and removes excluded im
pos2: string;
}>;
assert.deepEqual(result, { scanned: 5, kept: 3, deleted: 2, repaired: 2 });
assert.deepEqual(result, { scanned: 6, kept: 3, deleted: 3, repaired: 2 });
assert.deepEqual(
rows.map((row) => ({ headword: row.headword, frequency: row.frequency })),
[
@@ -2226,6 +2451,31 @@ test('getSessionWordsByLine joins word occurrences through imm_words.id', () =>
`INSERT INTO imm_word_line_occurrences (line_id, word_id, occurrence_count)
VALUES (?, ?, ?)`,
).run(lineId, wordId, 1);
const excludedWordId = Number(
db
.prepare(
`INSERT INTO imm_words (
headword, word, reading, pos1, pos2, pos3, part_of_speech, first_seen, last_seen, frequency
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
)
.run(
'じゃない',
'じゃない',
'',
'形容詞',
'*|自立',
'*',
'i_adjective',
startedAtMs,
startedAtMs,
1,
).lastInsertRowid,
);
db.prepare(
`INSERT INTO imm_word_line_occurrences (line_id, word_id, occurrence_count)
VALUES (?, ?, ?)`,
).run(lineId, excludedWordId, 3);
assert.deepEqual(getSessionWordsByLine(db, sessionId), [
{ lineIndex: 0, headword: '猫', occurrenceCount: 1 },
@@ -3959,6 +4209,121 @@ test('getTrendsDashboard librarySummary returns null lookupsPerHundred when word
}
});
test('getTrendsDashboard word metrics use filtered persisted occurrences', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
ensureSchema(db);
const stmts = createTrackerPreparedStatements(db);
const videoId = getOrCreateVideoRecord(db, 'local:/tmp/filtered-trends.mkv', {
canonicalTitle: 'Filtered Trends Episode',
sourcePath: '/tmp/filtered-trends.mkv',
sourceUrl: null,
sourceType: SOURCE_TYPE_LOCAL,
});
const animeId = getOrCreateAnimeRecord(db, {
parsedTitle: 'Filtered Trends Anime',
canonicalTitle: 'Filtered Trends Anime',
anilistId: null,
titleRomaji: null,
titleEnglish: null,
titleNative: null,
metadataJson: null,
});
linkVideoToAnimeRecord(db, videoId, {
animeId,
parsedBasename: 'filtered-trends.mkv',
parsedTitle: 'Filtered Trends Anime',
parsedSeason: 1,
parsedEpisode: 1,
parserSource: 'test',
parserConfidence: 1,
parseMetadataJson: null,
});
const dayOneStart = 1_700_000_000_000;
const dayTwoStart = dayOneStart + 86_400_000;
const rows = [
{ start: dayOneStart, rawWords: 10, filteredWords: 2, lookups: 4 },
{ start: dayTwoStart, rawWords: 20, filteredWords: 3, lookups: 6 },
];
for (const [index, row] of rows.entries()) {
const session = startSessionRecord(db, videoId, row.start);
stmts.telemetryInsertStmt.run(
session.sessionId,
`${row.start + 60_000}`,
10 * 60_000,
10 * 60_000,
1,
row.rawWords,
0,
0,
0,
row.lookups,
0,
0,
0,
0,
`${row.start + 60_000}`,
`${row.start + 60_000}`,
);
db.prepare(
`
UPDATE imm_sessions
SET ended_at_ms = ?, total_watched_ms = ?, active_watched_ms = ?,
lines_seen = ?, tokens_seen = ?, cards_mined = ?, yomitan_lookup_count = ?
WHERE session_id = ?
`,
).run(
`${row.start + 60_000}`,
10 * 60_000,
10 * 60_000,
1,
row.rawWords,
0,
row.lookups,
session.sessionId,
);
insertFilteredWordOccurrence(db, {
sessionId: session.sessionId,
videoId,
animeId,
lineIndex: index + 1,
occurrenceCount: row.filteredWords,
startedAtMs: row.start,
headword: `単語${index}`,
});
db.prepare(
`
INSERT INTO imm_daily_rollups (
rollup_day, video_id, total_sessions, total_active_min, total_lines_seen,
total_tokens_seen, total_cards
) VALUES (?, ?, ?, ?, ?, ?, ?)
`,
).run(Math.floor(row.start / 86_400_000), videoId, 1, 10, 1, row.rawWords, 0);
}
const dashboard = getTrendsDashboard(db, 'all', 'day');
assert.deepEqual(
dashboard.activity.words.map((point) => point.value),
[2, 3],
);
assert.deepEqual(
dashboard.progress.words.map((point) => point.value),
[2, 5],
);
assert.equal(dashboard.ratios.lookupsPerHundred[0]?.value, 200);
assert.equal(dashboard.librarySummary[0]?.words, 5);
assert.equal(dashboard.librarySummary[0]?.lookupsPerHundred, 200);
assert.equal(dashboard.animeCumulative.words.at(-1)?.value, 5);
} finally {
db.close();
cleanupDbPath(dbPath);
}
});
test('getTrendsDashboard librarySummary is empty when no rollups exist', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
@@ -1,4 +1,6 @@
import type { DatabaseSync } from './sqlite';
import { PartOfSpeech, type MergedToken } from '../../../types';
import { shouldExcludeTokenFromVocabularyPersistence } from '../tokenizer/annotation-stage';
import type {
KanjiAnimeAppearanceRow,
KanjiDetailRow,
@@ -7,18 +9,55 @@ import type {
KanjiWordRow,
SessionEventRow,
SimilarWordRow,
StatsExcludedWordRow,
VocabularyStatsRow,
WordAnimeAppearanceRow,
WordDetailRow,
WordOccurrenceRow,
} from './types';
import { fromDbTimestamp } from './query-shared';
import { fromDbTimestamp, toDbTimestamp } from './query-shared';
import { nowMs } from './time';
const VOCABULARY_STATS_FILTER_OVERSAMPLE_FACTOR = 4;
const VOCABULARY_STATS_FILTER_OVERSAMPLE_MIN = 100;
function toVocabularyToken(row: VocabularyStatsRow): MergedToken {
const partOfSpeech =
row.partOfSpeech && Object.values(PartOfSpeech).includes(row.partOfSpeech as PartOfSpeech)
? (row.partOfSpeech as PartOfSpeech)
: PartOfSpeech.other;
return {
surface: row.word,
reading: row.reading ?? '',
headword: row.headword,
startPos: 0,
endPos: row.word.length,
partOfSpeech,
pos1: row.pos1 ?? '',
pos2: row.pos2 ?? '',
pos3: row.pos3 ?? '',
frequencyRank: row.frequencyRank ?? undefined,
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
};
}
function isVocabularyStatsRowVisible(row: VocabularyStatsRow): boolean {
return !shouldExcludeTokenFromVocabularyPersistence(toVocabularyToken(row));
}
export function getVocabularyStats(
db: DatabaseSync,
limit = 100,
excludePos?: string[],
): VocabularyStatsRow[] {
const queryLimit = Math.max(
limit,
limit * VOCABULARY_STATS_FILTER_OVERSAMPLE_FACTOR,
limit + VOCABULARY_STATS_FILTER_OVERSAMPLE_MIN,
);
const hasExclude = excludePos && excludePos.length > 0;
const placeholders = hasExclude ? excludePos.map(() => '?').join(', ') : '';
const whereClause = hasExclude
@@ -37,8 +76,48 @@ export function getVocabularyStats(
GROUP BY w.id
ORDER BY w.frequency DESC LIMIT ?
`);
const params = hasExclude ? [...excludePos, limit] : [limit];
return stmt.all(...params) as VocabularyStatsRow[];
const params = hasExclude ? [...excludePos, queryLimit] : [queryLimit];
return (stmt.all(...params) as VocabularyStatsRow[])
.filter(isVocabularyStatsRowVisible)
.slice(0, limit);
}
export function getStatsExcludedWords(db: DatabaseSync): StatsExcludedWordRow[] {
return db
.prepare(
`
SELECT headword, word, reading
FROM imm_stats_excluded_words
ORDER BY headword COLLATE NOCASE, word COLLATE NOCASE, reading COLLATE NOCASE
`,
)
.all() as StatsExcludedWordRow[];
}
export function replaceStatsExcludedWords(db: DatabaseSync, words: StatsExcludedWordRow[]): void {
const now = toDbTimestamp(nowMs());
const insertStmt = db.prepare(`
INSERT OR IGNORE INTO imm_stats_excluded_words(
headword,
word,
reading,
CREATED_DATE,
LAST_UPDATE_DATE
)
VALUES (?, ?, ?, ?, ?)
`);
db.exec('BEGIN IMMEDIATE');
try {
db.prepare('DELETE FROM imm_stats_excluded_words').run();
for (const word of words) {
insertStmt.run(word.headword, word.word, word.reading, now, now);
}
db.exec('COMMIT');
} catch (error) {
db.exec('ROLLBACK');
throw error;
}
}
export function getKanjiStats(db: DatabaseSync, limit = 100): KanjiStatsRow[] {
@@ -16,12 +16,31 @@ import type {
StreakCalendarRow,
WatchTimePerAnimeRow,
} from './types';
import { ACTIVE_SESSION_METRICS_CTE, fromDbTimestamp, resolvedCoverBlobExpr } from './query-shared';
import {
ACTIVE_SESSION_METRICS_CTE,
SESSION_WORD_COUNTS_CTE,
SESSION_WORD_COUNTS_SELECT,
fromDbTimestamp,
resolvedCoverBlobExpr,
sessionDisplayWordsExpr,
visibleWordSql,
} from './query-shared';
export function getAnimeLibrary(db: DatabaseSync): AnimeLibraryRow[] {
const wordsExpr = sessionDisplayWordsExpr('s', 'swc');
const rows = db
.prepare(
`
${SESSION_WORD_COUNTS_CTE},
anime_word_counts AS (
SELECT v.anime_id AS animeId, SUM(${wordsExpr}) AS totalTokensSeen
FROM imm_sessions s
JOIN imm_videos v ON v.video_id = s.video_id
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
WHERE s.ended_at_ms IS NOT NULL
AND v.anime_id IS NOT NULL
GROUP BY v.anime_id
)
SELECT
a.anime_id AS animeId,
a.canonical_title AS canonicalTitle,
@@ -29,13 +48,14 @@ export function getAnimeLibrary(db: DatabaseSync): AnimeLibraryRow[] {
COALESCE(lm.total_sessions, 0) AS totalSessions,
COALESCE(lm.total_active_ms, 0) AS totalActiveMs,
COALESCE(lm.total_cards, 0) AS totalCards,
COALESCE(lm.total_tokens_seen, 0) AS totalTokensSeen,
COALESCE(awc.totalTokensSeen, lm.total_tokens_seen, 0) AS totalTokensSeen,
COUNT(DISTINCT v.video_id) AS episodeCount,
a.episodes_total AS episodesTotal,
COALESCE(lm.last_watched_ms, 0) AS lastWatchedMs
FROM imm_anime a
JOIN imm_lifetime_anime lm ON lm.anime_id = a.anime_id
JOIN imm_videos v ON v.anime_id = a.anime_id
LEFT JOIN anime_word_counts awc ON awc.animeId = a.anime_id
GROUP BY a.anime_id
ORDER BY totalActiveMs DESC, lm.last_watched_ms DESC, canonicalTitle ASC
`,
@@ -48,6 +68,7 @@ export function getAnimeLibrary(db: DatabaseSync): AnimeLibraryRow[] {
}
export function getAnimeDetail(db: DatabaseSync, animeId: number): AnimeDetailRow | null {
const wordsExpr = sessionDisplayWordsExpr('s', 'swc', 'COALESCE(asm.tokensSeen, s.tokens_seen)');
const row = db
.prepare(
`
@@ -63,7 +84,10 @@ export function getAnimeDetail(db: DatabaseSync, animeId: number): AnimeDetailRo
COALESCE(lm.total_sessions, 0) AS totalSessions,
COALESCE(lm.total_active_ms, 0) AS totalActiveMs,
COALESCE(lm.total_cards, 0) AS totalCards,
COALESCE(lm.total_tokens_seen, 0) AS totalTokensSeen,
CASE
WHEN COUNT(s.session_id) > 0 THEN COALESCE(SUM(${wordsExpr}), 0)
ELSE COALESCE(lm.total_tokens_seen, 0)
END AS totalTokensSeen,
COALESCE(lm.total_lines_seen, 0) AS totalLinesSeen,
COALESCE(SUM(COALESCE(asm.lookupCount, s.lookup_count, 0)), 0) AS totalLookupCount,
COALESCE(SUM(COALESCE(asm.lookupHits, s.lookup_hits, 0)), 0) AS totalLookupHits,
@@ -75,6 +99,7 @@ export function getAnimeDetail(db: DatabaseSync, animeId: number): AnimeDetailRo
JOIN imm_videos v ON v.anime_id = a.anime_id
LEFT JOIN imm_sessions s ON s.video_id = v.video_id
LEFT JOIN active_session_metrics asm ON asm.sessionId = s.session_id
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
WHERE a.anime_id = ?
GROUP BY a.anime_id
`,
@@ -108,6 +133,7 @@ export function getAnimeAnilistEntries(db: DatabaseSync, animeId: number): Anime
}
export function getAnimeEpisodes(db: DatabaseSync, animeId: number): AnimeEpisodeRow[] {
const wordsExpr = sessionDisplayWordsExpr('s', 'swc', 'COALESCE(asm.tokensSeen, s.tokens_seen)');
const rows = db
.prepare(
`
@@ -162,12 +188,13 @@ export function getAnimeEpisodes(db: DatabaseSync, animeId: number): AnimeEpisod
COUNT(DISTINCT s.session_id) AS totalSessions,
COALESCE(SUM(COALESCE(asm.activeWatchedMs, s.active_watched_ms, 0)), 0) AS totalActiveMs,
COALESCE(SUM(COALESCE(asm.cardsMined, s.cards_mined, 0)), 0) AS totalCards,
COALESCE(SUM(COALESCE(asm.tokensSeen, s.tokens_seen, 0)), 0) AS totalTokensSeen,
COALESCE(SUM(${wordsExpr}), 0) AS totalTokensSeen,
COALESCE(SUM(COALESCE(asm.yomitanLookupCount, s.yomitan_lookup_count, 0)), 0) AS totalYomitanLookupCount,
MAX(s.started_at_ms) AS lastWatchedMs
FROM imm_videos v
LEFT JOIN imm_sessions s ON s.video_id = v.video_id
LEFT JOIN active_session_metrics asm ON asm.sessionId = s.session_id
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
WHERE v.anime_id = ?
GROUP BY v.video_id
ORDER BY
@@ -192,16 +219,25 @@ export function getAnimeEpisodes(db: DatabaseSync, animeId: number): AnimeEpisod
}
export function getMediaLibrary(db: DatabaseSync): MediaLibraryRow[] {
const wordsExpr = sessionDisplayWordsExpr('s', 'swc');
const rows = db
.prepare(
`
${SESSION_WORD_COUNTS_CTE},
media_word_counts AS (
SELECT s.video_id AS videoId, SUM(${wordsExpr}) AS totalTokensSeen
FROM imm_sessions s
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
WHERE s.ended_at_ms IS NOT NULL
GROUP BY s.video_id
)
SELECT
v.video_id AS videoId,
v.canonical_title AS canonicalTitle,
COALESCE(lm.total_sessions, 0) AS totalSessions,
COALESCE(lm.total_active_ms, 0) AS totalActiveMs,
COALESCE(lm.total_cards, 0) AS totalCards,
COALESCE(lm.total_tokens_seen, 0) AS totalTokensSeen,
COALESCE(mwc.totalTokensSeen, lm.total_tokens_seen, 0) AS totalTokensSeen,
COALESCE(lm.last_watched_ms, 0) AS lastWatchedMs,
yv.youtube_video_id AS youtubeVideoId,
yv.video_url AS videoUrl,
@@ -220,6 +256,7 @@ export function getMediaLibrary(db: DatabaseSync): MediaLibraryRow[] {
END AS hasCoverArt
FROM imm_videos v
JOIN imm_lifetime_media lm ON lm.video_id = v.video_id
LEFT JOIN media_word_counts mwc ON mwc.videoId = v.video_id
LEFT JOIN imm_media_art ma ON ma.video_id = v.video_id
LEFT JOIN imm_youtube_videos yv ON yv.video_id = v.video_id
ORDER BY lm.last_watched_ms DESC
@@ -233,6 +270,7 @@ export function getMediaLibrary(db: DatabaseSync): MediaLibraryRow[] {
}
export function getMediaDetail(db: DatabaseSync, videoId: number): MediaDetailRow | null {
const wordsExpr = sessionDisplayWordsExpr('s', 'swc', 'COALESCE(asm.tokensSeen, s.tokens_seen)');
return db
.prepare(
`
@@ -244,7 +282,10 @@ export function getMediaDetail(db: DatabaseSync, videoId: number): MediaDetailRo
COALESCE(lm.total_sessions, 0) AS totalSessions,
COALESCE(lm.total_active_ms, 0) AS totalActiveMs,
COALESCE(lm.total_cards, 0) AS totalCards,
COALESCE(lm.total_tokens_seen, 0) AS totalTokensSeen,
CASE
WHEN COUNT(s.session_id) > 0 THEN COALESCE(SUM(${wordsExpr}), 0)
ELSE COALESCE(lm.total_tokens_seen, 0)
END AS totalTokensSeen,
COALESCE(lm.total_lines_seen, 0) AS totalLinesSeen,
COALESCE(SUM(COALESCE(asm.lookupCount, s.lookup_count, 0)), 0) AS totalLookupCount,
COALESCE(SUM(COALESCE(asm.lookupHits, s.lookup_hits, 0)), 0) AS totalLookupHits,
@@ -265,6 +306,7 @@ export function getMediaDetail(db: DatabaseSync, videoId: number): MediaDetailRo
LEFT JOIN imm_youtube_videos yv ON yv.video_id = v.video_id
LEFT JOIN imm_sessions s ON s.video_id = v.video_id
LEFT JOIN active_session_metrics asm ON asm.sessionId = s.session_id
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
WHERE v.video_id = ?
GROUP BY v.video_id
`,
@@ -277,6 +319,7 @@ export function getMediaSessions(
videoId: number,
limit = 100,
): SessionSummaryQueryRow[] {
const wordsExpr = sessionDisplayWordsExpr('s', 'swc', 'COALESCE(asm.tokensSeen, s.tokens_seen)');
const rows = db
.prepare(
`
@@ -290,13 +333,14 @@ export function getMediaSessions(
COALESCE(asm.totalWatchedMs, s.total_watched_ms, 0) AS totalWatchedMs,
COALESCE(asm.activeWatchedMs, s.active_watched_ms, 0) AS activeWatchedMs,
COALESCE(asm.linesSeen, s.lines_seen, 0) AS linesSeen,
COALESCE(asm.tokensSeen, s.tokens_seen, 0) AS tokensSeen,
${wordsExpr} AS tokensSeen,
COALESCE(asm.cardsMined, s.cards_mined, 0) AS cardsMined,
COALESCE(asm.lookupCount, s.lookup_count, 0) AS lookupCount,
COALESCE(asm.lookupHits, s.lookup_hits, 0) AS lookupHits,
COALESCE(asm.yomitanLookupCount, s.yomitan_lookup_count, 0) AS yomitanLookupCount
FROM imm_sessions s
LEFT JOIN active_session_metrics asm ON asm.sessionId = s.session_id
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
LEFT JOIN imm_videos v ON v.video_id = s.video_id
WHERE s.video_id = ?
ORDER BY s.started_at_ms DESC
@@ -321,10 +365,27 @@ export function getMediaDailyRollups(
videoId: number,
limit = 90,
): ImmersionSessionRollupRow[] {
const wordsExpr = sessionDisplayWordsExpr('s', 'swc');
return db
.prepare(
`
WITH recent_days AS (
WITH session_word_counts AS (
${SESSION_WORD_COUNTS_SELECT}
),
daily_word_counts AS (
SELECT
CAST(
julianday(CAST(s.started_at_ms AS REAL) / 1000, 'unixepoch', 'localtime') - 2440587.5
AS INTEGER
) AS rollupDay,
s.video_id AS videoId,
SUM(${wordsExpr}) AS totalTokensSeen
FROM imm_sessions s
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
WHERE s.ended_at_ms IS NOT NULL
GROUP BY rollupDay, s.video_id
),
recent_days AS (
SELECT DISTINCT rollup_day
FROM imm_daily_rollups
WHERE video_id = ?
@@ -337,12 +398,18 @@ export function getMediaDailyRollups(
total_sessions AS totalSessions,
total_active_min AS totalActiveMin,
total_lines_seen AS totalLinesSeen,
total_tokens_seen AS totalTokensSeen,
COALESCE(dwc.totalTokensSeen, total_tokens_seen) AS totalTokensSeen,
total_cards AS totalCards,
cards_per_hour AS cardsPerHour,
tokens_per_min AS tokensPerMin,
CASE
WHEN total_active_min > 0 THEN COALESCE(dwc.totalTokensSeen, total_tokens_seen) * 1.0 / total_active_min
ELSE NULL
END AS tokensPerMin,
lookup_hit_rate AS lookupHitRate
FROM imm_daily_rollups
LEFT JOIN daily_word_counts dwc
ON dwc.rollupDay = rollup_day
AND dwc.videoId = video_id
WHERE video_id = ?
AND rollup_day IN (SELECT rollup_day FROM recent_days)
ORDER BY rollup_day DESC, video_id DESC
@@ -356,10 +423,27 @@ export function getAnimeDailyRollups(
animeId: number,
limit = 90,
): ImmersionSessionRollupRow[] {
const wordsExpr = sessionDisplayWordsExpr('s', 'swc');
return db
.prepare(
`
WITH recent_days AS (
WITH session_word_counts AS (
${SESSION_WORD_COUNTS_SELECT}
),
daily_word_counts AS (
SELECT
CAST(
julianday(CAST(s.started_at_ms AS REAL) / 1000, 'unixepoch', 'localtime') - 2440587.5
AS INTEGER
) AS rollupDay,
s.video_id AS videoId,
SUM(${wordsExpr}) AS totalTokensSeen
FROM imm_sessions s
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
WHERE s.ended_at_ms IS NOT NULL
GROUP BY rollupDay, s.video_id
),
recent_days AS (
SELECT DISTINCT r.rollup_day
FROM imm_daily_rollups r
JOIN imm_videos v ON v.video_id = r.video_id
@@ -370,11 +454,19 @@ export function getAnimeDailyRollups(
SELECT r.rollup_day AS rollupDayOrMonth, r.video_id AS videoId,
r.total_sessions AS totalSessions, r.total_active_min AS totalActiveMin,
r.total_lines_seen AS totalLinesSeen,
r.total_tokens_seen AS totalTokensSeen, r.total_cards AS totalCards,
r.cards_per_hour AS cardsPerHour, r.tokens_per_min AS tokensPerMin,
COALESCE(dwc.totalTokensSeen, r.total_tokens_seen) AS totalTokensSeen,
r.total_cards AS totalCards,
r.cards_per_hour AS cardsPerHour,
CASE
WHEN r.total_active_min > 0 THEN COALESCE(dwc.totalTokensSeen, r.total_tokens_seen) * 1.0 / r.total_active_min
ELSE NULL
END AS tokensPerMin,
r.lookup_hit_rate AS lookupHitRate
FROM imm_daily_rollups r
JOIN imm_videos v ON v.video_id = r.video_id
LEFT JOIN daily_word_counts dwc
ON dwc.rollupDay = r.rollup_day
AND dwc.videoId = r.video_id
WHERE v.anime_id = ?
AND r.rollup_day IN (SELECT rollup_day FROM recent_days)
ORDER BY r.rollup_day DESC, r.video_id DESC
@@ -470,7 +562,7 @@ export function getAnimeWords(db: DatabaseSync, animeId: number, limit = 50): An
FROM imm_word_line_occurrences o
JOIN imm_subtitle_lines sl ON sl.line_id = o.line_id
JOIN imm_words w ON w.id = o.word_id
WHERE sl.anime_id = ?
WHERE sl.anime_id = ? AND ${visibleWordSql('w')}
GROUP BY w.id
ORDER BY frequency DESC
LIMIT ?
@@ -556,6 +648,7 @@ export function getEpisodeWords(db: DatabaseSync, videoId: number, limit = 50):
}
export function getEpisodeSessions(db: DatabaseSync, videoId: number): SessionSummaryQueryRow[] {
const wordsExpr = sessionDisplayWordsExpr('s', 'swc', 'COALESCE(asm.tokensSeen, s.tokens_seen)');
const rows = db
.prepare(
`
@@ -567,7 +660,7 @@ export function getEpisodeSessions(db: DatabaseSync, videoId: number): SessionSu
COALESCE(asm.totalWatchedMs, s.total_watched_ms, 0) AS totalWatchedMs,
COALESCE(asm.activeWatchedMs, s.active_watched_ms, 0) AS activeWatchedMs,
COALESCE(asm.linesSeen, s.lines_seen, 0) AS linesSeen,
COALESCE(asm.tokensSeen, s.tokens_seen, 0) AS tokensSeen,
${wordsExpr} AS tokensSeen,
COALESCE(asm.cardsMined, s.cards_mined, 0) AS cardsMined,
COALESCE(asm.lookupCount, s.lookup_count, 0) AS lookupCount,
COALESCE(asm.lookupHits, s.lookup_hits, 0) AS lookupHits,
@@ -575,6 +668,7 @@ export function getEpisodeSessions(db: DatabaseSync, videoId: number): SessionSu
FROM imm_sessions s
JOIN imm_videos v ON v.video_id = s.video_id
LEFT JOIN active_session_metrics asm ON asm.sessionId = s.session_id
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
WHERE s.video_id = ?
ORDER BY s.started_at_ms DESC
`,
@@ -6,14 +6,18 @@ import type {
} from './types';
import {
ACTIVE_SESSION_METRICS_CTE,
SESSION_WORD_COUNTS_CTE,
SESSION_WORD_COUNTS_SELECT,
currentDbTimestamp,
fromDbTimestamp,
getLocalEpochDay,
getShiftedLocalDaySec,
toDbTimestamp,
sessionDisplayWordsExpr,
visibleWordSql,
} from './query-shared';
export function getSessionSummaries(db: DatabaseSync, limit = 50): SessionSummaryQueryRow[] {
const wordsExpr = sessionDisplayWordsExpr('s', 'swc', 'COALESCE(asm.tokensSeen, s.tokens_seen)');
const prepared = db.prepare(`
${ACTIVE_SESSION_METRICS_CTE}
SELECT
@@ -27,13 +31,14 @@ export function getSessionSummaries(db: DatabaseSync, limit = 50): SessionSummar
COALESCE(asm.totalWatchedMs, s.total_watched_ms, 0) AS totalWatchedMs,
COALESCE(asm.activeWatchedMs, s.active_watched_ms, 0) AS activeWatchedMs,
COALESCE(asm.linesSeen, s.lines_seen, 0) AS linesSeen,
COALESCE(asm.tokensSeen, s.tokens_seen, 0) AS tokensSeen,
${wordsExpr} AS tokensSeen,
COALESCE(asm.cardsMined, s.cards_mined, 0) AS cardsMined,
COALESCE(asm.lookupCount, s.lookup_count, 0) AS lookupCount,
COALESCE(asm.lookupHits, s.lookup_hits, 0) AS lookupHits,
COALESCE(asm.yomitanLookupCount, s.yomitan_lookup_count, 0) AS yomitanLookupCount
FROM imm_sessions s
LEFT JOIN active_session_metrics asm ON asm.sessionId = s.session_id
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
LEFT JOIN imm_videos v ON v.video_id = s.video_id
LEFT JOIN imm_anime a ON a.anime_id = v.anime_id
ORDER BY s.started_at_ms DESC
@@ -94,7 +99,9 @@ export function getSessionTimeline(
/** Returns all distinct headwords in the vocabulary table (global). */
export function getAllDistinctHeadwords(db: DatabaseSync): string[] {
const rows = db.prepare('SELECT DISTINCT headword FROM imm_words').all() as Array<{
const rows = db
.prepare(`SELECT DISTINCT headword FROM imm_words w WHERE ${visibleWordSql('w')}`)
.all() as Array<{
headword: string;
}>;
return rows.map((r) => r.headword);
@@ -109,7 +116,7 @@ export function getAnimeDistinctHeadwords(db: DatabaseSync, animeId: number): st
FROM imm_word_line_occurrences o
JOIN imm_subtitle_lines sl ON sl.line_id = o.line_id
JOIN imm_words w ON w.id = o.word_id
WHERE sl.anime_id = ?
WHERE sl.anime_id = ? AND ${visibleWordSql('w')}
`,
)
.all(animeId) as Array<{ headword: string }>;
@@ -125,7 +132,7 @@ export function getMediaDistinctHeadwords(db: DatabaseSync, videoId: number): st
FROM imm_word_line_occurrences o
JOIN imm_subtitle_lines sl ON sl.line_id = o.line_id
JOIN imm_words w ON w.id = o.word_id
WHERE sl.video_id = ?
WHERE sl.video_id = ? AND ${visibleWordSql('w')}
`,
)
.all(videoId) as Array<{ headword: string }>;
@@ -148,7 +155,7 @@ export function getSessionWordsByLine(
FROM imm_subtitle_lines sl
JOIN imm_word_line_occurrences wlo ON wlo.line_id = sl.line_id
JOIN imm_words w ON w.id = wlo.word_id
WHERE sl.session_id = ?
WHERE sl.session_id = ? AND ${visibleWordSql('w')}
ORDER BY sl.line_index ASC
`);
return stmt.all(sessionId) as Array<{
@@ -290,11 +297,17 @@ export function getQueryHints(db: DatabaseSync): {
const totalCards = Number(lifetime?.totalCards ?? 0);
const activeDays = Number(lifetime?.activeDays ?? 0);
const lookupWordsExpr = sessionDisplayWordsExpr(
's',
'swc',
'COALESCE(t.tokens_seen, s.tokens_seen)',
);
const lookupTotals = db
.prepare(
`
${SESSION_WORD_COUNTS_CTE}
SELECT
COALESCE(SUM(COALESCE(t.tokens_seen, s.tokens_seen, 0)), 0) AS totalTokensSeen,
COALESCE(SUM(${lookupWordsExpr}), 0) AS totalTokensSeen,
COALESCE(SUM(COALESCE(t.lookup_count, s.lookup_count, 0)), 0) AS totalLookupCount,
COALESCE(SUM(COALESCE(t.lookup_hits, s.lookup_hits, 0)), 0) AS totalLookupHits,
COALESCE(SUM(COALESCE(t.yomitan_lookup_count, s.yomitan_lookup_count, 0)), 0) AS totalYomitanLookupCount
@@ -309,6 +322,7 @@ export function getQueryHints(db: DatabaseSync): {
FROM imm_session_telemetry
GROUP BY session_id
) t ON t.session_id = s.session_id
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
WHERE s.ended_at_ms IS NOT NULL
`,
)
@@ -338,8 +352,25 @@ export function getQueryHints(db: DatabaseSync): {
}
export function getDailyRollups(db: DatabaseSync, limit = 60): ImmersionSessionRollupRow[] {
const wordsExpr = sessionDisplayWordsExpr('s', 'swc');
const prepared = db.prepare(`
WITH recent_days AS (
WITH session_word_counts AS (
${SESSION_WORD_COUNTS_SELECT}
),
daily_word_counts AS (
SELECT
CAST(
julianday(CAST(s.started_at_ms AS REAL) / 1000, 'unixepoch', 'localtime') - 2440587.5
AS INTEGER
) AS rollupDay,
s.video_id AS videoId,
SUM(${wordsExpr}) AS totalTokensSeen
FROM imm_sessions s
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
WHERE s.ended_at_ms IS NOT NULL
GROUP BY rollupDay, s.video_id
),
recent_days AS (
SELECT DISTINCT rollup_day
FROM imm_daily_rollups
ORDER BY rollup_day DESC
@@ -351,12 +382,21 @@ export function getDailyRollups(db: DatabaseSync, limit = 60): ImmersionSessionR
r.total_sessions AS totalSessions,
r.total_active_min AS totalActiveMin,
r.total_lines_seen AS totalLinesSeen,
r.total_tokens_seen AS totalTokensSeen,
COALESCE(dwc.totalTokensSeen, r.total_tokens_seen) AS totalTokensSeen,
r.total_cards AS totalCards,
r.cards_per_hour AS cardsPerHour,
r.tokens_per_min AS tokensPerMin,
CASE
WHEN r.total_active_min > 0 THEN COALESCE(dwc.totalTokensSeen, r.total_tokens_seen) * 1.0 / r.total_active_min
ELSE NULL
END AS tokensPerMin,
r.lookup_hit_rate AS lookupHitRate
FROM imm_daily_rollups r
LEFT JOIN daily_word_counts dwc
ON dwc.rollupDay = r.rollup_day
AND (
(dwc.videoId IS NULL AND r.video_id IS NULL)
OR dwc.videoId = r.video_id
)
WHERE r.rollup_day IN (SELECT rollup_day FROM recent_days)
ORDER BY r.rollup_day DESC, r.video_id DESC
`);
@@ -365,33 +405,53 @@ export function getDailyRollups(db: DatabaseSync, limit = 60): ImmersionSessionR
}
export function getMonthlyRollups(db: DatabaseSync, limit = 24): ImmersionSessionRollupRow[] {
const wordsExpr = sessionDisplayWordsExpr('s', 'swc');
const prepared = db.prepare(`
WITH recent_months AS (
WITH session_word_counts AS (
${SESSION_WORD_COUNTS_SELECT}
),
monthly_word_counts AS (
SELECT
CAST(strftime('%Y%m', CAST(s.started_at_ms AS REAL) / 1000, 'unixepoch', 'localtime') AS INTEGER) AS rollupMonth,
s.video_id AS videoId,
SUM(${wordsExpr}) AS totalTokensSeen
FROM imm_sessions s
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
WHERE s.ended_at_ms IS NOT NULL
GROUP BY rollupMonth, s.video_id
),
recent_months AS (
SELECT DISTINCT rollup_month
FROM imm_monthly_rollups
ORDER BY rollup_month DESC
LIMIT ?
)
SELECT
rollup_month AS rollupDayOrMonth,
video_id AS videoId,
total_sessions AS totalSessions,
total_active_min AS totalActiveMin,
total_lines_seen AS totalLinesSeen,
total_tokens_seen AS totalTokensSeen,
total_cards AS totalCards,
r.rollup_month AS rollupDayOrMonth,
r.video_id AS videoId,
r.total_sessions AS totalSessions,
r.total_active_min AS totalActiveMin,
r.total_lines_seen AS totalLinesSeen,
COALESCE(mwc.totalTokensSeen, r.total_tokens_seen) AS totalTokensSeen,
r.total_cards AS totalCards,
CASE
WHEN total_active_min > 0 THEN (total_cards * 60.0) / total_active_min
WHEN r.total_active_min > 0 THEN (r.total_cards * 60.0) / r.total_active_min
ELSE NULL
END AS cardsPerHour,
CASE
WHEN total_active_min > 0 THEN total_tokens_seen * 1.0 / total_active_min
WHEN r.total_active_min > 0 THEN COALESCE(mwc.totalTokensSeen, r.total_tokens_seen) * 1.0 / r.total_active_min
ELSE NULL
END AS tokensPerMin,
NULL AS lookupHitRate
FROM imm_monthly_rollups
WHERE rollup_month IN (SELECT rollup_month FROM recent_months)
ORDER BY rollup_month DESC, video_id DESC
FROM imm_monthly_rollups r
LEFT JOIN monthly_word_counts mwc
ON mwc.rollupMonth = r.rollup_month
AND (
(mwc.videoId IS NULL AND r.video_id IS NULL)
OR mwc.videoId = r.video_id
)
WHERE r.rollup_month IN (SELECT rollup_month FROM recent_months)
ORDER BY r.rollup_month DESC, r.video_id DESC
`);
return prepared.all(limit) as unknown as ImmersionSessionRollupRow[];
}
@@ -1,6 +1,42 @@
import type { DatabaseSync } from './sqlite';
import { SUBTITLE_ANNOTATION_EXCLUDED_TERMS } from '../tokenizer/subtitle-annotation-filter';
import { nowMs } from './time';
function quoteSqlString(value: string): string {
return `'${value.replaceAll("'", "''")}'`;
}
const SQL_EXCLUDED_VOCABULARY_TERMS = [...SUBTITLE_ANNOTATION_EXCLUDED_TERMS].map(quoteSqlString);
const SQL_EXCLUDED_VOCABULARY_TERMS_LIST =
SQL_EXCLUDED_VOCABULARY_TERMS.length > 0 ? SQL_EXCLUDED_VOCABULARY_TERMS.join(', ') : "''";
export function visibleWordSql(wordAlias: string): string {
return `(
TRIM(COALESCE(${wordAlias}.word, '')) NOT IN (${SQL_EXCLUDED_VOCABULARY_TERMS_LIST})
AND TRIM(COALESCE(${wordAlias}.headword, '')) NOT IN (${SQL_EXCLUDED_VOCABULARY_TERMS_LIST})
AND TRIM(COALESCE(${wordAlias}.reading, '')) NOT IN (${SQL_EXCLUDED_VOCABULARY_TERMS_LIST})
)`;
}
export function filteredWordOccurrenceCountSql(occurrenceAlias: string, wordAlias: string): string {
return `CASE
WHEN ${occurrenceAlias}.word_id IS NOT NULL AND ${visibleWordSql(wordAlias)}
THEN ${occurrenceAlias}.occurrence_count
ELSE 0
END`;
}
export const SESSION_WORD_COUNTS_SELECT = `
SELECT
sl.session_id AS sessionId,
COUNT(DISTINCT sl.line_id) AS persistedLineCount,
COALESCE(SUM(${filteredWordOccurrenceCountSql('wlo', 'w')}), 0) AS filteredWordsSeen
FROM imm_subtitle_lines sl
LEFT JOIN imm_word_line_occurrences wlo ON wlo.line_id = sl.line_id
LEFT JOIN imm_words w ON w.id = wlo.word_id
GROUP BY sl.session_id
`;
export const ACTIVE_SESSION_METRICS_CTE = `
WITH active_session_metrics AS (
SELECT
@@ -17,9 +53,29 @@ export const ACTIVE_SESSION_METRICS_CTE = `
JOIN imm_sessions s ON s.session_id = t.session_id
WHERE s.ended_at_ms IS NULL
GROUP BY t.session_id
),
session_word_counts AS (
${SESSION_WORD_COUNTS_SELECT}
)
`;
export const SESSION_WORD_COUNTS_CTE = `
WITH session_word_counts AS (
${SESSION_WORD_COUNTS_SELECT}
)
`;
export function sessionDisplayWordsExpr(
sessionAlias: string,
wordCountAlias: string,
rawTokensExpr = `${sessionAlias}.tokens_seen`,
): string {
return `CASE
WHEN COALESCE(${wordCountAlias}.persistedLineCount, 0) > 0 THEN COALESCE(${wordCountAlias}.filteredWordsSeen, 0)
ELSE COALESCE(${rawTokensExpr}, 0)
END`;
}
export function makePlaceholders(values: number[]): string {
return values.map(() => '?').join(',');
}
@@ -9,6 +9,7 @@ import {
getLocalMonthKey,
getShiftedLocalDayTimestamp,
makePlaceholders,
sessionDisplayWordsExpr,
toDbTimestamp,
} from './query-shared';
import { getDailyRollups, getMonthlyRollups } from './query-sessions';
@@ -560,6 +561,7 @@ function getTrendSessionMetrics(
db: DatabaseSync,
cutoffMs: string | null,
): TrendSessionMetricRow[] {
const wordsExpr = sessionDisplayWordsExpr('s', 'swc', 'COALESCE(asm.tokensSeen, s.tokens_seen)');
const whereClause = cutoffMs === null ? '' : 'WHERE s.started_at_ms >= ?';
const cutoffValue = cutoffMs === null ? null : toDbTimestamp(cutoffMs);
const prepared = db.prepare(`
@@ -570,11 +572,12 @@ function getTrendSessionMetrics(
v.canonical_title AS canonicalTitle,
a.canonical_title AS animeTitle,
COALESCE(asm.activeWatchedMs, s.active_watched_ms, 0) AS activeWatchedMs,
COALESCE(asm.tokensSeen, s.tokens_seen, 0) AS tokensSeen,
${wordsExpr} AS tokensSeen,
COALESCE(asm.cardsMined, s.cards_mined, 0) AS cardsMined,
COALESCE(asm.yomitanLookupCount, s.yomitan_lookup_count, 0) AS yomitanLookupCount
FROM imm_sessions s
LEFT JOIN active_session_metrics asm ON asm.sessionId = s.session_id
LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id
LEFT JOIN imm_videos v ON v.video_id = s.video_id
LEFT JOIN imm_anime a ON a.anime_id = v.anime_id
${whereClause}
@@ -4,6 +4,7 @@ import os from 'node:os';
import path from 'node:path';
import test from 'node:test';
import { Database } from './sqlite';
import { getStatsExcludedWords, replaceStatsExcludedWords } from './query-lexical';
import { finalizeSessionRecord, startSessionRecord } from './session';
import {
applyPragmas,
@@ -113,6 +114,7 @@ test('ensureSchema creates immersion core tables', () => {
assert.ok(tableNames.has('imm_rollup_state'));
assert.ok(tableNames.has('imm_cover_art_blobs'));
assert.ok(tableNames.has('imm_youtube_videos'));
assert.ok(tableNames.has('imm_stats_excluded_words'));
const videoColumns = new Set(
(
@@ -153,6 +155,32 @@ test('ensureSchema creates immersion core tables', () => {
}
});
test('stats excluded words are replaced and read from sqlite storage', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
ensureSchema(db);
replaceStatsExcludedWords(db, [
{ headword: '猫', word: '猫', reading: 'ねこ' },
{ headword: 'する', word: 'する', reading: 'する' },
]);
assert.deepEqual(getStatsExcludedWords(db), [
{ headword: 'する', word: 'する', reading: 'する' },
{ headword: '猫', word: '猫', reading: 'ねこ' },
]);
replaceStatsExcludedWords(db, [{ headword: '犬', word: '犬', reading: 'いぬ' }]);
assert.deepEqual(getStatsExcludedWords(db), [
{ headword: '犬', word: '犬', reading: 'いぬ' },
]);
} finally {
db.close();
cleanupDbPath(dbPath);
}
});
test('ensureSchema adds youtube metadata table to existing schema version 15 databases', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
@@ -464,6 +464,19 @@ function ensureLifetimeSummaryTables(db: DatabaseSync): void {
`);
}
function ensureStatsExcludedWordsTable(db: DatabaseSync): void {
db.exec(`
CREATE TABLE IF NOT EXISTS imm_stats_excluded_words(
headword TEXT NOT NULL,
word TEXT NOT NULL,
reading TEXT NOT NULL,
CREATED_DATE TEXT,
LAST_UPDATE_DATE TEXT,
PRIMARY KEY(headword, word, reading)
)
`);
}
export function getOrCreateAnimeRecord(db: DatabaseSync, input: AnimeRecordInput): number {
const normalizedTitleKey = normalizeAnimeIdentityKey(input.parsedTitle);
if (!normalizedTitleKey) {
@@ -678,6 +691,7 @@ export function ensureSchema(db: DatabaseSync): void {
.get() as { schema_version: number } | null;
if (currentVersion?.schema_version === SCHEMA_VERSION) {
ensureLifetimeSummaryTables(db);
ensureStatsExcludedWordsTable(db);
return;
}
@@ -1221,6 +1235,7 @@ export function ensureSchema(db: DatabaseSync): void {
migrateSessionEventTimestampsToText(db);
ensureLifetimeSummaryTables(db);
ensureStatsExcludedWordsTable(db);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_anime_normalized_title
+7 -1
View File
@@ -1,4 +1,4 @@
export const SCHEMA_VERSION = 17;
export const SCHEMA_VERSION = 18;
export const DEFAULT_QUEUE_CAP = 1_000;
export const DEFAULT_BATCH_SIZE = 25;
export const DEFAULT_FLUSH_INTERVAL_MS = 500;
@@ -301,6 +301,12 @@ export interface VocabularyStatsRow {
lastSeen: number;
}
export interface StatsExcludedWordRow {
headword: string;
word: string;
reading: string;
}
export interface VocabularyCleanupSummary {
scanned: number;
kept: number;
+79 -16
View File
@@ -20,6 +20,12 @@ type StatsServerNoteInfo = {
fields: Record<string, { value: string }>;
};
type StatsExcludedWordPayload = {
headword: string;
word: string;
reading: string;
};
function parseIntQuery(raw: string | undefined, fallback: number, maxLimit?: number): number {
if (raw === undefined) return fallback;
const n = Number(raw);
@@ -49,6 +55,23 @@ function parseEventTypesQuery(raw: string | undefined): number[] | undefined {
return parsed.length > 0 ? parsed : undefined;
}
function parseExcludedWordsBody(body: unknown): StatsExcludedWordPayload[] | null {
if (!body || typeof body !== 'object' || !Array.isArray((body as { words?: unknown }).words)) {
return null;
}
const words: StatsExcludedWordPayload[] = [];
for (const row of (body as { words: unknown[] }).words) {
if (!row || typeof row !== 'object') return null;
const { headword, word, reading } = row as Record<string, unknown>;
if (typeof headword !== 'string' || typeof word !== 'string' || typeof reading !== 'string') {
return null;
}
words.push({ headword, word, reading });
}
return words;
}
function resolveStatsNoteFieldName(
noteInfo: StatsServerNoteInfo,
...preferredNames: (string | undefined)[]
@@ -161,6 +184,21 @@ function toKnownWordRate(knownWordsSeen: number, tokensSeen: number): number {
return Number(((knownWordsSeen / tokensSeen) * 100).toFixed(1));
}
function summarizeFilteredWordOccurrences(
wordsByLine: Array<{ lineIndex: number; headword: string; occurrenceCount: number }>,
knownWordsSet: Set<string>,
): { knownWordsSeen: number; totalWordsSeen: number } {
let knownWordsSeen = 0;
let totalWordsSeen = 0;
for (const row of wordsByLine) {
totalWordsSeen += row.occurrenceCount;
if (knownWordsSet.has(row.headword)) {
knownWordsSeen += row.occurrenceCount;
}
}
return { knownWordsSeen, totalWordsSeen };
}
async function enrichSessionsWithKnownWordMetrics(
tracker: ImmersionTrackerService,
sessions: Array<{
@@ -188,21 +226,21 @@ async function enrichSessionsWithKnownWordMetrics(
const enriched = await Promise.all(
sessions.map(async (session) => {
let knownWordsSeen = 0;
let totalWordsSeen = 0;
try {
const wordsByLine = await tracker.getSessionWordsByLine(session.sessionId);
for (const row of wordsByLine) {
if (knownWordsSet.has(row.headword)) {
knownWordsSeen += row.occurrenceCount;
}
}
const summary = summarizeFilteredWordOccurrences(wordsByLine, knownWordsSet);
knownWordsSeen = summary.knownWordsSeen;
totalWordsSeen = summary.totalWordsSeen;
} catch {
knownWordsSeen = 0;
totalWordsSeen = 0;
}
return {
...session,
knownWordsSeen,
knownWordRate: toKnownWordRate(knownWordsSeen, session.tokensSeen),
knownWordRate: toKnownWordRate(knownWordsSeen, totalWordsSeen),
};
}),
);
@@ -391,32 +429,45 @@ export function createStatsApp(
const id = parseIntQuery(c.req.param('id'), 0);
if (id <= 0) return c.json([], 400);
const knownWordsSet = loadKnownWordsSet(options?.knownWordCachePath);
if (!knownWordsSet) return c.json([]);
const knownWordsSet = loadKnownWordsSet(options?.knownWordCachePath) ?? new Set<string>();
// Get per-line word occurrences for the session.
const wordsByLine = await tracker.getSessionWordsByLine(id);
// Build cumulative known-word occurrence count per recorded line index.
// Build cumulative filtered occurrence counts per recorded line index.
// The stats UI uses line-count progress to align this series with the session
// timeline, so preserve the stored line position rather than compressing gaps.
const lineGroups = new Map<number, number>();
const totalLineGroups = new Map<number, number>();
const knownLineGroups = new Map<number, number>();
for (const row of wordsByLine) {
if (!knownWordsSet.has(row.headword)) {
continue;
totalLineGroups.set(
row.lineIndex,
(totalLineGroups.get(row.lineIndex) ?? 0) + row.occurrenceCount,
);
if (knownWordsSet.has(row.headword)) {
knownLineGroups.set(
row.lineIndex,
(knownLineGroups.get(row.lineIndex) ?? 0) + row.occurrenceCount,
);
}
lineGroups.set(row.lineIndex, (lineGroups.get(row.lineIndex) ?? 0) + row.occurrenceCount);
}
const sortedLineIndices = [...lineGroups.keys()].sort((a, b) => a - b);
const sortedLineIndices = [...totalLineGroups.keys()].sort((a, b) => a - b);
let knownWordsSeen = 0;
const knownByLinesSeen: Array<{ linesSeen: number; knownWordsSeen: number }> = [];
let totalWordsSeen = 0;
const knownByLinesSeen: Array<{
linesSeen: number;
knownWordsSeen: number;
totalWordsSeen: number;
}> = [];
for (const lineIdx of sortedLineIndices) {
knownWordsSeen += lineGroups.get(lineIdx)!;
knownWordsSeen += knownLineGroups.get(lineIdx) ?? 0;
totalWordsSeen += totalLineGroups.get(lineIdx)!;
knownByLinesSeen.push({
linesSeen: lineIdx,
knownWordsSeen,
totalWordsSeen,
});
}
@@ -430,6 +481,18 @@ export function createStatsApp(
return c.json(vocab);
});
app.get('/api/stats/excluded-words', async (c) => {
return c.json(await tracker.getStatsExcludedWords());
});
app.put('/api/stats/excluded-words', async (c) => {
const body = await c.req.json().catch(() => null);
const words = parseExcludedWordsBody(body);
if (!words) return c.body(null, 400);
await tracker.replaceStatsExcludedWords(words);
return c.json({ ok: true });
});
app.get('/api/stats/vocabulary/occurrences', async (c) => {
const headword = (c.req.query('headword') ?? '').trim();
const word = (c.req.query('word') ?? '').trim();
@@ -5,6 +5,7 @@ import {
annotateTokens,
AnnotationStageDeps,
shouldExcludeTokenFromSubtitleAnnotations,
shouldExcludeTokenFromVocabularyPersistence,
stripSubtitleAnnotationMetadata,
} from './annotation-stage';
@@ -366,6 +367,87 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes kana-only non-independe
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
});
test('shouldExcludeTokenFromVocabularyPersistence mirrors subtitle annotation grammar filters', () => {
const tokens = [
makeToken({
surface: 'どうしてもって',
headword: 'どうしても',
reading: 'ドウシテモッテ',
partOfSpeech: PartOfSpeech.other,
pos1: '副詞|助詞',
pos2: '一般|格助詞',
}),
makeToken({
surface: 'そうだ',
headword: 'そう',
reading: 'ソウダ',
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞|助動詞',
pos2: '一般|',
pos3: '助動詞語幹|',
}),
];
for (const token of tokens) {
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
assert.equal(shouldExcludeTokenFromVocabularyPersistence(token), true, token.surface);
}
});
test('shouldExcludeTokenFromVocabularyPersistence excludes common frequency stop terms', () => {
const tokens = [
makeToken({
surface: 'じゃない',
headword: 'じゃない',
reading: '',
partOfSpeech: PartOfSpeech.i_adjective,
pos1: '形容詞',
pos2: '*|自立',
pos3: '*',
}),
makeToken({
surface: 'である',
headword: 'である',
reading: '',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '*',
pos3: '*',
}),
makeToken({
surface: '何か',
headword: '何か',
reading: 'なにか',
partOfSpeech: PartOfSpeech.other,
pos1: '名詞|助詞',
pos2: '代名詞|副助詞/並立助詞/終助詞',
pos3: '一般|*',
}),
makeToken({
surface: '確かに',
headword: '確かに',
reading: 'たしかに',
partOfSpeech: PartOfSpeech.other,
pos1: '名詞|助詞',
pos2: '形容動詞語幹|副詞化',
pos3: '*',
}),
makeToken({
surface: 'あなた',
headword: '貴方',
reading: 'あなた',
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '代名詞',
pos3: '一般',
}),
];
for (const token of tokens) {
assert.equal(shouldExcludeTokenFromVocabularyPersistence(token), true, token.surface);
}
});
test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => {
const token = makeToken({
surface: 'は',
@@ -328,10 +328,12 @@ export function shouldExcludeTokenFromVocabularyPersistence(
token: MergedToken,
options: Pick<AnnotationStageOptions, 'pos1Exclusions' | 'pos2Exclusions'> = {},
): boolean {
return isFrequencyExcludedByPos(
token,
resolvePos1Exclusions(options),
resolvePos2Exclusions(options),
const pos1Exclusions = resolvePos1Exclusions(options);
const pos2Exclusions = resolvePos2Exclusions(options);
return (
sharedShouldExcludeTokenFromSubtitleAnnotations(token, { pos1Exclusions, pos2Exclusions }) ||
isFrequencyExcludedByPos(token, pos1Exclusions, pos2Exclusions)
);
}
@@ -13,17 +13,40 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
const KATAKANA_CODEPOINT_START = 0x30a1;
const KATAKANA_CODEPOINT_END = 0x30f6;
const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
export const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
'あ',
'ああ',
'あなた',
'あんた',
'ええ',
'うう',
'おお',
'おい',
'お前',
'こいつ',
'こっち',
'じゃない',
'そうだ',
'たち',
'である',
'どこか',
'なんか',
'べき',
'はあ',
'はは',
'へえ',
'ふう',
'ほう',
'やはり',
'って',
'何か',
'何だ',
'何も',
'如何した',
'様',
'確かに',
'誰も',
'貴方',
]);
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [