From 05805a316971245859f21b9e19ac3930e8e29132 Mon Sep 17 00:00:00 2001 From: sudacode Date: Sun, 1 Mar 2026 00:02:29 -0800 Subject: [PATCH] feat: add v3 immersion vocabulary tables --- docs/immersion-tracking.md | 9 ++- .../immersion-tracker-service.test.ts | 4 +- .../services/immersion-tracker-service.ts | 26 ++++++- .../immersion-tracker/reducer.test.ts | 22 ++++++ .../services/immersion-tracker/reducer.ts | 47 +++++++++++++ .../immersion-tracker/storage-session.test.ts | 46 +++++++++++++ .../services/immersion-tracker/storage.ts | 68 +++++++++++++++++++ src/core/services/immersion-tracker/types.ts | 37 +++++++++- 8 files changed, 252 insertions(+), 7 deletions(-) create mode 100644 src/core/services/immersion-tracker/reducer.test.ts diff --git a/docs/immersion-tracking.md b/docs/immersion-tracking.md index 69ae1c2..5573493 100644 --- a/docs/immersion-tracking.md +++ b/docs/immersion-tracking.md @@ -6,11 +6,12 @@ SubMiner stores immersion analytics in local SQLite (`immersion.sqlite`) by defa - Write path is asynchronous and queue-backed. - Hot paths (subtitle parsing/render/token flows) enqueue telemetry/events and never await SQLite writes. +- Background line processing also upserts to `imm_words` and `imm_kanji`. - Queue overflow policy is deterministic: drop oldest queued writes, keep newest. - Flush policy defaults to `25` writes or `500ms` max delay. - SQLite pragmas: `journal_mode=WAL`, `synchronous=NORMAL`, `foreign_keys=ON`, `busy_timeout=2500`. -## Schema (v2) +## Schema (v3) Schema versioning table: @@ -28,6 +29,12 @@ Rollups: - `imm_daily_rollups`: includes `CREATED_DATE`/`LAST_UPDATE_DATE` - `imm_monthly_rollups`: includes `CREATED_DATE`/`LAST_UPDATE_DATE` +Vocabulary: + +- `imm_words(id, headword, word, reading, first_seen, last_seen, frequency)` +- `imm_kanji(id, kanji, first_seen, last_seen, frequency)` +- `first_seen`/`last_seen` store Unix timestamps and are upserted with line ingestion + Primary index coverage: - session-by-video/time: `idx_sessions_video_started` diff --git a/src/core/services/immersion-tracker-service.test.ts b/src/core/services/immersion-tracker-service.test.ts index 2ba06d8..2c2e21c 100644 --- a/src/core/services/immersion-tracker-service.test.ts +++ b/src/core/services/immersion-tracker-service.test.ts @@ -74,8 +74,8 @@ test('seam: enqueueWrite drops oldest entries once capacity is exceeded', () => const result = enqueueWrite(queue, incoming, 2); assert.equal(result.dropped, 1); assert.equal(queue.length, 2); - assert.equal(queue[0]!.eventType, 2); - assert.equal(queue[1]!.eventType, 3); + assert.equal((queue[0] as Extract).eventType, 2); + assert.equal((queue[1] as Extract).eventType, 3); }); test('seam: toMonthKey uses UTC calendar month', () => { diff --git a/src/core/services/immersion-tracker-service.ts b/src/core/services/immersion-tracker-service.ts index a61124f..576d814 100644 --- a/src/core/services/immersion-tracker-service.ts +++ b/src/core/services/immersion-tracker-service.ts @@ -25,6 +25,7 @@ import { import { buildVideoKey, calculateTextMetrics, + extractLineVocabulary, deriveCanonicalTitle, isRemoteSource, normalizeMediaPath, @@ -268,18 +269,41 @@ export class ImmersionTrackerService { if (!this.sessionState || !text.trim()) return; const cleaned = normalizeText(text); if (!cleaned) return; + const nowMs = Date.now(); + const nowSec = nowMs / 1000; const metrics = calculateTextMetrics(cleaned); + const extractedVocabulary = extractLineVocabulary(cleaned); this.sessionState.currentLineIndex += 1; this.sessionState.linesSeen += 1; this.sessionState.wordsSeen += metrics.words; this.sessionState.tokensSeen += metrics.tokens; this.sessionState.pendingTelemetry = true; + for (const { headword, word, reading } of extractedVocabulary.words) { + this.recordWrite({ + kind: 'word', + headword, + word, + reading, + firstSeen: nowSec, + lastSeen: nowSec, + }); + } + + for (const kanji of extractedVocabulary.kanji) { + this.recordWrite({ + kind: 'kanji', + kanji, + firstSeen: nowSec, + lastSeen: nowSec, + }); + } + this.recordWrite({ kind: 'event', sessionId: this.sessionState.sessionId, - sampleMs: Date.now(), + sampleMs: nowMs, lineIndex: this.sessionState.currentLineIndex, segmentStartMs: secToMs(startSec), segmentEndMs: secToMs(endSec), diff --git a/src/core/services/immersion-tracker/reducer.test.ts b/src/core/services/immersion-tracker/reducer.test.ts new file mode 100644 index 0000000..27949ad --- /dev/null +++ b/src/core/services/immersion-tracker/reducer.test.ts @@ -0,0 +1,22 @@ +import test from 'node:test'; +import assert from 'node:assert/strict'; +import { extractLineVocabulary, isKanji } from './reducer'; + +test('isKanji follows canonical CJK ranges', () => { + assert.ok(isKanji('日')); + assert.ok(isKanji('𠀀')); + assert.ok(!isKanji('あ')); + assert.ok(!isKanji('a')); +}); + +test('extractLineVocabulary returns words and unique kanji', () => { + const result = extractLineVocabulary('hello 你好 猫'); + + assert.equal(result.words.length, 3); + assert.deepEqual( + new Set(result.words.map((entry) => `${entry.headword}/${entry.word}`)), + new Set(['hello/hello', '你好/你好', '猫/猫']), + ); + assert.equal(result.words.every((entry) => entry.reading === ''), true); + assert.deepEqual(new Set(result.kanji), new Set(['你', '好', '猫'])); +}); diff --git a/src/core/services/immersion-tracker/reducer.ts b/src/core/services/immersion-tracker/reducer.ts index f12deb7..2b6a90d 100644 --- a/src/core/services/immersion-tracker/reducer.ts +++ b/src/core/services/immersion-tracker/reducer.ts @@ -76,6 +76,53 @@ export function normalizeText(value: string | null | undefined): string { return value.trim().replace(/\s+/g, ' '); } +export interface ExtractedLineVocabulary { + words: Array<{ headword: string; word: string; reading: string }>; + kanji: string[]; +} + +export function isKanji(char: string): boolean { + if (!char) return false; + const code = char.codePointAt(0); + if (code === undefined) return false; + return ( + (code >= 0x4e00 && code <= 0x9fff) || + (code >= 0x3400 && code <= 0x4dbf) || + (code >= 0x20000 && code <= 0x2a6df) + ); +} + +export function extractLineVocabulary(value: string): ExtractedLineVocabulary { + const cleaned = normalizeText(value); + if (!cleaned) return { words: [], kanji: [] }; + + const wordSet = new Set(); + const tokenPattern = /[A-Za-z0-9']+|[\u3040-\u30ff]+|[\u3400-\u4dbf\u4e00-\u9fff\u20000-\u2a6df]+/g; + const rawWords = cleaned.match(tokenPattern) ?? []; + for (const rawWord of rawWords) { + const normalizedWord = normalizeText(rawWord.toLowerCase()); + if (!normalizedWord) continue; + wordSet.add(normalizedWord); + } + + const kanji = new Set(); + for (const char of cleaned) { + if (isKanji(char)) { + kanji.add(char); + } + } + + const words = Array.from(wordSet).map((word) => ({ + headword: word, + word, + reading: '', + })); + return { + words, + kanji: Array.from(kanji), + }; +} + export function buildVideoKey(mediaPath: string, sourceType: number): string { if (sourceType === SOURCE_TYPE_REMOTE) { return `remote:${mediaPath}`; diff --git a/src/core/services/immersion-tracker/storage-session.test.ts b/src/core/services/immersion-tracker/storage-session.test.ts index 5d89008..1fd586b 100644 --- a/src/core/services/immersion-tracker/storage-session.test.ts +++ b/src/core/services/immersion-tracker/storage-session.test.ts @@ -54,6 +54,8 @@ testIfSqlite('ensureSchema creates immersion core tables', () => { assert.ok(tableNames.has('imm_session_events')); assert.ok(tableNames.has('imm_daily_rollups')); assert.ok(tableNames.has('imm_monthly_rollups')); + assert.ok(tableNames.has('imm_words')); + assert.ok(tableNames.has('imm_kanji')); } finally { db.close(); cleanupDbPath(dbPath); @@ -160,3 +162,47 @@ testIfSqlite('executeQueuedWrite inserts event and telemetry rows', () => { cleanupDbPath(dbPath); } }); + +testIfSqlite('executeQueuedWrite inserts and upserts word and kanji rows', () => { + const dbPath = makeDbPath(); + const db = new DatabaseSync!(dbPath); + + try { + ensureSchema(db); + const stmts = createTrackerPreparedStatements(db); + + stmts.wordUpsertStmt.run('猫', '猫', '', 10.0, 10.0); + stmts.wordUpsertStmt.run('猫', '猫', '', 5.0, 15.0); + stmts.kanjiUpsertStmt.run('日', 9.0, 9.0); + stmts.kanjiUpsertStmt.run('日', 8.0, 11.0); + + const wordRow = db + .prepare('SELECT headword, frequency, first_seen, last_seen FROM imm_words WHERE headword = ?') + .get('猫') as { + headword: string; + frequency: number; + first_seen: number; + last_seen: number; + } | null; + const kanjiRow = db + .prepare('SELECT kanji, frequency, first_seen, last_seen FROM imm_kanji WHERE kanji = ?') + .get('日') as { + kanji: string; + frequency: number; + first_seen: number; + last_seen: number; + } | null; + + assert.ok(wordRow); + assert.ok(kanjiRow); + assert.equal(wordRow?.frequency, 2); + assert.equal(kanjiRow?.frequency, 2); + assert.equal(wordRow?.first_seen, 5); + assert.equal(wordRow?.last_seen, 15); + assert.equal(kanjiRow?.first_seen, 8); + assert.equal(kanjiRow?.last_seen, 11); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); diff --git a/src/core/services/immersion-tracker/storage.ts b/src/core/services/immersion-tracker/storage.ts index 1e42078..fb822c8 100644 --- a/src/core/services/immersion-tracker/storage.ts +++ b/src/core/services/immersion-tracker/storage.ts @@ -5,6 +5,8 @@ import type { QueuedWrite, VideoMetadata } from './types'; export interface TrackerPreparedStatements { telemetryInsertStmt: ReturnType; eventInsertStmt: ReturnType; + wordUpsertStmt: ReturnType; + kanjiUpsertStmt: ReturnType; } function hasColumn(db: DatabaseSync, tableName: string, columnName: string): boolean { @@ -154,6 +156,28 @@ export function ensureSchema(db: DatabaseSync): void { PRIMARY KEY (rollup_month, video_id) ); `); + db.exec(` + CREATE TABLE IF NOT EXISTS imm_words( + id INTEGER PRIMARY KEY AUTOINCREMENT, + headword TEXT, + word TEXT, + reading TEXT, + first_seen REAL, + last_seen REAL, + frequency INTEGER, + UNIQUE(headword, word, reading) + ); + `); + db.exec(` + CREATE TABLE IF NOT EXISTS imm_kanji( + id INTEGER PRIMARY KEY AUTOINCREMENT, + kanji TEXT, + first_seen REAL, + last_seen REAL, + frequency INTEGER, + UNIQUE(kanji) + ); + `); db.exec(` CREATE INDEX IF NOT EXISTS idx_sessions_video_started @@ -183,6 +207,14 @@ export function ensureSchema(db: DatabaseSync): void { CREATE INDEX IF NOT EXISTS idx_rollups_month_video ON imm_monthly_rollups(rollup_month, video_id) `); + db.exec(` + CREATE INDEX IF NOT EXISTS idx_words_headword_word_reading + ON imm_words(headword, word, reading) + `); + db.exec(` + CREATE INDEX IF NOT EXISTS idx_kanji_kanji + ON imm_kanji(kanji) + `); if (currentVersion?.schema_version === 1) { addColumnIfMissing(db, 'imm_videos', 'CREATED_DATE'); @@ -283,6 +315,28 @@ export function createTrackerPreparedStatements(db: DatabaseSync): TrackerPrepar ?, ?, ?, ?, ?, ?, ?, ?, ?, ? ) `), + wordUpsertStmt: db.prepare(` + INSERT INTO imm_words ( + headword, word, reading, first_seen, last_seen, frequency + ) VALUES ( + ?, ?, ?, ?, ?, 1 + ) + ON CONFLICT(headword, word, reading) DO UPDATE SET + frequency = COALESCE(frequency, 0) + 1, + first_seen = MIN(COALESCE(first_seen, excluded.first_seen), excluded.first_seen), + last_seen = MAX(COALESCE(last_seen, excluded.last_seen), excluded.last_seen) + `), + kanjiUpsertStmt: db.prepare(` + INSERT INTO imm_kanji ( + kanji, first_seen, last_seen, frequency + ) VALUES ( + ?, ?, ?, 1 + ) + ON CONFLICT(kanji) DO UPDATE SET + frequency = COALESCE(frequency, 0) + 1, + first_seen = MIN(COALESCE(first_seen, excluded.first_seen), excluded.first_seen), + last_seen = MAX(COALESCE(last_seen, excluded.last_seen), excluded.last_seen) + `), }; } @@ -309,6 +363,20 @@ export function executeQueuedWrite(write: QueuedWrite, stmts: TrackerPreparedSta ); return; } + if (write.kind === 'word') { + stmts.wordUpsertStmt.run( + write.headword, + write.word, + write.reading, + write.firstSeen, + write.lastSeen, + ); + return; + } + if (write.kind === 'kanji') { + stmts.kanjiUpsertStmt.run(write.kanji, write.firstSeen, write.lastSeen); + return; + } stmts.eventInsertStmt.run( write.sessionId, diff --git a/src/core/services/immersion-tracker/types.ts b/src/core/services/immersion-tracker/types.ts index 72e43af..975481c 100644 --- a/src/core/services/immersion-tracker/types.ts +++ b/src/core/services/immersion-tracker/types.ts @@ -1,4 +1,4 @@ -export const SCHEMA_VERSION = 2; +export const SCHEMA_VERSION = 3; export const DEFAULT_QUEUE_CAP = 1_000; export const DEFAULT_BATCH_SIZE = 25; export const DEFAULT_FLUSH_INTERVAL_MS = 500; @@ -74,8 +74,8 @@ export interface SessionState extends TelemetryAccumulator { pendingTelemetry: boolean; } -export interface QueuedWrite { - kind: 'telemetry' | 'event'; +interface QueuedTelemetryWrite { + kind: 'telemetry'; sessionId: number; sampleMs?: number; totalWatchedMs?: number; @@ -100,6 +100,37 @@ export interface QueuedWrite { payloadJson?: string | null; } +interface QueuedEventWrite { + kind: 'event'; + sessionId: number; + sampleMs?: number; + eventType?: number; + lineIndex?: number | null; + segmentStartMs?: number | null; + segmentEndMs?: number | null; + wordsDelta?: number; + cardsDelta?: number; + payloadJson?: string | null; +} + +interface QueuedWordWrite { + kind: 'word'; + headword: string; + word: string; + reading: string; + firstSeen: number; + lastSeen: number; +} + +interface QueuedKanjiWrite { + kind: 'kanji'; + kanji: string; + firstSeen: number; + lastSeen: number; +} + +export type QueuedWrite = QueuedTelemetryWrite | QueuedEventWrite | QueuedWordWrite | QueuedKanjiWrite; + export interface VideoMetadata { sourceType: number; canonicalTitle: string;