mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-02 06:22:42 -08:00
feat: add v3 immersion vocabulary tables
This commit is contained in:
@@ -6,11 +6,12 @@ SubMiner stores immersion analytics in local SQLite (`immersion.sqlite`) by defa
|
|||||||
|
|
||||||
- Write path is asynchronous and queue-backed.
|
- Write path is asynchronous and queue-backed.
|
||||||
- Hot paths (subtitle parsing/render/token flows) enqueue telemetry/events and never await SQLite writes.
|
- Hot paths (subtitle parsing/render/token flows) enqueue telemetry/events and never await SQLite writes.
|
||||||
|
- Background line processing also upserts to `imm_words` and `imm_kanji`.
|
||||||
- Queue overflow policy is deterministic: drop oldest queued writes, keep newest.
|
- Queue overflow policy is deterministic: drop oldest queued writes, keep newest.
|
||||||
- Flush policy defaults to `25` writes or `500ms` max delay.
|
- Flush policy defaults to `25` writes or `500ms` max delay.
|
||||||
- SQLite pragmas: `journal_mode=WAL`, `synchronous=NORMAL`, `foreign_keys=ON`, `busy_timeout=2500`.
|
- SQLite pragmas: `journal_mode=WAL`, `synchronous=NORMAL`, `foreign_keys=ON`, `busy_timeout=2500`.
|
||||||
|
|
||||||
## Schema (v2)
|
## Schema (v3)
|
||||||
|
|
||||||
Schema versioning table:
|
Schema versioning table:
|
||||||
|
|
||||||
@@ -28,6 +29,12 @@ Rollups:
|
|||||||
- `imm_daily_rollups`: includes `CREATED_DATE`/`LAST_UPDATE_DATE`
|
- `imm_daily_rollups`: includes `CREATED_DATE`/`LAST_UPDATE_DATE`
|
||||||
- `imm_monthly_rollups`: includes `CREATED_DATE`/`LAST_UPDATE_DATE`
|
- `imm_monthly_rollups`: includes `CREATED_DATE`/`LAST_UPDATE_DATE`
|
||||||
|
|
||||||
|
Vocabulary:
|
||||||
|
|
||||||
|
- `imm_words(id, headword, word, reading, first_seen, last_seen, frequency)`
|
||||||
|
- `imm_kanji(id, kanji, first_seen, last_seen, frequency)`
|
||||||
|
- `first_seen`/`last_seen` store Unix timestamps and are upserted with line ingestion
|
||||||
|
|
||||||
Primary index coverage:
|
Primary index coverage:
|
||||||
|
|
||||||
- session-by-video/time: `idx_sessions_video_started`
|
- session-by-video/time: `idx_sessions_video_started`
|
||||||
|
|||||||
@@ -74,8 +74,8 @@ test('seam: enqueueWrite drops oldest entries once capacity is exceeded', () =>
|
|||||||
const result = enqueueWrite(queue, incoming, 2);
|
const result = enqueueWrite(queue, incoming, 2);
|
||||||
assert.equal(result.dropped, 1);
|
assert.equal(result.dropped, 1);
|
||||||
assert.equal(queue.length, 2);
|
assert.equal(queue.length, 2);
|
||||||
assert.equal(queue[0]!.eventType, 2);
|
assert.equal((queue[0] as Extract<QueuedWrite, { kind: 'event' }>).eventType, 2);
|
||||||
assert.equal(queue[1]!.eventType, 3);
|
assert.equal((queue[1] as Extract<QueuedWrite, { kind: 'event' }>).eventType, 3);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('seam: toMonthKey uses UTC calendar month', () => {
|
test('seam: toMonthKey uses UTC calendar month', () => {
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ import {
|
|||||||
import {
|
import {
|
||||||
buildVideoKey,
|
buildVideoKey,
|
||||||
calculateTextMetrics,
|
calculateTextMetrics,
|
||||||
|
extractLineVocabulary,
|
||||||
deriveCanonicalTitle,
|
deriveCanonicalTitle,
|
||||||
isRemoteSource,
|
isRemoteSource,
|
||||||
normalizeMediaPath,
|
normalizeMediaPath,
|
||||||
@@ -268,18 +269,41 @@ export class ImmersionTrackerService {
|
|||||||
if (!this.sessionState || !text.trim()) return;
|
if (!this.sessionState || !text.trim()) return;
|
||||||
const cleaned = normalizeText(text);
|
const cleaned = normalizeText(text);
|
||||||
if (!cleaned) return;
|
if (!cleaned) return;
|
||||||
|
const nowMs = Date.now();
|
||||||
|
const nowSec = nowMs / 1000;
|
||||||
|
|
||||||
const metrics = calculateTextMetrics(cleaned);
|
const metrics = calculateTextMetrics(cleaned);
|
||||||
|
const extractedVocabulary = extractLineVocabulary(cleaned);
|
||||||
this.sessionState.currentLineIndex += 1;
|
this.sessionState.currentLineIndex += 1;
|
||||||
this.sessionState.linesSeen += 1;
|
this.sessionState.linesSeen += 1;
|
||||||
this.sessionState.wordsSeen += metrics.words;
|
this.sessionState.wordsSeen += metrics.words;
|
||||||
this.sessionState.tokensSeen += metrics.tokens;
|
this.sessionState.tokensSeen += metrics.tokens;
|
||||||
this.sessionState.pendingTelemetry = true;
|
this.sessionState.pendingTelemetry = true;
|
||||||
|
|
||||||
|
for (const { headword, word, reading } of extractedVocabulary.words) {
|
||||||
|
this.recordWrite({
|
||||||
|
kind: 'word',
|
||||||
|
headword,
|
||||||
|
word,
|
||||||
|
reading,
|
||||||
|
firstSeen: nowSec,
|
||||||
|
lastSeen: nowSec,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const kanji of extractedVocabulary.kanji) {
|
||||||
|
this.recordWrite({
|
||||||
|
kind: 'kanji',
|
||||||
|
kanji,
|
||||||
|
firstSeen: nowSec,
|
||||||
|
lastSeen: nowSec,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
this.recordWrite({
|
this.recordWrite({
|
||||||
kind: 'event',
|
kind: 'event',
|
||||||
sessionId: this.sessionState.sessionId,
|
sessionId: this.sessionState.sessionId,
|
||||||
sampleMs: Date.now(),
|
sampleMs: nowMs,
|
||||||
lineIndex: this.sessionState.currentLineIndex,
|
lineIndex: this.sessionState.currentLineIndex,
|
||||||
segmentStartMs: secToMs(startSec),
|
segmentStartMs: secToMs(startSec),
|
||||||
segmentEndMs: secToMs(endSec),
|
segmentEndMs: secToMs(endSec),
|
||||||
|
|||||||
22
src/core/services/immersion-tracker/reducer.test.ts
Normal file
22
src/core/services/immersion-tracker/reducer.test.ts
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
import test from 'node:test';
|
||||||
|
import assert from 'node:assert/strict';
|
||||||
|
import { extractLineVocabulary, isKanji } from './reducer';
|
||||||
|
|
||||||
|
test('isKanji follows canonical CJK ranges', () => {
|
||||||
|
assert.ok(isKanji('日'));
|
||||||
|
assert.ok(isKanji('𠀀'));
|
||||||
|
assert.ok(!isKanji('あ'));
|
||||||
|
assert.ok(!isKanji('a'));
|
||||||
|
});
|
||||||
|
|
||||||
|
test('extractLineVocabulary returns words and unique kanji', () => {
|
||||||
|
const result = extractLineVocabulary('hello 你好 猫');
|
||||||
|
|
||||||
|
assert.equal(result.words.length, 3);
|
||||||
|
assert.deepEqual(
|
||||||
|
new Set(result.words.map((entry) => `${entry.headword}/${entry.word}`)),
|
||||||
|
new Set(['hello/hello', '你好/你好', '猫/猫']),
|
||||||
|
);
|
||||||
|
assert.equal(result.words.every((entry) => entry.reading === ''), true);
|
||||||
|
assert.deepEqual(new Set(result.kanji), new Set(['你', '好', '猫']));
|
||||||
|
});
|
||||||
@@ -76,6 +76,53 @@ export function normalizeText(value: string | null | undefined): string {
|
|||||||
return value.trim().replace(/\s+/g, ' ');
|
return value.trim().replace(/\s+/g, ' ');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface ExtractedLineVocabulary {
|
||||||
|
words: Array<{ headword: string; word: string; reading: string }>;
|
||||||
|
kanji: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isKanji(char: string): boolean {
|
||||||
|
if (!char) return false;
|
||||||
|
const code = char.codePointAt(0);
|
||||||
|
if (code === undefined) return false;
|
||||||
|
return (
|
||||||
|
(code >= 0x4e00 && code <= 0x9fff) ||
|
||||||
|
(code >= 0x3400 && code <= 0x4dbf) ||
|
||||||
|
(code >= 0x20000 && code <= 0x2a6df)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractLineVocabulary(value: string): ExtractedLineVocabulary {
|
||||||
|
const cleaned = normalizeText(value);
|
||||||
|
if (!cleaned) return { words: [], kanji: [] };
|
||||||
|
|
||||||
|
const wordSet = new Set<string>();
|
||||||
|
const tokenPattern = /[A-Za-z0-9']+|[\u3040-\u30ff]+|[\u3400-\u4dbf\u4e00-\u9fff\u20000-\u2a6df]+/g;
|
||||||
|
const rawWords = cleaned.match(tokenPattern) ?? [];
|
||||||
|
for (const rawWord of rawWords) {
|
||||||
|
const normalizedWord = normalizeText(rawWord.toLowerCase());
|
||||||
|
if (!normalizedWord) continue;
|
||||||
|
wordSet.add(normalizedWord);
|
||||||
|
}
|
||||||
|
|
||||||
|
const kanji = new Set<string>();
|
||||||
|
for (const char of cleaned) {
|
||||||
|
if (isKanji(char)) {
|
||||||
|
kanji.add(char);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const words = Array.from(wordSet).map((word) => ({
|
||||||
|
headword: word,
|
||||||
|
word,
|
||||||
|
reading: '',
|
||||||
|
}));
|
||||||
|
return {
|
||||||
|
words,
|
||||||
|
kanji: Array.from(kanji),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
export function buildVideoKey(mediaPath: string, sourceType: number): string {
|
export function buildVideoKey(mediaPath: string, sourceType: number): string {
|
||||||
if (sourceType === SOURCE_TYPE_REMOTE) {
|
if (sourceType === SOURCE_TYPE_REMOTE) {
|
||||||
return `remote:${mediaPath}`;
|
return `remote:${mediaPath}`;
|
||||||
|
|||||||
@@ -54,6 +54,8 @@ testIfSqlite('ensureSchema creates immersion core tables', () => {
|
|||||||
assert.ok(tableNames.has('imm_session_events'));
|
assert.ok(tableNames.has('imm_session_events'));
|
||||||
assert.ok(tableNames.has('imm_daily_rollups'));
|
assert.ok(tableNames.has('imm_daily_rollups'));
|
||||||
assert.ok(tableNames.has('imm_monthly_rollups'));
|
assert.ok(tableNames.has('imm_monthly_rollups'));
|
||||||
|
assert.ok(tableNames.has('imm_words'));
|
||||||
|
assert.ok(tableNames.has('imm_kanji'));
|
||||||
} finally {
|
} finally {
|
||||||
db.close();
|
db.close();
|
||||||
cleanupDbPath(dbPath);
|
cleanupDbPath(dbPath);
|
||||||
@@ -160,3 +162,47 @@ testIfSqlite('executeQueuedWrite inserts event and telemetry rows', () => {
|
|||||||
cleanupDbPath(dbPath);
|
cleanupDbPath(dbPath);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
testIfSqlite('executeQueuedWrite inserts and upserts word and kanji rows', () => {
|
||||||
|
const dbPath = makeDbPath();
|
||||||
|
const db = new DatabaseSync!(dbPath);
|
||||||
|
|
||||||
|
try {
|
||||||
|
ensureSchema(db);
|
||||||
|
const stmts = createTrackerPreparedStatements(db);
|
||||||
|
|
||||||
|
stmts.wordUpsertStmt.run('猫', '猫', '', 10.0, 10.0);
|
||||||
|
stmts.wordUpsertStmt.run('猫', '猫', '', 5.0, 15.0);
|
||||||
|
stmts.kanjiUpsertStmt.run('日', 9.0, 9.0);
|
||||||
|
stmts.kanjiUpsertStmt.run('日', 8.0, 11.0);
|
||||||
|
|
||||||
|
const wordRow = db
|
||||||
|
.prepare('SELECT headword, frequency, first_seen, last_seen FROM imm_words WHERE headword = ?')
|
||||||
|
.get('猫') as {
|
||||||
|
headword: string;
|
||||||
|
frequency: number;
|
||||||
|
first_seen: number;
|
||||||
|
last_seen: number;
|
||||||
|
} | null;
|
||||||
|
const kanjiRow = db
|
||||||
|
.prepare('SELECT kanji, frequency, first_seen, last_seen FROM imm_kanji WHERE kanji = ?')
|
||||||
|
.get('日') as {
|
||||||
|
kanji: string;
|
||||||
|
frequency: number;
|
||||||
|
first_seen: number;
|
||||||
|
last_seen: number;
|
||||||
|
} | null;
|
||||||
|
|
||||||
|
assert.ok(wordRow);
|
||||||
|
assert.ok(kanjiRow);
|
||||||
|
assert.equal(wordRow?.frequency, 2);
|
||||||
|
assert.equal(kanjiRow?.frequency, 2);
|
||||||
|
assert.equal(wordRow?.first_seen, 5);
|
||||||
|
assert.equal(wordRow?.last_seen, 15);
|
||||||
|
assert.equal(kanjiRow?.first_seen, 8);
|
||||||
|
assert.equal(kanjiRow?.last_seen, 11);
|
||||||
|
} finally {
|
||||||
|
db.close();
|
||||||
|
cleanupDbPath(dbPath);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|||||||
@@ -5,6 +5,8 @@ import type { QueuedWrite, VideoMetadata } from './types';
|
|||||||
export interface TrackerPreparedStatements {
|
export interface TrackerPreparedStatements {
|
||||||
telemetryInsertStmt: ReturnType<DatabaseSync['prepare']>;
|
telemetryInsertStmt: ReturnType<DatabaseSync['prepare']>;
|
||||||
eventInsertStmt: ReturnType<DatabaseSync['prepare']>;
|
eventInsertStmt: ReturnType<DatabaseSync['prepare']>;
|
||||||
|
wordUpsertStmt: ReturnType<DatabaseSync['prepare']>;
|
||||||
|
kanjiUpsertStmt: ReturnType<DatabaseSync['prepare']>;
|
||||||
}
|
}
|
||||||
|
|
||||||
function hasColumn(db: DatabaseSync, tableName: string, columnName: string): boolean {
|
function hasColumn(db: DatabaseSync, tableName: string, columnName: string): boolean {
|
||||||
@@ -154,6 +156,28 @@ export function ensureSchema(db: DatabaseSync): void {
|
|||||||
PRIMARY KEY (rollup_month, video_id)
|
PRIMARY KEY (rollup_month, video_id)
|
||||||
);
|
);
|
||||||
`);
|
`);
|
||||||
|
db.exec(`
|
||||||
|
CREATE TABLE IF NOT EXISTS imm_words(
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
headword TEXT,
|
||||||
|
word TEXT,
|
||||||
|
reading TEXT,
|
||||||
|
first_seen REAL,
|
||||||
|
last_seen REAL,
|
||||||
|
frequency INTEGER,
|
||||||
|
UNIQUE(headword, word, reading)
|
||||||
|
);
|
||||||
|
`);
|
||||||
|
db.exec(`
|
||||||
|
CREATE TABLE IF NOT EXISTS imm_kanji(
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
kanji TEXT,
|
||||||
|
first_seen REAL,
|
||||||
|
last_seen REAL,
|
||||||
|
frequency INTEGER,
|
||||||
|
UNIQUE(kanji)
|
||||||
|
);
|
||||||
|
`);
|
||||||
|
|
||||||
db.exec(`
|
db.exec(`
|
||||||
CREATE INDEX IF NOT EXISTS idx_sessions_video_started
|
CREATE INDEX IF NOT EXISTS idx_sessions_video_started
|
||||||
@@ -183,6 +207,14 @@ export function ensureSchema(db: DatabaseSync): void {
|
|||||||
CREATE INDEX IF NOT EXISTS idx_rollups_month_video
|
CREATE INDEX IF NOT EXISTS idx_rollups_month_video
|
||||||
ON imm_monthly_rollups(rollup_month, video_id)
|
ON imm_monthly_rollups(rollup_month, video_id)
|
||||||
`);
|
`);
|
||||||
|
db.exec(`
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_words_headword_word_reading
|
||||||
|
ON imm_words(headword, word, reading)
|
||||||
|
`);
|
||||||
|
db.exec(`
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_kanji_kanji
|
||||||
|
ON imm_kanji(kanji)
|
||||||
|
`);
|
||||||
|
|
||||||
if (currentVersion?.schema_version === 1) {
|
if (currentVersion?.schema_version === 1) {
|
||||||
addColumnIfMissing(db, 'imm_videos', 'CREATED_DATE');
|
addColumnIfMissing(db, 'imm_videos', 'CREATED_DATE');
|
||||||
@@ -283,6 +315,28 @@ export function createTrackerPreparedStatements(db: DatabaseSync): TrackerPrepar
|
|||||||
?, ?, ?, ?, ?, ?, ?, ?, ?, ?
|
?, ?, ?, ?, ?, ?, ?, ?, ?, ?
|
||||||
)
|
)
|
||||||
`),
|
`),
|
||||||
|
wordUpsertStmt: db.prepare(`
|
||||||
|
INSERT INTO imm_words (
|
||||||
|
headword, word, reading, first_seen, last_seen, frequency
|
||||||
|
) VALUES (
|
||||||
|
?, ?, ?, ?, ?, 1
|
||||||
|
)
|
||||||
|
ON CONFLICT(headword, word, reading) DO UPDATE SET
|
||||||
|
frequency = COALESCE(frequency, 0) + 1,
|
||||||
|
first_seen = MIN(COALESCE(first_seen, excluded.first_seen), excluded.first_seen),
|
||||||
|
last_seen = MAX(COALESCE(last_seen, excluded.last_seen), excluded.last_seen)
|
||||||
|
`),
|
||||||
|
kanjiUpsertStmt: db.prepare(`
|
||||||
|
INSERT INTO imm_kanji (
|
||||||
|
kanji, first_seen, last_seen, frequency
|
||||||
|
) VALUES (
|
||||||
|
?, ?, ?, 1
|
||||||
|
)
|
||||||
|
ON CONFLICT(kanji) DO UPDATE SET
|
||||||
|
frequency = COALESCE(frequency, 0) + 1,
|
||||||
|
first_seen = MIN(COALESCE(first_seen, excluded.first_seen), excluded.first_seen),
|
||||||
|
last_seen = MAX(COALESCE(last_seen, excluded.last_seen), excluded.last_seen)
|
||||||
|
`),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -309,6 +363,20 @@ export function executeQueuedWrite(write: QueuedWrite, stmts: TrackerPreparedSta
|
|||||||
);
|
);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (write.kind === 'word') {
|
||||||
|
stmts.wordUpsertStmt.run(
|
||||||
|
write.headword,
|
||||||
|
write.word,
|
||||||
|
write.reading,
|
||||||
|
write.firstSeen,
|
||||||
|
write.lastSeen,
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (write.kind === 'kanji') {
|
||||||
|
stmts.kanjiUpsertStmt.run(write.kanji, write.firstSeen, write.lastSeen);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
stmts.eventInsertStmt.run(
|
stmts.eventInsertStmt.run(
|
||||||
write.sessionId,
|
write.sessionId,
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
export const SCHEMA_VERSION = 2;
|
export const SCHEMA_VERSION = 3;
|
||||||
export const DEFAULT_QUEUE_CAP = 1_000;
|
export const DEFAULT_QUEUE_CAP = 1_000;
|
||||||
export const DEFAULT_BATCH_SIZE = 25;
|
export const DEFAULT_BATCH_SIZE = 25;
|
||||||
export const DEFAULT_FLUSH_INTERVAL_MS = 500;
|
export const DEFAULT_FLUSH_INTERVAL_MS = 500;
|
||||||
@@ -74,8 +74,8 @@ export interface SessionState extends TelemetryAccumulator {
|
|||||||
pendingTelemetry: boolean;
|
pendingTelemetry: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface QueuedWrite {
|
interface QueuedTelemetryWrite {
|
||||||
kind: 'telemetry' | 'event';
|
kind: 'telemetry';
|
||||||
sessionId: number;
|
sessionId: number;
|
||||||
sampleMs?: number;
|
sampleMs?: number;
|
||||||
totalWatchedMs?: number;
|
totalWatchedMs?: number;
|
||||||
@@ -100,6 +100,37 @@ export interface QueuedWrite {
|
|||||||
payloadJson?: string | null;
|
payloadJson?: string | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface QueuedEventWrite {
|
||||||
|
kind: 'event';
|
||||||
|
sessionId: number;
|
||||||
|
sampleMs?: number;
|
||||||
|
eventType?: number;
|
||||||
|
lineIndex?: number | null;
|
||||||
|
segmentStartMs?: number | null;
|
||||||
|
segmentEndMs?: number | null;
|
||||||
|
wordsDelta?: number;
|
||||||
|
cardsDelta?: number;
|
||||||
|
payloadJson?: string | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface QueuedWordWrite {
|
||||||
|
kind: 'word';
|
||||||
|
headword: string;
|
||||||
|
word: string;
|
||||||
|
reading: string;
|
||||||
|
firstSeen: number;
|
||||||
|
lastSeen: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface QueuedKanjiWrite {
|
||||||
|
kind: 'kanji';
|
||||||
|
kanji: string;
|
||||||
|
firstSeen: number;
|
||||||
|
lastSeen: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export type QueuedWrite = QueuedTelemetryWrite | QueuedEventWrite | QueuedWordWrite | QueuedKanjiWrite;
|
||||||
|
|
||||||
export interface VideoMetadata {
|
export interface VideoMetadata {
|
||||||
sourceType: number;
|
sourceType: number;
|
||||||
canonicalTitle: string;
|
canonicalTitle: string;
|
||||||
|
|||||||
Reference in New Issue
Block a user