Files
SubMiner/src/core/services/immersion-tracker/query-maintenance.ts
sudacode 854179b9c1 Add backlog tasks and launcher time helper tests
- Track follow-up cleanup work in Backlog.md
- Replace Date.now usage with shared nowMs helper
- Add launcher args/parser and core regression tests
2026-03-27 02:01:36 -07:00

548 lines
16 KiB
TypeScript

import { createHash } from 'node:crypto';
import type { DatabaseSync } from './sqlite';
import { buildCoverBlobReference, normalizeCoverBlobBytes } from './storage';
import { rebuildLifetimeSummariesInTransaction } from './lifetime';
import { rebuildRollupsInTransaction } from './maintenance';
import { nowMs } from './time';
import { PartOfSpeech, type MergedToken } from '../../../types';
import { shouldExcludeTokenFromVocabularyPersistence } from '../tokenizer/annotation-stage';
import { deriveStoredPartOfSpeech } from '../tokenizer/part-of-speech';
import {
cleanupUnusedCoverArtBlobHash,
deleteSessionsByIds,
findSharedCoverBlobHash,
getAffectedKanjiIdsForSessions,
getAffectedKanjiIdsForVideo,
getAffectedWordIdsForSessions,
getAffectedWordIdsForVideo,
refreshLexicalAggregates,
} from './query-shared.js';
type CleanupVocabularyRow = {
id: number;
word: string;
headword: string;
reading: string | null;
part_of_speech: string | null;
pos1: string | null;
pos2: string | null;
pos3: string | null;
first_seen: number | null;
last_seen: number | null;
frequency: number | null;
};
type ResolvedVocabularyPos = {
headword: string;
reading: string;
hasPosMetadata: boolean;
partOfSpeech: PartOfSpeech;
pos1: string;
pos2: string;
pos3: string;
};
type CleanupVocabularyStatsOptions = {
resolveLegacyPos?: (row: CleanupVocabularyRow) => Promise<{
headword: string;
reading: string;
partOfSpeech: string;
pos1: string;
pos2: string;
pos3: string;
} | null>;
};
function toStoredWordToken(row: {
word: string;
headword: string;
part_of_speech: string | null;
pos1: string | null;
pos2: string | null;
pos3: string | null;
}): MergedToken {
return {
surface: row.word || row.headword || '',
reading: '',
headword: row.headword || row.word || '',
startPos: 0,
endPos: 0,
partOfSpeech: deriveStoredPartOfSpeech({
partOfSpeech: row.part_of_speech,
pos1: row.pos1,
}),
pos1: row.pos1 ?? '',
pos2: row.pos2 ?? '',
pos3: row.pos3 ?? '',
isMerged: true,
isKnown: false,
isNPlusOneTarget: false,
};
}
function normalizePosField(value: string | null | undefined): string {
return typeof value === 'string' ? value.trim() : '';
}
function resolveStoredVocabularyPos(row: CleanupVocabularyRow): ResolvedVocabularyPos | null {
const headword = normalizePosField(row.headword);
const reading = normalizePosField(row.reading);
const partOfSpeechRaw = typeof row.part_of_speech === 'string' ? row.part_of_speech.trim() : '';
const pos1 = normalizePosField(row.pos1);
const pos2 = normalizePosField(row.pos2);
const pos3 = normalizePosField(row.pos3);
if (!headword && !reading && !partOfSpeechRaw && !pos1 && !pos2 && !pos3) {
return null;
}
return {
headword: headword || normalizePosField(row.word),
reading,
hasPosMetadata: Boolean(partOfSpeechRaw || pos1 || pos2 || pos3),
partOfSpeech: deriveStoredPartOfSpeech({
partOfSpeech: partOfSpeechRaw,
pos1,
}),
pos1,
pos2,
pos3,
};
}
function hasStructuredPos(pos: ResolvedVocabularyPos | null): boolean {
return Boolean(pos?.hasPosMetadata && (pos.pos1 || pos.pos2 || pos.pos3 || pos.partOfSpeech));
}
function needsLegacyVocabularyMetadataRepair(
row: CleanupVocabularyRow,
stored: ResolvedVocabularyPos | null,
): boolean {
if (!stored) {
return true;
}
if (!hasStructuredPos(stored)) {
return true;
}
if (!stored.reading) {
return true;
}
if (!stored.headword) {
return true;
}
return stored.headword === normalizePosField(row.word);
}
function shouldUpdateStoredVocabularyPos(
row: CleanupVocabularyRow,
next: ResolvedVocabularyPos,
): boolean {
return (
normalizePosField(row.headword) !== next.headword ||
normalizePosField(row.reading) !== next.reading ||
(next.hasPosMetadata &&
(normalizePosField(row.part_of_speech) !== next.partOfSpeech ||
normalizePosField(row.pos1) !== next.pos1 ||
normalizePosField(row.pos2) !== next.pos2 ||
normalizePosField(row.pos3) !== next.pos3))
);
}
function chooseMergedPartOfSpeech(
current: string | null | undefined,
incoming: ResolvedVocabularyPos,
): string {
const normalizedCurrent = normalizePosField(current);
if (
normalizedCurrent &&
normalizedCurrent !== PartOfSpeech.other &&
incoming.partOfSpeech === PartOfSpeech.other
) {
return normalizedCurrent;
}
return incoming.partOfSpeech;
}
async function maybeResolveLegacyVocabularyPos(
row: CleanupVocabularyRow,
options: CleanupVocabularyStatsOptions,
): Promise<ResolvedVocabularyPos | null> {
const stored = resolveStoredVocabularyPos(row);
if (!needsLegacyVocabularyMetadataRepair(row, stored) || !options.resolveLegacyPos) {
return stored;
}
const resolved = await options.resolveLegacyPos(row);
if (resolved) {
return {
headword: normalizePosField(resolved.headword) || normalizePosField(row.word),
reading: normalizePosField(resolved.reading),
hasPosMetadata: true,
partOfSpeech: deriveStoredPartOfSpeech({
partOfSpeech: resolved.partOfSpeech,
pos1: resolved.pos1,
}),
pos1: normalizePosField(resolved.pos1),
pos2: normalizePosField(resolved.pos2),
pos3: normalizePosField(resolved.pos3),
};
}
return stored;
}
export async function cleanupVocabularyStats(
db: DatabaseSync,
options: CleanupVocabularyStatsOptions = {},
): Promise<{ scanned: number; kept: number; deleted: number; repaired: number }> {
const rows = db
.prepare(
`SELECT id, word, headword, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency
FROM imm_words`,
)
.all() as CleanupVocabularyRow[];
const findDuplicateStmt = db.prepare(
`SELECT id, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency
FROM imm_words
WHERE headword = ? AND word = ? AND reading = ? AND id != ?`,
);
const deleteStmt = db.prepare('DELETE FROM imm_words WHERE id = ?');
const updateStmt = db.prepare(
`UPDATE imm_words
SET headword = ?, reading = ?, part_of_speech = ?, pos1 = ?, pos2 = ?, pos3 = ?
WHERE id = ?`,
);
const mergeWordStmt = db.prepare(
`UPDATE imm_words
SET
frequency = COALESCE(frequency, 0) + ?,
part_of_speech = ?,
pos1 = ?,
pos2 = ?,
pos3 = ?,
first_seen = MIN(COALESCE(first_seen, ?), ?),
last_seen = MAX(COALESCE(last_seen, ?), ?)
WHERE id = ?`,
);
const moveOccurrencesStmt = db.prepare(
`INSERT INTO imm_word_line_occurrences (line_id, word_id, occurrence_count)
SELECT line_id, ?, occurrence_count
FROM imm_word_line_occurrences
WHERE word_id = ?
ON CONFLICT(line_id, word_id) DO UPDATE SET
occurrence_count = imm_word_line_occurrences.occurrence_count + excluded.occurrence_count`,
);
const deleteOccurrencesStmt = db.prepare(
'DELETE FROM imm_word_line_occurrences WHERE word_id = ?',
);
let kept = 0;
let deleted = 0;
let repaired = 0;
for (const row of rows) {
const resolvedPos = await maybeResolveLegacyVocabularyPos(row, options);
const shouldRepair = Boolean(resolvedPos && shouldUpdateStoredVocabularyPos(row, resolvedPos));
if (resolvedPos && shouldRepair) {
const duplicate = findDuplicateStmt.get(
resolvedPos.headword,
row.word,
resolvedPos.reading,
row.id,
) as {
id: number;
part_of_speech: string | null;
pos1: string | null;
pos2: string | null;
pos3: string | null;
first_seen: number | null;
last_seen: number | null;
frequency: number | null;
} | null;
if (duplicate) {
moveOccurrencesStmt.run(duplicate.id, row.id);
deleteOccurrencesStmt.run(row.id);
mergeWordStmt.run(
row.frequency ?? 0,
chooseMergedPartOfSpeech(duplicate.part_of_speech, resolvedPos),
normalizePosField(duplicate.pos1) || resolvedPos.pos1,
normalizePosField(duplicate.pos2) || resolvedPos.pos2,
normalizePosField(duplicate.pos3) || resolvedPos.pos3,
row.first_seen ?? duplicate.first_seen ?? 0,
row.first_seen ?? duplicate.first_seen ?? 0,
row.last_seen ?? duplicate.last_seen ?? 0,
row.last_seen ?? duplicate.last_seen ?? 0,
duplicate.id,
);
deleteStmt.run(row.id);
repaired += 1;
deleted += 1;
continue;
}
updateStmt.run(
resolvedPos.headword,
resolvedPos.reading,
resolvedPos.partOfSpeech,
resolvedPos.pos1,
resolvedPos.pos2,
resolvedPos.pos3,
row.id,
);
repaired += 1;
}
const effectiveRow = {
...row,
headword: resolvedPos?.headword ?? row.headword,
reading: resolvedPos?.reading ?? row.reading,
part_of_speech: resolvedPos?.hasPosMetadata ? resolvedPos.partOfSpeech : row.part_of_speech,
pos1: resolvedPos?.pos1 ?? row.pos1,
pos2: resolvedPos?.pos2 ?? row.pos2,
pos3: resolvedPos?.pos3 ?? row.pos3,
};
const missingPos =
!normalizePosField(effectiveRow.part_of_speech) &&
!normalizePosField(effectiveRow.pos1) &&
!normalizePosField(effectiveRow.pos2) &&
!normalizePosField(effectiveRow.pos3);
if (
missingPos ||
shouldExcludeTokenFromVocabularyPersistence(toStoredWordToken(effectiveRow))
) {
deleteStmt.run(row.id);
deleted += 1;
continue;
}
kept += 1;
}
return {
scanned: rows.length,
kept,
deleted,
repaired,
};
}
export function upsertCoverArt(
db: DatabaseSync,
videoId: number,
art: {
anilistId: number | null;
coverUrl: string | null;
coverBlob: ArrayBuffer | Uint8Array | Buffer | null;
titleRomaji: string | null;
titleEnglish: string | null;
episodesTotal: number | null;
},
): void {
const existing = db
.prepare(
`
SELECT cover_blob_hash AS coverBlobHash
FROM imm_media_art
WHERE video_id = ?
`,
)
.get(videoId) as { coverBlobHash: string | null } | undefined;
const sharedCoverBlobHash = findSharedCoverBlobHash(db, videoId, art.anilistId, art.coverUrl);
const fetchedAtMs = toDbMs(nowMs());
const coverBlob = normalizeCoverBlobBytes(art.coverBlob);
let coverBlobHash = sharedCoverBlobHash ?? null;
if (!coverBlobHash && coverBlob && coverBlob.length > 0) {
coverBlobHash = createHash('sha256').update(coverBlob).digest('hex');
}
if (!coverBlobHash && (!coverBlob || coverBlob.length === 0)) {
coverBlobHash = existing?.coverBlobHash ?? null;
}
if (coverBlobHash && coverBlob && coverBlob.length > 0 && !sharedCoverBlobHash) {
db.prepare(
`
INSERT INTO imm_cover_art_blobs (blob_hash, cover_blob, CREATED_DATE, LAST_UPDATE_DATE)
VALUES (?, ?, ?, ?)
ON CONFLICT(blob_hash) DO UPDATE SET
LAST_UPDATE_DATE = excluded.LAST_UPDATE_DATE
`,
).run(coverBlobHash, coverBlob, fetchedAtMs, fetchedAtMs);
}
db.prepare(
`
INSERT INTO imm_media_art (
video_id, anilist_id, cover_url, cover_blob, cover_blob_hash,
title_romaji, title_english, episodes_total,
fetched_at_ms, CREATED_DATE, LAST_UPDATE_DATE
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(video_id) DO UPDATE SET
anilist_id = excluded.anilist_id,
cover_url = excluded.cover_url,
cover_blob = excluded.cover_blob,
cover_blob_hash = excluded.cover_blob_hash,
title_romaji = excluded.title_romaji,
title_english = excluded.title_english,
episodes_total = excluded.episodes_total,
fetched_at_ms = excluded.fetched_at_ms,
LAST_UPDATE_DATE = excluded.LAST_UPDATE_DATE
`,
).run(
videoId,
art.anilistId,
art.coverUrl,
coverBlobHash ? buildCoverBlobReference(coverBlobHash) : coverBlob,
coverBlobHash,
art.titleRomaji,
art.titleEnglish,
art.episodesTotal,
fetchedAtMs,
fetchedAtMs,
fetchedAtMs,
);
if (existing?.coverBlobHash !== coverBlobHash) {
cleanupUnusedCoverArtBlobHash(db, existing?.coverBlobHash ?? null);
}
}
export function updateAnimeAnilistInfo(
db: DatabaseSync,
videoId: number,
info: {
anilistId: number;
titleRomaji: string | null;
titleEnglish: string | null;
titleNative: string | null;
episodesTotal: number | null;
},
): void {
const row = db.prepare('SELECT anime_id FROM imm_videos WHERE video_id = ?').get(videoId) as {
anime_id: number | null;
} | null;
if (!row?.anime_id) return;
db.prepare(
`
UPDATE imm_anime
SET
anilist_id = COALESCE(?, anilist_id),
title_romaji = COALESCE(?, title_romaji),
title_english = COALESCE(?, title_english),
title_native = COALESCE(?, title_native),
episodes_total = COALESCE(?, episodes_total),
LAST_UPDATE_DATE = ?
WHERE anime_id = ?
`,
).run(
info.anilistId,
info.titleRomaji,
info.titleEnglish,
info.titleNative,
info.episodesTotal,
toDbMs(nowMs()),
row.anime_id,
);
}
export function markVideoWatched(db: DatabaseSync, videoId: number, watched: boolean): void {
db.prepare('UPDATE imm_videos SET watched = ?, LAST_UPDATE_DATE = ? WHERE video_id = ?').run(
watched ? 1 : 0,
toDbMs(nowMs()),
videoId,
);
}
export function getVideoDurationMs(db: DatabaseSync, videoId: number): number {
const row = db.prepare('SELECT duration_ms FROM imm_videos WHERE video_id = ?').get(videoId) as {
duration_ms: number;
} | null;
return row?.duration_ms ?? 0;
}
export function isVideoWatched(db: DatabaseSync, videoId: number): boolean {
const row = db.prepare('SELECT watched FROM imm_videos WHERE video_id = ?').get(videoId) as {
watched: number;
} | null;
return row?.watched === 1;
}
export function deleteSession(db: DatabaseSync, sessionId: number): void {
const sessionIds = [sessionId];
const affectedWordIds = getAffectedWordIdsForSessions(db, sessionIds);
const affectedKanjiIds = getAffectedKanjiIdsForSessions(db, sessionIds);
db.exec('BEGIN IMMEDIATE');
try {
deleteSessionsByIds(db, sessionIds);
refreshLexicalAggregates(db, affectedWordIds, affectedKanjiIds);
rebuildLifetimeSummariesInTransaction(db);
rebuildRollupsInTransaction(db);
db.exec('COMMIT');
} catch (error) {
db.exec('ROLLBACK');
throw error;
}
}
export function deleteSessions(db: DatabaseSync, sessionIds: number[]): void {
if (sessionIds.length === 0) return;
const affectedWordIds = getAffectedWordIdsForSessions(db, sessionIds);
const affectedKanjiIds = getAffectedKanjiIdsForSessions(db, sessionIds);
db.exec('BEGIN IMMEDIATE');
try {
deleteSessionsByIds(db, sessionIds);
refreshLexicalAggregates(db, affectedWordIds, affectedKanjiIds);
rebuildLifetimeSummariesInTransaction(db);
rebuildRollupsInTransaction(db);
db.exec('COMMIT');
} catch (error) {
db.exec('ROLLBACK');
throw error;
}
}
export function deleteVideo(db: DatabaseSync, videoId: number): void {
const artRow = db
.prepare(
`
SELECT cover_blob_hash AS coverBlobHash
FROM imm_media_art
WHERE video_id = ?
`,
)
.get(videoId) as { coverBlobHash: string | null } | undefined;
const affectedWordIds = getAffectedWordIdsForVideo(db, videoId);
const affectedKanjiIds = getAffectedKanjiIdsForVideo(db, videoId);
const sessions = db
.prepare('SELECT session_id FROM imm_sessions WHERE video_id = ?')
.all(videoId) as Array<{ session_id: number }>;
db.exec('BEGIN IMMEDIATE');
try {
deleteSessionsByIds(
db,
sessions.map((session) => session.session_id),
);
db.prepare('DELETE FROM imm_subtitle_lines WHERE video_id = ?').run(videoId);
db.prepare('DELETE FROM imm_daily_rollups WHERE video_id = ?').run(videoId);
db.prepare('DELETE FROM imm_monthly_rollups WHERE video_id = ?').run(videoId);
db.prepare('DELETE FROM imm_media_art WHERE video_id = ?').run(videoId);
cleanupUnusedCoverArtBlobHash(db, artRow?.coverBlobHash ?? null);
db.prepare('DELETE FROM imm_videos WHERE video_id = ?').run(videoId);
refreshLexicalAggregates(db, affectedWordIds, affectedKanjiIds);
rebuildLifetimeSummariesInTransaction(db);
rebuildRollupsInTransaction(db);
db.exec('COMMIT');
} catch (error) {
db.exec('ROLLBACK');
throw error;
}
}
function toDbMs(ms: number | bigint): bigint {
return BigInt(Math.trunc(Number(ms)));
}