mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-27 06:12:05 -07:00
- Track follow-up cleanup work in Backlog.md - Replace Date.now usage with shared nowMs helper - Add launcher args/parser and core regression tests
548 lines
16 KiB
TypeScript
548 lines
16 KiB
TypeScript
import { createHash } from 'node:crypto';
|
|
import type { DatabaseSync } from './sqlite';
|
|
import { buildCoverBlobReference, normalizeCoverBlobBytes } from './storage';
|
|
import { rebuildLifetimeSummariesInTransaction } from './lifetime';
|
|
import { rebuildRollupsInTransaction } from './maintenance';
|
|
import { nowMs } from './time';
|
|
import { PartOfSpeech, type MergedToken } from '../../../types';
|
|
import { shouldExcludeTokenFromVocabularyPersistence } from '../tokenizer/annotation-stage';
|
|
import { deriveStoredPartOfSpeech } from '../tokenizer/part-of-speech';
|
|
import {
|
|
cleanupUnusedCoverArtBlobHash,
|
|
deleteSessionsByIds,
|
|
findSharedCoverBlobHash,
|
|
getAffectedKanjiIdsForSessions,
|
|
getAffectedKanjiIdsForVideo,
|
|
getAffectedWordIdsForSessions,
|
|
getAffectedWordIdsForVideo,
|
|
refreshLexicalAggregates,
|
|
} from './query-shared.js';
|
|
|
|
type CleanupVocabularyRow = {
|
|
id: number;
|
|
word: string;
|
|
headword: string;
|
|
reading: string | null;
|
|
part_of_speech: string | null;
|
|
pos1: string | null;
|
|
pos2: string | null;
|
|
pos3: string | null;
|
|
first_seen: number | null;
|
|
last_seen: number | null;
|
|
frequency: number | null;
|
|
};
|
|
|
|
type ResolvedVocabularyPos = {
|
|
headword: string;
|
|
reading: string;
|
|
hasPosMetadata: boolean;
|
|
partOfSpeech: PartOfSpeech;
|
|
pos1: string;
|
|
pos2: string;
|
|
pos3: string;
|
|
};
|
|
|
|
type CleanupVocabularyStatsOptions = {
|
|
resolveLegacyPos?: (row: CleanupVocabularyRow) => Promise<{
|
|
headword: string;
|
|
reading: string;
|
|
partOfSpeech: string;
|
|
pos1: string;
|
|
pos2: string;
|
|
pos3: string;
|
|
} | null>;
|
|
};
|
|
|
|
function toStoredWordToken(row: {
|
|
word: string;
|
|
headword: string;
|
|
part_of_speech: string | null;
|
|
pos1: string | null;
|
|
pos2: string | null;
|
|
pos3: string | null;
|
|
}): MergedToken {
|
|
return {
|
|
surface: row.word || row.headword || '',
|
|
reading: '',
|
|
headword: row.headword || row.word || '',
|
|
startPos: 0,
|
|
endPos: 0,
|
|
partOfSpeech: deriveStoredPartOfSpeech({
|
|
partOfSpeech: row.part_of_speech,
|
|
pos1: row.pos1,
|
|
}),
|
|
pos1: row.pos1 ?? '',
|
|
pos2: row.pos2 ?? '',
|
|
pos3: row.pos3 ?? '',
|
|
isMerged: true,
|
|
isKnown: false,
|
|
isNPlusOneTarget: false,
|
|
};
|
|
}
|
|
|
|
function normalizePosField(value: string | null | undefined): string {
|
|
return typeof value === 'string' ? value.trim() : '';
|
|
}
|
|
|
|
function resolveStoredVocabularyPos(row: CleanupVocabularyRow): ResolvedVocabularyPos | null {
|
|
const headword = normalizePosField(row.headword);
|
|
const reading = normalizePosField(row.reading);
|
|
const partOfSpeechRaw = typeof row.part_of_speech === 'string' ? row.part_of_speech.trim() : '';
|
|
const pos1 = normalizePosField(row.pos1);
|
|
const pos2 = normalizePosField(row.pos2);
|
|
const pos3 = normalizePosField(row.pos3);
|
|
|
|
if (!headword && !reading && !partOfSpeechRaw && !pos1 && !pos2 && !pos3) {
|
|
return null;
|
|
}
|
|
|
|
return {
|
|
headword: headword || normalizePosField(row.word),
|
|
reading,
|
|
hasPosMetadata: Boolean(partOfSpeechRaw || pos1 || pos2 || pos3),
|
|
partOfSpeech: deriveStoredPartOfSpeech({
|
|
partOfSpeech: partOfSpeechRaw,
|
|
pos1,
|
|
}),
|
|
pos1,
|
|
pos2,
|
|
pos3,
|
|
};
|
|
}
|
|
|
|
function hasStructuredPos(pos: ResolvedVocabularyPos | null): boolean {
|
|
return Boolean(pos?.hasPosMetadata && (pos.pos1 || pos.pos2 || pos.pos3 || pos.partOfSpeech));
|
|
}
|
|
|
|
function needsLegacyVocabularyMetadataRepair(
|
|
row: CleanupVocabularyRow,
|
|
stored: ResolvedVocabularyPos | null,
|
|
): boolean {
|
|
if (!stored) {
|
|
return true;
|
|
}
|
|
|
|
if (!hasStructuredPos(stored)) {
|
|
return true;
|
|
}
|
|
|
|
if (!stored.reading) {
|
|
return true;
|
|
}
|
|
|
|
if (!stored.headword) {
|
|
return true;
|
|
}
|
|
|
|
return stored.headword === normalizePosField(row.word);
|
|
}
|
|
|
|
function shouldUpdateStoredVocabularyPos(
|
|
row: CleanupVocabularyRow,
|
|
next: ResolvedVocabularyPos,
|
|
): boolean {
|
|
return (
|
|
normalizePosField(row.headword) !== next.headword ||
|
|
normalizePosField(row.reading) !== next.reading ||
|
|
(next.hasPosMetadata &&
|
|
(normalizePosField(row.part_of_speech) !== next.partOfSpeech ||
|
|
normalizePosField(row.pos1) !== next.pos1 ||
|
|
normalizePosField(row.pos2) !== next.pos2 ||
|
|
normalizePosField(row.pos3) !== next.pos3))
|
|
);
|
|
}
|
|
|
|
function chooseMergedPartOfSpeech(
|
|
current: string | null | undefined,
|
|
incoming: ResolvedVocabularyPos,
|
|
): string {
|
|
const normalizedCurrent = normalizePosField(current);
|
|
if (
|
|
normalizedCurrent &&
|
|
normalizedCurrent !== PartOfSpeech.other &&
|
|
incoming.partOfSpeech === PartOfSpeech.other
|
|
) {
|
|
return normalizedCurrent;
|
|
}
|
|
return incoming.partOfSpeech;
|
|
}
|
|
|
|
async function maybeResolveLegacyVocabularyPos(
|
|
row: CleanupVocabularyRow,
|
|
options: CleanupVocabularyStatsOptions,
|
|
): Promise<ResolvedVocabularyPos | null> {
|
|
const stored = resolveStoredVocabularyPos(row);
|
|
if (!needsLegacyVocabularyMetadataRepair(row, stored) || !options.resolveLegacyPos) {
|
|
return stored;
|
|
}
|
|
|
|
const resolved = await options.resolveLegacyPos(row);
|
|
if (resolved) {
|
|
return {
|
|
headword: normalizePosField(resolved.headword) || normalizePosField(row.word),
|
|
reading: normalizePosField(resolved.reading),
|
|
hasPosMetadata: true,
|
|
partOfSpeech: deriveStoredPartOfSpeech({
|
|
partOfSpeech: resolved.partOfSpeech,
|
|
pos1: resolved.pos1,
|
|
}),
|
|
pos1: normalizePosField(resolved.pos1),
|
|
pos2: normalizePosField(resolved.pos2),
|
|
pos3: normalizePosField(resolved.pos3),
|
|
};
|
|
}
|
|
|
|
return stored;
|
|
}
|
|
|
|
export async function cleanupVocabularyStats(
|
|
db: DatabaseSync,
|
|
options: CleanupVocabularyStatsOptions = {},
|
|
): Promise<{ scanned: number; kept: number; deleted: number; repaired: number }> {
|
|
const rows = db
|
|
.prepare(
|
|
`SELECT id, word, headword, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency
|
|
FROM imm_words`,
|
|
)
|
|
.all() as CleanupVocabularyRow[];
|
|
const findDuplicateStmt = db.prepare(
|
|
`SELECT id, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency
|
|
FROM imm_words
|
|
WHERE headword = ? AND word = ? AND reading = ? AND id != ?`,
|
|
);
|
|
const deleteStmt = db.prepare('DELETE FROM imm_words WHERE id = ?');
|
|
const updateStmt = db.prepare(
|
|
`UPDATE imm_words
|
|
SET headword = ?, reading = ?, part_of_speech = ?, pos1 = ?, pos2 = ?, pos3 = ?
|
|
WHERE id = ?`,
|
|
);
|
|
const mergeWordStmt = db.prepare(
|
|
`UPDATE imm_words
|
|
SET
|
|
frequency = COALESCE(frequency, 0) + ?,
|
|
part_of_speech = ?,
|
|
pos1 = ?,
|
|
pos2 = ?,
|
|
pos3 = ?,
|
|
first_seen = MIN(COALESCE(first_seen, ?), ?),
|
|
last_seen = MAX(COALESCE(last_seen, ?), ?)
|
|
WHERE id = ?`,
|
|
);
|
|
const moveOccurrencesStmt = db.prepare(
|
|
`INSERT INTO imm_word_line_occurrences (line_id, word_id, occurrence_count)
|
|
SELECT line_id, ?, occurrence_count
|
|
FROM imm_word_line_occurrences
|
|
WHERE word_id = ?
|
|
ON CONFLICT(line_id, word_id) DO UPDATE SET
|
|
occurrence_count = imm_word_line_occurrences.occurrence_count + excluded.occurrence_count`,
|
|
);
|
|
const deleteOccurrencesStmt = db.prepare(
|
|
'DELETE FROM imm_word_line_occurrences WHERE word_id = ?',
|
|
);
|
|
let kept = 0;
|
|
let deleted = 0;
|
|
let repaired = 0;
|
|
|
|
for (const row of rows) {
|
|
const resolvedPos = await maybeResolveLegacyVocabularyPos(row, options);
|
|
const shouldRepair = Boolean(resolvedPos && shouldUpdateStoredVocabularyPos(row, resolvedPos));
|
|
if (resolvedPos && shouldRepair) {
|
|
const duplicate = findDuplicateStmt.get(
|
|
resolvedPos.headword,
|
|
row.word,
|
|
resolvedPos.reading,
|
|
row.id,
|
|
) as {
|
|
id: number;
|
|
part_of_speech: string | null;
|
|
pos1: string | null;
|
|
pos2: string | null;
|
|
pos3: string | null;
|
|
first_seen: number | null;
|
|
last_seen: number | null;
|
|
frequency: number | null;
|
|
} | null;
|
|
if (duplicate) {
|
|
moveOccurrencesStmt.run(duplicate.id, row.id);
|
|
deleteOccurrencesStmt.run(row.id);
|
|
mergeWordStmt.run(
|
|
row.frequency ?? 0,
|
|
chooseMergedPartOfSpeech(duplicate.part_of_speech, resolvedPos),
|
|
normalizePosField(duplicate.pos1) || resolvedPos.pos1,
|
|
normalizePosField(duplicate.pos2) || resolvedPos.pos2,
|
|
normalizePosField(duplicate.pos3) || resolvedPos.pos3,
|
|
row.first_seen ?? duplicate.first_seen ?? 0,
|
|
row.first_seen ?? duplicate.first_seen ?? 0,
|
|
row.last_seen ?? duplicate.last_seen ?? 0,
|
|
row.last_seen ?? duplicate.last_seen ?? 0,
|
|
duplicate.id,
|
|
);
|
|
deleteStmt.run(row.id);
|
|
repaired += 1;
|
|
deleted += 1;
|
|
continue;
|
|
}
|
|
|
|
updateStmt.run(
|
|
resolvedPos.headword,
|
|
resolvedPos.reading,
|
|
resolvedPos.partOfSpeech,
|
|
resolvedPos.pos1,
|
|
resolvedPos.pos2,
|
|
resolvedPos.pos3,
|
|
row.id,
|
|
);
|
|
repaired += 1;
|
|
}
|
|
|
|
const effectiveRow = {
|
|
...row,
|
|
headword: resolvedPos?.headword ?? row.headword,
|
|
reading: resolvedPos?.reading ?? row.reading,
|
|
part_of_speech: resolvedPos?.hasPosMetadata ? resolvedPos.partOfSpeech : row.part_of_speech,
|
|
pos1: resolvedPos?.pos1 ?? row.pos1,
|
|
pos2: resolvedPos?.pos2 ?? row.pos2,
|
|
pos3: resolvedPos?.pos3 ?? row.pos3,
|
|
};
|
|
const missingPos =
|
|
!normalizePosField(effectiveRow.part_of_speech) &&
|
|
!normalizePosField(effectiveRow.pos1) &&
|
|
!normalizePosField(effectiveRow.pos2) &&
|
|
!normalizePosField(effectiveRow.pos3);
|
|
if (
|
|
missingPos ||
|
|
shouldExcludeTokenFromVocabularyPersistence(toStoredWordToken(effectiveRow))
|
|
) {
|
|
deleteStmt.run(row.id);
|
|
deleted += 1;
|
|
continue;
|
|
}
|
|
kept += 1;
|
|
}
|
|
|
|
return {
|
|
scanned: rows.length,
|
|
kept,
|
|
deleted,
|
|
repaired,
|
|
};
|
|
}
|
|
|
|
export function upsertCoverArt(
|
|
db: DatabaseSync,
|
|
videoId: number,
|
|
art: {
|
|
anilistId: number | null;
|
|
coverUrl: string | null;
|
|
coverBlob: ArrayBuffer | Uint8Array | Buffer | null;
|
|
titleRomaji: string | null;
|
|
titleEnglish: string | null;
|
|
episodesTotal: number | null;
|
|
},
|
|
): void {
|
|
const existing = db
|
|
.prepare(
|
|
`
|
|
SELECT cover_blob_hash AS coverBlobHash
|
|
FROM imm_media_art
|
|
WHERE video_id = ?
|
|
`,
|
|
)
|
|
.get(videoId) as { coverBlobHash: string | null } | undefined;
|
|
const sharedCoverBlobHash = findSharedCoverBlobHash(db, videoId, art.anilistId, art.coverUrl);
|
|
const fetchedAtMs = toDbMs(nowMs());
|
|
const coverBlob = normalizeCoverBlobBytes(art.coverBlob);
|
|
let coverBlobHash = sharedCoverBlobHash ?? null;
|
|
if (!coverBlobHash && coverBlob && coverBlob.length > 0) {
|
|
coverBlobHash = createHash('sha256').update(coverBlob).digest('hex');
|
|
}
|
|
if (!coverBlobHash && (!coverBlob || coverBlob.length === 0)) {
|
|
coverBlobHash = existing?.coverBlobHash ?? null;
|
|
}
|
|
|
|
if (coverBlobHash && coverBlob && coverBlob.length > 0 && !sharedCoverBlobHash) {
|
|
db.prepare(
|
|
`
|
|
INSERT INTO imm_cover_art_blobs (blob_hash, cover_blob, CREATED_DATE, LAST_UPDATE_DATE)
|
|
VALUES (?, ?, ?, ?)
|
|
ON CONFLICT(blob_hash) DO UPDATE SET
|
|
LAST_UPDATE_DATE = excluded.LAST_UPDATE_DATE
|
|
`,
|
|
).run(coverBlobHash, coverBlob, fetchedAtMs, fetchedAtMs);
|
|
}
|
|
|
|
db.prepare(
|
|
`
|
|
INSERT INTO imm_media_art (
|
|
video_id, anilist_id, cover_url, cover_blob, cover_blob_hash,
|
|
title_romaji, title_english, episodes_total,
|
|
fetched_at_ms, CREATED_DATE, LAST_UPDATE_DATE
|
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
ON CONFLICT(video_id) DO UPDATE SET
|
|
anilist_id = excluded.anilist_id,
|
|
cover_url = excluded.cover_url,
|
|
cover_blob = excluded.cover_blob,
|
|
cover_blob_hash = excluded.cover_blob_hash,
|
|
title_romaji = excluded.title_romaji,
|
|
title_english = excluded.title_english,
|
|
episodes_total = excluded.episodes_total,
|
|
fetched_at_ms = excluded.fetched_at_ms,
|
|
LAST_UPDATE_DATE = excluded.LAST_UPDATE_DATE
|
|
`,
|
|
).run(
|
|
videoId,
|
|
art.anilistId,
|
|
art.coverUrl,
|
|
coverBlobHash ? buildCoverBlobReference(coverBlobHash) : coverBlob,
|
|
coverBlobHash,
|
|
art.titleRomaji,
|
|
art.titleEnglish,
|
|
art.episodesTotal,
|
|
fetchedAtMs,
|
|
fetchedAtMs,
|
|
fetchedAtMs,
|
|
);
|
|
|
|
if (existing?.coverBlobHash !== coverBlobHash) {
|
|
cleanupUnusedCoverArtBlobHash(db, existing?.coverBlobHash ?? null);
|
|
}
|
|
}
|
|
|
|
export function updateAnimeAnilistInfo(
|
|
db: DatabaseSync,
|
|
videoId: number,
|
|
info: {
|
|
anilistId: number;
|
|
titleRomaji: string | null;
|
|
titleEnglish: string | null;
|
|
titleNative: string | null;
|
|
episodesTotal: number | null;
|
|
},
|
|
): void {
|
|
const row = db.prepare('SELECT anime_id FROM imm_videos WHERE video_id = ?').get(videoId) as {
|
|
anime_id: number | null;
|
|
} | null;
|
|
if (!row?.anime_id) return;
|
|
|
|
db.prepare(
|
|
`
|
|
UPDATE imm_anime
|
|
SET
|
|
anilist_id = COALESCE(?, anilist_id),
|
|
title_romaji = COALESCE(?, title_romaji),
|
|
title_english = COALESCE(?, title_english),
|
|
title_native = COALESCE(?, title_native),
|
|
episodes_total = COALESCE(?, episodes_total),
|
|
LAST_UPDATE_DATE = ?
|
|
WHERE anime_id = ?
|
|
`,
|
|
).run(
|
|
info.anilistId,
|
|
info.titleRomaji,
|
|
info.titleEnglish,
|
|
info.titleNative,
|
|
info.episodesTotal,
|
|
toDbMs(nowMs()),
|
|
row.anime_id,
|
|
);
|
|
}
|
|
|
|
export function markVideoWatched(db: DatabaseSync, videoId: number, watched: boolean): void {
|
|
db.prepare('UPDATE imm_videos SET watched = ?, LAST_UPDATE_DATE = ? WHERE video_id = ?').run(
|
|
watched ? 1 : 0,
|
|
toDbMs(nowMs()),
|
|
videoId,
|
|
);
|
|
}
|
|
|
|
export function getVideoDurationMs(db: DatabaseSync, videoId: number): number {
|
|
const row = db.prepare('SELECT duration_ms FROM imm_videos WHERE video_id = ?').get(videoId) as {
|
|
duration_ms: number;
|
|
} | null;
|
|
return row?.duration_ms ?? 0;
|
|
}
|
|
|
|
export function isVideoWatched(db: DatabaseSync, videoId: number): boolean {
|
|
const row = db.prepare('SELECT watched FROM imm_videos WHERE video_id = ?').get(videoId) as {
|
|
watched: number;
|
|
} | null;
|
|
return row?.watched === 1;
|
|
}
|
|
|
|
export function deleteSession(db: DatabaseSync, sessionId: number): void {
|
|
const sessionIds = [sessionId];
|
|
const affectedWordIds = getAffectedWordIdsForSessions(db, sessionIds);
|
|
const affectedKanjiIds = getAffectedKanjiIdsForSessions(db, sessionIds);
|
|
|
|
db.exec('BEGIN IMMEDIATE');
|
|
try {
|
|
deleteSessionsByIds(db, sessionIds);
|
|
refreshLexicalAggregates(db, affectedWordIds, affectedKanjiIds);
|
|
rebuildLifetimeSummariesInTransaction(db);
|
|
rebuildRollupsInTransaction(db);
|
|
db.exec('COMMIT');
|
|
} catch (error) {
|
|
db.exec('ROLLBACK');
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
export function deleteSessions(db: DatabaseSync, sessionIds: number[]): void {
|
|
if (sessionIds.length === 0) return;
|
|
const affectedWordIds = getAffectedWordIdsForSessions(db, sessionIds);
|
|
const affectedKanjiIds = getAffectedKanjiIdsForSessions(db, sessionIds);
|
|
|
|
db.exec('BEGIN IMMEDIATE');
|
|
try {
|
|
deleteSessionsByIds(db, sessionIds);
|
|
refreshLexicalAggregates(db, affectedWordIds, affectedKanjiIds);
|
|
rebuildLifetimeSummariesInTransaction(db);
|
|
rebuildRollupsInTransaction(db);
|
|
db.exec('COMMIT');
|
|
} catch (error) {
|
|
db.exec('ROLLBACK');
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
export function deleteVideo(db: DatabaseSync, videoId: number): void {
|
|
const artRow = db
|
|
.prepare(
|
|
`
|
|
SELECT cover_blob_hash AS coverBlobHash
|
|
FROM imm_media_art
|
|
WHERE video_id = ?
|
|
`,
|
|
)
|
|
.get(videoId) as { coverBlobHash: string | null } | undefined;
|
|
const affectedWordIds = getAffectedWordIdsForVideo(db, videoId);
|
|
const affectedKanjiIds = getAffectedKanjiIdsForVideo(db, videoId);
|
|
const sessions = db
|
|
.prepare('SELECT session_id FROM imm_sessions WHERE video_id = ?')
|
|
.all(videoId) as Array<{ session_id: number }>;
|
|
|
|
db.exec('BEGIN IMMEDIATE');
|
|
try {
|
|
deleteSessionsByIds(
|
|
db,
|
|
sessions.map((session) => session.session_id),
|
|
);
|
|
db.prepare('DELETE FROM imm_subtitle_lines WHERE video_id = ?').run(videoId);
|
|
db.prepare('DELETE FROM imm_daily_rollups WHERE video_id = ?').run(videoId);
|
|
db.prepare('DELETE FROM imm_monthly_rollups WHERE video_id = ?').run(videoId);
|
|
db.prepare('DELETE FROM imm_media_art WHERE video_id = ?').run(videoId);
|
|
cleanupUnusedCoverArtBlobHash(db, artRow?.coverBlobHash ?? null);
|
|
db.prepare('DELETE FROM imm_videos WHERE video_id = ?').run(videoId);
|
|
refreshLexicalAggregates(db, affectedWordIds, affectedKanjiIds);
|
|
rebuildLifetimeSummariesInTransaction(db);
|
|
rebuildRollupsInTransaction(db);
|
|
db.exec('COMMIT');
|
|
} catch (error) {
|
|
db.exec('ROLLBACK');
|
|
throw error;
|
|
}
|
|
}
|
|
function toDbMs(ms: number | bigint): bigint {
|
|
return BigInt(Math.trunc(Number(ms)));
|
|
}
|