From f005f542a3726f53d8d314a015b6f7d5efa77f18 Mon Sep 17 00:00:00 2001 From: sudacode Date: Sat, 14 Mar 2026 22:13:42 -0700 Subject: [PATCH] feat(immersion): add anime metadata, occurrence tracking, and schema upgrades - Add imm_anime table with AniList integration - Add imm_subtitle_lines, imm_word_line_occurrences, imm_kanji_line_occurrences - Add POS fields (part_of_speech, pos1, pos2, pos3) to imm_words - Add anime metadata parsing with guessit fallback - Add video duration tracking and watched status - Add episode, streak, trend, and word/kanji detail queries - Deduplicate subtitle line recording within sessions - Pass Anki note IDs through card mining callback chain --- .../services/anilist/anilist-updater.test.ts | 4 + src/core/services/anilist/anilist-updater.ts | 9 +- .../anilist/cover-art-fetcher.test.ts | 239 ++++ .../services/anilist/cover-art-fetcher.ts | 405 +++++++ .../immersion-tracker-service.test.ts | 331 +++++ .../services/immersion-tracker-service.ts | 445 ++++++- .../immersion-tracker/__tests__/query.test.ts | 976 +++++++++++++++ .../legacy-vocabulary-pos.ts | 71 ++ .../services/immersion-tracker/maintenance.ts | 72 +- .../immersion-tracker/metadata.test.ts | 78 +- .../services/immersion-tracker/metadata.ts | 74 +- src/core/services/immersion-tracker/query.ts | 1063 ++++++++++++++++- .../services/immersion-tracker/reducer.ts | 1 + .../immersion-tracker/storage-session.test.ts | 531 +++++++- .../services/immersion-tracker/storage.ts | 638 +++++++++- src/core/services/immersion-tracker/types.ts | 321 ++++- .../services/tokenizer/annotation-stage.ts | 11 + src/core/services/tokenizer/part-of-speech.ts | 56 + src/mecab-tokenizer.ts | 28 +- 19 files changed, 5231 insertions(+), 122 deletions(-) create mode 100644 src/core/services/anilist/cover-art-fetcher.test.ts create mode 100644 src/core/services/anilist/cover-art-fetcher.ts create mode 100644 src/core/services/immersion-tracker/__tests__/query.test.ts create mode 100644 src/core/services/immersion-tracker/legacy-vocabulary-pos.ts create mode 100644 src/core/services/tokenizer/part-of-speech.ts diff --git a/src/core/services/anilist/anilist-updater.test.ts b/src/core/services/anilist/anilist-updater.test.ts index e42bcff..37c5c5a 100644 --- a/src/core/services/anilist/anilist-updater.test.ts +++ b/src/core/services/anilist/anilist-updater.test.ts @@ -16,6 +16,7 @@ test('guessAnilistMediaInfo uses guessit output when available', async () => { }); assert.deepEqual(result, { title: 'Guessit Title', + season: null, episode: 7, source: 'guessit', }); @@ -29,6 +30,7 @@ test('guessAnilistMediaInfo falls back to parser when guessit fails', async () = }); assert.deepEqual(result, { title: 'My Anime', + season: 1, episode: 3, source: 'fallback', }); @@ -52,6 +54,7 @@ test('guessAnilistMediaInfo uses basename for guessit input', async () => { ]); assert.deepEqual(result, { title: 'Rascal Does Not Dream of Bunny Girl Senpai', + season: null, episode: 1, source: 'guessit', }); @@ -67,6 +70,7 @@ test('guessAnilistMediaInfo joins multi-part guessit titles', async () => { }); assert.deepEqual(result, { title: 'Rascal Does not Dream of Bunny Girl Senpai', + season: null, episode: 1, source: 'guessit', }); diff --git a/src/core/services/anilist/anilist-updater.ts b/src/core/services/anilist/anilist-updater.ts index 849c5a6..601f041 100644 --- a/src/core/services/anilist/anilist-updater.ts +++ b/src/core/services/anilist/anilist-updater.ts @@ -7,6 +7,7 @@ const ANILIST_GRAPHQL_URL = 'https://graphql.anilist.co'; export interface AnilistMediaGuess { title: string; + season: number | null; episode: number | null; source: 'guessit' | 'fallback'; } @@ -56,7 +57,7 @@ interface AnilistSaveEntryData { }; } -function runGuessit(target: string): Promise { +export function runGuessit(target: string): Promise { return new Promise((resolve, reject) => { childProcess.execFile( 'guessit', @@ -73,7 +74,7 @@ function runGuessit(target: string): Promise { }); } -type GuessAnilistMediaInfoDeps = { +export interface GuessAnilistMediaInfoDeps { runGuessit: (target: string) => Promise; }; @@ -215,8 +216,9 @@ export async function guessAnilistMediaInfo( const parsed = JSON.parse(stdout) as Record; const title = readGuessitTitle(parsed.title); const episode = firstPositiveInteger(parsed.episode); + const season = firstPositiveInteger(parsed.season); if (title) { - return { title, episode, source: 'guessit' }; + return { title, season, episode, source: 'guessit' }; } } catch { // Ignore guessit failures and fall back to internal parser. @@ -230,6 +232,7 @@ export async function guessAnilistMediaInfo( } return { title: parsed.title.trim(), + season: parsed.season, episode: parsed.episode, source: 'fallback', }; diff --git a/src/core/services/anilist/cover-art-fetcher.test.ts b/src/core/services/anilist/cover-art-fetcher.test.ts new file mode 100644 index 0000000..5ab134d --- /dev/null +++ b/src/core/services/anilist/cover-art-fetcher.test.ts @@ -0,0 +1,239 @@ +import assert from 'node:assert/strict'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import test from 'node:test'; +import { createCoverArtFetcher, stripFilenameTags } from './cover-art-fetcher.js'; +import { Database } from '../immersion-tracker/sqlite.js'; +import { ensureSchema, getOrCreateVideoRecord } from '../immersion-tracker/storage.js'; +import { getCoverArt, upsertCoverArt } from '../immersion-tracker/query.js'; +import { SOURCE_TYPE_LOCAL } from '../immersion-tracker/types.js'; + +function makeDbPath(): string { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-cover-art-test-')); + return path.join(dir, 'immersion.sqlite'); +} + +function cleanupDbPath(dbPath: string): void { + fs.rmSync(path.dirname(dbPath), { recursive: true, force: true }); +} + +test('stripFilenameTags normalizes common media-title formats', () => { + assert.equal( + stripFilenameTags('[Jellyfin/direct] The Eminence in Shadow S01E05 I Am...'), + 'The Eminence in Shadow', + ); + assert.equal( + stripFilenameTags( + '[Foxtrot] Kono Subarashii Sekai ni Shukufuku wo! S2 - 05: Servitude for this Masked Knight!', + ), + 'Kono Subarashii Sekai ni Shukufuku wo!', + ); + assert.equal( + stripFilenameTags('Kono Subarashii Sekai ni Shukufuku wo! E03: A Panty Treasure'), + 'Kono Subarashii Sekai ni Shukufuku wo!', + ); + assert.equal( + stripFilenameTags( + 'Little Witch Academia (2017) - S01E05 - 005 - Pact of the Dragon [Bluray-1080p][10bit][h265][FLAC 2.0][JA]-FumeiRaws.mkv', + ), + 'Little Witch Academia', + ); +}); + +test('fetchIfMissing backfills a missing blob from an existing cover URL', async () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + ensureSchema(db); + const videoId = getOrCreateVideoRecord(db, 'local:/tmp/cover-fetcher-test.mkv', { + canonicalTitle: 'Cover Fetcher Test', + sourcePath: '/tmp/cover-fetcher-test.mkv', + sourceUrl: null, + sourceType: SOURCE_TYPE_LOCAL, + }); + upsertCoverArt(db, videoId, { + anilistId: 7, + coverUrl: 'https://images.test/cover.jpg', + coverBlob: null, + titleRomaji: 'Test Title', + titleEnglish: 'Test Title', + episodesTotal: 12, + }); + + const fetchCalls: string[] = []; + const originalFetch = globalThis.fetch; + globalThis.fetch = (async (input: RequestInfo | URL) => { + const url = String(input); + fetchCalls.push(url); + assert.equal(url, 'https://images.test/cover.jpg'); + return new Response(new Uint8Array([1, 2, 3, 4]), { + status: 200, + headers: { 'Content-Type': 'image/jpeg' }, + }); + }) as typeof fetch; + + try { + const fetcher = createCoverArtFetcher( + { + acquire: async () => {}, + recordResponse: () => {}, + }, + console, + ); + + const fetched = await fetcher.fetchIfMissing( + db, + videoId, + '[Jellyfin] Little Witch Academia S02E05 - 025 - Pact of the Dragon (2020) [1080p].mkv', + ); + const stored = getCoverArt(db, videoId); + + assert.equal(fetched, true); + assert.equal(fetchCalls.length, 1); + assert.equal(stored?.coverBlob?.length, 4); + assert.equal(stored?.titleEnglish, 'Test Title'); + } finally { + globalThis.fetch = originalFetch; + db.close(); + cleanupDbPath(dbPath); + } +}); + +function createJsonResponse(payload: unknown): Response { + return new Response(JSON.stringify(payload), { + status: 200, + headers: { 'content-type': 'application/json' }, + }); +} + +test('fetchIfMissing uses guessit primary title and season when available', async () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + ensureSchema(db); + const videoId = getOrCreateVideoRecord(db, 'local:/tmp/cover-fetcher-season-test.mkv', { + canonicalTitle: '[Jellyfin] Little Witch Academia S02E05 - 025 - Pact of the Dragon (2020) [1080p].mkv', + sourcePath: '/tmp/cover-fetcher-season-test.mkv', + sourceUrl: null, + sourceType: SOURCE_TYPE_LOCAL, + }); + + const searchCalls: Array<{ search: string }> = []; + const originalFetch = globalThis.fetch; + globalThis.fetch = ((input: RequestInfo | URL, init?: RequestInit) => { + const raw = (init?.body as string | undefined) ?? ''; + const payload = JSON.parse(raw) as { variables: { search: string } }; + const search = payload.variables.search; + searchCalls.push({ search }); + + if (search.includes('Season 2')) { + return Promise.resolve(createJsonResponse({ data: { Page: { media: [] } } })); + } + + return Promise.resolve( + createJsonResponse({ + data: { + Page: { + media: [ + { + id: 19, + episodes: 24, + coverImage: { large: 'https://images.test/cover.jpg', medium: null }, + title: { romaji: 'Little Witch Academia', english: 'Little Witch Academia', native: null }, + }, + ], + }, + }, + }), + ); + }) as typeof fetch; + + try { + const fetcher = createCoverArtFetcher( + { + acquire: async () => {}, + recordResponse: () => {}, + }, + console, + { + runGuessit: async () => + JSON.stringify({ title: 'Little Witch Academia', season: 2, episode: 5 }), + }, + ); + + const fetched = await fetcher.fetchIfMissing(db, videoId, 'School Vlog S01E01'); + const stored = getCoverArt(db, videoId); + + assert.equal(fetched, true); + assert.equal(searchCalls.length, 2); + assert.equal(searchCalls[0]!.search, 'Little Witch Academia Season 2'); + assert.equal(stored?.anilistId, 19); + } finally { + globalThis.fetch = originalFetch; + db.close(); + cleanupDbPath(dbPath); + } +}); + +test('fetchIfMissing falls back to internal parser when guessit throws', async () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + ensureSchema(db); + const videoId = getOrCreateVideoRecord(db, 'local:/tmp/cover-fetcher-fallback-test.mkv', { + canonicalTitle: 'School Vlog S01E01', + sourcePath: '/tmp/cover-fetcher-fallback-test.mkv', + sourceUrl: null, + sourceType: SOURCE_TYPE_LOCAL, + }); + + let requestCount = 0; + const originalFetch = globalThis.fetch; + globalThis.fetch = ((input: RequestInfo | URL, init?: RequestInit) => { + requestCount += 1; + const raw = (init?.body as string | undefined) ?? ''; + const payload = JSON.parse(raw) as { variables: { search: string } }; + assert.equal(payload.variables.search, 'School Vlog'); + + return Promise.resolve( + createJsonResponse({ + data: { + Page: { + media: [ + { + id: 21, + episodes: 12, + coverImage: { large: 'https://images.test/fallback-cover.jpg', medium: null }, + title: { romaji: 'School Vlog', english: 'School Vlog', native: null }, + }, + ], + }, + }, + }), + ); + }) as typeof fetch; + + try { + const fetcher = createCoverArtFetcher( + { + acquire: async () => {}, + recordResponse: () => {}, + }, + console, + { + runGuessit: async () => { + throw new Error('guessit unavailable'); + }, + }, + ); + + const fetched = await fetcher.fetchIfMissing(db, videoId, 'Ignored Title'); + const stored = getCoverArt(db, videoId); + + assert.equal(fetched, true); + assert.equal(requestCount, 1); + assert.equal(stored?.anilistId, 21); + } finally { + globalThis.fetch = originalFetch; + db.close(); + cleanupDbPath(dbPath); + } +}); diff --git a/src/core/services/anilist/cover-art-fetcher.ts b/src/core/services/anilist/cover-art-fetcher.ts new file mode 100644 index 0000000..599dec7 --- /dev/null +++ b/src/core/services/anilist/cover-art-fetcher.ts @@ -0,0 +1,405 @@ +import type { AnilistRateLimiter } from './rate-limiter'; +import type { DatabaseSync } from '../immersion-tracker/sqlite'; +import { getCoverArt, upsertCoverArt, updateAnimeAnilistInfo } from '../immersion-tracker/query'; +import { guessAnilistMediaInfo, runGuessit, type GuessAnilistMediaInfoDeps } from './anilist-updater'; + +const ANILIST_GRAPHQL_URL = 'https://graphql.anilist.co'; +const NO_MATCH_RETRY_MS = 5 * 60 * 1000; + +const SEARCH_QUERY = ` +query ($search: String!) { + Page(perPage: 5) { + media(search: $search, type: ANIME) { + id + episodes + season + seasonYear + coverImage { large medium } + title { romaji english native } + } + } +} +`; + +interface AnilistMedia { + id: number; + episodes: number | null; + season: string | null; + seasonYear: number | null; + coverImage: { large: string | null; medium: string | null } | null; + title: { romaji: string | null; english: string | null; native: string | null } | null; +} + +interface AnilistSearchResponse { + data?: { + Page?: { + media?: AnilistMedia[]; + }; + }; + errors?: Array<{ message?: string }>; +} + +export interface CoverArtFetcher { + fetchIfMissing(db: DatabaseSync, videoId: number, canonicalTitle: string): Promise; +} + +interface Logger { + info(msg: string, ...args: unknown[]): void; + warn(msg: string, ...args: unknown[]): void; + error(msg: string, ...args: unknown[]): void; +} + +interface CoverArtCandidate { + title: string; + source: 'guessit' | 'fallback'; + season: number | null; + episode: number | null; +} + +interface CoverArtFetcherOptions { + runGuessit?: GuessAnilistMediaInfoDeps['runGuessit']; +} + +export function stripFilenameTags(raw: string): string { + let title = raw.replace(/\.[A-Za-z0-9]{2,4}$/, ''); + + title = title.replace(/^(?:\s*\[[^\]]*\]\s*)+/, ''); + title = title.replace(/[._]+/g, ' '); + + // Remove everything from " - S##E##" or " - ###" onward (season/episode markers) + title = title.replace(/\s+-\s+S\d+E\d+.*$/i, ''); + title = title.replace(/\s+-\s+\d{2,}(\s+-\s+\d+)?(\s+-.+)?$/, ''); + title = title.replace(/\s+S\d+E\d+.*$/i, ''); + title = title.replace(/\s+S\d+\s*[- ]\s*\d+[: -].*$/i, ''); + title = title.replace(/\s+E\d+[: -].*$/i, ''); + title = title.replace(/^S\d+E\d+\s*[- ]\s*/i, ''); + + // Remove bracketed/parenthesized tags: [WEBDL-1080p], (2022), etc. + title = title.replace(/\s*\[[^\]]*\]\s*/g, ' '); + title = title.replace(/\s*\([^)]*\d{4}[^)]*\)\s*/g, ' '); + + // Remove common codec/source tags that may appear without brackets + title = title.replace( + /\b(WEBDL|WEBRip|BluRay|BDRip|HDTV|DVDRip|x264|x265|H\.?264|H\.?265|AV1|AAC|FLAC|Opus|10bit|8bit|1080p|720p|480p|2160p|4K)\b[-.\w]*/gi, + '', + ); + + // Remove trailing dashes and group tags like "-Retr0" + title = title.replace(/\s*-\s*[\w]+$/, ''); + + return title.trim().replace(/\s{2,}/g, ' '); +} + +function removeSeasonHint(title: string): string { + return title.replace(/\bseason\s*\d+\b/gi, '').replace(/\s{2,}/g, ' ').trim(); +} + +function normalizeTitle(text: string): string { + return text.trim().toLowerCase().replace(/\s+/g, ' '); +} + +function extractCandidateSeasonHints(text: string): Set { + const normalized = normalizeTitle(text); + const matches = [ + ...normalized.matchAll(/\bseason\s*(\d{1,2})\b/gi), + ...normalized.matchAll(/\bs(\d{1,2})(?:\b|\D)/gi), + ]; + const values = new Set(); + for (const match of matches) { + const value = Number.parseInt(match[1]!, 10); + if (Number.isInteger(value)) { + values.add(value); + } + } + return values; +} + +function isSeasonMentioned(titles: string[], season: number | null): boolean { + if (!season) { + return false; + } + const hints = titles.flatMap((title) => [...extractCandidateSeasonHints(title)]); + return hints.includes(season); +} + +function pickBestSearchResult( + title: string, + episode: number | null, + season: number | null, + media: AnilistMedia[], +): { id: number; title: string } | null { + const cleanedTitle = removeSeasonHint(title); + const targets = [title, cleanedTitle] + .map(normalizeTitle) + .map((value) => value.trim()) + .filter((value, index, all) => value.length > 0 && all.indexOf(value) === index); + + const filtered = episode === null + ? media + : media.filter((item) => { + const total = item.episodes; + return total === null || total >= episode; + }); + const candidates = filtered.length > 0 ? filtered : media; + if (candidates.length === 0) { + return null; + } + + const scored = candidates.map((item) => { + const candidateTitles = [ + item.title?.romaji, + item.title?.english, + item.title?.native, + ] + .filter((value): value is string => typeof value === 'string') + .map((value) => normalizeTitle(value)); + + let score = 0; + + for (const target of targets) { + if (candidateTitles.includes(target)) { + score += 120; + continue; + } + if (candidateTitles.some((itemTitle) => itemTitle.includes(target))) { + score += 30; + } + if (candidateTitles.some((itemTitle) => target.includes(itemTitle))) { + score += 10; + } + } + + if (episode !== null && item.episodes === episode) { + score += 20; + } + + if (season !== null && isSeasonMentioned(candidateTitles, season)) { + score += 15; + } + + return { item, score }; + }); + + scored.sort((a, b) => { + if (b.score !== a.score) return b.score - a.score; + return b.item.id - a.item.id; + }); + + const selected = scored[0]!; + const selectedTitle = selected.item.title?.english ?? selected.item.title?.romaji ?? selected.item.title?.native ?? title; + return { id: selected.item.id, title: selectedTitle }; +} + +function buildSearchCandidates(parsed: CoverArtCandidate): string[] { + const candidateTitles = [ + parsed.title, + ...(parsed.source === 'guessit' && parsed.season !== null && parsed.season > 1 + ? [`${parsed.title} Season ${parsed.season}`] + : []), + ]; + return candidateTitles + .map((title) => title.trim()) + .filter((title, index, all) => title.length > 0 && all.indexOf(title) === index); +} + +async function searchAnilist( + rateLimiter: AnilistRateLimiter, + title: string, +): Promise<{ media: AnilistMedia[]; rateLimited: boolean }> { + await rateLimiter.acquire(); + + const res = await fetch(ANILIST_GRAPHQL_URL, { + method: 'POST', + headers: { 'Content-Type': 'application/json', Accept: 'application/json' }, + body: JSON.stringify({ query: SEARCH_QUERY, variables: { search: title } }), + }); + + rateLimiter.recordResponse(res.headers); + + if (res.status === 429) { + return { media: [], rateLimited: true }; + } + + if (!res.ok) { + throw new Error(`Anilist search failed: ${res.status} ${res.statusText}`); + } + + const json = (await res.json()) as AnilistSearchResponse; + const mediaList = json.data?.Page?.media; + if (!mediaList || mediaList.length === 0) { + return { media: [], rateLimited: false }; + } + + return { media: mediaList, rateLimited: false }; +} + +async function downloadImage(url: string): Promise { + try { + const res = await fetch(url); + if (!res.ok) return null; + const arrayBuf = await res.arrayBuffer(); + return Buffer.from(arrayBuf); + } catch { + return null; + } +} + +export function createCoverArtFetcher( + rateLimiter: AnilistRateLimiter, + logger: Logger, + options: CoverArtFetcherOptions = {}, +): CoverArtFetcher { + const resolveMediaInfo = async (canonicalTitle: string): Promise => { + const parsed = await guessAnilistMediaInfo(null, canonicalTitle, { + runGuessit: options.runGuessit ?? runGuessit, + }); + if (!parsed) { + return null; + } + return { + title: parsed.title, + season: parsed.season, + episode: parsed.episode, + source: parsed.source, + }; + }; + + return { + async fetchIfMissing(db, videoId, canonicalTitle): Promise { + const existing = getCoverArt(db, videoId); + if (existing?.coverBlob) { + return true; + } + + if (existing?.coverUrl) { + const coverBlob = await downloadImage(existing.coverUrl); + if (coverBlob) { + upsertCoverArt(db, videoId, { + anilistId: existing.anilistId, + coverUrl: existing.coverUrl, + coverBlob, + titleRomaji: existing.titleRomaji, + titleEnglish: existing.titleEnglish, + episodesTotal: existing.episodesTotal, + }); + return true; + } + } + + if ( + existing && + existing.coverUrl === null && + existing.anilistId === null && + Date.now() - existing.fetchedAtMs < NO_MATCH_RETRY_MS + ) { + return false; + } + + const cleaned = stripFilenameTags(canonicalTitle); + if (!cleaned) { + logger.warn('cover-art: empty title after stripping tags for videoId=%d', videoId); + upsertCoverArt(db, videoId, { + anilistId: null, + coverUrl: null, + coverBlob: null, + titleRomaji: null, + titleEnglish: null, + episodesTotal: null, + }); + return false; + } + + const parsedInfo = await resolveMediaInfo(canonicalTitle); + const searchBase = parsedInfo?.title ?? cleaned; + const searchCandidates = parsedInfo + ? buildSearchCandidates(parsedInfo) + : [cleaned]; + + const effectiveCandidates = searchCandidates.includes(cleaned) + ? searchCandidates + : [...searchCandidates, cleaned]; + + let selected: AnilistMedia | null = null; + let rateLimited = false; + + for (const candidate of effectiveCandidates) { + logger.info('cover-art: searching Anilist for "%s" (videoId=%d)', candidate, videoId); + + try { + const result = await searchAnilist(rateLimiter, candidate); + rateLimited = result.rateLimited; + if (result.media.length === 0) { + continue; + } + + const picked = pickBestSearchResult( + searchBase, + parsedInfo?.episode ?? null, + parsedInfo?.season ?? null, + result.media, + ); + if (picked) { + const match = result.media.find((media) => media.id === picked.id); + if (match) { + selected = match; + break; + } + } + } catch (err) { + logger.error('cover-art: Anilist search error for "%s": %s', candidate, err); + return false; + } + } + + if (rateLimited) { + logger.warn('cover-art: rate-limited by Anilist, skipping videoId=%d', videoId); + return false; + } + + if (!selected) { + logger.info('cover-art: no Anilist results for "%s", caching no-match', searchBase); + upsertCoverArt(db, videoId, { + anilistId: null, + coverUrl: null, + coverBlob: null, + titleRomaji: null, + titleEnglish: null, + episodesTotal: null, + }); + return false; + } + + const coverUrl = selected.coverImage?.large ?? selected.coverImage?.medium ?? null; + let coverBlob: Buffer | null = null; + if (coverUrl) { + coverBlob = await downloadImage(coverUrl); + } + + upsertCoverArt(db, videoId, { + anilistId: selected.id, + coverUrl, + coverBlob, + titleRomaji: selected.title?.romaji ?? null, + titleEnglish: selected.title?.english ?? null, + episodesTotal: selected.episodes ?? null, + }); + + updateAnimeAnilistInfo(db, videoId, { + anilistId: selected.id, + titleRomaji: selected.title?.romaji ?? null, + titleEnglish: selected.title?.english ?? null, + titleNative: selected.title?.native ?? null, + episodesTotal: selected.episodes ?? null, + }); + + logger.info( + 'cover-art: cached art for videoId=%d anilistId=%d title="%s"', + videoId, + selected.id, + selected.title?.romaji ?? searchBase, + ); + + return true; + }, + }; +} diff --git a/src/core/services/immersion-tracker-service.test.ts b/src/core/services/immersion-tracker-service.test.ts index d5bad4e..2c55d41 100644 --- a/src/core/services/immersion-tracker-service.test.ts +++ b/src/core/services/immersion-tracker-service.test.ts @@ -12,6 +12,7 @@ import { resolveBoundedInt, } from './immersion-tracker/reducer'; import type { QueuedWrite } from './immersion-tracker/types'; +import { PartOfSpeech, type MergedToken } from '../../types'; type ImmersionTrackerService = import('./immersion-tracker-service').ImmersionTrackerService; type ImmersionTrackerServiceCtor = @@ -26,6 +27,34 @@ async function loadTrackerCtor(): Promise { return trackerCtor; } +async function waitForPendingAnimeMetadata(tracker: ImmersionTrackerService): Promise { + const privateApi = tracker as unknown as { + sessionState: { videoId: number } | null; + pendingAnimeMetadataUpdates?: Map>; + }; + const videoId = privateApi.sessionState?.videoId; + if (!videoId) return; + await privateApi.pendingAnimeMetadataUpdates?.get(videoId); +} + +function makeMergedToken(overrides: Partial): MergedToken { + return { + surface: '', + reading: '', + headword: '', + startPos: 0, + endPos: 0, + partOfSpeech: PartOfSpeech.other, + pos1: '', + pos2: '', + pos3: '', + isMerged: true, + isKnown: false, + isNPlusOneTarget: false, + ...overrides, + }; +} + function makeDbPath(): string { const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-immersion-test-')); return path.join(dir, 'immersion.sqlite'); @@ -222,6 +251,308 @@ test('persists and retrieves minimum immersion tracking fields', async () => { } }); +test('recordSubtitleLine persists counted allowed tokenized vocabulary rows and subtitle-line occurrences', async () => { + const dbPath = makeDbPath(); + let tracker: ImmersionTrackerService | null = null; + + try { + const Ctor = await loadTrackerCtor(); + tracker = new Ctor({ dbPath }); + + tracker.handleMediaChange('/tmp/Little Witch Academia S02E04.mkv', 'Episode 4'); + await waitForPendingAnimeMetadata(tracker); + tracker.recordSubtitleLine('猫 猫 日 日 は 知っている', 0, 1, [ + makeMergedToken({ + surface: '猫', + headword: '猫', + reading: 'ねこ', + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '一般', + }), + makeMergedToken({ + surface: '猫', + headword: '猫', + reading: 'ねこ', + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '一般', + }), + makeMergedToken({ + surface: 'は', + headword: 'は', + reading: 'は', + partOfSpeech: PartOfSpeech.particle, + pos1: '助詞', + pos2: '係助詞', + }), + makeMergedToken({ + surface: '知っている', + headword: '知る', + reading: 'しっている', + partOfSpeech: PartOfSpeech.other, + pos1: '動詞', + pos2: '自立', + }), + ]); + + const privateApi = tracker as unknown as { + flushTelemetry: (force?: boolean) => void; + flushNow: () => void; + }; + privateApi.flushTelemetry(true); + privateApi.flushNow(); + + const db = new Database(dbPath); + const rows = db + .prepare( + `SELECT headword, word, reading, part_of_speech, pos1, pos2, frequency + FROM imm_words + ORDER BY id ASC`, + ) + .all() as Array<{ + headword: string; + word: string; + reading: string; + part_of_speech: string; + pos1: string; + pos2: string; + frequency: number; + }>; + const lineRows = db + .prepare( + `SELECT video_id, anime_id, line_index, segment_start_ms, segment_end_ms, text + FROM imm_subtitle_lines + ORDER BY line_id ASC`, + ) + .all() as Array<{ + video_id: number; + anime_id: number | null; + line_index: number; + segment_start_ms: number | null; + segment_end_ms: number | null; + text: string; + }>; + const wordOccurrenceRows = db + .prepare( + `SELECT o.occurrence_count, w.headword, w.word, w.reading + FROM imm_word_line_occurrences o + JOIN imm_words w ON w.id = o.word_id + ORDER BY o.line_id ASC, o.word_id ASC`, + ) + .all() as Array<{ + occurrence_count: number; + headword: string; + word: string; + reading: string; + }>; + const kanjiOccurrenceRows = db + .prepare( + `SELECT o.occurrence_count, k.kanji + FROM imm_kanji_line_occurrences o + JOIN imm_kanji k ON k.id = o.kanji_id + ORDER BY o.line_id ASC, k.kanji ASC`, + ) + .all() as Array<{ + occurrence_count: number; + kanji: string; + }>; + db.close(); + + assert.deepEqual(rows, [ + { + headword: '猫', + word: '猫', + reading: 'ねこ', + part_of_speech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '一般', + frequency: 2, + }, + { + headword: '知る', + word: '知っている', + reading: 'しっている', + part_of_speech: PartOfSpeech.verb, + pos1: '動詞', + pos2: '自立', + frequency: 1, + }, + ]); + assert.equal(lineRows.length, 1); + assert.equal(lineRows[0]?.line_index, 1); + assert.equal(lineRows[0]?.segment_start_ms, 0); + assert.equal(lineRows[0]?.segment_end_ms, 1000); + assert.equal(lineRows[0]?.text, '猫 猫 日 日 は 知っている'); + assert.ok(lineRows[0]?.video_id); + assert.ok(lineRows[0]?.anime_id); + assert.deepEqual(wordOccurrenceRows, [ + { + occurrence_count: 2, + headword: '猫', + word: '猫', + reading: 'ねこ', + }, + { + occurrence_count: 1, + headword: '知る', + word: '知っている', + reading: 'しっている', + }, + ]); + assert.deepEqual(kanjiOccurrenceRows, [ + { + occurrence_count: 2, + kanji: '日', + }, + { + occurrence_count: 2, + kanji: '猫', + }, + { + occurrence_count: 1, + kanji: '知', + }, + ]); + } finally { + tracker?.destroy(); + cleanupDbPath(dbPath); + } +}); + +test('handleMediaChange links parsed anime metadata on the active video row', async () => { + const dbPath = makeDbPath(); + let tracker: ImmersionTrackerService | null = null; + + try { + const Ctor = await loadTrackerCtor(); + tracker = new Ctor({ dbPath }); + + tracker.handleMediaChange('/tmp/Little Witch Academia S02E05.mkv', 'Episode 5'); + await waitForPendingAnimeMetadata(tracker); + + const privateApi = tracker as unknown as { + db: DatabaseSync; + sessionState: { videoId: number } | null; + }; + const videoId = privateApi.sessionState?.videoId; + assert.ok(videoId); + + const row = privateApi.db + .prepare( + ` + SELECT + v.anime_id, + v.parsed_basename, + v.parsed_title, + v.parsed_season, + v.parsed_episode, + v.parser_source, + a.canonical_title AS anime_title, + a.anilist_id + FROM imm_videos v + LEFT JOIN imm_anime a ON a.anime_id = v.anime_id + WHERE v.video_id = ? + `, + ) + .get(videoId) as { + anime_id: number | null; + parsed_basename: string | null; + parsed_title: string | null; + parsed_season: number | null; + parsed_episode: number | null; + parser_source: string | null; + anime_title: string | null; + anilist_id: number | null; + } | null; + + assert.ok(row); + assert.ok(row?.anime_id); + assert.equal(row?.parsed_basename, 'Little Witch Academia S02E05.mkv'); + assert.equal(row?.parsed_title, 'Little Witch Academia'); + assert.equal(row?.parsed_season, 2); + assert.equal(row?.parsed_episode, 5); + assert.ok(row?.parser_source === 'guessit' || row?.parser_source === 'fallback'); + assert.equal(row?.anime_title, 'Little Witch Academia'); + assert.equal(row?.anilist_id, null); + } finally { + tracker?.destroy(); + cleanupDbPath(dbPath); + } +}); + +test('handleMediaChange reuses the same provisional anime row across matching files', async () => { + const dbPath = makeDbPath(); + let tracker: ImmersionTrackerService | null = null; + + try { + const Ctor = await loadTrackerCtor(); + tracker = new Ctor({ dbPath }); + + tracker.handleMediaChange('/tmp/Little Witch Academia S02E05.mkv', 'Episode 5'); + await waitForPendingAnimeMetadata(tracker); + + tracker.handleMediaChange('/tmp/Little Witch Academia S02E06.mkv', 'Episode 6'); + await waitForPendingAnimeMetadata(tracker); + + const privateApi = tracker as unknown as { + db: DatabaseSync; + }; + const rows = privateApi.db + .prepare( + ` + SELECT + v.source_path, + v.anime_id, + v.parsed_episode, + a.canonical_title AS anime_title, + a.anilist_id + FROM imm_videos v + LEFT JOIN imm_anime a ON a.anime_id = v.anime_id + WHERE v.source_path IN (?, ?) + ORDER BY v.source_path + `, + ) + .all('/tmp/Little Witch Academia S02E05.mkv', '/tmp/Little Witch Academia S02E06.mkv') as + Array<{ + source_path: string | null; + anime_id: number | null; + parsed_episode: number | null; + anime_title: string | null; + anilist_id: number | null; + }>; + + assert.equal(rows.length, 2); + assert.ok(rows[0]?.anime_id); + assert.equal(rows[0]?.anime_id, rows[1]?.anime_id); + assert.deepEqual( + rows.map((row) => ({ + sourcePath: row.source_path, + parsedEpisode: row.parsed_episode, + animeTitle: row.anime_title, + anilistId: row.anilist_id, + })), + [ + { + sourcePath: '/tmp/Little Witch Academia S02E05.mkv', + parsedEpisode: 5, + animeTitle: 'Little Witch Academia', + anilistId: null, + }, + { + sourcePath: '/tmp/Little Witch Academia S02E06.mkv', + parsedEpisode: 6, + animeTitle: 'Little Witch Academia', + anilistId: null, + }, + ], + ); + } finally { + tracker?.destroy(); + cleanupDbPath(dbPath); + } +}); + test('applies configurable queue, flush, and retention policy', async () => { const dbPath = makeDbPath(); let tracker: ImmersionTrackerService | null = null; diff --git a/src/core/services/immersion-tracker-service.ts b/src/core/services/immersion-tracker-service.ts index ff02283..dccf784 100644 --- a/src/core/services/immersion-tracker-service.ts +++ b/src/core/services/immersion-tracker-service.ts @@ -1,7 +1,8 @@ import path from 'node:path'; import * as fs from 'node:fs'; import { createLogger } from '../../logger'; -import { getLocalVideoMetadata } from './immersion-tracker/metadata'; +import type { CoverArtFetcher } from './anilist/cover-art-fetcher'; +import { getLocalVideoMetadata, guessAnimeVideoMetadata } from './immersion-tracker/metadata'; import { pruneRetention, runRollupMaintenance } from './immersion-tracker/maintenance'; import { Database, type DatabaseSync } from './immersion-tracker/sqlite'; import { finalizeSessionRecord, startSessionRecord } from './immersion-tracker/session'; @@ -10,23 +11,58 @@ import { createTrackerPreparedStatements, ensureSchema, executeQueuedWrite, + getOrCreateAnimeRecord, getOrCreateVideoRecord, + linkVideoToAnimeRecord, type TrackerPreparedStatements, updateVideoMetadataRecord, updateVideoTitleRecord, } from './immersion-tracker/storage'; import { + cleanupVocabularyStats, + getAnimeCoverArt, + getAnimeDailyRollups, + getAnimeAnilistEntries, + getAnimeDetail, + getAnimeEpisodes, + getAnimeLibrary, + getAnimeWords, + getEpisodeCardEvents, + getEpisodeSessions, + getEpisodeWords, + getCoverArt, getDailyRollups, + getEpisodesPerDay, + getKanjiAnimeAppearances, + getKanjiDetail, + getKanjiWords, + getNewAnimePerDay, + getSimilarWords, + getStreakCalendar, + getKanjiOccurrences, + getKanjiStats, + getMediaDailyRollups, + getMediaDetail, + getMediaLibrary, + getMediaSessions, getMonthlyRollups, getQueryHints, + getSessionEvents, getSessionSummaries, getSessionTimeline, + getVocabularyStats, + getWatchTimePerAnime, + getWordAnimeAppearances, + getWordDetail, + getWordOccurrences, + getVideoDurationMs, + markVideoWatched, } from './immersion-tracker/query'; import { buildVideoKey, calculateTextMetrics, - extractLineVocabulary, deriveCanonicalTitle, + isKanji, isRemoteSource, normalizeMediaPath, normalizeText, @@ -57,19 +93,73 @@ import { SOURCE_TYPE_LOCAL, SOURCE_TYPE_REMOTE, type ImmersionSessionRollupRow, + type EpisodeCardEventRow, + type EpisodesPerDayRow, type ImmersionTrackerOptions, + type KanjiAnimeAppearanceRow, + type KanjiDetailRow, + type KanjiOccurrenceRow, + type KanjiStatsRow, + type KanjiWordRow, + type LegacyVocabularyPosResolution, + type LegacyVocabularyPosRow, + type AnimeAnilistEntryRow, + type AnimeDetailRow, + type AnimeEpisodeRow, + type AnimeLibraryRow, + type AnimeWordRow, + type MediaArtRow, + type MediaDetailRow, + type MediaLibraryRow, + type NewAnimePerDayRow, type QueuedWrite, + type SessionEventRow, type SessionState, type SessionSummaryQueryRow, type SessionTimelineRow, + type SimilarWordRow, + type StreakCalendarRow, + type VocabularyCleanupSummary, + type WatchTimePerAnimeRow, + type WordAnimeAppearanceRow, + type WordDetailRow, + type WordOccurrenceRow, + type VocabularyStatsRow, } from './immersion-tracker/types'; +import type { MergedToken } from '../../types'; +import { shouldExcludeTokenFromVocabularyPersistence } from './tokenizer/annotation-stage'; +import { deriveStoredPartOfSpeech } from './tokenizer/part-of-speech'; export type { + AnimeAnilistEntryRow, + AnimeDetailRow, + AnimeEpisodeRow, + AnimeLibraryRow, + AnimeWordRow, + EpisodeCardEventRow, + EpisodesPerDayRow, ImmersionSessionRollupRow, ImmersionTrackerOptions, ImmersionTrackerPolicy, + KanjiAnimeAppearanceRow, + KanjiDetailRow, + KanjiOccurrenceRow, + KanjiStatsRow, + KanjiWordRow, + MediaArtRow, + MediaDetailRow, + MediaLibraryRow, + NewAnimePerDayRow, + SessionEventRow, SessionSummaryQueryRow, SessionTimelineRow, + SimilarWordRow, + StreakCalendarRow, + WatchTimePerAnimeRow, + WordAnimeAppearanceRow, + WordDetailRow, + WordOccurrenceRow, + VocabularyStatsRow, } from './immersion-tracker/types'; export class ImmersionTrackerService { @@ -98,9 +188,17 @@ export class ImmersionTrackerService { private currentVideoKey = ''; private currentMediaPathOrUrl = ''; private readonly preparedStatements: TrackerPreparedStatements; + private coverArtFetcher: CoverArtFetcher | null = null; + private readonly pendingCoverFetches = new Map>(); + private readonly recordedSubtitleKeys = new Set(); + private readonly pendingAnimeMetadataUpdates = new Map>(); + private readonly resolveLegacyVocabularyPos: + | ((row: LegacyVocabularyPosRow) => Promise) + | undefined; constructor(options: ImmersionTrackerOptions) { this.dbPath = options.dbPath; + this.resolveLegacyVocabularyPos = options.resolveLegacyVocabularyPos; const parentDir = path.dirname(this.dbPath); if (!fs.existsSync(parentDir)) { fs.mkdirSync(parentDir, { recursive: true }); @@ -198,6 +296,8 @@ export class ImmersionTrackerService { async getQueryHints(): Promise<{ totalSessions: number; activeSessions: number; + episodesToday: number; + activeAnimeCount: number; }> { return getQueryHints(this.db); } @@ -210,6 +310,180 @@ export class ImmersionTrackerService { return getMonthlyRollups(this.db, limit); } + async getVocabularyStats(limit = 100, excludePos?: string[]): Promise { + return getVocabularyStats(this.db, limit, excludePos); + } + + async cleanupVocabularyStats(): Promise { + return cleanupVocabularyStats(this.db, { + resolveLegacyPos: this.resolveLegacyVocabularyPos, + }); + } + + async getKanjiStats(limit = 100): Promise { + return getKanjiStats(this.db, limit); + } + + async getWordOccurrences( + headword: string, + word: string, + reading: string, + limit = 100, + offset = 0, + ): Promise { + return getWordOccurrences(this.db, headword, word, reading, limit, offset); + } + + async getKanjiOccurrences( + kanji: string, + limit = 100, + offset = 0, + ): Promise { + return getKanjiOccurrences(this.db, kanji, limit, offset); + } + + async getSessionEvents(sessionId: number, limit = 500): Promise { + return getSessionEvents(this.db, sessionId, limit); + } + + async getMediaLibrary(): Promise { + return getMediaLibrary(this.db); + } + + async getMediaDetail(videoId: number): Promise { + return getMediaDetail(this.db, videoId); + } + + async getMediaSessions(videoId: number, limit = 100): Promise { + return getMediaSessions(this.db, videoId, limit); + } + + async getMediaDailyRollups(videoId: number, limit = 90): Promise { + return getMediaDailyRollups(this.db, videoId, limit); + } + + async getCoverArt(videoId: number): Promise { + return getCoverArt(this.db, videoId); + } + + async getAnimeLibrary(): Promise { + return getAnimeLibrary(this.db); + } + + async getAnimeDetail(animeId: number): Promise { + return getAnimeDetail(this.db, animeId); + } + + async getAnimeEpisodes(animeId: number): Promise { + return getAnimeEpisodes(this.db, animeId); + } + + async getAnimeAnilistEntries(animeId: number): Promise { + return getAnimeAnilistEntries(this.db, animeId); + } + + async getAnimeCoverArt(animeId: number): Promise { + return getAnimeCoverArt(this.db, animeId); + } + + async getAnimeWords(animeId: number, limit = 50): Promise { + return getAnimeWords(this.db, animeId, limit); + } + + async getEpisodeWords(videoId: number, limit = 50): Promise { + return getEpisodeWords(this.db, videoId, limit); + } + + async getEpisodeSessions(videoId: number): Promise { + return getEpisodeSessions(this.db, videoId); + } + + async setVideoWatched(videoId: number, watched: boolean): Promise { + markVideoWatched(this.db, videoId, watched); + } + + async getEpisodeCardEvents(videoId: number): Promise { + return getEpisodeCardEvents(this.db, videoId); + } + + async getAnimeDailyRollups(animeId: number, limit = 90): Promise { + return getAnimeDailyRollups(this.db, animeId, limit); + } + + async getStreakCalendar(days = 90): Promise { + return getStreakCalendar(this.db, days); + } + + async getEpisodesPerDay(limit = 90): Promise { + return getEpisodesPerDay(this.db, limit); + } + + async getNewAnimePerDay(limit = 90): Promise { + return getNewAnimePerDay(this.db, limit); + } + + async getWatchTimePerAnime(limit = 90): Promise { + return getWatchTimePerAnime(this.db, limit); + } + + async getWordDetail(wordId: number): Promise { + return getWordDetail(this.db, wordId); + } + + async getWordAnimeAppearances(wordId: number): Promise { + return getWordAnimeAppearances(this.db, wordId); + } + + async getSimilarWords(wordId: number, limit = 10): Promise { + return getSimilarWords(this.db, wordId, limit); + } + + async getKanjiDetail(kanjiId: number): Promise { + return getKanjiDetail(this.db, kanjiId); + } + + async getKanjiAnimeAppearances(kanjiId: number): Promise { + return getKanjiAnimeAppearances(this.db, kanjiId); + } + + async getKanjiWords(kanjiId: number, limit = 20): Promise { + return getKanjiWords(this.db, kanjiId, limit); + } + + setCoverArtFetcher(fetcher: CoverArtFetcher | null): void { + this.coverArtFetcher = fetcher; + } + + async ensureCoverArt(videoId: number): Promise { + const existing = getCoverArt(this.db, videoId); + if (existing?.coverBlob) { + return true; + } + if (!this.coverArtFetcher) { + return false; + } + const inFlight = this.pendingCoverFetches.get(videoId); + if (inFlight) { + return await inFlight; + } + + const fetchPromise = (async () => { + const detail = getMediaDetail(this.db, videoId); + const canonicalTitle = detail?.canonicalTitle?.trim(); + if (!canonicalTitle) { + return false; + } + return await this.coverArtFetcher!.fetchIfMissing(this.db, videoId, canonicalTitle); + })(); + + this.pendingCoverFetches.set(videoId, fetchPromise); + try { + return await fetchPromise; + } finally { + this.pendingCoverFetches.delete(videoId); + } + } + handleMediaChange(mediaPath: string | null, mediaTitle: string | null): void { const normalizedPath = normalizeMediaPath(mediaPath); const normalizedTitle = normalizeText(mediaTitle); @@ -254,6 +528,7 @@ export class ImmersionTrackerService { `Starting immersion session for path=${normalizedPath} videoId=${sessionInfo.videoId}`, ); this.startSession(sessionInfo.videoId, sessionInfo.startedAtMs); + this.captureAnimeMetadataAsync(sessionInfo.videoId, normalizedPath, normalizedTitle || null); this.captureVideoMetadataAsync(sessionInfo.videoId, sourceType, normalizedPath); } @@ -265,41 +540,111 @@ export class ImmersionTrackerService { this.updateVideoTitleForActiveSession(normalizedTitle); } - recordSubtitleLine(text: string, startSec: number, endSec: number): void { + recordSubtitleLine( + text: string, + startSec: number, + endSec: number, + tokens?: MergedToken[] | null, + ): void { if (!this.sessionState || !text.trim()) return; const cleaned = normalizeText(text); if (!cleaned) return; + + if (!endSec || endSec <= 0) { + return; + } + + const startMs = secToMs(startSec); + const subtitleKey = `${startMs}:${cleaned}`; + if (this.recordedSubtitleKeys.has(subtitleKey)) { + return; + } + this.recordedSubtitleKeys.add(subtitleKey); + const nowMs = Date.now(); const nowSec = nowMs / 1000; const metrics = calculateTextMetrics(cleaned); - const extractedVocabulary = extractLineVocabulary(cleaned); this.sessionState.currentLineIndex += 1; this.sessionState.linesSeen += 1; this.sessionState.wordsSeen += metrics.words; this.sessionState.tokensSeen += metrics.tokens; this.sessionState.pendingTelemetry = true; - for (const { headword, word, reading } of extractedVocabulary.words) { - this.recordWrite({ - kind: 'word', + const wordOccurrences = new Map< + string, + { + headword: string; + word: string; + reading: string; + partOfSpeech: string; + pos1: string; + pos2: string; + pos3: string; + occurrenceCount: number; + } + >(); + for (const token of tokens ?? []) { + if (shouldExcludeTokenFromVocabularyPersistence(token)) { + continue; + } + const headword = normalizeText(token.headword || token.surface); + const word = normalizeText(token.surface || token.headword); + const reading = normalizeText(token.reading); + if (!headword || !word) { + continue; + } + const wordKey = [ headword, word, reading, - firstSeen: nowSec, - lastSeen: nowSec, + ].join('\u0000'); + const storedPartOfSpeech = deriveStoredPartOfSpeech({ + partOfSpeech: token.partOfSpeech, + pos1: token.pos1 ?? '', + }); + const existing = wordOccurrences.get(wordKey); + if (existing) { + existing.occurrenceCount += 1; + continue; + } + wordOccurrences.set(wordKey, { + headword, + word, + reading, + partOfSpeech: storedPartOfSpeech, + pos1: token.pos1 ?? '', + pos2: token.pos2 ?? '', + pos3: token.pos3 ?? '', + occurrenceCount: 1, }); } - for (const kanji of extractedVocabulary.kanji) { - this.recordWrite({ - kind: 'kanji', - kanji, - firstSeen: nowSec, - lastSeen: nowSec, - }); + const kanjiCounts = new Map(); + for (const char of cleaned) { + if (!isKanji(char)) { + continue; + } + kanjiCounts.set(char, (kanjiCounts.get(char) ?? 0) + 1); } + this.recordWrite({ + kind: 'subtitleLine', + sessionId: this.sessionState.sessionId, + videoId: this.sessionState.videoId, + lineIndex: this.sessionState.currentLineIndex, + segmentStartMs: secToMs(startSec), + segmentEndMs: secToMs(endSec), + text: cleaned, + wordOccurrences: Array.from(wordOccurrences.values()), + kanjiOccurrences: Array.from(kanjiCounts.entries()).map(([kanji, occurrenceCount]) => ({ + kanji, + occurrenceCount, + })), + firstSeen: nowSec, + lastSeen: nowSec, + }); + this.recordWrite({ kind: 'event', sessionId: this.sessionState.sessionId, @@ -321,6 +666,16 @@ export class ImmersionTrackerService { }); } + recordMediaDuration(durationSec: number): void { + if (!this.sessionState || !Number.isFinite(durationSec) || durationSec <= 0) return; + const durationMs = Math.round(durationSec * 1000); + const current = getVideoDurationMs(this.db, this.sessionState.videoId); + if (current === 0 || Math.abs(current - durationMs) > 1000) { + this.db.prepare('UPDATE imm_videos SET duration_ms = ?, LAST_UPDATE_DATE = ? WHERE video_id = ?') + .run(durationMs, Date.now(), this.sessionState.videoId); + } + } + recordPlaybackPosition(mediaTimeSec: number | null): void { if (!this.sessionState || mediaTimeSec === null || !Number.isFinite(mediaTimeSec)) { return; @@ -391,6 +746,14 @@ export class ImmersionTrackerService { this.sessionState.lastWallClockMs = nowMs; this.sessionState.lastMediaMs = mediaMs; this.sessionState.pendingTelemetry = true; + + if (!this.sessionState.markedWatched) { + const durationMs = getVideoDurationMs(this.db, this.sessionState.videoId); + if (durationMs > 0 && mediaMs >= durationMs * 0.98) { + markVideoWatched(this.db, this.sessionState.videoId, true); + this.sessionState.markedWatched = true; + } + } } recordPauseState(isPaused: boolean): void { @@ -454,7 +817,7 @@ export class ImmersionTrackerService { }); } - recordCardsMined(count = 1): void { + recordCardsMined(count = 1, noteIds?: number[]): void { if (!this.sessionState) return; this.sessionState.cardsMined += count; this.sessionState.pendingTelemetry = true; @@ -465,7 +828,10 @@ export class ImmersionTrackerService { eventType: EVENT_CARD_MINED, wordsDelta: 0, cardsDelta: count, - payloadJson: sanitizePayload({ cardsMined: count }, this.maxPayloadBytes), + payloadJson: sanitizePayload( + { cardsMined: count, ...(noteIds?.length ? { noteIds } : {}) }, + this.maxPayloadBytes, + ), }); } @@ -615,6 +981,7 @@ export class ImmersionTrackerService { private startSession(videoId: number, startedAtMs?: number): void { const { sessionId, state } = startSessionRecord(this.db, videoId, startedAtMs); this.sessionState = state; + this.recordedSubtitleKeys.clear(); this.recordWrite({ kind: 'telemetry', sessionId, @@ -673,6 +1040,48 @@ export class ImmersionTrackerService { })(); } + private captureAnimeMetadataAsync( + videoId: number, + mediaPath: string | null, + mediaTitle: string | null, + ): void { + const updatePromise = (async () => { + try { + const parsed = await guessAnimeVideoMetadata(mediaPath, mediaTitle); + if (this.isDestroyed || !parsed?.parsedTitle.trim()) { + return; + } + + const animeId = getOrCreateAnimeRecord(this.db, { + parsedTitle: parsed.parsedTitle, + canonicalTitle: parsed.parsedTitle, + anilistId: null, + titleRomaji: null, + titleEnglish: null, + titleNative: null, + metadataJson: parsed.parseMetadataJson, + }); + linkVideoToAnimeRecord(this.db, videoId, { + animeId, + parsedBasename: parsed.parsedBasename, + parsedTitle: parsed.parsedTitle, + parsedSeason: parsed.parsedSeason, + parsedEpisode: parsed.parsedEpisode, + parserSource: parsed.parserSource, + parserConfidence: parsed.parserConfidence, + parseMetadataJson: parsed.parseMetadataJson, + }); + } catch (error) { + this.logger.warn('Unable to capture anime metadata', (error as Error).message); + } + })(); + + this.pendingAnimeMetadataUpdates.set(videoId, updatePromise); + void updatePromise.finally(() => { + this.pendingAnimeMetadataUpdates.delete(videoId); + }); + } + private updateVideoTitleForActiveSession(canonicalTitle: string): void { if (!this.sessionState) return; updateVideoTitleRecord(this.db, this.sessionState.videoId, canonicalTitle); diff --git a/src/core/services/immersion-tracker/__tests__/query.test.ts b/src/core/services/immersion-tracker/__tests__/query.test.ts new file mode 100644 index 0000000..468800d --- /dev/null +++ b/src/core/services/immersion-tracker/__tests__/query.test.ts @@ -0,0 +1,976 @@ +import assert from 'node:assert/strict'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import test from 'node:test'; +import { Database } from '../sqlite.js'; +import { + createTrackerPreparedStatements, + ensureSchema, + getOrCreateAnimeRecord, + getOrCreateVideoRecord, + linkVideoToAnimeRecord, +} from '../storage.js'; +import { startSessionRecord } from '../session.js'; +import { + cleanupVocabularyStats, + getAnimeDetail, + getAnimeEpisodes, + getAnimeLibrary, + getKanjiOccurrences, + getSessionSummaries, + getVocabularyStats, + getKanjiStats, + getSessionEvents, + getWordOccurrences, +} from '../query.js'; +import { SOURCE_TYPE_LOCAL, EVENT_SUBTITLE_LINE } from '../types.js'; + +function makeDbPath(): string { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-imm-query-test-')); + return path.join(dir, 'immersion.sqlite'); +} + +function cleanupDbPath(dbPath: string): void { + const dir = path.dirname(dbPath); + if (!fs.existsSync(dir)) { + return; + } + + const bunRuntime = globalThis as typeof globalThis & { + Bun?: { + gc?: (force?: boolean) => void; + }; + }; + let lastError: NodeJS.ErrnoException | null = null; + for (let attempt = 0; attempt < 3; attempt += 1) { + try { + fs.rmSync(dir, { recursive: true, force: true }); + return; + } catch (error) { + const err = error as NodeJS.ErrnoException; + lastError = err; + if (process.platform !== 'win32' || err.code !== 'EBUSY') { + throw error; + } + bunRuntime.Bun?.gc?.(true); + Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, 25); + } + } + if (lastError) { + throw lastError; + } +} + +test('getSessionSummaries returns sessionId and canonicalTitle', () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + ensureSchema(db); + const stmts = createTrackerPreparedStatements(db); + + const videoId = getOrCreateVideoRecord(db, 'local:/tmp/query-test.mkv', { + canonicalTitle: 'Query Test Episode', + sourcePath: '/tmp/query-test.mkv', + sourceUrl: null, + sourceType: SOURCE_TYPE_LOCAL, + }); + + const startedAtMs = 1_000_000; + const { sessionId } = startSessionRecord(db, videoId, startedAtMs); + + stmts.telemetryInsertStmt.run( + sessionId, + startedAtMs + 1_000, + 3_000, + 2_500, + 5, + 10, + 10, + 1, + 2, + 1, + 0, + 0, + 0, + 0, + 0, + startedAtMs + 1_000, + startedAtMs + 1_000, + ); + + const rows = getSessionSummaries(db, 10); + + assert.ok(rows.length >= 1); + const row = rows.find((r) => r.sessionId === sessionId); + assert.ok(row, 'expected to find a row for the created session'); + assert.equal(typeof row.sessionId, 'number'); + assert.equal(row.sessionId, sessionId); + assert.equal(row.canonicalTitle, 'Query Test Episode'); + assert.equal(row.videoId, videoId); + assert.ok(row.linesSeen >= 5); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); + +test('getSessionSummaries with no telemetry returns zero aggregates', () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + ensureSchema(db); + + const videoId = getOrCreateVideoRecord(db, 'local:/tmp/no-telemetry.mkv', { + canonicalTitle: 'No Telemetry', + sourcePath: '/tmp/no-telemetry.mkv', + sourceUrl: null, + sourceType: SOURCE_TYPE_LOCAL, + }); + + const { sessionId } = startSessionRecord(db, videoId, 3_000_000); + + const rows = getSessionSummaries(db, 10); + const row = rows.find((r) => r.sessionId === sessionId); + assert.ok(row, 'expected to find the session with no telemetry'); + assert.equal(row.canonicalTitle, 'No Telemetry'); + assert.equal(row.totalWatchedMs, 0); + assert.equal(row.linesSeen, 0); + assert.equal(row.cardsMined, 0); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); + +test('getVocabularyStats returns rows ordered by frequency descending', () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + ensureSchema(db); + const stmts = createTrackerPreparedStatements(db); + + // Insert words: 猫 twice, 犬 once + stmts.wordUpsertStmt.run('猫', '猫', 'ねこ', 'noun', '名詞', '一般', '', 1_000, 2_000); + stmts.wordUpsertStmt.run('猫', '猫', 'ねこ', 'noun', '名詞', '一般', '', 1_000, 3_000); + stmts.wordUpsertStmt.run('犬', '犬', 'いぬ', 'noun', '名詞', '一般', '', 1_500, 1_500); + + const rows = getVocabularyStats(db, 10); + + assert.ok(rows.length >= 2); + // First row should be 猫 (frequency 2) + const nekRow = rows.find((r) => r.headword === '猫'); + const inuRow = rows.find((r) => r.headword === '犬'); + assert.ok(nekRow, 'expected 猫 row'); + assert.ok(inuRow, 'expected 犬 row'); + assert.equal(nekRow.headword, '猫'); + assert.equal(nekRow.word, '猫'); + assert.equal(nekRow.reading, 'ねこ'); + assert.equal(nekRow.frequency, 2); + assert.equal(typeof nekRow.firstSeen, 'number'); + assert.equal(typeof nekRow.lastSeen, 'number'); + // Higher frequency should come first + const nekIdx = rows.indexOf(nekRow); + const inuIdx = rows.indexOf(inuRow); + assert.ok(nekIdx < inuIdx, 'higher frequency word should appear first'); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); + +test('getVocabularyStats returns empty array when no words exist', () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + ensureSchema(db); + const rows = getVocabularyStats(db, 10); + assert.deepEqual(rows, []); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); + +test('cleanupVocabularyStats repairs stored POS metadata and removes excluded imm_words rows', async () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + ensureSchema(db); + db.prepare( + `INSERT INTO imm_words ( + headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ).run('猫', '猫', 'ねこ', 'noun', '名詞', '一般', '', 1_000, 1_500, 3); + db.prepare( + `INSERT INTO imm_words ( + headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ).run('知っている', '知っている', '', 'other', '動詞', '自立', '', 1_025, 1_525, 4); + db.prepare( + `INSERT INTO imm_words ( + headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ).run('は', 'は', 'は', 'particle', '助詞', '係助詞', '', 1_100, 1_600, 9); + db.prepare( + `INSERT INTO imm_words ( + headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ).run('旧', '旧', '', '', '', '', '', 900, 950, 1); + db.prepare( + `INSERT INTO imm_words ( + headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ).run('未解決', '未解決', '', '', '', '', '', 901, 951, 1); + + const result = await cleanupVocabularyStats(db, { + resolveLegacyPos: async (row) => { + if (row.headword === '旧') { + return { + partOfSpeech: 'noun', + headword: '旧', + reading: 'きゅう', + pos1: '名詞', + pos2: '一般', + pos3: '', + }; + } + if (row.headword === '知っている') { + return { + partOfSpeech: 'verb', + headword: '知る', + reading: 'しっている', + pos1: '動詞', + pos2: '自立', + pos3: '', + }; + } + return null; + }, + }); + const rows = getVocabularyStats(db, 10); + const repairedRows = db + .prepare( + `SELECT headword, word, reading, part_of_speech, pos1, pos2 + FROM imm_words + ORDER BY headword ASC, word ASC`, + ) + .all() as Array<{ + headword: string; + word: string; + reading: string; + part_of_speech: string; + pos1: string; + pos2: string; + }>; + + assert.deepEqual(result, { scanned: 5, kept: 3, deleted: 2, repaired: 2 }); + assert.deepEqual( + rows.map((row) => ({ headword: row.headword, frequency: row.frequency })), + [ + { headword: '知る', frequency: 4 }, + { headword: '猫', frequency: 3 }, + { headword: '旧', frequency: 1 }, + ], + ); + assert.deepEqual( + repairedRows, + [ + { + headword: '旧', + word: '旧', + reading: 'きゅう', + part_of_speech: 'noun', + pos1: '名詞', + pos2: '一般', + }, + { + headword: '猫', + word: '猫', + reading: 'ねこ', + part_of_speech: 'noun', + pos1: '名詞', + pos2: '一般', + }, + { + headword: '知る', + word: '知っている', + reading: 'しっている', + part_of_speech: 'verb', + pos1: '動詞', + pos2: '自立', + }, + ], + ); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); + +test('cleanupVocabularyStats merges repaired duplicates instead of violating the imm_words unique key', async () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + ensureSchema(db); + const videoId = getOrCreateVideoRecord(db, 'local:/tmp/cleanup-merge.mkv', { + canonicalTitle: 'Cleanup Merge', + sourcePath: '/tmp/cleanup-merge.mkv', + sourceUrl: null, + sourceType: SOURCE_TYPE_LOCAL, + }); + const { sessionId } = startSessionRecord(db, videoId, 2_000_000); + const duplicateResult = db + .prepare( + `INSERT INTO imm_words ( + headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run('知る', '知っている', 'しっている', 'verb', '動詞', '自立', '', 2_000, 2_500, 3); + const legacyResult = db + .prepare( + `INSERT INTO imm_words ( + headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run('知っている', '知っている', '', 'other', '動詞', '自立', '', 1_000, 3_000, 4); + const lineResult = db + .prepare( + `INSERT INTO imm_subtitle_lines ( + session_id, event_id, video_id, anime_id, line_index, segment_start_ms, segment_end_ms, text, CREATED_DATE, LAST_UPDATE_DATE + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(sessionId, null, videoId, null, 1, 0, 1000, '知っている', 2_000, 2_000); + const lineId = Number(lineResult.lastInsertRowid); + const duplicateId = Number(duplicateResult.lastInsertRowid); + const legacyId = Number(legacyResult.lastInsertRowid); + db.prepare( + `INSERT INTO imm_word_line_occurrences (line_id, word_id, occurrence_count) + VALUES (?, ?, ?)`, + ).run(lineId, duplicateId, 2); + db.prepare( + `INSERT INTO imm_word_line_occurrences (line_id, word_id, occurrence_count) + VALUES (?, ?, ?)`, + ).run(lineId, legacyId, 1); + + const result = await cleanupVocabularyStats(db, { + resolveLegacyPos: async (row) => { + if (row.id !== legacyId) { + return null; + } + return { + partOfSpeech: 'verb', + headword: '知る', + reading: 'しっている', + pos1: '動詞', + pos2: '自立', + pos3: '', + }; + }, + }); + + const rows = db + .prepare( + `SELECT id, headword, word, reading, frequency, first_seen, last_seen + FROM imm_words + ORDER BY id ASC`, + ) + .all() as Array<{ + id: number; + headword: string; + word: string; + reading: string; + frequency: number; + first_seen: number; + last_seen: number; + }>; + const occurrences = getWordOccurrences(db, '知る', '知っている', 'しっている', 10); + + assert.deepEqual(result, { scanned: 2, kept: 1, deleted: 1, repaired: 1 }); + assert.deepEqual(rows, [ + { + id: duplicateId, + headword: '知る', + word: '知っている', + reading: 'しっている', + frequency: 7, + first_seen: 1_000, + last_seen: 3_000, + }, + ]); + assert.deepEqual(occurrences, [ + { + animeId: null, + animeTitle: null, + videoId, + videoTitle: 'Cleanup Merge', + sessionId, + lineIndex: 1, + segmentStartMs: 0, + segmentEndMs: 1000, + text: '知っている', + occurrenceCount: 3, + }, + ]); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); + +test('getKanjiStats returns rows ordered by frequency descending', () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + ensureSchema(db); + const stmts = createTrackerPreparedStatements(db); + + // Insert kanji: 日 twice, 月 once + stmts.kanjiUpsertStmt.run('日', 1_000, 2_000); + stmts.kanjiUpsertStmt.run('日', 1_000, 3_000); + stmts.kanjiUpsertStmt.run('月', 1_500, 1_500); + + const rows = getKanjiStats(db, 10); + + assert.ok(rows.length >= 2); + const nichiRow = rows.find((r) => r.kanji === '日'); + const tsukiRow = rows.find((r) => r.kanji === '月'); + assert.ok(nichiRow, 'expected 日 row'); + assert.ok(tsukiRow, 'expected 月 row'); + assert.equal(nichiRow.kanji, '日'); + assert.equal(nichiRow.frequency, 2); + assert.equal(typeof nichiRow.firstSeen, 'number'); + assert.equal(typeof nichiRow.lastSeen, 'number'); + // Higher frequency should come first + const nichiIdx = rows.indexOf(nichiRow); + const tsukiIdx = rows.indexOf(tsukiRow); + assert.ok(nichiIdx < tsukiIdx, 'higher frequency kanji should appear first'); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); + +test('getKanjiStats returns empty array when no kanji exist', () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + ensureSchema(db); + const rows = getKanjiStats(db, 10); + assert.deepEqual(rows, []); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); + +test('getSessionEvents returns events ordered by ts_ms ascending', () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + ensureSchema(db); + const stmts = createTrackerPreparedStatements(db); + + const videoId = getOrCreateVideoRecord(db, 'local:/tmp/events-test.mkv', { + canonicalTitle: 'Events Test', + sourcePath: '/tmp/events-test.mkv', + sourceUrl: null, + sourceType: SOURCE_TYPE_LOCAL, + }); + + const startedAtMs = 5_000_000; + const { sessionId } = startSessionRecord(db, videoId, startedAtMs); + + // Insert two events at different timestamps + stmts.eventInsertStmt.run( + sessionId, + startedAtMs + 2_000, + EVENT_SUBTITLE_LINE, + 1, + 0, + 800, + 2, + 0, + '{"line":"second"}', + startedAtMs + 2_000, + startedAtMs + 2_000, + ); + stmts.eventInsertStmt.run( + sessionId, + startedAtMs + 1_000, + EVENT_SUBTITLE_LINE, + 0, + 0, + 600, + 3, + 0, + '{"line":"first"}', + startedAtMs + 1_000, + startedAtMs + 1_000, + ); + + const events = getSessionEvents(db, sessionId, 50); + + assert.equal(events.length, 2); + // Should be ordered ASC by ts_ms + assert.equal(events[0]!.tsMs, startedAtMs + 1_000); + assert.equal(events[1]!.tsMs, startedAtMs + 2_000); + assert.equal(events[0]!.eventType, EVENT_SUBTITLE_LINE); + assert.equal(events[0]!.payload, '{"line":"first"}'); + assert.equal(events[1]!.payload, '{"line":"second"}'); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); + +test('getSessionEvents returns empty array for session with no events', () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + ensureSchema(db); + const events = getSessionEvents(db, 9999, 50); + assert.deepEqual(events, []); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); + +test('getSessionEvents respects limit parameter', () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + ensureSchema(db); + const stmts = createTrackerPreparedStatements(db); + + const videoId = getOrCreateVideoRecord(db, 'local:/tmp/events-limit.mkv', { + canonicalTitle: 'Events Limit Test', + sourcePath: '/tmp/events-limit.mkv', + sourceUrl: null, + sourceType: SOURCE_TYPE_LOCAL, + }); + + const startedAtMs = 7_000_000; + const { sessionId } = startSessionRecord(db, videoId, startedAtMs); + + // Insert 5 events + for (let i = 0; i < 5; i += 1) { + stmts.eventInsertStmt.run( + sessionId, + startedAtMs + i * 1_000, + EVENT_SUBTITLE_LINE, + i, + 0, + 500, + 1, + 0, + null, + startedAtMs + i * 1_000, + startedAtMs + i * 1_000, + ); + } + + const limited = getSessionEvents(db, sessionId, 3); + assert.equal(limited.length, 3); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); + +test('anime-level queries group by anime_id and preserve episode-level rows', () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + ensureSchema(db); + const stmts = createTrackerPreparedStatements(db); + + const lwaAnimeId = getOrCreateAnimeRecord(db, { + parsedTitle: 'Little Witch Academia', + canonicalTitle: 'Little Witch Academia', + anilistId: 33_435, + titleRomaji: 'Little Witch Academia', + titleEnglish: 'Little Witch Academia', + titleNative: 'リトルウィッチアカデミア', + metadataJson: '{"source":"anilist"}', + }); + const frierenAnimeId = getOrCreateAnimeRecord(db, { + parsedTitle: 'Frieren', + canonicalTitle: 'Frieren', + anilistId: 52_921, + titleRomaji: 'Sousou no Frieren', + titleEnglish: 'Frieren: Beyond Journey\'s End', + titleNative: '葬送のフリーレン', + metadataJson: '{"source":"anilist"}', + }); + + const lwaEpisode5 = getOrCreateVideoRecord(db, 'local:/tmp/lwa-s02e05.mkv', { + canonicalTitle: 'Episode 5', + sourcePath: '/tmp/Little Witch Academia S02E05.mkv', + sourceUrl: null, + sourceType: SOURCE_TYPE_LOCAL, + }); + const lwaEpisode6 = getOrCreateVideoRecord(db, 'local:/tmp/lwa-s02e06.mkv', { + canonicalTitle: 'Episode 6', + sourcePath: '/tmp/Little Witch Academia S02E06.mkv', + sourceUrl: null, + sourceType: SOURCE_TYPE_LOCAL, + }); + const frierenEpisode3 = getOrCreateVideoRecord(db, 'local:/tmp/frieren-03.mkv', { + canonicalTitle: 'Episode 3', + sourcePath: '/tmp/[SubsPlease] Frieren - 03 - Departure.mkv', + sourceUrl: null, + sourceType: SOURCE_TYPE_LOCAL, + }); + + linkVideoToAnimeRecord(db, lwaEpisode5, { + animeId: lwaAnimeId, + parsedBasename: 'Little Witch Academia S02E05.mkv', + parsedTitle: 'Little Witch Academia', + parsedSeason: 2, + parsedEpisode: 5, + parserSource: 'fallback', + parserConfidence: 1, + parseMetadataJson: '{"episode":5}', + }); + linkVideoToAnimeRecord(db, lwaEpisode6, { + animeId: lwaAnimeId, + parsedBasename: 'Little Witch Academia S02E06.mkv', + parsedTitle: 'Little Witch Academia', + parsedSeason: 2, + parsedEpisode: 6, + parserSource: 'fallback', + parserConfidence: 1, + parseMetadataJson: '{"episode":6}', + }); + linkVideoToAnimeRecord(db, frierenEpisode3, { + animeId: frierenAnimeId, + parsedBasename: '[SubsPlease] Frieren - 03 - Departure.mkv', + parsedTitle: 'Frieren', + parsedSeason: 1, + parsedEpisode: 3, + parserSource: 'fallback', + parserConfidence: 0.6, + parseMetadataJson: '{"episode":3}', + }); + + const sessionA = startSessionRecord(db, lwaEpisode5, 1_000_000); + const sessionB = startSessionRecord(db, lwaEpisode5, 1_010_000); + const sessionC = startSessionRecord(db, lwaEpisode6, 1_020_000); + const sessionD = startSessionRecord(db, frierenEpisode3, 1_030_000); + + stmts.telemetryInsertStmt.run( + sessionA.sessionId, + 1_001_000, + 4_000, + 3_000, + 10, + 25, + 25, + 1, + 3, + 2, + 0, + 0, + 0, + 0, + 0, + 1_001_000, + 1_001_000, + ); + stmts.telemetryInsertStmt.run( + sessionB.sessionId, + 1_011_000, + 5_000, + 4_000, + 11, + 27, + 27, + 2, + 4, + 2, + 0, + 0, + 0, + 0, + 0, + 1_011_000, + 1_011_000, + ); + stmts.telemetryInsertStmt.run( + sessionC.sessionId, + 1_021_000, + 6_000, + 5_000, + 12, + 28, + 28, + 3, + 5, + 4, + 0, + 0, + 0, + 0, + 0, + 1_021_000, + 1_021_000, + ); + stmts.telemetryInsertStmt.run( + sessionD.sessionId, + 1_031_000, + 4_000, + 3_500, + 8, + 20, + 20, + 1, + 2, + 1, + 0, + 0, + 0, + 0, + 0, + 1_031_000, + 1_031_000, + ); + + const animeLibrary = getAnimeLibrary(db); + assert.equal(animeLibrary.length, 2); + assert.deepEqual( + animeLibrary.map((row) => ({ + animeId: row.animeId, + canonicalTitle: row.canonicalTitle, + totalSessions: row.totalSessions, + totalActiveMs: row.totalActiveMs, + totalCards: row.totalCards, + episodeCount: row.episodeCount, + })), + [ + { + animeId: lwaAnimeId, + canonicalTitle: 'Little Witch Academia', + totalSessions: 3, + totalActiveMs: 12_000, + totalCards: 6, + episodeCount: 2, + }, + { + animeId: frierenAnimeId, + canonicalTitle: 'Frieren', + totalSessions: 1, + totalActiveMs: 3_500, + totalCards: 1, + episodeCount: 1, + }, + ], + ); + + const animeDetail = getAnimeDetail(db, lwaAnimeId); + assert.ok(animeDetail); + assert.equal(animeDetail?.animeId, lwaAnimeId); + assert.equal(animeDetail?.canonicalTitle, 'Little Witch Academia'); + assert.equal(animeDetail?.anilistId, 33_435); + assert.equal(animeDetail?.totalSessions, 3); + assert.equal(animeDetail?.totalActiveMs, 12_000); + assert.equal(animeDetail?.totalCards, 6); + assert.equal(animeDetail?.totalWordsSeen, 80); + assert.equal(animeDetail?.totalLinesSeen, 33); + assert.equal(animeDetail?.totalLookupCount, 12); + assert.equal(animeDetail?.totalLookupHits, 8); + assert.equal(animeDetail?.episodeCount, 2); + + const episodes = getAnimeEpisodes(db, lwaAnimeId); + assert.deepEqual( + episodes.map((row) => ({ + videoId: row.videoId, + season: row.season, + episode: row.episode, + totalSessions: row.totalSessions, + totalActiveMs: row.totalActiveMs, + totalCards: row.totalCards, + })), + [ + { + videoId: lwaEpisode5, + season: 2, + episode: 5, + totalSessions: 2, + totalActiveMs: 7_000, + totalCards: 3, + }, + { + videoId: lwaEpisode6, + season: 2, + episode: 6, + totalSessions: 1, + totalActiveMs: 5_000, + totalCards: 3, + }, + ], + ); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); + +test('getWordOccurrences maps a normalized word back to anime, video, and subtitle line context', () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + ensureSchema(db); + const animeId = getOrCreateAnimeRecord(db, { + parsedTitle: 'Little Witch Academia', + canonicalTitle: 'Little Witch Academia', + anilistId: null, + titleRomaji: null, + titleEnglish: null, + titleNative: null, + metadataJson: '{"source":"test"}', + }); + const videoId = getOrCreateVideoRecord(db, 'local:/tmp/lwa-s02e04.mkv', { + canonicalTitle: 'Episode 4', + sourcePath: '/tmp/Little Witch Academia S02E04.mkv', + sourceUrl: null, + sourceType: SOURCE_TYPE_LOCAL, + }); + linkVideoToAnimeRecord(db, videoId, { + animeId, + parsedBasename: 'Little Witch Academia S02E04.mkv', + parsedTitle: 'Little Witch Academia', + parsedSeason: 2, + parsedEpisode: 4, + parserSource: 'fallback', + parserConfidence: 1, + parseMetadataJson: '{"episode":4}', + }); + const { sessionId } = startSessionRecord(db, videoId, 1_000_000); + const wordResult = db + .prepare( + `INSERT INTO imm_words ( + headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run('猫', '猫', 'ねこ', 'noun', '名詞', '一般', '', 1_000, 1_500, 4); + const lineResult = db + .prepare( + `INSERT INTO imm_subtitle_lines ( + session_id, event_id, video_id, anime_id, line_index, segment_start_ms, segment_end_ms, text, CREATED_DATE, LAST_UPDATE_DATE + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(sessionId, null, videoId, animeId, 1, 0, 1000, '猫 猫 日 日 は', 1_000, 1_000); + db.prepare( + `INSERT INTO imm_word_line_occurrences (line_id, word_id, occurrence_count) + VALUES (?, ?, ?)`, + ).run(Number(lineResult.lastInsertRowid), Number(wordResult.lastInsertRowid), 2); + + const rows = getWordOccurrences(db, '猫', '猫', 'ねこ', 10); + + assert.deepEqual(rows, [ + { + animeId, + animeTitle: 'Little Witch Academia', + videoId, + videoTitle: 'Episode 4', + sessionId, + lineIndex: 1, + segmentStartMs: 0, + segmentEndMs: 1000, + text: '猫 猫 日 日 は', + occurrenceCount: 2, + }, + ]); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); + +test('getKanjiOccurrences maps a kanji back to anime, video, and subtitle line context', () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + ensureSchema(db); + const animeId = getOrCreateAnimeRecord(db, { + parsedTitle: 'Frieren', + canonicalTitle: 'Frieren', + anilistId: null, + titleRomaji: null, + titleEnglish: null, + titleNative: null, + metadataJson: '{"source":"test"}', + }); + const videoId = getOrCreateVideoRecord(db, 'local:/tmp/frieren-03.mkv', { + canonicalTitle: 'Episode 3', + sourcePath: '/tmp/[SubsPlease] Frieren - 03 - Departure.mkv', + sourceUrl: null, + sourceType: SOURCE_TYPE_LOCAL, + }); + linkVideoToAnimeRecord(db, videoId, { + animeId, + parsedBasename: '[SubsPlease] Frieren - 03 - Departure.mkv', + parsedTitle: 'Frieren', + parsedSeason: 1, + parsedEpisode: 3, + parserSource: 'fallback', + parserConfidence: 1, + parseMetadataJson: '{"episode":3}', + }); + const { sessionId } = startSessionRecord(db, videoId, 2_000_000); + const kanjiResult = db + .prepare( + `INSERT INTO imm_kanji ( + kanji, first_seen, last_seen, frequency + ) VALUES (?, ?, ?, ?)`, + ) + .run('日', 2_000, 2_500, 8); + const lineResult = db + .prepare( + `INSERT INTO imm_subtitle_lines ( + session_id, event_id, video_id, anime_id, line_index, segment_start_ms, segment_end_ms, text, CREATED_DATE, LAST_UPDATE_DATE + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(sessionId, null, videoId, animeId, 3, 5000, 6500, '今日は日曜', 2_000, 2_000); + db.prepare( + `INSERT INTO imm_kanji_line_occurrences (line_id, kanji_id, occurrence_count) + VALUES (?, ?, ?)`, + ).run(Number(lineResult.lastInsertRowid), Number(kanjiResult.lastInsertRowid), 2); + + const rows = getKanjiOccurrences(db, '日', 10); + + assert.deepEqual(rows, [ + { + animeId, + animeTitle: 'Frieren', + videoId, + videoTitle: 'Episode 3', + sessionId, + lineIndex: 3, + segmentStartMs: 5000, + segmentEndMs: 6500, + text: '今日は日曜', + occurrenceCount: 2, + }, + ]); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); diff --git a/src/core/services/immersion-tracker/legacy-vocabulary-pos.ts b/src/core/services/immersion-tracker/legacy-vocabulary-pos.ts new file mode 100644 index 0000000..8c66ab3 --- /dev/null +++ b/src/core/services/immersion-tracker/legacy-vocabulary-pos.ts @@ -0,0 +1,71 @@ +import type { Token } from '../../../types'; +import type { LegacyVocabularyPosResolution } from './types'; +import { deriveStoredPartOfSpeech } from '../tokenizer/part-of-speech'; + +const KATAKANA_TO_HIRAGANA_OFFSET = 0x60; +const KATAKANA_CODEPOINT_START = 0x30a1; +const KATAKANA_CODEPOINT_END = 0x30f6; + +function normalizeLookupText(value: string | null | undefined): string { + return typeof value === 'string' ? value.trim() : ''; +} + +function katakanaToHiragana(text: string): string { + let normalized = ''; + for (const char of text) { + const code = char.codePointAt(0); + if (code === undefined) { + continue; + } + if (code >= KATAKANA_CODEPOINT_START && code <= KATAKANA_CODEPOINT_END) { + normalized += String.fromCodePoint(code - KATAKANA_TO_HIRAGANA_OFFSET); + continue; + } + normalized += char; + } + return normalized; +} + +function toResolution(token: Token): LegacyVocabularyPosResolution { + return { + headword: normalizeLookupText(token.headword) || normalizeLookupText(token.word), + reading: katakanaToHiragana(normalizeLookupText(token.katakanaReading)), + partOfSpeech: deriveStoredPartOfSpeech({ + partOfSpeech: token.partOfSpeech, + pos1: token.pos1, + }), + pos1: normalizeLookupText(token.pos1), + pos2: normalizeLookupText(token.pos2), + pos3: normalizeLookupText(token.pos3), + }; +} + +export function resolveLegacyVocabularyPosFromTokens( + lookupText: string, + tokens: Token[] | null, +): LegacyVocabularyPosResolution | null { + const normalizedLookup = normalizeLookupText(lookupText); + if (!normalizedLookup || !tokens || tokens.length === 0) { + return null; + } + + const exactSurfaceMatches = tokens.filter( + (token) => normalizeLookupText(token.word) === normalizedLookup, + ); + if (exactSurfaceMatches.length === 1) { + return toResolution(exactSurfaceMatches[0]!); + } + + const exactHeadwordMatches = tokens.filter( + (token) => normalizeLookupText(token.headword) === normalizedLookup, + ); + if (exactHeadwordMatches.length === 1) { + return toResolution(exactHeadwordMatches[0]!); + } + + if (tokens.length === 1) { + return toResolution(tokens[0]!); + } + + return null; +} diff --git a/src/core/services/immersion-tracker/maintenance.ts b/src/core/services/immersion-tracker/maintenance.ts index 11d6430..7810f7f 100644 --- a/src/core/services/immersion-tracker/maintenance.ts +++ b/src/core/services/immersion-tracker/maintenance.ts @@ -112,35 +112,46 @@ function upsertDailyRollupsForGroups( words_per_min, lookup_hit_rate, CREATED_DATE, LAST_UPDATE_DATE ) SELECT - CAST(s.started_at_ms / 86400000 AS INTEGER) AS rollup_day, + CAST(julianday(s.started_at_ms / 1000, 'unixepoch', 'localtime') - 2440587.5 AS INTEGER) AS rollup_day, s.video_id AS video_id, COUNT(DISTINCT s.session_id) AS total_sessions, - COALESCE(SUM(t.active_watched_ms), 0) / 60000.0 AS total_active_min, - COALESCE(SUM(t.lines_seen), 0) AS total_lines_seen, - COALESCE(SUM(t.words_seen), 0) AS total_words_seen, - COALESCE(SUM(t.tokens_seen), 0) AS total_tokens_seen, - COALESCE(SUM(t.cards_mined), 0) AS total_cards, + COALESCE(SUM(sm.max_active_ms), 0) / 60000.0 AS total_active_min, + COALESCE(SUM(sm.max_lines), 0) AS total_lines_seen, + COALESCE(SUM(sm.max_words), 0) AS total_words_seen, + COALESCE(SUM(sm.max_tokens), 0) AS total_tokens_seen, + COALESCE(SUM(sm.max_cards), 0) AS total_cards, CASE - WHEN COALESCE(SUM(t.active_watched_ms), 0) > 0 - THEN (COALESCE(SUM(t.cards_mined), 0) * 60.0) / (COALESCE(SUM(t.active_watched_ms), 0) / 60000.0) + WHEN COALESCE(SUM(sm.max_active_ms), 0) > 0 + THEN (COALESCE(SUM(sm.max_cards), 0) * 60.0) / (COALESCE(SUM(sm.max_active_ms), 0) / 60000.0) ELSE NULL END AS cards_per_hour, CASE - WHEN COALESCE(SUM(t.active_watched_ms), 0) > 0 - THEN COALESCE(SUM(t.words_seen), 0) / (COALESCE(SUM(t.active_watched_ms), 0) / 60000.0) + WHEN COALESCE(SUM(sm.max_active_ms), 0) > 0 + THEN COALESCE(SUM(sm.max_words), 0) / (COALESCE(SUM(sm.max_active_ms), 0) / 60000.0) ELSE NULL END AS words_per_min, CASE - WHEN COALESCE(SUM(t.lookup_count), 0) > 0 - THEN CAST(COALESCE(SUM(t.lookup_hits), 0) AS REAL) / CAST(SUM(t.lookup_count) AS REAL) + WHEN COALESCE(SUM(sm.max_lookups), 0) > 0 + THEN CAST(COALESCE(SUM(sm.max_hits), 0) AS REAL) / CAST(SUM(sm.max_lookups) AS REAL) ELSE NULL END AS lookup_hit_rate, ? AS CREATED_DATE, ? AS LAST_UPDATE_DATE FROM imm_sessions s - JOIN imm_session_telemetry t - ON t.session_id = s.session_id - WHERE CAST(s.started_at_ms / 86400000 AS INTEGER) = ? AND s.video_id = ? + JOIN ( + SELECT + t.session_id, + MAX(t.active_watched_ms) AS max_active_ms, + MAX(t.lines_seen) AS max_lines, + MAX(t.words_seen) AS max_words, + MAX(t.tokens_seen) AS max_tokens, + MAX(t.cards_mined) AS max_cards, + MAX(t.lookup_count) AS max_lookups, + MAX(t.lookup_hits) AS max_hits + FROM imm_session_telemetry t + GROUP BY t.session_id + ) sm ON s.session_id = sm.session_id + WHERE CAST(julianday(s.started_at_ms / 1000, 'unixepoch', 'localtime') - 2440587.5 AS INTEGER) = ? AND s.video_id = ? GROUP BY rollup_day, s.video_id ON CONFLICT (rollup_day, video_id) DO UPDATE SET total_sessions = excluded.total_sessions, @@ -176,20 +187,29 @@ function upsertMonthlyRollupsForGroups( total_words_seen, total_tokens_seen, total_cards, CREATED_DATE, LAST_UPDATE_DATE ) SELECT - CAST(strftime('%Y%m', s.started_at_ms / 1000, 'unixepoch') AS INTEGER) AS rollup_month, + CAST(strftime('%Y%m', s.started_at_ms / 1000, 'unixepoch', 'localtime') AS INTEGER) AS rollup_month, s.video_id AS video_id, COUNT(DISTINCT s.session_id) AS total_sessions, - COALESCE(SUM(t.active_watched_ms), 0) / 60000.0 AS total_active_min, - COALESCE(SUM(t.lines_seen), 0) AS total_lines_seen, - COALESCE(SUM(t.words_seen), 0) AS total_words_seen, - COALESCE(SUM(t.tokens_seen), 0) AS total_tokens_seen, - COALESCE(SUM(t.cards_mined), 0) AS total_cards, + COALESCE(SUM(sm.max_active_ms), 0) / 60000.0 AS total_active_min, + COALESCE(SUM(sm.max_lines), 0) AS total_lines_seen, + COALESCE(SUM(sm.max_words), 0) AS total_words_seen, + COALESCE(SUM(sm.max_tokens), 0) AS total_tokens_seen, + COALESCE(SUM(sm.max_cards), 0) AS total_cards, ? AS CREATED_DATE, ? AS LAST_UPDATE_DATE FROM imm_sessions s - JOIN imm_session_telemetry t - ON t.session_id = s.session_id - WHERE CAST(strftime('%Y%m', s.started_at_ms / 1000, 'unixepoch') AS INTEGER) = ? AND s.video_id = ? + JOIN ( + SELECT + t.session_id, + MAX(t.active_watched_ms) AS max_active_ms, + MAX(t.lines_seen) AS max_lines, + MAX(t.words_seen) AS max_words, + MAX(t.tokens_seen) AS max_tokens, + MAX(t.cards_mined) AS max_cards + FROM imm_session_telemetry t + GROUP BY t.session_id + ) sm ON s.session_id = sm.session_id + WHERE CAST(strftime('%Y%m', s.started_at_ms / 1000, 'unixepoch', 'localtime') AS INTEGER) = ? AND s.video_id = ? GROUP BY rollup_month, s.video_id ON CONFLICT (rollup_month, video_id) DO UPDATE SET total_sessions = excluded.total_sessions, @@ -216,8 +236,8 @@ function getAffectedRollupGroups( .prepare( ` SELECT DISTINCT - CAST(s.started_at_ms / 86400000 AS INTEGER) AS rollup_day, - CAST(strftime('%Y%m', s.started_at_ms / 1000, 'unixepoch') AS INTEGER) AS rollup_month, + CAST(julianday(s.started_at_ms / 1000, 'unixepoch', 'localtime') - 2440587.5 AS INTEGER) AS rollup_day, + CAST(strftime('%Y%m', s.started_at_ms / 1000, 'unixepoch', 'localtime') AS INTEGER) AS rollup_month, s.video_id AS video_id FROM imm_session_telemetry t JOIN imm_sessions s diff --git a/src/core/services/immersion-tracker/metadata.test.ts b/src/core/services/immersion-tracker/metadata.test.ts index b9da9d4..f6a23c8 100644 --- a/src/core/services/immersion-tracker/metadata.test.ts +++ b/src/core/services/immersion-tracker/metadata.test.ts @@ -4,7 +4,7 @@ import { EventEmitter } from 'node:events'; import test from 'node:test'; import type { spawn as spawnFn } from 'node:child_process'; import { SOURCE_TYPE_LOCAL } from './types'; -import { getLocalVideoMetadata, runFfprobe } from './metadata'; +import { getLocalVideoMetadata, guessAnimeVideoMetadata, runFfprobe } from './metadata'; type Spawn = typeof spawnFn; @@ -146,3 +146,79 @@ test('getLocalVideoMetadata derives title and falls back to null hash on read er assert.equal(hashFallbackMetadata.canonicalTitle, 'Episode 02'); assert.equal(hashFallbackMetadata.hashSha256, null); }); + +test('guessAnimeVideoMetadata uses guessit basename output first when available', async () => { + const seenTargets: string[] = []; + const parsed = await guessAnimeVideoMetadata('/tmp/Little Witch Academia S02E05.mkv', 'Episode 5', { + runGuessit: async (target) => { + seenTargets.push(target); + return JSON.stringify({ + title: 'Little Witch Academia', + season: 2, + episode: 5, + }); + }, + }); + + assert.deepEqual(seenTargets, ['Little Witch Academia S02E05.mkv']); + assert.deepEqual(parsed, { + parsedBasename: 'Little Witch Academia S02E05.mkv', + parsedTitle: 'Little Witch Academia', + parsedSeason: 2, + parsedEpisode: 5, + parserSource: 'guessit', + parserConfidence: 1, + parseMetadataJson: JSON.stringify({ + filename: 'Little Witch Academia S02E05.mkv', + source: 'guessit', + }), + }); +}); + +test('guessAnimeVideoMetadata falls back to parser when guessit throws', async () => { + const parsed = await guessAnimeVideoMetadata('/tmp/Little Witch Academia S02E05.mkv', 'Episode 5', { + runGuessit: async () => { + throw new Error('guessit unavailable'); + }, + }); + + assert.deepEqual(parsed, { + parsedBasename: 'Little Witch Academia S02E05.mkv', + parsedTitle: 'Little Witch Academia', + parsedSeason: 2, + parsedEpisode: 5, + parserSource: 'fallback', + parserConfidence: 1, + parseMetadataJson: JSON.stringify({ + confidence: 'high', + filename: 'Little Witch Academia S02E05.mkv', + rawTitle: 'Little Witch Academia S02E05', + source: 'fallback', + }), + }); +}); + +test('guessAnimeVideoMetadata falls back when guessit output is incomplete', async () => { + const parsed = await guessAnimeVideoMetadata( + '/tmp/[SubsPlease] Frieren - 03 (1080p).mkv', + null, + { + runGuessit: async () => JSON.stringify({ episode: 3 }), + }, + ); + + assert.deepEqual(parsed, { + parsedBasename: '[SubsPlease] Frieren - 03 (1080p).mkv', + parsedTitle: 'Frieren - 03 (1080p)', + parsedSeason: null, + parsedEpisode: null, + parserSource: 'fallback', + parserConfidence: 0.2, + parseMetadataJson: JSON.stringify({ + confidence: 'low', + filename: '[SubsPlease] Frieren - 03 (1080p).mkv', + rawTitle: 'Frieren - 03 (1080p)', + source: 'fallback', + }), + }); +}); diff --git a/src/core/services/immersion-tracker/metadata.ts b/src/core/services/immersion-tracker/metadata.ts index 394da91..3b09ce0 100644 --- a/src/core/services/immersion-tracker/metadata.ts +++ b/src/core/services/immersion-tracker/metadata.ts @@ -1,6 +1,13 @@ import crypto from 'node:crypto'; import { spawn as nodeSpawn } from 'node:child_process'; import * as fs from 'node:fs'; +import path from 'node:path'; +import { parseMediaInfo } from '../../../jimaku/utils'; +import { + guessAnilistMediaInfo, + runGuessit, + type GuessAnilistMediaInfoDeps, +} from '../anilist/anilist-updater'; import { deriveCanonicalTitle, emptyMetadata, @@ -8,7 +15,12 @@ import { parseFps, toNullableInt, } from './reducer'; -import { SOURCE_TYPE_LOCAL, type ProbeMetadata, type VideoMetadata } from './types'; +import { + SOURCE_TYPE_LOCAL, + type ParsedAnimeVideoGuess, + type ProbeMetadata, + type VideoMetadata, +} from './types'; type SpawnFn = typeof nodeSpawn; @@ -24,6 +36,21 @@ interface MetadataDeps { fs?: FsDeps; } +interface GuessAnimeVideoMetadataDeps { + runGuessit?: GuessAnilistMediaInfoDeps['runGuessit']; +} + +function mapParserConfidenceToScore(confidence: 'high' | 'medium' | 'low'): number { + switch (confidence) { + case 'high': + return 1; + case 'medium': + return 0.6; + default: + return 0.2; + } +} + export async function computeSha256( mediaPath: string, deps: MetadataDeps = {}, @@ -151,3 +178,48 @@ export async function getLocalVideoMetadata( metadataJson: null, }; } + +export async function guessAnimeVideoMetadata( + mediaPath: string | null, + mediaTitle: string | null, + deps: GuessAnimeVideoMetadataDeps = {}, +): Promise { + const parsed = await guessAnilistMediaInfo(mediaPath, mediaTitle, { + runGuessit: deps.runGuessit ?? runGuessit, + }); + if (!parsed) { + return null; + } + + const parsedBasename = mediaPath ? path.basename(mediaPath) : null; + if (parsed.source === 'guessit') { + return { + parsedBasename, + parsedTitle: parsed.title, + parsedSeason: parsed.season, + parsedEpisode: parsed.episode, + parserSource: 'guessit', + parserConfidence: 1, + parseMetadataJson: JSON.stringify({ + filename: parsedBasename, + source: 'guessit', + }), + }; + } + + const fallbackInfo = parseMediaInfo(mediaPath ?? mediaTitle); + return { + parsedBasename: parsedBasename ?? fallbackInfo.filename ?? null, + parsedTitle: parsed.title, + parsedSeason: parsed.season, + parsedEpisode: parsed.episode, + parserSource: 'fallback', + parserConfidence: mapParserConfidenceToScore(fallbackInfo.confidence), + parseMetadataJson: JSON.stringify({ + confidence: fallbackInfo.confidence, + filename: fallbackInfo.filename, + rawTitle: fallbackInfo.rawTitle, + source: 'fallback', + }), + }; +} diff --git a/src/core/services/immersion-tracker/query.ts b/src/core/services/immersion-tracker/query.ts index a734852..2c0a989 100644 --- a/src/core/services/immersion-tracker/query.ts +++ b/src/core/services/immersion-tracker/query.ts @@ -1,26 +1,95 @@ import type { DatabaseSync } from './sqlite'; import type { + AnimeAnilistEntryRow, + AnimeDetailRow, + AnimeEpisodeRow, + AnimeLibraryRow, + AnimeWordRow, + EpisodeCardEventRow, + EpisodesPerDayRow, ImmersionSessionRollupRow, + KanjiAnimeAppearanceRow, + KanjiDetailRow, + KanjiOccurrenceRow, + KanjiStatsRow, + KanjiWordRow, + MediaArtRow, + MediaDetailRow, + MediaLibraryRow, + NewAnimePerDayRow, + SessionEventRow, SessionSummaryQueryRow, SessionTimelineRow, + SimilarWordRow, + StreakCalendarRow, + VocabularyCleanupSummary, + WatchTimePerAnimeRow, + WordAnimeAppearanceRow, + WordDetailRow, + WordOccurrenceRow, + VocabularyStatsRow, } from './types'; +import { PartOfSpeech, type MergedToken } from '../../../types'; +import { shouldExcludeTokenFromVocabularyPersistence } from '../tokenizer/annotation-stage'; +import { deriveStoredPartOfSpeech } from '../tokenizer/part-of-speech'; + +type CleanupVocabularyRow = { + id: number; + word: string; + headword: string; + reading: string | null; + part_of_speech: string | null; + pos1: string | null; + pos2: string | null; + pos3: string | null; + first_seen: number | null; + last_seen: number | null; + frequency: number | null; +}; + +type ResolvedVocabularyPos = { + headword: string; + reading: string; + hasPosMetadata: boolean; + partOfSpeech: PartOfSpeech; + pos1: string; + pos2: string; + pos3: string; +}; + +type CleanupVocabularyStatsOptions = { + resolveLegacyPos?: (row: CleanupVocabularyRow) => Promise<{ + headword: string; + reading: string; + partOfSpeech: string; + pos1: string; + pos2: string; + pos3: string; + } | null>; +}; export function getSessionSummaries(db: DatabaseSync, limit = 50): SessionSummaryQueryRow[] { const prepared = db.prepare(` SELECT + s.session_id AS sessionId, s.video_id AS videoId, + v.canonical_title AS canonicalTitle, + v.anime_id AS animeId, + a.canonical_title AS animeTitle, s.started_at_ms AS startedAtMs, s.ended_at_ms AS endedAtMs, - COALESCE(SUM(t.total_watched_ms), 0) AS totalWatchedMs, - COALESCE(SUM(t.active_watched_ms), 0) AS activeWatchedMs, - COALESCE(SUM(t.lines_seen), 0) AS linesSeen, - COALESCE(SUM(t.words_seen), 0) AS wordsSeen, - COALESCE(SUM(t.tokens_seen), 0) AS tokensSeen, - COALESCE(SUM(t.cards_mined), 0) AS cardsMined, - COALESCE(SUM(t.lookup_count), 0) AS lookupCount, - COALESCE(SUM(t.lookup_hits), 0) AS lookupHits + COALESCE(MAX(t.total_watched_ms), 0) AS totalWatchedMs, + COALESCE(MAX(t.active_watched_ms), 0) AS activeWatchedMs, + COALESCE(MAX(t.lines_seen), 0) AS linesSeen, + COALESCE(MAX(t.words_seen), 0) AS wordsSeen, + COALESCE(MAX(t.tokens_seen), 0) AS tokensSeen, + COALESCE(MAX(t.cards_mined), 0) AS cardsMined, + COALESCE(MAX(t.lookup_count), 0) AS lookupCount, + COALESCE(MAX(t.lookup_hits), 0) AS lookupHits FROM imm_sessions s LEFT JOIN imm_session_telemetry t ON t.session_id = s.session_id + LEFT JOIN imm_videos v ON v.video_id = s.video_id + LEFT JOIN imm_anime a ON a.anime_id = v.anime_id GROUP BY s.session_id ORDER BY s.started_at_ms DESC LIMIT ? @@ -53,12 +122,32 @@ export function getSessionTimeline( export function getQueryHints(db: DatabaseSync): { totalSessions: number; activeSessions: number; + episodesToday: number; + activeAnimeCount: number; } { const sessions = db.prepare('SELECT COUNT(*) AS total FROM imm_sessions'); const active = db.prepare('SELECT COUNT(*) AS total FROM imm_sessions WHERE ended_at_ms IS NULL'); const totalSessions = Number((sessions.get() as { total?: number } | null)?.total ?? 0); const activeSessions = Number((active.get() as { total?: number } | null)?.total ?? 0); - return { totalSessions, activeSessions }; + + const now = new Date(); + const todayLocal = Math.floor(new Date(now.getFullYear(), now.getMonth(), now.getDate()).getTime() / 86_400_000); + const episodesToday = (db.prepare(` + SELECT COUNT(DISTINCT s.video_id) AS count + FROM imm_sessions s + WHERE CAST(julianday(s.started_at_ms / 1000, 'unixepoch', 'localtime') - 2440587.5 AS INTEGER) = ? + `).get(todayLocal) as { count: number })?.count ?? 0; + + const thirtyDaysAgoMs = Date.now() - 30 * 86400000; + const activeAnimeCount = (db.prepare(` + SELECT COUNT(DISTINCT v.anime_id) AS count + FROM imm_sessions s + JOIN imm_videos v ON v.video_id = s.video_id + WHERE v.anime_id IS NOT NULL + AND s.started_at_ms >= ? + `).get(thirtyDaysAgoMs) as { count: number })?.count ?? 0; + + return { totalSessions, activeSessions, episodesToday, activeAnimeCount }; } export function getDailyRollups(db: DatabaseSync, limit = 60): ImmersionSessionRollupRow[] { @@ -102,3 +191,959 @@ export function getMonthlyRollups(db: DatabaseSync, limit = 24): ImmersionSessio `); return prepared.all(limit) as unknown as ImmersionSessionRollupRow[]; } + +export function getVocabularyStats( + db: DatabaseSync, + limit = 100, + excludePos?: string[], +): VocabularyStatsRow[] { + const hasExclude = excludePos && excludePos.length > 0; + const placeholders = hasExclude ? excludePos.map(() => '?').join(', ') : ''; + const whereClause = hasExclude + ? `WHERE (part_of_speech IS NULL OR part_of_speech NOT IN (${placeholders}))` + : ''; + const stmt = db.prepare(` + SELECT id AS wordId, headword, word, reading, + part_of_speech AS partOfSpeech, pos1, pos2, pos3, + frequency, first_seen AS firstSeen, last_seen AS lastSeen + FROM imm_words ${whereClause} ORDER BY frequency DESC LIMIT ? + `); + const params = hasExclude ? [...excludePos, limit] : [limit]; + return stmt.all(...params) as VocabularyStatsRow[]; +} + +function toStoredWordToken(row: { + word: string; + headword: string; + part_of_speech: string | null; + pos1: string | null; + pos2: string | null; + pos3: string | null; +}): MergedToken { + return { + surface: row.word || row.headword || '', + reading: '', + headword: row.headword || row.word || '', + startPos: 0, + endPos: 0, + partOfSpeech: deriveStoredPartOfSpeech({ + partOfSpeech: row.part_of_speech, + pos1: row.pos1, + }), + pos1: row.pos1 ?? '', + pos2: row.pos2 ?? '', + pos3: row.pos3 ?? '', + isMerged: true, + isKnown: false, + isNPlusOneTarget: false, + }; +} + +function normalizePosField(value: string | null | undefined): string { + return typeof value === 'string' ? value.trim() : ''; +} + +function resolveStoredVocabularyPos(row: CleanupVocabularyRow): ResolvedVocabularyPos | null { + const headword = normalizePosField(row.headword); + const reading = normalizePosField(row.reading); + const partOfSpeechRaw = typeof row.part_of_speech === 'string' ? row.part_of_speech.trim() : ''; + const pos1 = normalizePosField(row.pos1); + const pos2 = normalizePosField(row.pos2); + const pos3 = normalizePosField(row.pos3); + + if (!headword && !reading && !partOfSpeechRaw && !pos1 && !pos2 && !pos3) { + return null; + } + + return { + headword: headword || normalizePosField(row.word), + reading, + hasPosMetadata: Boolean(partOfSpeechRaw || pos1 || pos2 || pos3), + partOfSpeech: deriveStoredPartOfSpeech({ + partOfSpeech: partOfSpeechRaw, + pos1, + }), + pos1, + pos2, + pos3, + }; +} + +function hasStructuredPos(pos: ResolvedVocabularyPos | null): boolean { + return Boolean(pos?.hasPosMetadata && (pos.pos1 || pos.pos2 || pos.pos3 || pos.partOfSpeech)); +} + +function needsLegacyVocabularyMetadataRepair( + row: CleanupVocabularyRow, + stored: ResolvedVocabularyPos | null, +): boolean { + if (!stored) { + return true; + } + + if (!hasStructuredPos(stored)) { + return true; + } + + if (!stored.reading) { + return true; + } + + if (!stored.headword) { + return true; + } + + return stored.headword === normalizePosField(row.word); +} + +function shouldUpdateStoredVocabularyPos( + row: CleanupVocabularyRow, + next: ResolvedVocabularyPos, +): boolean { + return ( + normalizePosField(row.headword) !== next.headword || + normalizePosField(row.reading) !== next.reading || + (next.hasPosMetadata && + (normalizePosField(row.part_of_speech) !== next.partOfSpeech || + normalizePosField(row.pos1) !== next.pos1 || + normalizePosField(row.pos2) !== next.pos2 || + normalizePosField(row.pos3) !== next.pos3)) + ); +} + +function chooseMergedPartOfSpeech( + current: string | null | undefined, + incoming: ResolvedVocabularyPos, +): string { + const normalizedCurrent = normalizePosField(current); + if ( + normalizedCurrent && + normalizedCurrent !== PartOfSpeech.other && + incoming.partOfSpeech === PartOfSpeech.other + ) { + return normalizedCurrent; + } + return incoming.partOfSpeech; +} + +async function maybeResolveLegacyVocabularyPos( + row: CleanupVocabularyRow, + options: CleanupVocabularyStatsOptions, +): Promise { + const stored = resolveStoredVocabularyPos(row); + if (!needsLegacyVocabularyMetadataRepair(row, stored) || !options.resolveLegacyPos) { + return stored; + } + + const resolved = await options.resolveLegacyPos(row); + if (resolved) { + return { + headword: normalizePosField(resolved.headword) || normalizePosField(row.word), + reading: normalizePosField(resolved.reading), + hasPosMetadata: true, + partOfSpeech: deriveStoredPartOfSpeech({ + partOfSpeech: resolved.partOfSpeech, + pos1: resolved.pos1, + }), + pos1: normalizePosField(resolved.pos1), + pos2: normalizePosField(resolved.pos2), + pos3: normalizePosField(resolved.pos3), + }; + } + + return stored; +} + +export async function cleanupVocabularyStats( + db: DatabaseSync, + options: CleanupVocabularyStatsOptions = {}, +): Promise { + const rows = db + .prepare( + `SELECT id, word, headword, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency + FROM imm_words`, + ) + .all() as CleanupVocabularyRow[]; + const findDuplicateStmt = db.prepare( + `SELECT id, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency + FROM imm_words + WHERE headword = ? AND word = ? AND reading = ? AND id != ?`, + ); + const deleteStmt = db.prepare('DELETE FROM imm_words WHERE id = ?'); + const updateStmt = db.prepare( + `UPDATE imm_words + SET headword = ?, reading = ?, part_of_speech = ?, pos1 = ?, pos2 = ?, pos3 = ? + WHERE id = ?`, + ); + const mergeWordStmt = db.prepare( + `UPDATE imm_words + SET + frequency = COALESCE(frequency, 0) + ?, + part_of_speech = ?, + pos1 = ?, + pos2 = ?, + pos3 = ?, + first_seen = MIN(COALESCE(first_seen, ?), ?), + last_seen = MAX(COALESCE(last_seen, ?), ?) + WHERE id = ?`, + ); + const moveOccurrencesStmt = db.prepare( + `INSERT INTO imm_word_line_occurrences (line_id, word_id, occurrence_count) + SELECT line_id, ?, occurrence_count + FROM imm_word_line_occurrences + WHERE word_id = ? + ON CONFLICT(line_id, word_id) DO UPDATE SET + occurrence_count = imm_word_line_occurrences.occurrence_count + excluded.occurrence_count`, + ); + const deleteOccurrencesStmt = db.prepare('DELETE FROM imm_word_line_occurrences WHERE word_id = ?'); + let kept = 0; + let deleted = 0; + let repaired = 0; + + for (const row of rows) { + const resolvedPos = await maybeResolveLegacyVocabularyPos(row, options); + const shouldRepair = Boolean(resolvedPos && shouldUpdateStoredVocabularyPos(row, resolvedPos)); + if (resolvedPos && shouldRepair) { + const duplicate = findDuplicateStmt.get( + resolvedPos.headword, + row.word, + resolvedPos.reading, + row.id, + ) as + | { + id: number; + part_of_speech: string | null; + pos1: string | null; + pos2: string | null; + pos3: string | null; + first_seen: number | null; + last_seen: number | null; + frequency: number | null; + } + | null; + if (duplicate) { + moveOccurrencesStmt.run(duplicate.id, row.id); + deleteOccurrencesStmt.run(row.id); + mergeWordStmt.run( + row.frequency ?? 0, + chooseMergedPartOfSpeech(duplicate.part_of_speech, resolvedPos), + normalizePosField(duplicate.pos1) || resolvedPos.pos1, + normalizePosField(duplicate.pos2) || resolvedPos.pos2, + normalizePosField(duplicate.pos3) || resolvedPos.pos3, + row.first_seen ?? duplicate.first_seen ?? 0, + row.first_seen ?? duplicate.first_seen ?? 0, + row.last_seen ?? duplicate.last_seen ?? 0, + row.last_seen ?? duplicate.last_seen ?? 0, + duplicate.id, + ); + deleteStmt.run(row.id); + repaired += 1; + deleted += 1; + continue; + } + + updateStmt.run( + resolvedPos.headword, + resolvedPos.reading, + resolvedPos.partOfSpeech, + resolvedPos.pos1, + resolvedPos.pos2, + resolvedPos.pos3, + row.id, + ); + repaired += 1; + } + + const effectiveRow = { + ...row, + headword: resolvedPos?.headword ?? row.headword, + reading: resolvedPos?.reading ?? row.reading, + part_of_speech: resolvedPos?.hasPosMetadata ? resolvedPos.partOfSpeech : row.part_of_speech, + pos1: resolvedPos?.pos1 ?? row.pos1, + pos2: resolvedPos?.pos2 ?? row.pos2, + pos3: resolvedPos?.pos3 ?? row.pos3, + }; + const missingPos = + !normalizePosField(effectiveRow.part_of_speech) && + !normalizePosField(effectiveRow.pos1) && + !normalizePosField(effectiveRow.pos2) && + !normalizePosField(effectiveRow.pos3); + if (missingPos || shouldExcludeTokenFromVocabularyPersistence(toStoredWordToken(effectiveRow))) { + deleteStmt.run(row.id); + deleted += 1; + continue; + } + kept += 1; + } + + return { + scanned: rows.length, + kept, + deleted, + repaired, + }; +} + +export function getKanjiStats(db: DatabaseSync, limit = 100): KanjiStatsRow[] { + const stmt = db.prepare(` + SELECT id AS kanjiId, kanji, frequency, + first_seen AS firstSeen, last_seen AS lastSeen + FROM imm_kanji ORDER BY frequency DESC LIMIT ? + `); + return stmt.all(limit) as KanjiStatsRow[]; +} + +export function getWordOccurrences( + db: DatabaseSync, + headword: string, + word: string, + reading: string, + limit = 100, + offset = 0, +): WordOccurrenceRow[] { + return db + .prepare( + ` + SELECT + l.anime_id AS animeId, + a.canonical_title AS animeTitle, + l.video_id AS videoId, + v.canonical_title AS videoTitle, + l.session_id AS sessionId, + l.line_index AS lineIndex, + l.segment_start_ms AS segmentStartMs, + l.segment_end_ms AS segmentEndMs, + l.text AS text, + o.occurrence_count AS occurrenceCount + FROM imm_word_line_occurrences o + JOIN imm_words w ON w.id = o.word_id + JOIN imm_subtitle_lines l ON l.line_id = o.line_id + JOIN imm_videos v ON v.video_id = l.video_id + LEFT JOIN imm_anime a ON a.anime_id = l.anime_id + WHERE w.headword = ? AND w.word = ? AND w.reading = ? + ORDER BY l.CREATED_DATE DESC, l.line_id DESC + LIMIT ? + OFFSET ? + `, + ) + .all(headword, word, reading, limit, offset) as unknown as WordOccurrenceRow[]; +} + +export function getKanjiOccurrences( + db: DatabaseSync, + kanji: string, + limit = 100, + offset = 0, +): KanjiOccurrenceRow[] { + return db + .prepare( + ` + SELECT + l.anime_id AS animeId, + a.canonical_title AS animeTitle, + l.video_id AS videoId, + v.canonical_title AS videoTitle, + l.session_id AS sessionId, + l.line_index AS lineIndex, + l.segment_start_ms AS segmentStartMs, + l.segment_end_ms AS segmentEndMs, + l.text AS text, + o.occurrence_count AS occurrenceCount + FROM imm_kanji_line_occurrences o + JOIN imm_kanji k ON k.id = o.kanji_id + JOIN imm_subtitle_lines l ON l.line_id = o.line_id + JOIN imm_videos v ON v.video_id = l.video_id + LEFT JOIN imm_anime a ON a.anime_id = l.anime_id + WHERE k.kanji = ? + ORDER BY l.CREATED_DATE DESC, l.line_id DESC + LIMIT ? + OFFSET ? + `, + ) + .all(kanji, limit, offset) as unknown as KanjiOccurrenceRow[]; +} + +export function getSessionEvents( + db: DatabaseSync, + sessionId: number, + limit = 500, +): SessionEventRow[] { + const stmt = db.prepare(` + SELECT event_type AS eventType, ts_ms AS tsMs, payload_json AS payload + FROM imm_session_events WHERE session_id = ? ORDER BY ts_ms ASC LIMIT ? + `); + return stmt.all(sessionId, limit) as SessionEventRow[]; +} + +export function getAnimeLibrary(db: DatabaseSync): AnimeLibraryRow[] { + return db.prepare(` + SELECT + a.anime_id AS animeId, + a.canonical_title AS canonicalTitle, + a.anilist_id AS anilistId, + COUNT(DISTINCT s.session_id) AS totalSessions, + COALESCE(SUM(sm.max_active_ms), 0) AS totalActiveMs, + COALESCE(SUM(sm.max_cards), 0) AS totalCards, + COALESCE(SUM(sm.max_words), 0) AS totalWordsSeen, + COUNT(DISTINCT v.video_id) AS episodeCount, + a.episodes_total AS episodesTotal, + MAX(s.started_at_ms) AS lastWatchedMs + FROM imm_anime a + JOIN imm_videos v ON v.anime_id = a.anime_id + JOIN imm_sessions s ON s.video_id = v.video_id + LEFT JOIN ( + SELECT + t.session_id, + MAX(t.active_watched_ms) AS max_active_ms, + MAX(t.cards_mined) AS max_cards, + MAX(t.words_seen) AS max_words + FROM imm_session_telemetry t + GROUP BY t.session_id + ) sm ON sm.session_id = s.session_id + GROUP BY a.anime_id + ORDER BY totalActiveMs DESC, lastWatchedMs DESC, canonicalTitle ASC + `).all() as unknown as AnimeLibraryRow[]; +} + +export function getAnimeDetail(db: DatabaseSync, animeId: number): AnimeDetailRow | null { + return db.prepare(` + SELECT + a.anime_id AS animeId, + a.canonical_title AS canonicalTitle, + a.anilist_id AS anilistId, + a.title_romaji AS titleRomaji, + a.title_english AS titleEnglish, + a.title_native AS titleNative, + COUNT(DISTINCT s.session_id) AS totalSessions, + COALESCE(SUM(sm.max_active_ms), 0) AS totalActiveMs, + COALESCE(SUM(sm.max_cards), 0) AS totalCards, + COALESCE(SUM(sm.max_words), 0) AS totalWordsSeen, + COALESCE(SUM(sm.max_lines), 0) AS totalLinesSeen, + COALESCE(SUM(sm.max_lookups), 0) AS totalLookupCount, + COALESCE(SUM(sm.max_hits), 0) AS totalLookupHits, + COUNT(DISTINCT v.video_id) AS episodeCount, + MAX(s.started_at_ms) AS lastWatchedMs + FROM imm_anime a + JOIN imm_videos v ON v.anime_id = a.anime_id + JOIN imm_sessions s ON s.video_id = v.video_id + LEFT JOIN ( + SELECT + t.session_id, + MAX(t.active_watched_ms) AS max_active_ms, + MAX(t.cards_mined) AS max_cards, + MAX(t.words_seen) AS max_words, + MAX(t.lines_seen) AS max_lines, + MAX(t.lookup_count) AS max_lookups, + MAX(t.lookup_hits) AS max_hits + FROM imm_session_telemetry t + GROUP BY t.session_id + ) sm ON sm.session_id = s.session_id + WHERE a.anime_id = ? + GROUP BY a.anime_id + `).get(animeId) as unknown as AnimeDetailRow | null; +} + +export function getAnimeAnilistEntries(db: DatabaseSync, animeId: number): AnimeAnilistEntryRow[] { + return db.prepare(` + SELECT DISTINCT + m.anilist_id AS anilistId, + m.title_romaji AS titleRomaji, + m.title_english AS titleEnglish, + v.parsed_season AS season + FROM imm_videos v + JOIN imm_media_art m ON m.video_id = v.video_id + WHERE v.anime_id = ? + AND m.anilist_id IS NOT NULL + ORDER BY v.parsed_season ASC + `).all(animeId) as unknown as AnimeAnilistEntryRow[]; +} + +export function getAnimeEpisodes(db: DatabaseSync, animeId: number): AnimeEpisodeRow[] { + return db.prepare(` + SELECT + v.anime_id AS animeId, + v.video_id AS videoId, + v.canonical_title AS canonicalTitle, + v.parsed_title AS parsedTitle, + v.parsed_season AS season, + v.parsed_episode AS episode, + v.duration_ms AS durationMs, + v.watched AS watched, + COUNT(DISTINCT s.session_id) AS totalSessions, + COALESCE(SUM(sm.max_active_ms), 0) AS totalActiveMs, + COALESCE(SUM(sm.max_cards), 0) AS totalCards, + COALESCE(SUM(sm.max_words), 0) AS totalWordsSeen, + MAX(s.started_at_ms) AS lastWatchedMs + FROM imm_videos v + JOIN imm_sessions s ON s.video_id = v.video_id + LEFT JOIN ( + SELECT + t.session_id, + MAX(t.active_watched_ms) AS max_active_ms, + MAX(t.cards_mined) AS max_cards, + MAX(t.words_seen) AS max_words + FROM imm_session_telemetry t + GROUP BY t.session_id + ) sm ON sm.session_id = s.session_id + WHERE v.anime_id = ? + GROUP BY v.video_id + ORDER BY + CASE WHEN v.parsed_season IS NULL THEN 1 ELSE 0 END, + v.parsed_season ASC, + CASE WHEN v.parsed_episode IS NULL THEN 1 ELSE 0 END, + v.parsed_episode ASC, + v.video_id ASC + `).all(animeId) as unknown as AnimeEpisodeRow[]; +} + +export function getMediaLibrary(db: DatabaseSync): MediaLibraryRow[] { + return db.prepare(` + SELECT + v.video_id AS videoId, + v.canonical_title AS canonicalTitle, + COUNT(DISTINCT s.session_id) AS totalSessions, + COALESCE(SUM(sm.max_active_ms), 0) AS totalActiveMs, + COALESCE(SUM(sm.max_cards), 0) AS totalCards, + COALESCE(SUM(sm.max_words), 0) AS totalWordsSeen, + MAX(s.started_at_ms) AS lastWatchedMs, + CASE WHEN ma.cover_blob IS NOT NULL THEN 1 ELSE 0 END AS hasCoverArt + FROM imm_videos v + JOIN imm_sessions s ON s.video_id = v.video_id + LEFT JOIN ( + SELECT + t.session_id, + MAX(t.active_watched_ms) AS max_active_ms, + MAX(t.cards_mined) AS max_cards, + MAX(t.words_seen) AS max_words + FROM imm_session_telemetry t + GROUP BY t.session_id + ) sm ON sm.session_id = s.session_id + LEFT JOIN imm_media_art ma ON ma.video_id = v.video_id + GROUP BY v.video_id + ORDER BY lastWatchedMs DESC + `).all() as unknown as MediaLibraryRow[]; +} + +export function getMediaDetail(db: DatabaseSync, videoId: number): MediaDetailRow | null { + return db.prepare(` + SELECT + v.video_id AS videoId, + v.canonical_title AS canonicalTitle, + COUNT(DISTINCT s.session_id) AS totalSessions, + COALESCE(SUM(sm.max_active_ms), 0) AS totalActiveMs, + COALESCE(SUM(sm.max_cards), 0) AS totalCards, + COALESCE(SUM(sm.max_words), 0) AS totalWordsSeen, + COALESCE(SUM(sm.max_lines), 0) AS totalLinesSeen, + COALESCE(SUM(sm.max_lookups), 0) AS totalLookupCount, + COALESCE(SUM(sm.max_hits), 0) AS totalLookupHits + FROM imm_videos v + JOIN imm_sessions s ON s.video_id = v.video_id + LEFT JOIN ( + SELECT + t.session_id, + MAX(t.active_watched_ms) AS max_active_ms, + MAX(t.cards_mined) AS max_cards, + MAX(t.words_seen) AS max_words, + MAX(t.lines_seen) AS max_lines, + MAX(t.lookup_count) AS max_lookups, + MAX(t.lookup_hits) AS max_hits + FROM imm_session_telemetry t + GROUP BY t.session_id + ) sm ON sm.session_id = s.session_id + WHERE v.video_id = ? + GROUP BY v.video_id + `).get(videoId) as unknown as MediaDetailRow | null; +} + +export function getMediaSessions(db: DatabaseSync, videoId: number, limit = 100): SessionSummaryQueryRow[] { + return db.prepare(` + SELECT + s.session_id AS sessionId, + s.video_id AS videoId, + v.canonical_title AS canonicalTitle, + s.started_at_ms AS startedAtMs, + s.ended_at_ms AS endedAtMs, + COALESCE(MAX(t.total_watched_ms), 0) AS totalWatchedMs, + COALESCE(MAX(t.active_watched_ms), 0) AS activeWatchedMs, + COALESCE(MAX(t.lines_seen), 0) AS linesSeen, + COALESCE(MAX(t.words_seen), 0) AS wordsSeen, + COALESCE(MAX(t.tokens_seen), 0) AS tokensSeen, + COALESCE(MAX(t.cards_mined), 0) AS cardsMined, + COALESCE(MAX(t.lookup_count), 0) AS lookupCount, + COALESCE(MAX(t.lookup_hits), 0) AS lookupHits + FROM imm_sessions s + LEFT JOIN imm_session_telemetry t ON t.session_id = s.session_id + LEFT JOIN imm_videos v ON v.video_id = s.video_id + WHERE s.video_id = ? + GROUP BY s.session_id + ORDER BY s.started_at_ms DESC + LIMIT ? + `).all(videoId, limit) as unknown as SessionSummaryQueryRow[]; +} + +export function getMediaDailyRollups(db: DatabaseSync, videoId: number, limit = 90): ImmersionSessionRollupRow[] { + return db.prepare(` + SELECT + rollup_day AS rollupDayOrMonth, + video_id AS videoId, + total_sessions AS totalSessions, + total_active_min AS totalActiveMin, + total_lines_seen AS totalLinesSeen, + total_words_seen AS totalWordsSeen, + total_tokens_seen AS totalTokensSeen, + total_cards AS totalCards, + cards_per_hour AS cardsPerHour, + words_per_min AS wordsPerMin, + lookup_hit_rate AS lookupHitRate + FROM imm_daily_rollups + WHERE video_id = ? + ORDER BY rollup_day DESC + LIMIT ? + `).all(videoId, limit) as unknown as ImmersionSessionRollupRow[]; +} + +export function getAnimeCoverArt(db: DatabaseSync, animeId: number): MediaArtRow | null { + return db.prepare(` + SELECT + a.video_id AS videoId, + a.anilist_id AS anilistId, + a.cover_url AS coverUrl, + a.cover_blob AS coverBlob, + a.title_romaji AS titleRomaji, + a.title_english AS titleEnglish, + a.episodes_total AS episodesTotal, + a.fetched_at_ms AS fetchedAtMs + FROM imm_media_art a + JOIN imm_videos v ON v.video_id = a.video_id + WHERE v.anime_id = ? + AND a.cover_blob IS NOT NULL + LIMIT 1 + `).get(animeId) as unknown as MediaArtRow | null; +} + +export function getCoverArt(db: DatabaseSync, videoId: number): MediaArtRow | null { + return db.prepare(` + SELECT + video_id AS videoId, + anilist_id AS anilistId, + cover_url AS coverUrl, + cover_blob AS coverBlob, + title_romaji AS titleRomaji, + title_english AS titleEnglish, + episodes_total AS episodesTotal, + fetched_at_ms AS fetchedAtMs + FROM imm_media_art + WHERE video_id = ? + `).get(videoId) as unknown as MediaArtRow | null; +} + +export function getStreakCalendar(db: DatabaseSync, days = 90): StreakCalendarRow[] { + const now = new Date(); + const localMidnight = new Date(now.getFullYear(), now.getMonth(), now.getDate()).getTime(); + const todayLocalDay = Math.floor(localMidnight / 86_400_000); + const cutoffDay = todayLocalDay - days; + return db.prepare(` + SELECT rollup_day AS epochDay, SUM(total_active_min) AS totalActiveMin + FROM imm_daily_rollups + WHERE rollup_day >= ? + GROUP BY rollup_day + ORDER BY rollup_day ASC + `).all(cutoffDay) as StreakCalendarRow[]; +} + +export function getAnimeWords(db: DatabaseSync, animeId: number, limit = 50): AnimeWordRow[] { + return db.prepare(` + SELECT w.id AS wordId, w.headword, w.word, w.reading, w.part_of_speech AS partOfSpeech, + SUM(o.occurrence_count) AS frequency + FROM imm_word_line_occurrences o + JOIN imm_subtitle_lines sl ON sl.line_id = o.line_id + JOIN imm_words w ON w.id = o.word_id + WHERE sl.anime_id = ? + GROUP BY w.id + ORDER BY frequency DESC + LIMIT ? + `).all(animeId, limit) as unknown as AnimeWordRow[]; +} + +export function getAnimeDailyRollups(db: DatabaseSync, animeId: number, limit = 90): ImmersionSessionRollupRow[] { + return db.prepare(` + SELECT r.rollup_day AS rollupDayOrMonth, r.video_id AS videoId, + r.total_sessions AS totalSessions, r.total_active_min AS totalActiveMin, + r.total_lines_seen AS totalLinesSeen, r.total_words_seen AS totalWordsSeen, + r.total_tokens_seen AS totalTokensSeen, r.total_cards AS totalCards, + r.cards_per_hour AS cardsPerHour, r.words_per_min AS wordsPerMin, + r.lookup_hit_rate AS lookupHitRate + FROM imm_daily_rollups r + JOIN imm_videos v ON v.video_id = r.video_id + WHERE v.anime_id = ? + ORDER BY r.rollup_day DESC + LIMIT ? + `).all(animeId, limit) as unknown as ImmersionSessionRollupRow[]; +} + +export function getEpisodesPerDay(db: DatabaseSync, limit = 90): EpisodesPerDayRow[] { + return db.prepare(` + SELECT CAST(julianday(s.started_at_ms / 1000, 'unixepoch', 'localtime') - 2440587.5 AS INTEGER) AS epochDay, + COUNT(DISTINCT s.video_id) AS episodeCount + FROM imm_sessions s + GROUP BY epochDay + ORDER BY epochDay DESC + LIMIT ? + `).all(limit) as EpisodesPerDayRow[]; +} + +export function getNewAnimePerDay(db: DatabaseSync, limit = 90): NewAnimePerDayRow[] { + return db.prepare(` + SELECT first_day AS epochDay, COUNT(*) AS newAnimeCount + FROM ( + SELECT CAST(julianday(MIN(s.started_at_ms) / 1000, 'unixepoch', 'localtime') - 2440587.5 AS INTEGER) AS first_day + FROM imm_sessions s + JOIN imm_videos v ON v.video_id = s.video_id + WHERE v.anime_id IS NOT NULL + GROUP BY v.anime_id + ) + GROUP BY first_day + ORDER BY first_day DESC + LIMIT ? + `).all(limit) as NewAnimePerDayRow[]; +} + +export function getWatchTimePerAnime(db: DatabaseSync, limit = 90): WatchTimePerAnimeRow[] { + const nowD = new Date(); + const cutoffDay = Math.floor(new Date(nowD.getFullYear(), nowD.getMonth(), nowD.getDate()).getTime() / 86_400_000) - limit; + return db.prepare(` + SELECT r.rollup_day AS epochDay, a.anime_id AS animeId, + a.canonical_title AS animeTitle, + SUM(r.total_active_min) AS totalActiveMin + FROM imm_daily_rollups r + JOIN imm_videos v ON v.video_id = r.video_id + JOIN imm_anime a ON a.anime_id = v.anime_id + WHERE r.rollup_day >= ? + GROUP BY r.rollup_day, a.anime_id + ORDER BY r.rollup_day ASC + `).all(cutoffDay) as WatchTimePerAnimeRow[]; +} + +export function getWordDetail(db: DatabaseSync, wordId: number): WordDetailRow | null { + return db.prepare(` + SELECT id AS wordId, headword, word, reading, + part_of_speech AS partOfSpeech, pos1, pos2, pos3, + frequency, first_seen AS firstSeen, last_seen AS lastSeen + FROM imm_words WHERE id = ? + `).get(wordId) as WordDetailRow | null; +} + +export function getWordAnimeAppearances(db: DatabaseSync, wordId: number): WordAnimeAppearanceRow[] { + return db.prepare(` + SELECT a.anime_id AS animeId, a.canonical_title AS animeTitle, + SUM(o.occurrence_count) AS occurrenceCount + FROM imm_word_line_occurrences o + JOIN imm_subtitle_lines sl ON sl.line_id = o.line_id + JOIN imm_anime a ON a.anime_id = sl.anime_id + WHERE o.word_id = ? AND sl.anime_id IS NOT NULL + GROUP BY a.anime_id + ORDER BY occurrenceCount DESC + `).all(wordId) as WordAnimeAppearanceRow[]; +} + +export function getSimilarWords(db: DatabaseSync, wordId: number, limit = 10): SimilarWordRow[] { + const word = db.prepare('SELECT headword, reading FROM imm_words WHERE id = ?').get(wordId) as { headword: string; reading: string } | null; + if (!word) return []; + return db.prepare(` + SELECT id AS wordId, headword, word, reading, frequency + FROM imm_words + WHERE id != ? + AND (reading = ? OR headword LIKE ? OR headword LIKE ?) + ORDER BY frequency DESC + LIMIT ? + `).all( + wordId, + word.reading, + `%${word.headword.charAt(0)}%`, + `%${word.headword.charAt(word.headword.length - 1)}%`, + limit, + ) as SimilarWordRow[]; +} + +export function getKanjiDetail(db: DatabaseSync, kanjiId: number): KanjiDetailRow | null { + return db.prepare(` + SELECT id AS kanjiId, kanji, frequency, first_seen AS firstSeen, last_seen AS lastSeen + FROM imm_kanji WHERE id = ? + `).get(kanjiId) as KanjiDetailRow | null; +} + +export function getKanjiAnimeAppearances(db: DatabaseSync, kanjiId: number): KanjiAnimeAppearanceRow[] { + return db.prepare(` + SELECT a.anime_id AS animeId, a.canonical_title AS animeTitle, + SUM(o.occurrence_count) AS occurrenceCount + FROM imm_kanji_line_occurrences o + JOIN imm_subtitle_lines sl ON sl.line_id = o.line_id + JOIN imm_anime a ON a.anime_id = sl.anime_id + WHERE o.kanji_id = ? AND sl.anime_id IS NOT NULL + GROUP BY a.anime_id + ORDER BY occurrenceCount DESC + `).all(kanjiId) as KanjiAnimeAppearanceRow[]; +} + +export function getKanjiWords(db: DatabaseSync, kanjiId: number, limit = 20): KanjiWordRow[] { + const kanjiRow = db.prepare('SELECT kanji FROM imm_kanji WHERE id = ?').get(kanjiId) as { kanji: string } | null; + if (!kanjiRow) return []; + return db.prepare(` + SELECT id AS wordId, headword, word, reading, frequency + FROM imm_words + WHERE headword LIKE ? + ORDER BY frequency DESC + LIMIT ? + `).all(`%${kanjiRow.kanji}%`, limit) as KanjiWordRow[]; +} + +export function getEpisodeWords(db: DatabaseSync, videoId: number, limit = 50): AnimeWordRow[] { + return db.prepare(` + SELECT w.id AS wordId, w.headword, w.word, w.reading, w.part_of_speech AS partOfSpeech, + SUM(o.occurrence_count) AS frequency + FROM imm_word_line_occurrences o + JOIN imm_subtitle_lines sl ON sl.line_id = o.line_id + JOIN imm_words w ON w.id = o.word_id + WHERE sl.video_id = ? + GROUP BY w.id + ORDER BY frequency DESC + LIMIT ? + `).all(videoId, limit) as unknown as AnimeWordRow[]; +} + +export function getEpisodeSessions(db: DatabaseSync, videoId: number): SessionSummaryQueryRow[] { + return db.prepare(` + SELECT + s.session_id AS sessionId, s.video_id AS videoId, + v.canonical_title AS canonicalTitle, + s.started_at_ms AS startedAtMs, s.ended_at_ms AS endedAtMs, + COALESCE(MAX(t.total_watched_ms), 0) AS totalWatchedMs, + COALESCE(MAX(t.active_watched_ms), 0) AS activeWatchedMs, + COALESCE(MAX(t.lines_seen), 0) AS linesSeen, + COALESCE(MAX(t.words_seen), 0) AS wordsSeen, + COALESCE(MAX(t.tokens_seen), 0) AS tokensSeen, + COALESCE(MAX(t.cards_mined), 0) AS cardsMined, + COALESCE(MAX(t.lookup_count), 0) AS lookupCount, + COALESCE(MAX(t.lookup_hits), 0) AS lookupHits + FROM imm_sessions s + JOIN imm_videos v ON v.video_id = s.video_id + LEFT JOIN imm_session_telemetry t ON t.session_id = s.session_id + WHERE s.video_id = ? + GROUP BY s.session_id + ORDER BY s.started_at_ms DESC + `).all(videoId) as SessionSummaryQueryRow[]; +} + +export function getEpisodeCardEvents(db: DatabaseSync, videoId: number): EpisodeCardEventRow[] { + const rows = db.prepare(` + SELECT e.event_id AS eventId, e.session_id AS sessionId, + e.ts_ms AS tsMs, e.cards_delta AS cardsDelta, + e.payload_json AS payloadJson + FROM imm_session_events e + JOIN imm_sessions s ON s.session_id = e.session_id + WHERE s.video_id = ? AND e.event_type = 4 + ORDER BY e.ts_ms DESC + `).all(videoId) as Array<{ eventId: number; sessionId: number; tsMs: number; cardsDelta: number; payloadJson: string | null }>; + + return rows.map(row => { + let noteIds: number[] = []; + if (row.payloadJson) { + try { + const parsed = JSON.parse(row.payloadJson); + if (Array.isArray(parsed.noteIds)) noteIds = parsed.noteIds; + } catch {} + } + return { eventId: row.eventId, sessionId: row.sessionId, tsMs: row.tsMs, cardsDelta: row.cardsDelta, noteIds }; + }); +} + +export function upsertCoverArt( + db: DatabaseSync, + videoId: number, + art: { + anilistId: number | null; + coverUrl: string | null; + coverBlob: Buffer | null; + titleRomaji: string | null; + titleEnglish: string | null; + episodesTotal: number | null; + }, +): void { + const nowMs = Date.now(); + db.prepare(` + INSERT INTO imm_media_art ( + video_id, anilist_id, cover_url, cover_blob, + title_romaji, title_english, episodes_total, + fetched_at_ms, CREATED_DATE, LAST_UPDATE_DATE + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(video_id) DO UPDATE SET + anilist_id = excluded.anilist_id, + cover_url = excluded.cover_url, + cover_blob = excluded.cover_blob, + title_romaji = excluded.title_romaji, + title_english = excluded.title_english, + episodes_total = excluded.episodes_total, + fetched_at_ms = excluded.fetched_at_ms, + LAST_UPDATE_DATE = excluded.LAST_UPDATE_DATE + `).run( + videoId, art.anilistId, art.coverUrl, art.coverBlob, + art.titleRomaji, art.titleEnglish, art.episodesTotal, + nowMs, nowMs, nowMs, + ); +} + +export function updateAnimeAnilistInfo( + db: DatabaseSync, + videoId: number, + info: { + anilistId: number; + titleRomaji: string | null; + titleEnglish: string | null; + titleNative: string | null; + episodesTotal: number | null; + }, +): void { + const row = db.prepare('SELECT anime_id FROM imm_videos WHERE video_id = ?').get(videoId) as { + anime_id: number | null; + } | null; + if (!row?.anime_id) return; + + db.prepare(` + UPDATE imm_anime + SET + anilist_id = COALESCE(?, anilist_id), + title_romaji = COALESCE(?, title_romaji), + title_english = COALESCE(?, title_english), + title_native = COALESCE(?, title_native), + episodes_total = COALESCE(?, episodes_total), + LAST_UPDATE_DATE = ? + WHERE anime_id = ? + `).run( + info.anilistId, + info.titleRomaji, + info.titleEnglish, + info.titleNative, + info.episodesTotal, + Date.now(), + row.anime_id, + ); +} + +export function markVideoWatched(db: DatabaseSync, videoId: number, watched: boolean): void { + db.prepare('UPDATE imm_videos SET watched = ?, LAST_UPDATE_DATE = ? WHERE video_id = ?') + .run(watched ? 1 : 0, Date.now(), videoId); +} + +export function getVideoDurationMs(db: DatabaseSync, videoId: number): number { + const row = db.prepare('SELECT duration_ms FROM imm_videos WHERE video_id = ?').get(videoId) as { + duration_ms: number; + } | null; + return row?.duration_ms ?? 0; +} + +export function isVideoWatched(db: DatabaseSync, videoId: number): boolean { + const row = db.prepare('SELECT watched FROM imm_videos WHERE video_id = ?').get(videoId) as { + watched: number; + } | null; + return row?.watched === 1; +} diff --git a/src/core/services/immersion-tracker/reducer.ts b/src/core/services/immersion-tracker/reducer.ts index ae1a43f..5f4fa58 100644 --- a/src/core/services/immersion-tracker/reducer.ts +++ b/src/core/services/immersion-tracker/reducer.ts @@ -30,6 +30,7 @@ export function createInitialSessionState( lastPauseStartMs: null, isPaused: false, pendingTelemetry: true, + markedWatched: false, }; } diff --git a/src/core/services/immersion-tracker/storage-session.test.ts b/src/core/services/immersion-tracker/storage-session.test.ts index b07d5ec..ea4bec2 100644 --- a/src/core/services/immersion-tracker/storage-session.test.ts +++ b/src/core/services/immersion-tracker/storage-session.test.ts @@ -9,7 +9,9 @@ import { createTrackerPreparedStatements, ensureSchema, executeQueuedWrite, + getOrCreateAnimeRecord, getOrCreateVideoRecord, + linkVideoToAnimeRecord, } from './storage'; import { EVENT_SUBTITLE_LINE, SESSION_STATUS_ENDED, SOURCE_TYPE_LOCAL } from './types'; @@ -60,6 +62,7 @@ test('ensureSchema creates immersion core tables', () => { const tableNames = new Set(rows.map((row) => row.name)); assert.ok(tableNames.has('imm_videos')); + assert.ok(tableNames.has('imm_anime')); assert.ok(tableNames.has('imm_sessions')); assert.ok(tableNames.has('imm_session_telemetry')); assert.ok(tableNames.has('imm_session_events')); @@ -67,8 +70,28 @@ test('ensureSchema creates immersion core tables', () => { assert.ok(tableNames.has('imm_monthly_rollups')); assert.ok(tableNames.has('imm_words')); assert.ok(tableNames.has('imm_kanji')); + assert.ok(tableNames.has('imm_subtitle_lines')); + assert.ok(tableNames.has('imm_word_line_occurrences')); + assert.ok(tableNames.has('imm_kanji_line_occurrences')); assert.ok(tableNames.has('imm_rollup_state')); + const videoColumns = new Set( + ( + db.prepare('PRAGMA table_info(imm_videos)').all() as Array<{ + name: string; + }> + ).map((row) => row.name), + ); + + assert.ok(videoColumns.has('anime_id')); + assert.ok(videoColumns.has('parsed_basename')); + assert.ok(videoColumns.has('parsed_title')); + assert.ok(videoColumns.has('parsed_season')); + assert.ok(videoColumns.has('parsed_episode')); + assert.ok(videoColumns.has('parser_source')); + assert.ok(videoColumns.has('parser_confidence')); + assert.ok(videoColumns.has('parse_metadata_json')); + const rollupStateRow = db .prepare('SELECT state_value FROM imm_rollup_state WHERE state_key = ?') .get('last_rollup_sample_ms') as { @@ -82,6 +105,470 @@ test('ensureSchema creates immersion core tables', () => { } }); +test('ensureSchema migrates legacy videos and backfills anime metadata from filenames', () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + db.exec(` + CREATE TABLE imm_schema_version ( + schema_version INTEGER PRIMARY KEY, + applied_at_ms INTEGER NOT NULL + ); + INSERT INTO imm_schema_version(schema_version, applied_at_ms) VALUES (4, 1); + + CREATE TABLE imm_videos( + video_id INTEGER PRIMARY KEY AUTOINCREMENT, + video_key TEXT NOT NULL UNIQUE, + canonical_title TEXT NOT NULL, + source_type INTEGER NOT NULL, + source_path TEXT, + source_url TEXT, + duration_ms INTEGER NOT NULL CHECK(duration_ms>=0), + file_size_bytes INTEGER CHECK(file_size_bytes>=0), + codec_id INTEGER, container_id INTEGER, + width_px INTEGER, height_px INTEGER, fps_x100 INTEGER, + bitrate_kbps INTEGER, audio_codec_id INTEGER, + hash_sha256 TEXT, screenshot_path TEXT, + metadata_json TEXT, + CREATED_DATE INTEGER, + LAST_UPDATE_DATE INTEGER + ); + `); + + const insertLegacyVideo = db.prepare(` + INSERT INTO imm_videos ( + video_key, canonical_title, source_type, source_path, source_url, + duration_ms, file_size_bytes, codec_id, container_id, width_px, height_px, + fps_x100, bitrate_kbps, audio_codec_id, hash_sha256, screenshot_path, + metadata_json, CREATED_DATE, LAST_UPDATE_DATE + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + `); + + insertLegacyVideo.run( + 'local:/library/Little Witch Academia S02E05.mkv', + 'Episode 5', + SOURCE_TYPE_LOCAL, + '/library/Little Witch Academia S02E05.mkv', + null, + 0, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + 1, + 1, + ); + insertLegacyVideo.run( + 'local:/library/Little Witch Academia S02E06.mkv', + 'Episode 6', + SOURCE_TYPE_LOCAL, + '/library/Little Witch Academia S02E06.mkv', + null, + 0, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + 1, + 1, + ); + insertLegacyVideo.run( + 'local:/library/[SubsPlease] Frieren - 03 - Departure.mkv', + 'Episode 3', + SOURCE_TYPE_LOCAL, + '/library/[SubsPlease] Frieren - 03 - Departure.mkv', + null, + 0, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + 1, + 1, + ); + + ensureSchema(db); + + const videoColumns = new Set( + ( + db.prepare('PRAGMA table_info(imm_videos)').all() as Array<{ + name: string; + }> + ).map((row) => row.name), + ); + assert.ok(videoColumns.has('anime_id')); + assert.ok(videoColumns.has('parsed_basename')); + assert.ok(videoColumns.has('parsed_title')); + assert.ok(videoColumns.has('parsed_season')); + assert.ok(videoColumns.has('parsed_episode')); + assert.ok(videoColumns.has('parser_source')); + assert.ok(videoColumns.has('parser_confidence')); + assert.ok(videoColumns.has('parse_metadata_json')); + + const animeRows = db + .prepare('SELECT canonical_title FROM imm_anime ORDER BY canonical_title') + .all() as Array<{ canonical_title: string }>; + assert.deepEqual( + animeRows.map((row) => row.canonical_title), + ['Frieren', 'Little Witch Academia'], + ); + + const littleWitchRows = db + .prepare( + ` + SELECT + a.canonical_title AS anime_title, + v.parsed_title, + v.parsed_basename, + v.parsed_season, + v.parsed_episode, + v.parser_source, + v.parser_confidence + FROM imm_videos v + JOIN imm_anime a ON a.anime_id = v.anime_id + WHERE v.video_key LIKE 'local:/library/Little Witch Academia%' + ORDER BY v.video_key + `, + ) + .all() as Array<{ + anime_title: string; + parsed_title: string | null; + parsed_basename: string | null; + parsed_season: number | null; + parsed_episode: number | null; + parser_source: string | null; + parser_confidence: number | null; + }>; + + assert.equal(littleWitchRows.length, 2); + assert.deepEqual( + littleWitchRows.map((row) => ({ + animeTitle: row.anime_title, + parsedTitle: row.parsed_title, + parsedBasename: row.parsed_basename, + parsedSeason: row.parsed_season, + parsedEpisode: row.parsed_episode, + parserSource: row.parser_source, + })), + [ + { + animeTitle: 'Little Witch Academia', + parsedTitle: 'Little Witch Academia', + parsedBasename: 'Little Witch Academia S02E05.mkv', + parsedSeason: 2, + parsedEpisode: 5, + parserSource: 'fallback', + }, + { + animeTitle: 'Little Witch Academia', + parsedTitle: 'Little Witch Academia', + parsedBasename: 'Little Witch Academia S02E06.mkv', + parsedSeason: 2, + parsedEpisode: 6, + parserSource: 'fallback', + }, + ], + ); + assert.ok( + littleWitchRows.every( + (row) => typeof row.parser_confidence === 'number' && row.parser_confidence > 0, + ), + ); + + const frierenRow = db + .prepare( + ` + SELECT + a.canonical_title AS anime_title, + v.parsed_title, + v.parsed_episode, + v.parser_source + FROM imm_videos v + JOIN imm_anime a ON a.anime_id = v.anime_id + WHERE v.video_key = ? + `, + ) + .get('local:/library/[SubsPlease] Frieren - 03 - Departure.mkv') as { + anime_title: string; + parsed_title: string | null; + parsed_episode: number | null; + parser_source: string | null; + } | null; + + assert.ok(frierenRow); + assert.equal(frierenRow?.anime_title, 'Frieren'); + assert.equal(frierenRow?.parsed_title, 'Frieren'); + assert.equal(frierenRow?.parsed_episode, 3); + assert.equal(frierenRow?.parser_source, 'fallback'); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); + +test('ensureSchema adds subtitle-line occurrence tables to schema version 6 databases', () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + db.exec(` + CREATE TABLE imm_schema_version ( + schema_version INTEGER PRIMARY KEY, + applied_at_ms INTEGER NOT NULL + ); + INSERT INTO imm_schema_version(schema_version, applied_at_ms) VALUES (6, 1); + + CREATE TABLE imm_videos( + video_id INTEGER PRIMARY KEY AUTOINCREMENT, + video_key TEXT NOT NULL UNIQUE, + anime_id INTEGER, + canonical_title TEXT NOT NULL, + source_type INTEGER NOT NULL, + source_path TEXT, + source_url TEXT, + parsed_basename TEXT, + parsed_title TEXT, + parsed_season INTEGER, + parsed_episode INTEGER, + parser_source TEXT, + parser_confidence REAL, + parse_metadata_json TEXT, + duration_ms INTEGER NOT NULL CHECK(duration_ms>=0), + file_size_bytes INTEGER CHECK(file_size_bytes>=0), + codec_id INTEGER, container_id INTEGER, + width_px INTEGER, height_px INTEGER, fps_x100 INTEGER, + bitrate_kbps INTEGER, audio_codec_id INTEGER, + hash_sha256 TEXT, screenshot_path TEXT, + metadata_json TEXT, + CREATED_DATE INTEGER, + LAST_UPDATE_DATE INTEGER + ); + CREATE TABLE imm_sessions( + session_id INTEGER PRIMARY KEY AUTOINCREMENT, + session_uuid TEXT NOT NULL UNIQUE, + video_id INTEGER NOT NULL, + started_at_ms INTEGER NOT NULL, + ended_at_ms INTEGER, + status INTEGER NOT NULL, + locale_id INTEGER, + target_lang_id INTEGER, + difficulty_tier INTEGER, + subtitle_mode INTEGER, + CREATED_DATE INTEGER, + LAST_UPDATE_DATE INTEGER + ); + CREATE TABLE imm_session_events( + event_id INTEGER PRIMARY KEY AUTOINCREMENT, + session_id INTEGER NOT NULL, + ts_ms INTEGER NOT NULL, + event_type INTEGER NOT NULL, + line_index INTEGER, + segment_start_ms INTEGER, + segment_end_ms INTEGER, + words_delta INTEGER NOT NULL DEFAULT 0, + cards_delta INTEGER NOT NULL DEFAULT 0, + payload_json TEXT, + CREATED_DATE INTEGER, + LAST_UPDATE_DATE INTEGER + ); + CREATE TABLE imm_words( + id INTEGER PRIMARY KEY AUTOINCREMENT, + headword TEXT, + word TEXT, + reading TEXT, + part_of_speech TEXT, + pos1 TEXT, + pos2 TEXT, + pos3 TEXT, + first_seen REAL, + last_seen REAL, + frequency INTEGER, + UNIQUE(headword, word, reading) + ); + CREATE TABLE imm_kanji( + id INTEGER PRIMARY KEY AUTOINCREMENT, + kanji TEXT, + first_seen REAL, + last_seen REAL, + frequency INTEGER, + UNIQUE(kanji) + ); + CREATE TABLE imm_rollup_state( + state_key TEXT PRIMARY KEY, + state_value INTEGER NOT NULL + ); + `); + + ensureSchema(db); + + const tableNames = new Set( + ( + db.prepare(`SELECT name FROM sqlite_master WHERE type = 'table' AND name LIKE 'imm_%'`).all() as + Array<{ name: string }> + ).map((row) => row.name), + ); + + assert.ok(tableNames.has('imm_subtitle_lines')); + assert.ok(tableNames.has('imm_word_line_occurrences')); + assert.ok(tableNames.has('imm_kanji_line_occurrences')); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); + +test('anime rows are reused by normalized parsed title and upgraded with AniList metadata', () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + ensureSchema(db); + + const firstVideoId = getOrCreateVideoRecord(db, 'local:/tmp/lwa-s02e05.mkv', { + canonicalTitle: 'Episode 5', + sourcePath: '/tmp/Little Witch Academia S02E05.mkv', + sourceUrl: null, + sourceType: SOURCE_TYPE_LOCAL, + }); + const secondVideoId = getOrCreateVideoRecord(db, 'local:/tmp/lwa-s02e06.mkv', { + canonicalTitle: 'Episode 6', + sourcePath: '/tmp/Little Witch Academia S02E06.mkv', + sourceUrl: null, + sourceType: SOURCE_TYPE_LOCAL, + }); + + const provisionalAnimeId = getOrCreateAnimeRecord(db, { + parsedTitle: 'Little Witch Academia', + canonicalTitle: 'Little Witch Academia', + anilistId: null, + titleRomaji: null, + titleEnglish: null, + titleNative: null, + metadataJson: '{"source":"parsed"}', + }); + linkVideoToAnimeRecord(db, firstVideoId, { + animeId: provisionalAnimeId, + parsedBasename: 'Little Witch Academia S02E05.mkv', + parsedTitle: 'Little Witch Academia', + parsedSeason: 2, + parsedEpisode: 5, + parserSource: 'fallback', + parserConfidence: 0.6, + parseMetadataJson: '{"source":"parsed","episode":5}', + }); + + const reusedAnimeId = getOrCreateAnimeRecord(db, { + parsedTitle: ' little witch academia ', + canonicalTitle: 'Little Witch Academia', + anilistId: null, + titleRomaji: null, + titleEnglish: null, + titleNative: null, + metadataJson: '{"source":"parsed"}', + }); + linkVideoToAnimeRecord(db, secondVideoId, { + animeId: reusedAnimeId, + parsedBasename: 'Little Witch Academia S02E06.mkv', + parsedTitle: 'Little Witch Academia', + parsedSeason: 2, + parsedEpisode: 6, + parserSource: 'fallback', + parserConfidence: 0.6, + parseMetadataJson: '{"source":"parsed","episode":6}', + }); + + assert.equal(reusedAnimeId, provisionalAnimeId); + + const upgradedAnimeId = getOrCreateAnimeRecord(db, { + parsedTitle: 'Little Witch Academia', + canonicalTitle: 'Little Witch Academia TV', + anilistId: 33_435, + titleRomaji: 'Little Witch Academia', + titleEnglish: 'Little Witch Academia', + titleNative: 'リトルウィッチアカデミア', + metadataJson: '{"source":"anilist"}', + }); + + assert.equal(upgradedAnimeId, provisionalAnimeId); + + const animeRows = db.prepare('SELECT * FROM imm_anime').all() as Array<{ + anime_id: number; + normalized_title_key: string; + canonical_title: string; + anilist_id: number | null; + title_romaji: string | null; + title_english: string | null; + title_native: string | null; + metadata_json: string | null; + }>; + assert.equal(animeRows.length, 1); + assert.equal(animeRows[0]?.anime_id, provisionalAnimeId); + assert.equal(animeRows[0]?.normalized_title_key, 'little witch academia'); + assert.equal(animeRows[0]?.canonical_title, 'Little Witch Academia TV'); + assert.equal(animeRows[0]?.anilist_id, 33_435); + assert.equal(animeRows[0]?.title_romaji, 'Little Witch Academia'); + assert.equal(animeRows[0]?.title_english, 'Little Witch Academia'); + assert.equal(animeRows[0]?.title_native, 'リトルウィッチアカデミア'); + assert.equal(animeRows[0]?.metadata_json, '{"source":"anilist"}'); + + const linkedVideos = db + .prepare( + ` + SELECT anime_id, parsed_title, parsed_season, parsed_episode + FROM imm_videos + WHERE video_id IN (?, ?) + ORDER BY video_id + `, + ) + .all(firstVideoId, secondVideoId) as Array<{ + anime_id: number | null; + parsed_title: string | null; + parsed_season: number | null; + parsed_episode: number | null; + }>; + + assert.deepEqual(linkedVideos, [ + { + anime_id: provisionalAnimeId, + parsed_title: 'Little Witch Academia', + parsed_season: 2, + parsed_episode: 5, + }, + { + anime_id: provisionalAnimeId, + parsed_title: 'Little Witch Academia', + parsed_season: 2, + parsed_episode: 6, + }, + ]); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); + test('start/finalize session updates ended_at and status', () => { const dbPath = makeDbPath(); const db = new Database(dbPath); @@ -191,18 +678,22 @@ test('executeQueuedWrite inserts and upserts word and kanji rows', () => { ensureSchema(db); const stmts = createTrackerPreparedStatements(db); - stmts.wordUpsertStmt.run('猫', '猫', '', 10.0, 10.0); - stmts.wordUpsertStmt.run('猫', '猫', '', 5.0, 15.0); + stmts.wordUpsertStmt.run('猫', '猫', '', 'noun', '名詞', '一般', '', 10.0, 10.0); + stmts.wordUpsertStmt.run('猫', '猫', '', 'noun', '名詞', '一般', '', 5.0, 15.0); stmts.kanjiUpsertStmt.run('日', 9.0, 9.0); stmts.kanjiUpsertStmt.run('日', 8.0, 11.0); const wordRow = db .prepare( - 'SELECT headword, frequency, first_seen, last_seen FROM imm_words WHERE headword = ?', + `SELECT headword, frequency, part_of_speech, pos1, pos2, first_seen, last_seen + FROM imm_words WHERE headword = ?`, ) .get('猫') as { headword: string; frequency: number; + part_of_speech: string; + pos1: string; + pos2: string; first_seen: number; last_seen: number; } | null; @@ -218,6 +709,9 @@ test('executeQueuedWrite inserts and upserts word and kanji rows', () => { assert.ok(wordRow); assert.ok(kanjiRow); assert.equal(wordRow?.frequency, 2); + assert.equal(wordRow?.part_of_speech, 'noun'); + assert.equal(wordRow?.pos1, '名詞'); + assert.equal(wordRow?.pos2, '一般'); assert.equal(kanjiRow?.frequency, 2); assert.equal(wordRow?.first_seen, 5); assert.equal(wordRow?.last_seen, 15); @@ -228,3 +722,34 @@ test('executeQueuedWrite inserts and upserts word and kanji rows', () => { cleanupDbPath(dbPath); } }); + +test('word upsert replaces legacy other part_of_speech when better POS metadata arrives later', () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + ensureSchema(db); + const stmts = createTrackerPreparedStatements(db); + + stmts.wordUpsertStmt.run('知っている', '知っている', 'しっている', 'other', '動詞', '自立', '', 10, 10); + stmts.wordUpsertStmt.run('知っている', '知っている', 'しっている', 'verb', '動詞', '自立', '', 11, 12); + + const row = db + .prepare('SELECT frequency, part_of_speech, pos1, pos2 FROM imm_words WHERE headword = ?') + .get('知っている') as { + frequency: number; + part_of_speech: string; + pos1: string; + pos2: string; + } | null; + + assert.ok(row); + assert.equal(row?.frequency, 2); + assert.equal(row?.part_of_speech, 'verb'); + assert.equal(row?.pos1, '動詞'); + assert.equal(row?.pos2, '自立'); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); diff --git a/src/core/services/immersion-tracker/storage.ts b/src/core/services/immersion-tracker/storage.ts index 2685da5..dfcbb83 100644 --- a/src/core/services/immersion-tracker/storage.ts +++ b/src/core/services/immersion-tracker/storage.ts @@ -1,3 +1,4 @@ +import { parseMediaInfo } from '../../../jimaku/utils'; import type { DatabaseSync } from './sqlite'; import { SCHEMA_VERSION } from './types'; import type { QueuedWrite, VideoMetadata } from './types'; @@ -7,6 +8,33 @@ export interface TrackerPreparedStatements { eventInsertStmt: ReturnType; wordUpsertStmt: ReturnType; kanjiUpsertStmt: ReturnType; + subtitleLineInsertStmt: ReturnType; + wordIdSelectStmt: ReturnType; + kanjiIdSelectStmt: ReturnType; + wordLineOccurrenceUpsertStmt: ReturnType; + kanjiLineOccurrenceUpsertStmt: ReturnType; + videoAnimeIdSelectStmt: ReturnType; +} + +export interface AnimeRecordInput { + parsedTitle: string; + canonicalTitle: string; + anilistId: number | null; + titleRomaji: string | null; + titleEnglish: string | null; + titleNative: string | null; + metadataJson: string | null; +} + +export interface VideoAnimeLinkInput { + animeId: number | null; + parsedBasename: string | null; + parsedTitle: string | null; + parsedSeason: number | null; + parsedEpisode: number | null; + parserSource: string | null; + parserConfidence: number | null; + parseMetadataJson: string | null; } function hasColumn(db: DatabaseSync, tableName: string, columnName: string): boolean { @@ -16,9 +44,14 @@ function hasColumn(db: DatabaseSync, tableName: string, columnName: string): boo .some((row: unknown) => (row as { name: string }).name === columnName); } -function addColumnIfMissing(db: DatabaseSync, tableName: string, columnName: string): void { +function addColumnIfMissing( + db: DatabaseSync, + tableName: string, + columnName: string, + columnType = 'INTEGER', +): void { if (!hasColumn(db, tableName, columnName)) { - db.exec(`ALTER TABLE ${tableName} ADD COLUMN ${columnName} INTEGER`); + db.exec(`ALTER TABLE ${tableName} ADD COLUMN ${columnName} ${columnType}`); } } @@ -35,6 +68,247 @@ export function applyPragmas(db: DatabaseSync): void { db.exec('PRAGMA busy_timeout = 2500'); } +export function normalizeAnimeIdentityKey(title: string): string { + return title + .normalize('NFKC') + .toLowerCase() + .replace(/[^\p{L}\p{N}]+/gu, ' ') + .trim() + .replace(/\s+/g, ' '); +} + +function looksLikeEpisodeOnlyTitle(title: string): boolean { + const normalized = title + .normalize('NFKC') + .toLowerCase() + .replace(/\s+/g, ' ') + .trim(); + return /^(episode|ep)\s*\d{1,3}$/.test(normalized) || /^第\s*\d{1,3}\s*話$/.test(normalized); +} + +function parserConfidenceToScore(confidence: 'high' | 'medium' | 'low'): number { + switch (confidence) { + case 'high': + return 1; + case 'medium': + return 0.6; + default: + return 0.2; + } +} + +function parseLegacyAnimeBackfillCandidate( + sourcePath: string | null, + canonicalTitle: string, +): { + basename: string | null; + title: string; + season: number | null; + episode: number | null; + source: 'fallback'; + confidenceScore: number; + metadataJson: string; +} | null { + const fromPath = + sourcePath && sourcePath.trim().length > 0 ? parseMediaInfo(sourcePath.trim()) : null; + if (fromPath?.title && !looksLikeEpisodeOnlyTitle(fromPath.title)) { + return { + basename: fromPath.filename || null, + title: fromPath.title, + season: fromPath.season, + episode: fromPath.episode, + source: 'fallback', + confidenceScore: parserConfidenceToScore(fromPath.confidence), + metadataJson: JSON.stringify({ + confidence: fromPath.confidence, + filename: fromPath.filename, + rawTitle: fromPath.rawTitle, + migrationSource: 'source_path', + }), + }; + } + + const fallbackTitle = canonicalTitle.trim(); + if (!fallbackTitle) return null; + const fromTitle = parseMediaInfo(fallbackTitle); + if (!fromTitle.title || looksLikeEpisodeOnlyTitle(fromTitle.title)) { + return null; + } + + return { + basename: null, + title: fromTitle.title, + season: fromTitle.season, + episode: fromTitle.episode, + source: 'fallback', + confidenceScore: parserConfidenceToScore(fromTitle.confidence), + metadataJson: JSON.stringify({ + confidence: fromTitle.confidence, + filename: fromTitle.filename, + rawTitle: fromTitle.rawTitle, + migrationSource: 'canonical_title', + }), + }; +} + +export function getOrCreateAnimeRecord(db: DatabaseSync, input: AnimeRecordInput): number { + const normalizedTitleKey = normalizeAnimeIdentityKey(input.parsedTitle); + if (!normalizedTitleKey) { + throw new Error('parsedTitle is required to create or update an anime record'); + } + + const byAnilistId = + input.anilistId !== null + ? (db.prepare('SELECT anime_id FROM imm_anime WHERE anilist_id = ?').get(input.anilistId) as { + anime_id: number; + } | null) + : null; + const byNormalizedTitle = db + .prepare('SELECT anime_id FROM imm_anime WHERE normalized_title_key = ?') + .get(normalizedTitleKey) as { anime_id: number } | null; + const existing = byAnilistId ?? byNormalizedTitle; + if (existing?.anime_id) { + db.prepare( + ` + UPDATE imm_anime + SET + canonical_title = COALESCE(NULLIF(?, ''), canonical_title), + anilist_id = COALESCE(?, anilist_id), + title_romaji = COALESCE(?, title_romaji), + title_english = COALESCE(?, title_english), + title_native = COALESCE(?, title_native), + metadata_json = COALESCE(?, metadata_json), + LAST_UPDATE_DATE = ? + WHERE anime_id = ? + `, + ).run( + input.canonicalTitle, + input.anilistId, + input.titleRomaji, + input.titleEnglish, + input.titleNative, + input.metadataJson, + Date.now(), + existing.anime_id, + ); + return existing.anime_id; + } + + const nowMs = Date.now(); + const result = db + .prepare( + ` + INSERT INTO imm_anime( + normalized_title_key, + canonical_title, + anilist_id, + title_romaji, + title_english, + title_native, + metadata_json, + CREATED_DATE, + LAST_UPDATE_DATE + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + `, + ) + .run( + normalizedTitleKey, + input.canonicalTitle, + input.anilistId, + input.titleRomaji, + input.titleEnglish, + input.titleNative, + input.metadataJson, + nowMs, + nowMs, + ); + return Number(result.lastInsertRowid); +} + +export function linkVideoToAnimeRecord( + db: DatabaseSync, + videoId: number, + input: VideoAnimeLinkInput, +): void { + db.prepare( + ` + UPDATE imm_videos + SET + anime_id = ?, + parsed_basename = ?, + parsed_title = ?, + parsed_season = ?, + parsed_episode = ?, + parser_source = ?, + parser_confidence = ?, + parse_metadata_json = ?, + LAST_UPDATE_DATE = ? + WHERE video_id = ? + `, + ).run( + input.animeId, + input.parsedBasename, + input.parsedTitle, + input.parsedSeason, + input.parsedEpisode, + input.parserSource, + input.parserConfidence, + input.parseMetadataJson, + Date.now(), + videoId, + ); +} + +function migrateLegacyAnimeMetadata(db: DatabaseSync): void { + addColumnIfMissing(db, 'imm_videos', 'anime_id', 'INTEGER REFERENCES imm_anime(anime_id)'); + addColumnIfMissing(db, 'imm_videos', 'parsed_basename', 'TEXT'); + addColumnIfMissing(db, 'imm_videos', 'parsed_title', 'TEXT'); + addColumnIfMissing(db, 'imm_videos', 'parsed_season', 'INTEGER'); + addColumnIfMissing(db, 'imm_videos', 'parsed_episode', 'INTEGER'); + addColumnIfMissing(db, 'imm_videos', 'parser_source', 'TEXT'); + addColumnIfMissing(db, 'imm_videos', 'parser_confidence', 'REAL'); + addColumnIfMissing(db, 'imm_videos', 'parse_metadata_json', 'TEXT'); + + const legacyRows = db + .prepare( + ` + SELECT video_id, source_path, canonical_title + FROM imm_videos + WHERE anime_id IS NULL + `, + ) + .all() as Array<{ + video_id: number; + source_path: string | null; + canonical_title: string; + }>; + + for (const row of legacyRows) { + const parsed = parseLegacyAnimeBackfillCandidate(row.source_path, row.canonical_title); + if (!parsed) continue; + + const animeId = getOrCreateAnimeRecord(db, { + parsedTitle: parsed.title, + canonicalTitle: parsed.title, + anilistId: null, + titleRomaji: null, + titleEnglish: null, + titleNative: null, + metadataJson: parsed.metadataJson, + }); + linkVideoToAnimeRecord(db, row.video_id, { + animeId, + parsedBasename: parsed.basename, + parsedTitle: parsed.title, + parsedSeason: parsed.season, + parsedEpisode: parsed.episode, + parserSource: parsed.source, + parserConfidence: parsed.confidenceScore, + parseMetadataJson: parsed.metadataJson, + }); + } +} + export function ensureSchema(db: DatabaseSync): void { db.exec(` CREATE TABLE IF NOT EXISTS imm_schema_version ( @@ -61,14 +335,38 @@ export function ensureSchema(db: DatabaseSync): void { return; } + db.exec(` + CREATE TABLE IF NOT EXISTS imm_anime( + anime_id INTEGER PRIMARY KEY AUTOINCREMENT, + normalized_title_key TEXT NOT NULL UNIQUE, + canonical_title TEXT NOT NULL, + anilist_id INTEGER UNIQUE, + title_romaji TEXT, + title_english TEXT, + title_native TEXT, + episodes_total INTEGER, + metadata_json TEXT, + CREATED_DATE INTEGER, + LAST_UPDATE_DATE INTEGER + ); + `); db.exec(` CREATE TABLE IF NOT EXISTS imm_videos( video_id INTEGER PRIMARY KEY AUTOINCREMENT, video_key TEXT NOT NULL UNIQUE, + anime_id INTEGER, canonical_title TEXT NOT NULL, source_type INTEGER NOT NULL, source_path TEXT, source_url TEXT, + parsed_basename TEXT, + parsed_title TEXT, + parsed_season INTEGER, + parsed_episode INTEGER, + parser_source TEXT, + parser_confidence REAL, + parse_metadata_json TEXT, + watched INTEGER NOT NULL DEFAULT 0, duration_ms INTEGER NOT NULL CHECK(duration_ms>=0), file_size_bytes INTEGER CHECK(file_size_bytes>=0), codec_id INTEGER, container_id INTEGER, @@ -77,7 +375,8 @@ export function ensureSchema(db: DatabaseSync): void { hash_sha256 TEXT, screenshot_path TEXT, metadata_json TEXT, CREATED_DATE INTEGER, - LAST_UPDATE_DATE INTEGER + LAST_UPDATE_DATE INTEGER, + FOREIGN KEY(anime_id) REFERENCES imm_anime(anime_id) ON DELETE SET NULL ); `); db.exec(` @@ -173,6 +472,10 @@ export function ensureSchema(db: DatabaseSync): void { headword TEXT, word TEXT, reading TEXT, + part_of_speech TEXT, + pos1 TEXT, + pos2 TEXT, + pos3 TEXT, first_seen REAL, last_seen REAL, frequency INTEGER, @@ -189,42 +492,59 @@ export function ensureSchema(db: DatabaseSync): void { UNIQUE(kanji) ); `); - db.exec(` - CREATE INDEX IF NOT EXISTS idx_sessions_video_started - ON imm_sessions(video_id, started_at_ms DESC) + CREATE TABLE IF NOT EXISTS imm_subtitle_lines( + line_id INTEGER PRIMARY KEY AUTOINCREMENT, + session_id INTEGER NOT NULL, + event_id INTEGER, + video_id INTEGER NOT NULL, + anime_id INTEGER, + line_index INTEGER NOT NULL, + segment_start_ms INTEGER, + segment_end_ms INTEGER, + text TEXT NOT NULL, + CREATED_DATE INTEGER, + LAST_UPDATE_DATE INTEGER, + FOREIGN KEY(session_id) REFERENCES imm_sessions(session_id) ON DELETE CASCADE, + FOREIGN KEY(event_id) REFERENCES imm_session_events(event_id) ON DELETE SET NULL, + FOREIGN KEY(video_id) REFERENCES imm_videos(video_id) ON DELETE CASCADE, + FOREIGN KEY(anime_id) REFERENCES imm_anime(anime_id) ON DELETE SET NULL + ); `); db.exec(` - CREATE INDEX IF NOT EXISTS idx_sessions_status_started - ON imm_sessions(status, started_at_ms DESC) + CREATE TABLE IF NOT EXISTS imm_word_line_occurrences( + line_id INTEGER NOT NULL, + word_id INTEGER NOT NULL, + occurrence_count INTEGER NOT NULL, + PRIMARY KEY(line_id, word_id), + FOREIGN KEY(line_id) REFERENCES imm_subtitle_lines(line_id) ON DELETE CASCADE, + FOREIGN KEY(word_id) REFERENCES imm_words(id) ON DELETE CASCADE + ); `); db.exec(` - CREATE INDEX IF NOT EXISTS idx_telemetry_session_sample - ON imm_session_telemetry(session_id, sample_ms DESC) + CREATE TABLE IF NOT EXISTS imm_kanji_line_occurrences( + line_id INTEGER NOT NULL, + kanji_id INTEGER NOT NULL, + occurrence_count INTEGER NOT NULL, + PRIMARY KEY(line_id, kanji_id), + FOREIGN KEY(line_id) REFERENCES imm_subtitle_lines(line_id) ON DELETE CASCADE, + FOREIGN KEY(kanji_id) REFERENCES imm_kanji(id) ON DELETE CASCADE + ); `); db.exec(` - CREATE INDEX IF NOT EXISTS idx_events_session_ts - ON imm_session_events(session_id, ts_ms DESC) - `); - db.exec(` - CREATE INDEX IF NOT EXISTS idx_events_type_ts - ON imm_session_events(event_type, ts_ms DESC) - `); - db.exec(` - CREATE INDEX IF NOT EXISTS idx_rollups_day_video - ON imm_daily_rollups(rollup_day, video_id) - `); - db.exec(` - CREATE INDEX IF NOT EXISTS idx_rollups_month_video - ON imm_monthly_rollups(rollup_month, video_id) - `); - db.exec(` - CREATE INDEX IF NOT EXISTS idx_words_headword_word_reading - ON imm_words(headword, word, reading) - `); - db.exec(` - CREATE INDEX IF NOT EXISTS idx_kanji_kanji - ON imm_kanji(kanji) + CREATE TABLE IF NOT EXISTS imm_media_art( + video_id INTEGER PRIMARY KEY, + anilist_id INTEGER, + cover_url TEXT, + cover_blob BLOB, + title_romaji TEXT, + title_english TEXT, + episodes_total INTEGER, + fetched_at_ms INTEGER NOT NULL, + CREATED_DATE INTEGER, + LAST_UPDATE_DATE INTEGER, + FOREIGN KEY(video_id) REFERENCES imm_videos(video_id) ON DELETE CASCADE + ); `); if (currentVersion?.schema_version === 1) { @@ -299,6 +619,134 @@ export function ensureSchema(db: DatabaseSync): void { dropColumnIfExists(db, 'imm_sessions', 'updated_at_ms'); } + if (currentVersion?.schema_version && currentVersion.schema_version < 5) { + migrateLegacyAnimeMetadata(db); + } + + if (currentVersion?.schema_version && currentVersion.schema_version < 6) { + addColumnIfMissing(db, 'imm_words', 'part_of_speech', 'TEXT'); + addColumnIfMissing(db, 'imm_words', 'pos1', 'TEXT'); + addColumnIfMissing(db, 'imm_words', 'pos2', 'TEXT'); + addColumnIfMissing(db, 'imm_words', 'pos3', 'TEXT'); + } + + if (currentVersion?.schema_version && currentVersion.schema_version < 7) { + db.exec(` + CREATE TABLE IF NOT EXISTS imm_subtitle_lines( + line_id INTEGER PRIMARY KEY AUTOINCREMENT, + session_id INTEGER NOT NULL, + event_id INTEGER, + video_id INTEGER NOT NULL, + anime_id INTEGER, + line_index INTEGER NOT NULL, + segment_start_ms INTEGER, + segment_end_ms INTEGER, + text TEXT NOT NULL, + CREATED_DATE INTEGER, + LAST_UPDATE_DATE INTEGER, + FOREIGN KEY(session_id) REFERENCES imm_sessions(session_id) ON DELETE CASCADE, + FOREIGN KEY(event_id) REFERENCES imm_session_events(event_id) ON DELETE SET NULL, + FOREIGN KEY(video_id) REFERENCES imm_videos(video_id) ON DELETE CASCADE, + FOREIGN KEY(anime_id) REFERENCES imm_anime(anime_id) ON DELETE SET NULL + ) + `); + db.exec(` + CREATE TABLE IF NOT EXISTS imm_word_line_occurrences( + line_id INTEGER NOT NULL, + word_id INTEGER NOT NULL, + occurrence_count INTEGER NOT NULL, + PRIMARY KEY(line_id, word_id), + FOREIGN KEY(line_id) REFERENCES imm_subtitle_lines(line_id) ON DELETE CASCADE, + FOREIGN KEY(word_id) REFERENCES imm_words(id) ON DELETE CASCADE + ) + `); + db.exec(` + CREATE TABLE IF NOT EXISTS imm_kanji_line_occurrences( + line_id INTEGER NOT NULL, + kanji_id INTEGER NOT NULL, + occurrence_count INTEGER NOT NULL, + PRIMARY KEY(line_id, kanji_id), + FOREIGN KEY(line_id) REFERENCES imm_subtitle_lines(line_id) ON DELETE CASCADE, + FOREIGN KEY(kanji_id) REFERENCES imm_kanji(id) ON DELETE CASCADE + ) + `); + } + + db.exec(` + CREATE INDEX IF NOT EXISTS idx_anime_normalized_title + ON imm_anime(normalized_title_key) + `); + db.exec(` + CREATE INDEX IF NOT EXISTS idx_anime_anilist_id + ON imm_anime(anilist_id) + `); + db.exec(` + CREATE INDEX IF NOT EXISTS idx_videos_anime_id + ON imm_videos(anime_id) + `); + db.exec(` + CREATE INDEX IF NOT EXISTS idx_sessions_video_started + ON imm_sessions(video_id, started_at_ms DESC) + `); + db.exec(` + CREATE INDEX IF NOT EXISTS idx_sessions_status_started + ON imm_sessions(status, started_at_ms DESC) + `); + db.exec(` + CREATE INDEX IF NOT EXISTS idx_telemetry_session_sample + ON imm_session_telemetry(session_id, sample_ms DESC) + `); + db.exec(` + CREATE INDEX IF NOT EXISTS idx_events_session_ts + ON imm_session_events(session_id, ts_ms DESC) + `); + db.exec(` + CREATE INDEX IF NOT EXISTS idx_events_type_ts + ON imm_session_events(event_type, ts_ms DESC) + `); + db.exec(` + CREATE INDEX IF NOT EXISTS idx_rollups_day_video + ON imm_daily_rollups(rollup_day, video_id) + `); + db.exec(` + CREATE INDEX IF NOT EXISTS idx_rollups_month_video + ON imm_monthly_rollups(rollup_month, video_id) + `); + db.exec(` + CREATE INDEX IF NOT EXISTS idx_words_headword_word_reading + ON imm_words(headword, word, reading) + `); + db.exec(` + CREATE INDEX IF NOT EXISTS idx_kanji_kanji + ON imm_kanji(kanji) + `); + db.exec(` + CREATE INDEX IF NOT EXISTS idx_subtitle_lines_session_line + ON imm_subtitle_lines(session_id, line_index) + `); + db.exec(` + CREATE INDEX IF NOT EXISTS idx_subtitle_lines_video_line + ON imm_subtitle_lines(video_id, line_index) + `); + db.exec(` + CREATE INDEX IF NOT EXISTS idx_subtitle_lines_anime_line + ON imm_subtitle_lines(anime_id, line_index) + `); + db.exec(` + CREATE INDEX IF NOT EXISTS idx_word_line_occurrences_word + ON imm_word_line_occurrences(word_id, line_id) + `); + db.exec(` + CREATE INDEX IF NOT EXISTS idx_kanji_line_occurrences_kanji + ON imm_kanji_line_occurrences(kanji_id, line_id) + `); + + if (currentVersion?.schema_version && currentVersion.schema_version < SCHEMA_VERSION) { + db.exec('DELETE FROM imm_daily_rollups'); + db.exec('DELETE FROM imm_monthly_rollups'); + db.exec(`UPDATE imm_rollup_state SET state_value = 0 WHERE state_key = 'last_rollup_sample_ms'`); + } + db.exec(` INSERT INTO imm_schema_version(schema_version, applied_at_ms) VALUES (${SCHEMA_VERSION}, ${Date.now()}) @@ -328,12 +776,21 @@ export function createTrackerPreparedStatements(db: DatabaseSync): TrackerPrepar `), wordUpsertStmt: db.prepare(` INSERT INTO imm_words ( - headword, word, reading, first_seen, last_seen, frequency + headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency ) VALUES ( - ?, ?, ?, ?, ?, 1 + ?, ?, ?, ?, ?, ?, ?, ?, ?, 1 ) ON CONFLICT(headword, word, reading) DO UPDATE SET frequency = COALESCE(frequency, 0) + 1, + part_of_speech = CASE + WHEN COALESCE(NULLIF(imm_words.part_of_speech, ''), 'other') = 'other' + AND COALESCE(NULLIF(excluded.part_of_speech, ''), '') <> '' + THEN excluded.part_of_speech + ELSE imm_words.part_of_speech + END, + pos1 = COALESCE(NULLIF(imm_words.pos1, ''), excluded.pos1), + pos2 = COALESCE(NULLIF(imm_words.pos2, ''), excluded.pos2), + pos3 = COALESCE(NULLIF(imm_words.pos3, ''), excluded.pos3), first_seen = MIN(COALESCE(first_seen, excluded.first_seen), excluded.first_seen), last_seen = MAX(COALESCE(last_seen, excluded.last_seen), excluded.last_seen) `), @@ -348,9 +805,93 @@ export function createTrackerPreparedStatements(db: DatabaseSync): TrackerPrepar first_seen = MIN(COALESCE(first_seen, excluded.first_seen), excluded.first_seen), last_seen = MAX(COALESCE(last_seen, excluded.last_seen), excluded.last_seen) `), + subtitleLineInsertStmt: db.prepare(` + INSERT INTO imm_subtitle_lines ( + session_id, event_id, video_id, anime_id, line_index, segment_start_ms, + segment_end_ms, text, CREATED_DATE, LAST_UPDATE_DATE + ) VALUES ( + ?, ?, ?, ?, ?, ?, ?, ?, ?, ? + ) + `), + wordIdSelectStmt: db.prepare(` + SELECT id FROM imm_words + WHERE headword = ? AND word = ? AND reading = ? + `), + kanjiIdSelectStmt: db.prepare(` + SELECT id FROM imm_kanji + WHERE kanji = ? + `), + wordLineOccurrenceUpsertStmt: db.prepare(` + INSERT INTO imm_word_line_occurrences ( + line_id, word_id, occurrence_count + ) VALUES ( + ?, ?, ? + ) + ON CONFLICT(line_id, word_id) DO UPDATE SET + occurrence_count = imm_word_line_occurrences.occurrence_count + excluded.occurrence_count + `), + kanjiLineOccurrenceUpsertStmt: db.prepare(` + INSERT INTO imm_kanji_line_occurrences ( + line_id, kanji_id, occurrence_count + ) VALUES ( + ?, ?, ? + ) + ON CONFLICT(line_id, kanji_id) DO UPDATE SET + occurrence_count = imm_kanji_line_occurrences.occurrence_count + excluded.occurrence_count + `), + videoAnimeIdSelectStmt: db.prepare(` + SELECT anime_id FROM imm_videos + WHERE video_id = ? + `), }; } +function incrementWordAggregate( + stmts: TrackerPreparedStatements, + occurrence: Extract['wordOccurrences'][number], + firstSeen: number, + lastSeen: number, +): number { + for (let i = 0; i < occurrence.occurrenceCount; i += 1) { + stmts.wordUpsertStmt.run( + occurrence.headword, + occurrence.word, + occurrence.reading, + occurrence.partOfSpeech, + occurrence.pos1, + occurrence.pos2, + occurrence.pos3, + firstSeen, + lastSeen, + ); + } + const row = stmts.wordIdSelectStmt.get( + occurrence.headword, + occurrence.word, + occurrence.reading, + ) as { id: number } | null; + if (!row?.id) { + throw new Error(`Failed to resolve imm_words id for ${occurrence.headword}`); + } + return row.id; +} + +function incrementKanjiAggregate( + stmts: TrackerPreparedStatements, + occurrence: Extract['kanjiOccurrences'][number], + firstSeen: number, + lastSeen: number, +): number { + for (let i = 0; i < occurrence.occurrenceCount; i += 1) { + stmts.kanjiUpsertStmt.run(occurrence.kanji, firstSeen, lastSeen); + } + const row = stmts.kanjiIdSelectStmt.get(occurrence.kanji) as { id: number } | null; + if (!row?.id) { + throw new Error(`Failed to resolve imm_kanji id for ${occurrence.kanji}`); + } + return row.id; +} + export function executeQueuedWrite(write: QueuedWrite, stmts: TrackerPreparedStatements): void { if (write.kind === 'telemetry') { stmts.telemetryInsertStmt.run( @@ -379,6 +920,10 @@ export function executeQueuedWrite(write: QueuedWrite, stmts: TrackerPreparedSta write.headword, write.word, write.reading, + write.partOfSpeech, + write.pos1, + write.pos2, + write.pos3, write.firstSeen, write.lastSeen, ); @@ -388,6 +933,31 @@ export function executeQueuedWrite(write: QueuedWrite, stmts: TrackerPreparedSta stmts.kanjiUpsertStmt.run(write.kanji, write.firstSeen, write.lastSeen); return; } + if (write.kind === 'subtitleLine') { + const animeRow = stmts.videoAnimeIdSelectStmt.get(write.videoId) as { anime_id: number | null } | null; + const lineResult = stmts.subtitleLineInsertStmt.run( + write.sessionId, + null, + write.videoId, + animeRow?.anime_id ?? null, + write.lineIndex, + write.segmentStartMs ?? null, + write.segmentEndMs ?? null, + write.text, + Date.now(), + Date.now(), + ); + const lineId = Number(lineResult.lastInsertRowid); + for (const occurrence of write.wordOccurrences) { + const wordId = incrementWordAggregate(stmts, occurrence, write.firstSeen, write.lastSeen); + stmts.wordLineOccurrenceUpsertStmt.run(lineId, wordId, occurrence.occurrenceCount); + } + for (const occurrence of write.kanjiOccurrences) { + const kanjiId = incrementKanjiAggregate(stmts, occurrence, write.firstSeen, write.lastSeen); + stmts.kanjiLineOccurrenceUpsertStmt.run(lineId, kanjiId, occurrence.occurrenceCount); + } + return; + } stmts.eventInsertStmt.run( write.sessionId, diff --git a/src/core/services/immersion-tracker/types.ts b/src/core/services/immersion-tracker/types.ts index e7810b1..674cad4 100644 --- a/src/core/services/immersion-tracker/types.ts +++ b/src/core/services/immersion-tracker/types.ts @@ -1,4 +1,4 @@ -export const SCHEMA_VERSION = 3; +export const SCHEMA_VERSION = 7; export const DEFAULT_QUEUE_CAP = 1_000; export const DEFAULT_BATCH_SIZE = 25; export const DEFAULT_FLUSH_INTERVAL_MS = 500; @@ -29,6 +29,9 @@ export const EVENT_PAUSE_END = 8; export interface ImmersionTrackerOptions { dbPath: string; policy?: ImmersionTrackerPolicy; + resolveLegacyVocabularyPos?: ( + row: LegacyVocabularyPosRow, + ) => Promise; } export interface ImmersionTrackerPolicy { @@ -72,6 +75,7 @@ export interface SessionState extends TelemetryAccumulator { lastPauseStartMs: number | null; isPaused: boolean; pendingTelemetry: boolean; + markedWatched: boolean; } interface QueuedTelemetryWrite { @@ -118,6 +122,10 @@ interface QueuedWordWrite { headword: string; word: string; reading: string; + partOfSpeech: string; + pos1: string; + pos2: string; + pos3: string; firstSeen: number; lastSeen: number; } @@ -129,11 +137,42 @@ interface QueuedKanjiWrite { lastSeen: number; } +export interface CountedWordOccurrence { + headword: string; + word: string; + reading: string; + partOfSpeech: string; + pos1: string; + pos2: string; + pos3: string; + occurrenceCount: number; +} + +export interface CountedKanjiOccurrence { + kanji: string; + occurrenceCount: number; +} + +interface QueuedSubtitleLineWrite { + kind: 'subtitleLine'; + sessionId: number; + videoId: number; + lineIndex: number; + segmentStartMs: number | null; + segmentEndMs: number | null; + text: string; + wordOccurrences: CountedWordOccurrence[]; + kanjiOccurrences: CountedKanjiOccurrence[]; + firstSeen: number; + lastSeen: number; +} + export type QueuedWrite = | QueuedTelemetryWrite | QueuedEventWrite | QueuedWordWrite - | QueuedKanjiWrite; + | QueuedKanjiWrite + | QueuedSubtitleLineWrite; export interface VideoMetadata { sourceType: number; @@ -152,8 +191,33 @@ export interface VideoMetadata { metadataJson: string | null; } +export interface ParsedAnimeVideoMetadata { + animeId: number | null; + parsedBasename: string | null; + parsedTitle: string | null; + parsedSeason: number | null; + parsedEpisode: number | null; + parserSource: string | null; + parserConfidence: number | null; + parseMetadataJson: string | null; +} + +export interface ParsedAnimeVideoGuess { + parsedBasename: string | null; + parsedTitle: string; + parsedSeason: number | null; + parsedEpisode: number | null; + parserSource: 'guessit' | 'fallback'; + parserConfidence: number; + parseMetadataJson: string; +} + export interface SessionSummaryQueryRow { + sessionId: number; videoId: number | null; + canonicalTitle: string | null; + animeId: number | null; + animeTitle: string | null; startedAtMs: number; endedAtMs: number | null; totalWatchedMs: number; @@ -166,6 +230,82 @@ export interface SessionSummaryQueryRow { lookupHits: number; } +export interface VocabularyStatsRow { + wordId: number; + headword: string; + word: string; + reading: string; + partOfSpeech: string | null; + pos1: string | null; + pos2: string | null; + pos3: string | null; + frequency: number; + firstSeen: number; + lastSeen: number; +} + +export interface VocabularyCleanupSummary { + scanned: number; + kept: number; + deleted: number; + repaired: number; +} + +export interface LegacyVocabularyPosRow { + headword: string; + word: string; + reading: string | null; +} + +export interface LegacyVocabularyPosResolution { + headword: string; + reading: string; + partOfSpeech: string; + pos1: string; + pos2: string; + pos3: string; +} + +export interface KanjiStatsRow { + kanjiId: number; + kanji: string; + frequency: number; + firstSeen: number; + lastSeen: number; +} + +export interface WordOccurrenceRow { + animeId: number | null; + animeTitle: string | null; + videoId: number; + videoTitle: string; + sessionId: number; + lineIndex: number; + segmentStartMs: number | null; + segmentEndMs: number | null; + text: string; + occurrenceCount: number; +} + +export interface KanjiOccurrenceRow { + animeId: number | null; + animeTitle: string | null; + videoId: number; + videoTitle: string; + sessionId: number; + lineIndex: number; + segmentStartMs: number | null; + segmentEndMs: number | null; + text: string; + occurrenceCount: number; +} + +export interface SessionEventRow { + eventType: number; + tsMs: number; + payload: string | null; +} + export interface SessionTimelineRow { sampleMs: number; totalWatchedMs: number; @@ -200,3 +340,180 @@ export interface ProbeMetadata { bitrateKbps: number | null; audioCodecId: number | null; } + +export interface MediaArtRow { + videoId: number; + anilistId: number | null; + coverUrl: string | null; + coverBlob: Buffer | null; + titleRomaji: string | null; + titleEnglish: string | null; + episodesTotal: number | null; + fetchedAtMs: number; +} + +export interface MediaLibraryRow { + videoId: number; + canonicalTitle: string; + totalSessions: number; + totalActiveMs: number; + totalCards: number; + totalWordsSeen: number; + lastWatchedMs: number; + hasCoverArt: number; +} + +export interface MediaDetailRow { + videoId: number; + canonicalTitle: string; + totalSessions: number; + totalActiveMs: number; + totalCards: number; + totalWordsSeen: number; + totalLinesSeen: number; + totalLookupCount: number; + totalLookupHits: number; +} + +export interface AnimeLibraryRow { + animeId: number; + canonicalTitle: string; + anilistId: number | null; + totalSessions: number; + totalActiveMs: number; + totalCards: number; + totalWordsSeen: number; + episodeCount: number; + episodesTotal: number | null; + lastWatchedMs: number; +} + +export interface AnimeDetailRow { + animeId: number; + canonicalTitle: string; + anilistId: number | null; + titleRomaji: string | null; + titleEnglish: string | null; + titleNative: string | null; + totalSessions: number; + totalActiveMs: number; + totalCards: number; + totalWordsSeen: number; + totalLinesSeen: number; + totalLookupCount: number; + totalLookupHits: number; + episodeCount: number; + lastWatchedMs: number; +} + +export interface AnimeAnilistEntryRow { + anilistId: number; + titleRomaji: string | null; + titleEnglish: string | null; + season: number | null; +} + +export interface AnimeEpisodeRow { + animeId: number; + videoId: number; + canonicalTitle: string; + parsedTitle: string | null; + season: number | null; + episode: number | null; + durationMs: number; + watched: number; + totalSessions: number; + totalActiveMs: number; + totalCards: number; + totalWordsSeen: number; + lastWatchedMs: number; +} + +export interface StreakCalendarRow { + epochDay: number; + totalActiveMin: number; +} + +export interface AnimeWordRow { + wordId: number; + headword: string; + word: string; + reading: string; + partOfSpeech: string | null; + frequency: number; +} + +export interface EpisodesPerDayRow { + epochDay: number; + episodeCount: number; +} + +export interface NewAnimePerDayRow { + epochDay: number; + newAnimeCount: number; +} + +export interface WatchTimePerAnimeRow { + epochDay: number; + animeId: number; + animeTitle: string; + totalActiveMin: number; +} + +export interface WordDetailRow { + wordId: number; + headword: string; + word: string; + reading: string; + partOfSpeech: string | null; + pos1: string | null; + pos2: string | null; + pos3: string | null; + frequency: number; + firstSeen: number; + lastSeen: number; +} + +export interface WordAnimeAppearanceRow { + animeId: number; + animeTitle: string; + occurrenceCount: number; +} + +export interface SimilarWordRow { + wordId: number; + headword: string; + word: string; + reading: string; + frequency: number; +} + +export interface KanjiDetailRow { + kanjiId: number; + kanji: string; + frequency: number; + firstSeen: number; + lastSeen: number; +} + +export interface KanjiAnimeAppearanceRow { + animeId: number; + animeTitle: string; + occurrenceCount: number; +} + +export interface KanjiWordRow { + wordId: number; + headword: string; + word: string; + reading: string; + frequency: number; +} + +export interface EpisodeCardEventRow { + eventId: number; + sessionId: number; + tsMs: number; + cardsDelta: number; + noteIds: number[]; +} diff --git a/src/core/services/tokenizer/annotation-stage.ts b/src/core/services/tokenizer/annotation-stage.ts index c263757..e957696 100644 --- a/src/core/services/tokenizer/annotation-stage.ts +++ b/src/core/services/tokenizer/annotation-stage.ts @@ -133,6 +133,17 @@ function isFrequencyExcludedByPos( ); } +export function shouldExcludeTokenFromVocabularyPersistence( + token: MergedToken, + options: Pick = {}, +): boolean { + return isFrequencyExcludedByPos( + token, + resolvePos1Exclusions(options), + resolvePos2Exclusions(options), + ); +} + function applyFrequencyMarking( tokens: MergedToken[], pos1Exclusions: ReadonlySet, diff --git a/src/core/services/tokenizer/part-of-speech.ts b/src/core/services/tokenizer/part-of-speech.ts new file mode 100644 index 0000000..b396e2a --- /dev/null +++ b/src/core/services/tokenizer/part-of-speech.ts @@ -0,0 +1,56 @@ +import { PartOfSpeech } from '../../../types'; + +function normalizePosTag(value: string | null | undefined): string { + return typeof value === 'string' ? value.trim() : ''; +} + +export function isPartOfSpeechValue(value: unknown): value is PartOfSpeech { + return typeof value === 'string' && Object.values(PartOfSpeech).includes(value as PartOfSpeech); +} + +export function mapMecabPos1ToPartOfSpeech(pos1: string | null | undefined): PartOfSpeech { + switch (normalizePosTag(pos1)) { + case '名詞': + return PartOfSpeech.noun; + case '動詞': + return PartOfSpeech.verb; + case '形容詞': + return PartOfSpeech.i_adjective; + case '形状詞': + case '形容動詞': + return PartOfSpeech.na_adjective; + case '助詞': + return PartOfSpeech.particle; + case '助動詞': + return PartOfSpeech.bound_auxiliary; + case '記号': + case '補助記号': + return PartOfSpeech.symbol; + default: + return PartOfSpeech.other; + } +} + +export function deriveStoredPartOfSpeech(input: { + partOfSpeech?: string | null; + pos1?: string | null; +}): PartOfSpeech { + const pos1Parts = normalizePosTag(input.pos1) + .split('|') + .map((part) => part.trim()) + .filter((part) => part.length > 0); + + if (pos1Parts.length > 0) { + const derivedParts = [...new Set(pos1Parts.map((part) => mapMecabPos1ToPartOfSpeech(part)))]; + if (derivedParts.length === 1) { + return derivedParts[0]!; + } + return PartOfSpeech.other; + } + + if (isPartOfSpeechValue(input.partOfSpeech)) { + return input.partOfSpeech; + } + + return PartOfSpeech.other; +} diff --git a/src/mecab-tokenizer.ts b/src/mecab-tokenizer.ts index 60bd5ca..d2e1bdb 100644 --- a/src/mecab-tokenizer.ts +++ b/src/mecab-tokenizer.ts @@ -19,34 +19,12 @@ import * as childProcess from 'child_process'; import { PartOfSpeech, Token, MecabStatus } from './types'; import { createLogger } from './logger'; +import { mapMecabPos1ToPartOfSpeech } from './core/services/tokenizer/part-of-speech'; export { PartOfSpeech }; const log = createLogger('mecab'); -function mapPartOfSpeech(pos1: string): PartOfSpeech { - switch (pos1) { - case '名詞': - return PartOfSpeech.noun; - case '動詞': - return PartOfSpeech.verb; - case '形容詞': - return PartOfSpeech.i_adjective; - case '形状詞': - case '形容動詞': - return PartOfSpeech.na_adjective; - case '助詞': - return PartOfSpeech.particle; - case '助動詞': - return PartOfSpeech.bound_auxiliary; - case '記号': - case '補助記号': - return PartOfSpeech.symbol; - default: - return PartOfSpeech.other; - } -} - export function parseMecabLine(line: string): Token | null { if (!line || line === 'EOS' || line.trim() === '') { return null; @@ -73,7 +51,7 @@ export function parseMecabLine(line: string): Token | null { return { word: surface, - partOfSpeech: mapPartOfSpeech(pos1), + partOfSpeech: mapMecabPos1ToPartOfSpeech(pos1), pos1, pos2, pos3, @@ -446,4 +424,4 @@ export class MecabTokenizer { } } -export { mapPartOfSpeech }; +export { mapMecabPos1ToPartOfSpeech as mapPartOfSpeech };