feat(immersion): add anime metadata, occurrence tracking, and schema upgrades

- Add imm_anime table with AniList integration
- Add imm_subtitle_lines, imm_word_line_occurrences, imm_kanji_line_occurrences
- Add POS fields (part_of_speech, pos1, pos2, pos3) to imm_words
- Add anime metadata parsing with guessit fallback
- Add video duration tracking and watched status
- Add episode, streak, trend, and word/kanji detail queries
- Deduplicate subtitle line recording within sessions
- Pass Anki note IDs through card mining callback chain
This commit is contained in:
2026-03-14 22:13:42 -07:00
parent ee95e86ad5
commit f005f542a3
19 changed files with 5231 additions and 122 deletions

View File

@@ -16,6 +16,7 @@ test('guessAnilistMediaInfo uses guessit output when available', async () => {
});
assert.deepEqual(result, {
title: 'Guessit Title',
season: null,
episode: 7,
source: 'guessit',
});
@@ -29,6 +30,7 @@ test('guessAnilistMediaInfo falls back to parser when guessit fails', async () =
});
assert.deepEqual(result, {
title: 'My Anime',
season: 1,
episode: 3,
source: 'fallback',
});
@@ -52,6 +54,7 @@ test('guessAnilistMediaInfo uses basename for guessit input', async () => {
]);
assert.deepEqual(result, {
title: 'Rascal Does Not Dream of Bunny Girl Senpai',
season: null,
episode: 1,
source: 'guessit',
});
@@ -67,6 +70,7 @@ test('guessAnilistMediaInfo joins multi-part guessit titles', async () => {
});
assert.deepEqual(result, {
title: 'Rascal Does not Dream of Bunny Girl Senpai',
season: null,
episode: 1,
source: 'guessit',
});

View File

@@ -7,6 +7,7 @@ const ANILIST_GRAPHQL_URL = 'https://graphql.anilist.co';
export interface AnilistMediaGuess {
title: string;
season: number | null;
episode: number | null;
source: 'guessit' | 'fallback';
}
@@ -56,7 +57,7 @@ interface AnilistSaveEntryData {
};
}
function runGuessit(target: string): Promise<string> {
export function runGuessit(target: string): Promise<string> {
return new Promise((resolve, reject) => {
childProcess.execFile(
'guessit',
@@ -73,7 +74,7 @@ function runGuessit(target: string): Promise<string> {
});
}
type GuessAnilistMediaInfoDeps = {
export interface GuessAnilistMediaInfoDeps {
runGuessit: (target: string) => Promise<string>;
};
@@ -215,8 +216,9 @@ export async function guessAnilistMediaInfo(
const parsed = JSON.parse(stdout) as Record<string, unknown>;
const title = readGuessitTitle(parsed.title);
const episode = firstPositiveInteger(parsed.episode);
const season = firstPositiveInteger(parsed.season);
if (title) {
return { title, episode, source: 'guessit' };
return { title, season, episode, source: 'guessit' };
}
} catch {
// Ignore guessit failures and fall back to internal parser.
@@ -230,6 +232,7 @@ export async function guessAnilistMediaInfo(
}
return {
title: parsed.title.trim(),
season: parsed.season,
episode: parsed.episode,
source: 'fallback',
};

View File

@@ -0,0 +1,239 @@
import assert from 'node:assert/strict';
import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
import test from 'node:test';
import { createCoverArtFetcher, stripFilenameTags } from './cover-art-fetcher.js';
import { Database } from '../immersion-tracker/sqlite.js';
import { ensureSchema, getOrCreateVideoRecord } from '../immersion-tracker/storage.js';
import { getCoverArt, upsertCoverArt } from '../immersion-tracker/query.js';
import { SOURCE_TYPE_LOCAL } from '../immersion-tracker/types.js';
function makeDbPath(): string {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-cover-art-test-'));
return path.join(dir, 'immersion.sqlite');
}
function cleanupDbPath(dbPath: string): void {
fs.rmSync(path.dirname(dbPath), { recursive: true, force: true });
}
test('stripFilenameTags normalizes common media-title formats', () => {
assert.equal(
stripFilenameTags('[Jellyfin/direct] The Eminence in Shadow S01E05 I Am...'),
'The Eminence in Shadow',
);
assert.equal(
stripFilenameTags(
'[Foxtrot] Kono Subarashii Sekai ni Shukufuku wo! S2 - 05: Servitude for this Masked Knight!',
),
'Kono Subarashii Sekai ni Shukufuku wo!',
);
assert.equal(
stripFilenameTags('Kono Subarashii Sekai ni Shukufuku wo! E03: A Panty Treasure'),
'Kono Subarashii Sekai ni Shukufuku wo!',
);
assert.equal(
stripFilenameTags(
'Little Witch Academia (2017) - S01E05 - 005 - Pact of the Dragon [Bluray-1080p][10bit][h265][FLAC 2.0][JA]-FumeiRaws.mkv',
),
'Little Witch Academia',
);
});
test('fetchIfMissing backfills a missing blob from an existing cover URL', async () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
ensureSchema(db);
const videoId = getOrCreateVideoRecord(db, 'local:/tmp/cover-fetcher-test.mkv', {
canonicalTitle: 'Cover Fetcher Test',
sourcePath: '/tmp/cover-fetcher-test.mkv',
sourceUrl: null,
sourceType: SOURCE_TYPE_LOCAL,
});
upsertCoverArt(db, videoId, {
anilistId: 7,
coverUrl: 'https://images.test/cover.jpg',
coverBlob: null,
titleRomaji: 'Test Title',
titleEnglish: 'Test Title',
episodesTotal: 12,
});
const fetchCalls: string[] = [];
const originalFetch = globalThis.fetch;
globalThis.fetch = (async (input: RequestInfo | URL) => {
const url = String(input);
fetchCalls.push(url);
assert.equal(url, 'https://images.test/cover.jpg');
return new Response(new Uint8Array([1, 2, 3, 4]), {
status: 200,
headers: { 'Content-Type': 'image/jpeg' },
});
}) as typeof fetch;
try {
const fetcher = createCoverArtFetcher(
{
acquire: async () => {},
recordResponse: () => {},
},
console,
);
const fetched = await fetcher.fetchIfMissing(
db,
videoId,
'[Jellyfin] Little Witch Academia S02E05 - 025 - Pact of the Dragon (2020) [1080p].mkv',
);
const stored = getCoverArt(db, videoId);
assert.equal(fetched, true);
assert.equal(fetchCalls.length, 1);
assert.equal(stored?.coverBlob?.length, 4);
assert.equal(stored?.titleEnglish, 'Test Title');
} finally {
globalThis.fetch = originalFetch;
db.close();
cleanupDbPath(dbPath);
}
});
function createJsonResponse(payload: unknown): Response {
return new Response(JSON.stringify(payload), {
status: 200,
headers: { 'content-type': 'application/json' },
});
}
test('fetchIfMissing uses guessit primary title and season when available', async () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
ensureSchema(db);
const videoId = getOrCreateVideoRecord(db, 'local:/tmp/cover-fetcher-season-test.mkv', {
canonicalTitle: '[Jellyfin] Little Witch Academia S02E05 - 025 - Pact of the Dragon (2020) [1080p].mkv',
sourcePath: '/tmp/cover-fetcher-season-test.mkv',
sourceUrl: null,
sourceType: SOURCE_TYPE_LOCAL,
});
const searchCalls: Array<{ search: string }> = [];
const originalFetch = globalThis.fetch;
globalThis.fetch = ((input: RequestInfo | URL, init?: RequestInit) => {
const raw = (init?.body as string | undefined) ?? '';
const payload = JSON.parse(raw) as { variables: { search: string } };
const search = payload.variables.search;
searchCalls.push({ search });
if (search.includes('Season 2')) {
return Promise.resolve(createJsonResponse({ data: { Page: { media: [] } } }));
}
return Promise.resolve(
createJsonResponse({
data: {
Page: {
media: [
{
id: 19,
episodes: 24,
coverImage: { large: 'https://images.test/cover.jpg', medium: null },
title: { romaji: 'Little Witch Academia', english: 'Little Witch Academia', native: null },
},
],
},
},
}),
);
}) as typeof fetch;
try {
const fetcher = createCoverArtFetcher(
{
acquire: async () => {},
recordResponse: () => {},
},
console,
{
runGuessit: async () =>
JSON.stringify({ title: 'Little Witch Academia', season: 2, episode: 5 }),
},
);
const fetched = await fetcher.fetchIfMissing(db, videoId, 'School Vlog S01E01');
const stored = getCoverArt(db, videoId);
assert.equal(fetched, true);
assert.equal(searchCalls.length, 2);
assert.equal(searchCalls[0]!.search, 'Little Witch Academia Season 2');
assert.equal(stored?.anilistId, 19);
} finally {
globalThis.fetch = originalFetch;
db.close();
cleanupDbPath(dbPath);
}
});
test('fetchIfMissing falls back to internal parser when guessit throws', async () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
ensureSchema(db);
const videoId = getOrCreateVideoRecord(db, 'local:/tmp/cover-fetcher-fallback-test.mkv', {
canonicalTitle: 'School Vlog S01E01',
sourcePath: '/tmp/cover-fetcher-fallback-test.mkv',
sourceUrl: null,
sourceType: SOURCE_TYPE_LOCAL,
});
let requestCount = 0;
const originalFetch = globalThis.fetch;
globalThis.fetch = ((input: RequestInfo | URL, init?: RequestInit) => {
requestCount += 1;
const raw = (init?.body as string | undefined) ?? '';
const payload = JSON.parse(raw) as { variables: { search: string } };
assert.equal(payload.variables.search, 'School Vlog');
return Promise.resolve(
createJsonResponse({
data: {
Page: {
media: [
{
id: 21,
episodes: 12,
coverImage: { large: 'https://images.test/fallback-cover.jpg', medium: null },
title: { romaji: 'School Vlog', english: 'School Vlog', native: null },
},
],
},
},
}),
);
}) as typeof fetch;
try {
const fetcher = createCoverArtFetcher(
{
acquire: async () => {},
recordResponse: () => {},
},
console,
{
runGuessit: async () => {
throw new Error('guessit unavailable');
},
},
);
const fetched = await fetcher.fetchIfMissing(db, videoId, 'Ignored Title');
const stored = getCoverArt(db, videoId);
assert.equal(fetched, true);
assert.equal(requestCount, 1);
assert.equal(stored?.anilistId, 21);
} finally {
globalThis.fetch = originalFetch;
db.close();
cleanupDbPath(dbPath);
}
});

View File

@@ -0,0 +1,405 @@
import type { AnilistRateLimiter } from './rate-limiter';
import type { DatabaseSync } from '../immersion-tracker/sqlite';
import { getCoverArt, upsertCoverArt, updateAnimeAnilistInfo } from '../immersion-tracker/query';
import { guessAnilistMediaInfo, runGuessit, type GuessAnilistMediaInfoDeps } from './anilist-updater';
const ANILIST_GRAPHQL_URL = 'https://graphql.anilist.co';
const NO_MATCH_RETRY_MS = 5 * 60 * 1000;
const SEARCH_QUERY = `
query ($search: String!) {
Page(perPage: 5) {
media(search: $search, type: ANIME) {
id
episodes
season
seasonYear
coverImage { large medium }
title { romaji english native }
}
}
}
`;
interface AnilistMedia {
id: number;
episodes: number | null;
season: string | null;
seasonYear: number | null;
coverImage: { large: string | null; medium: string | null } | null;
title: { romaji: string | null; english: string | null; native: string | null } | null;
}
interface AnilistSearchResponse {
data?: {
Page?: {
media?: AnilistMedia[];
};
};
errors?: Array<{ message?: string }>;
}
export interface CoverArtFetcher {
fetchIfMissing(db: DatabaseSync, videoId: number, canonicalTitle: string): Promise<boolean>;
}
interface Logger {
info(msg: string, ...args: unknown[]): void;
warn(msg: string, ...args: unknown[]): void;
error(msg: string, ...args: unknown[]): void;
}
interface CoverArtCandidate {
title: string;
source: 'guessit' | 'fallback';
season: number | null;
episode: number | null;
}
interface CoverArtFetcherOptions {
runGuessit?: GuessAnilistMediaInfoDeps['runGuessit'];
}
export function stripFilenameTags(raw: string): string {
let title = raw.replace(/\.[A-Za-z0-9]{2,4}$/, '');
title = title.replace(/^(?:\s*\[[^\]]*\]\s*)+/, '');
title = title.replace(/[._]+/g, ' ');
// Remove everything from " - S##E##" or " - ###" onward (season/episode markers)
title = title.replace(/\s+-\s+S\d+E\d+.*$/i, '');
title = title.replace(/\s+-\s+\d{2,}(\s+-\s+\d+)?(\s+-.+)?$/, '');
title = title.replace(/\s+S\d+E\d+.*$/i, '');
title = title.replace(/\s+S\d+\s*[- ]\s*\d+[: -].*$/i, '');
title = title.replace(/\s+E\d+[: -].*$/i, '');
title = title.replace(/^S\d+E\d+\s*[- ]\s*/i, '');
// Remove bracketed/parenthesized tags: [WEBDL-1080p], (2022), etc.
title = title.replace(/\s*\[[^\]]*\]\s*/g, ' ');
title = title.replace(/\s*\([^)]*\d{4}[^)]*\)\s*/g, ' ');
// Remove common codec/source tags that may appear without brackets
title = title.replace(
/\b(WEBDL|WEBRip|BluRay|BDRip|HDTV|DVDRip|x264|x265|H\.?264|H\.?265|AV1|AAC|FLAC|Opus|10bit|8bit|1080p|720p|480p|2160p|4K)\b[-.\w]*/gi,
'',
);
// Remove trailing dashes and group tags like "-Retr0"
title = title.replace(/\s*-\s*[\w]+$/, '');
return title.trim().replace(/\s{2,}/g, ' ');
}
function removeSeasonHint(title: string): string {
return title.replace(/\bseason\s*\d+\b/gi, '').replace(/\s{2,}/g, ' ').trim();
}
function normalizeTitle(text: string): string {
return text.trim().toLowerCase().replace(/\s+/g, ' ');
}
function extractCandidateSeasonHints(text: string): Set<number> {
const normalized = normalizeTitle(text);
const matches = [
...normalized.matchAll(/\bseason\s*(\d{1,2})\b/gi),
...normalized.matchAll(/\bs(\d{1,2})(?:\b|\D)/gi),
];
const values = new Set<number>();
for (const match of matches) {
const value = Number.parseInt(match[1]!, 10);
if (Number.isInteger(value)) {
values.add(value);
}
}
return values;
}
function isSeasonMentioned(titles: string[], season: number | null): boolean {
if (!season) {
return false;
}
const hints = titles.flatMap((title) => [...extractCandidateSeasonHints(title)]);
return hints.includes(season);
}
function pickBestSearchResult(
title: string,
episode: number | null,
season: number | null,
media: AnilistMedia[],
): { id: number; title: string } | null {
const cleanedTitle = removeSeasonHint(title);
const targets = [title, cleanedTitle]
.map(normalizeTitle)
.map((value) => value.trim())
.filter((value, index, all) => value.length > 0 && all.indexOf(value) === index);
const filtered = episode === null
? media
: media.filter((item) => {
const total = item.episodes;
return total === null || total >= episode;
});
const candidates = filtered.length > 0 ? filtered : media;
if (candidates.length === 0) {
return null;
}
const scored = candidates.map((item) => {
const candidateTitles = [
item.title?.romaji,
item.title?.english,
item.title?.native,
]
.filter((value): value is string => typeof value === 'string')
.map((value) => normalizeTitle(value));
let score = 0;
for (const target of targets) {
if (candidateTitles.includes(target)) {
score += 120;
continue;
}
if (candidateTitles.some((itemTitle) => itemTitle.includes(target))) {
score += 30;
}
if (candidateTitles.some((itemTitle) => target.includes(itemTitle))) {
score += 10;
}
}
if (episode !== null && item.episodes === episode) {
score += 20;
}
if (season !== null && isSeasonMentioned(candidateTitles, season)) {
score += 15;
}
return { item, score };
});
scored.sort((a, b) => {
if (b.score !== a.score) return b.score - a.score;
return b.item.id - a.item.id;
});
const selected = scored[0]!;
const selectedTitle = selected.item.title?.english ?? selected.item.title?.romaji ?? selected.item.title?.native ?? title;
return { id: selected.item.id, title: selectedTitle };
}
function buildSearchCandidates(parsed: CoverArtCandidate): string[] {
const candidateTitles = [
parsed.title,
...(parsed.source === 'guessit' && parsed.season !== null && parsed.season > 1
? [`${parsed.title} Season ${parsed.season}`]
: []),
];
return candidateTitles
.map((title) => title.trim())
.filter((title, index, all) => title.length > 0 && all.indexOf(title) === index);
}
async function searchAnilist(
rateLimiter: AnilistRateLimiter,
title: string,
): Promise<{ media: AnilistMedia[]; rateLimited: boolean }> {
await rateLimiter.acquire();
const res = await fetch(ANILIST_GRAPHQL_URL, {
method: 'POST',
headers: { 'Content-Type': 'application/json', Accept: 'application/json' },
body: JSON.stringify({ query: SEARCH_QUERY, variables: { search: title } }),
});
rateLimiter.recordResponse(res.headers);
if (res.status === 429) {
return { media: [], rateLimited: true };
}
if (!res.ok) {
throw new Error(`Anilist search failed: ${res.status} ${res.statusText}`);
}
const json = (await res.json()) as AnilistSearchResponse;
const mediaList = json.data?.Page?.media;
if (!mediaList || mediaList.length === 0) {
return { media: [], rateLimited: false };
}
return { media: mediaList, rateLimited: false };
}
async function downloadImage(url: string): Promise<Buffer | null> {
try {
const res = await fetch(url);
if (!res.ok) return null;
const arrayBuf = await res.arrayBuffer();
return Buffer.from(arrayBuf);
} catch {
return null;
}
}
export function createCoverArtFetcher(
rateLimiter: AnilistRateLimiter,
logger: Logger,
options: CoverArtFetcherOptions = {},
): CoverArtFetcher {
const resolveMediaInfo = async (canonicalTitle: string): Promise<CoverArtCandidate | null> => {
const parsed = await guessAnilistMediaInfo(null, canonicalTitle, {
runGuessit: options.runGuessit ?? runGuessit,
});
if (!parsed) {
return null;
}
return {
title: parsed.title,
season: parsed.season,
episode: parsed.episode,
source: parsed.source,
};
};
return {
async fetchIfMissing(db, videoId, canonicalTitle): Promise<boolean> {
const existing = getCoverArt(db, videoId);
if (existing?.coverBlob) {
return true;
}
if (existing?.coverUrl) {
const coverBlob = await downloadImage(existing.coverUrl);
if (coverBlob) {
upsertCoverArt(db, videoId, {
anilistId: existing.anilistId,
coverUrl: existing.coverUrl,
coverBlob,
titleRomaji: existing.titleRomaji,
titleEnglish: existing.titleEnglish,
episodesTotal: existing.episodesTotal,
});
return true;
}
}
if (
existing &&
existing.coverUrl === null &&
existing.anilistId === null &&
Date.now() - existing.fetchedAtMs < NO_MATCH_RETRY_MS
) {
return false;
}
const cleaned = stripFilenameTags(canonicalTitle);
if (!cleaned) {
logger.warn('cover-art: empty title after stripping tags for videoId=%d', videoId);
upsertCoverArt(db, videoId, {
anilistId: null,
coverUrl: null,
coverBlob: null,
titleRomaji: null,
titleEnglish: null,
episodesTotal: null,
});
return false;
}
const parsedInfo = await resolveMediaInfo(canonicalTitle);
const searchBase = parsedInfo?.title ?? cleaned;
const searchCandidates = parsedInfo
? buildSearchCandidates(parsedInfo)
: [cleaned];
const effectiveCandidates = searchCandidates.includes(cleaned)
? searchCandidates
: [...searchCandidates, cleaned];
let selected: AnilistMedia | null = null;
let rateLimited = false;
for (const candidate of effectiveCandidates) {
logger.info('cover-art: searching Anilist for "%s" (videoId=%d)', candidate, videoId);
try {
const result = await searchAnilist(rateLimiter, candidate);
rateLimited = result.rateLimited;
if (result.media.length === 0) {
continue;
}
const picked = pickBestSearchResult(
searchBase,
parsedInfo?.episode ?? null,
parsedInfo?.season ?? null,
result.media,
);
if (picked) {
const match = result.media.find((media) => media.id === picked.id);
if (match) {
selected = match;
break;
}
}
} catch (err) {
logger.error('cover-art: Anilist search error for "%s": %s', candidate, err);
return false;
}
}
if (rateLimited) {
logger.warn('cover-art: rate-limited by Anilist, skipping videoId=%d', videoId);
return false;
}
if (!selected) {
logger.info('cover-art: no Anilist results for "%s", caching no-match', searchBase);
upsertCoverArt(db, videoId, {
anilistId: null,
coverUrl: null,
coverBlob: null,
titleRomaji: null,
titleEnglish: null,
episodesTotal: null,
});
return false;
}
const coverUrl = selected.coverImage?.large ?? selected.coverImage?.medium ?? null;
let coverBlob: Buffer | null = null;
if (coverUrl) {
coverBlob = await downloadImage(coverUrl);
}
upsertCoverArt(db, videoId, {
anilistId: selected.id,
coverUrl,
coverBlob,
titleRomaji: selected.title?.romaji ?? null,
titleEnglish: selected.title?.english ?? null,
episodesTotal: selected.episodes ?? null,
});
updateAnimeAnilistInfo(db, videoId, {
anilistId: selected.id,
titleRomaji: selected.title?.romaji ?? null,
titleEnglish: selected.title?.english ?? null,
titleNative: selected.title?.native ?? null,
episodesTotal: selected.episodes ?? null,
});
logger.info(
'cover-art: cached art for videoId=%d anilistId=%d title="%s"',
videoId,
selected.id,
selected.title?.romaji ?? searchBase,
);
return true;
},
};
}

View File

@@ -12,6 +12,7 @@ import {
resolveBoundedInt,
} from './immersion-tracker/reducer';
import type { QueuedWrite } from './immersion-tracker/types';
import { PartOfSpeech, type MergedToken } from '../../types';
type ImmersionTrackerService = import('./immersion-tracker-service').ImmersionTrackerService;
type ImmersionTrackerServiceCtor =
@@ -26,6 +27,34 @@ async function loadTrackerCtor(): Promise<ImmersionTrackerServiceCtor> {
return trackerCtor;
}
async function waitForPendingAnimeMetadata(tracker: ImmersionTrackerService): Promise<void> {
const privateApi = tracker as unknown as {
sessionState: { videoId: number } | null;
pendingAnimeMetadataUpdates?: Map<number, Promise<void>>;
};
const videoId = privateApi.sessionState?.videoId;
if (!videoId) return;
await privateApi.pendingAnimeMetadataUpdates?.get(videoId);
}
function makeMergedToken(overrides: Partial<MergedToken>): MergedToken {
return {
surface: '',
reading: '',
headword: '',
startPos: 0,
endPos: 0,
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
pos3: '',
isMerged: true,
isKnown: false,
isNPlusOneTarget: false,
...overrides,
};
}
function makeDbPath(): string {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-immersion-test-'));
return path.join(dir, 'immersion.sqlite');
@@ -222,6 +251,308 @@ test('persists and retrieves minimum immersion tracking fields', async () => {
}
});
test('recordSubtitleLine persists counted allowed tokenized vocabulary rows and subtitle-line occurrences', async () => {
const dbPath = makeDbPath();
let tracker: ImmersionTrackerService | null = null;
try {
const Ctor = await loadTrackerCtor();
tracker = new Ctor({ dbPath });
tracker.handleMediaChange('/tmp/Little Witch Academia S02E04.mkv', 'Episode 4');
await waitForPendingAnimeMetadata(tracker);
tracker.recordSubtitleLine('猫 猫 日 日 は 知っている', 0, 1, [
makeMergedToken({
surface: '猫',
headword: '猫',
reading: 'ねこ',
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '一般',
}),
makeMergedToken({
surface: '猫',
headword: '猫',
reading: 'ねこ',
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '一般',
}),
makeMergedToken({
surface: 'は',
headword: 'は',
reading: 'は',
partOfSpeech: PartOfSpeech.particle,
pos1: '助詞',
pos2: '係助詞',
}),
makeMergedToken({
surface: '知っている',
headword: '知る',
reading: 'しっている',
partOfSpeech: PartOfSpeech.other,
pos1: '動詞',
pos2: '自立',
}),
]);
const privateApi = tracker as unknown as {
flushTelemetry: (force?: boolean) => void;
flushNow: () => void;
};
privateApi.flushTelemetry(true);
privateApi.flushNow();
const db = new Database(dbPath);
const rows = db
.prepare(
`SELECT headword, word, reading, part_of_speech, pos1, pos2, frequency
FROM imm_words
ORDER BY id ASC`,
)
.all() as Array<{
headword: string;
word: string;
reading: string;
part_of_speech: string;
pos1: string;
pos2: string;
frequency: number;
}>;
const lineRows = db
.prepare(
`SELECT video_id, anime_id, line_index, segment_start_ms, segment_end_ms, text
FROM imm_subtitle_lines
ORDER BY line_id ASC`,
)
.all() as Array<{
video_id: number;
anime_id: number | null;
line_index: number;
segment_start_ms: number | null;
segment_end_ms: number | null;
text: string;
}>;
const wordOccurrenceRows = db
.prepare(
`SELECT o.occurrence_count, w.headword, w.word, w.reading
FROM imm_word_line_occurrences o
JOIN imm_words w ON w.id = o.word_id
ORDER BY o.line_id ASC, o.word_id ASC`,
)
.all() as Array<{
occurrence_count: number;
headword: string;
word: string;
reading: string;
}>;
const kanjiOccurrenceRows = db
.prepare(
`SELECT o.occurrence_count, k.kanji
FROM imm_kanji_line_occurrences o
JOIN imm_kanji k ON k.id = o.kanji_id
ORDER BY o.line_id ASC, k.kanji ASC`,
)
.all() as Array<{
occurrence_count: number;
kanji: string;
}>;
db.close();
assert.deepEqual(rows, [
{
headword: '猫',
word: '猫',
reading: 'ねこ',
part_of_speech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '一般',
frequency: 2,
},
{
headword: '知る',
word: '知っている',
reading: 'しっている',
part_of_speech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '自立',
frequency: 1,
},
]);
assert.equal(lineRows.length, 1);
assert.equal(lineRows[0]?.line_index, 1);
assert.equal(lineRows[0]?.segment_start_ms, 0);
assert.equal(lineRows[0]?.segment_end_ms, 1000);
assert.equal(lineRows[0]?.text, '猫 猫 日 日 は 知っている');
assert.ok(lineRows[0]?.video_id);
assert.ok(lineRows[0]?.anime_id);
assert.deepEqual(wordOccurrenceRows, [
{
occurrence_count: 2,
headword: '猫',
word: '猫',
reading: 'ねこ',
},
{
occurrence_count: 1,
headword: '知る',
word: '知っている',
reading: 'しっている',
},
]);
assert.deepEqual(kanjiOccurrenceRows, [
{
occurrence_count: 2,
kanji: '日',
},
{
occurrence_count: 2,
kanji: '猫',
},
{
occurrence_count: 1,
kanji: '知',
},
]);
} finally {
tracker?.destroy();
cleanupDbPath(dbPath);
}
});
test('handleMediaChange links parsed anime metadata on the active video row', async () => {
const dbPath = makeDbPath();
let tracker: ImmersionTrackerService | null = null;
try {
const Ctor = await loadTrackerCtor();
tracker = new Ctor({ dbPath });
tracker.handleMediaChange('/tmp/Little Witch Academia S02E05.mkv', 'Episode 5');
await waitForPendingAnimeMetadata(tracker);
const privateApi = tracker as unknown as {
db: DatabaseSync;
sessionState: { videoId: number } | null;
};
const videoId = privateApi.sessionState?.videoId;
assert.ok(videoId);
const row = privateApi.db
.prepare(
`
SELECT
v.anime_id,
v.parsed_basename,
v.parsed_title,
v.parsed_season,
v.parsed_episode,
v.parser_source,
a.canonical_title AS anime_title,
a.anilist_id
FROM imm_videos v
LEFT JOIN imm_anime a ON a.anime_id = v.anime_id
WHERE v.video_id = ?
`,
)
.get(videoId) as {
anime_id: number | null;
parsed_basename: string | null;
parsed_title: string | null;
parsed_season: number | null;
parsed_episode: number | null;
parser_source: string | null;
anime_title: string | null;
anilist_id: number | null;
} | null;
assert.ok(row);
assert.ok(row?.anime_id);
assert.equal(row?.parsed_basename, 'Little Witch Academia S02E05.mkv');
assert.equal(row?.parsed_title, 'Little Witch Academia');
assert.equal(row?.parsed_season, 2);
assert.equal(row?.parsed_episode, 5);
assert.ok(row?.parser_source === 'guessit' || row?.parser_source === 'fallback');
assert.equal(row?.anime_title, 'Little Witch Academia');
assert.equal(row?.anilist_id, null);
} finally {
tracker?.destroy();
cleanupDbPath(dbPath);
}
});
test('handleMediaChange reuses the same provisional anime row across matching files', async () => {
const dbPath = makeDbPath();
let tracker: ImmersionTrackerService | null = null;
try {
const Ctor = await loadTrackerCtor();
tracker = new Ctor({ dbPath });
tracker.handleMediaChange('/tmp/Little Witch Academia S02E05.mkv', 'Episode 5');
await waitForPendingAnimeMetadata(tracker);
tracker.handleMediaChange('/tmp/Little Witch Academia S02E06.mkv', 'Episode 6');
await waitForPendingAnimeMetadata(tracker);
const privateApi = tracker as unknown as {
db: DatabaseSync;
};
const rows = privateApi.db
.prepare(
`
SELECT
v.source_path,
v.anime_id,
v.parsed_episode,
a.canonical_title AS anime_title,
a.anilist_id
FROM imm_videos v
LEFT JOIN imm_anime a ON a.anime_id = v.anime_id
WHERE v.source_path IN (?, ?)
ORDER BY v.source_path
`,
)
.all('/tmp/Little Witch Academia S02E05.mkv', '/tmp/Little Witch Academia S02E06.mkv') as
Array<{
source_path: string | null;
anime_id: number | null;
parsed_episode: number | null;
anime_title: string | null;
anilist_id: number | null;
}>;
assert.equal(rows.length, 2);
assert.ok(rows[0]?.anime_id);
assert.equal(rows[0]?.anime_id, rows[1]?.anime_id);
assert.deepEqual(
rows.map((row) => ({
sourcePath: row.source_path,
parsedEpisode: row.parsed_episode,
animeTitle: row.anime_title,
anilistId: row.anilist_id,
})),
[
{
sourcePath: '/tmp/Little Witch Academia S02E05.mkv',
parsedEpisode: 5,
animeTitle: 'Little Witch Academia',
anilistId: null,
},
{
sourcePath: '/tmp/Little Witch Academia S02E06.mkv',
parsedEpisode: 6,
animeTitle: 'Little Witch Academia',
anilistId: null,
},
],
);
} finally {
tracker?.destroy();
cleanupDbPath(dbPath);
}
});
test('applies configurable queue, flush, and retention policy', async () => {
const dbPath = makeDbPath();
let tracker: ImmersionTrackerService | null = null;

View File

@@ -1,7 +1,8 @@
import path from 'node:path';
import * as fs from 'node:fs';
import { createLogger } from '../../logger';
import { getLocalVideoMetadata } from './immersion-tracker/metadata';
import type { CoverArtFetcher } from './anilist/cover-art-fetcher';
import { getLocalVideoMetadata, guessAnimeVideoMetadata } from './immersion-tracker/metadata';
import { pruneRetention, runRollupMaintenance } from './immersion-tracker/maintenance';
import { Database, type DatabaseSync } from './immersion-tracker/sqlite';
import { finalizeSessionRecord, startSessionRecord } from './immersion-tracker/session';
@@ -10,23 +11,58 @@ import {
createTrackerPreparedStatements,
ensureSchema,
executeQueuedWrite,
getOrCreateAnimeRecord,
getOrCreateVideoRecord,
linkVideoToAnimeRecord,
type TrackerPreparedStatements,
updateVideoMetadataRecord,
updateVideoTitleRecord,
} from './immersion-tracker/storage';
import {
cleanupVocabularyStats,
getAnimeCoverArt,
getAnimeDailyRollups,
getAnimeAnilistEntries,
getAnimeDetail,
getAnimeEpisodes,
getAnimeLibrary,
getAnimeWords,
getEpisodeCardEvents,
getEpisodeSessions,
getEpisodeWords,
getCoverArt,
getDailyRollups,
getEpisodesPerDay,
getKanjiAnimeAppearances,
getKanjiDetail,
getKanjiWords,
getNewAnimePerDay,
getSimilarWords,
getStreakCalendar,
getKanjiOccurrences,
getKanjiStats,
getMediaDailyRollups,
getMediaDetail,
getMediaLibrary,
getMediaSessions,
getMonthlyRollups,
getQueryHints,
getSessionEvents,
getSessionSummaries,
getSessionTimeline,
getVocabularyStats,
getWatchTimePerAnime,
getWordAnimeAppearances,
getWordDetail,
getWordOccurrences,
getVideoDurationMs,
markVideoWatched,
} from './immersion-tracker/query';
import {
buildVideoKey,
calculateTextMetrics,
extractLineVocabulary,
deriveCanonicalTitle,
isKanji,
isRemoteSource,
normalizeMediaPath,
normalizeText,
@@ -57,19 +93,73 @@ import {
SOURCE_TYPE_LOCAL,
SOURCE_TYPE_REMOTE,
type ImmersionSessionRollupRow,
type EpisodeCardEventRow,
type EpisodesPerDayRow,
type ImmersionTrackerOptions,
type KanjiAnimeAppearanceRow,
type KanjiDetailRow,
type KanjiOccurrenceRow,
type KanjiStatsRow,
type KanjiWordRow,
type LegacyVocabularyPosResolution,
type LegacyVocabularyPosRow,
type AnimeAnilistEntryRow,
type AnimeDetailRow,
type AnimeEpisodeRow,
type AnimeLibraryRow,
type AnimeWordRow,
type MediaArtRow,
type MediaDetailRow,
type MediaLibraryRow,
type NewAnimePerDayRow,
type QueuedWrite,
type SessionEventRow,
type SessionState,
type SessionSummaryQueryRow,
type SessionTimelineRow,
type SimilarWordRow,
type StreakCalendarRow,
type VocabularyCleanupSummary,
type WatchTimePerAnimeRow,
type WordAnimeAppearanceRow,
type WordDetailRow,
type WordOccurrenceRow,
type VocabularyStatsRow,
} from './immersion-tracker/types';
import type { MergedToken } from '../../types';
import { shouldExcludeTokenFromVocabularyPersistence } from './tokenizer/annotation-stage';
import { deriveStoredPartOfSpeech } from './tokenizer/part-of-speech';
export type {
AnimeAnilistEntryRow,
AnimeDetailRow,
AnimeEpisodeRow,
AnimeLibraryRow,
AnimeWordRow,
EpisodeCardEventRow,
EpisodesPerDayRow,
ImmersionSessionRollupRow,
ImmersionTrackerOptions,
ImmersionTrackerPolicy,
KanjiAnimeAppearanceRow,
KanjiDetailRow,
KanjiOccurrenceRow,
KanjiStatsRow,
KanjiWordRow,
MediaArtRow,
MediaDetailRow,
MediaLibraryRow,
NewAnimePerDayRow,
SessionEventRow,
SessionSummaryQueryRow,
SessionTimelineRow,
SimilarWordRow,
StreakCalendarRow,
WatchTimePerAnimeRow,
WordAnimeAppearanceRow,
WordDetailRow,
WordOccurrenceRow,
VocabularyStatsRow,
} from './immersion-tracker/types';
export class ImmersionTrackerService {
@@ -98,9 +188,17 @@ export class ImmersionTrackerService {
private currentVideoKey = '';
private currentMediaPathOrUrl = '';
private readonly preparedStatements: TrackerPreparedStatements;
private coverArtFetcher: CoverArtFetcher | null = null;
private readonly pendingCoverFetches = new Map<number, Promise<boolean>>();
private readonly recordedSubtitleKeys = new Set<string>();
private readonly pendingAnimeMetadataUpdates = new Map<number, Promise<void>>();
private readonly resolveLegacyVocabularyPos:
| ((row: LegacyVocabularyPosRow) => Promise<LegacyVocabularyPosResolution | null>)
| undefined;
constructor(options: ImmersionTrackerOptions) {
this.dbPath = options.dbPath;
this.resolveLegacyVocabularyPos = options.resolveLegacyVocabularyPos;
const parentDir = path.dirname(this.dbPath);
if (!fs.existsSync(parentDir)) {
fs.mkdirSync(parentDir, { recursive: true });
@@ -198,6 +296,8 @@ export class ImmersionTrackerService {
async getQueryHints(): Promise<{
totalSessions: number;
activeSessions: number;
episodesToday: number;
activeAnimeCount: number;
}> {
return getQueryHints(this.db);
}
@@ -210,6 +310,180 @@ export class ImmersionTrackerService {
return getMonthlyRollups(this.db, limit);
}
async getVocabularyStats(limit = 100, excludePos?: string[]): Promise<VocabularyStatsRow[]> {
return getVocabularyStats(this.db, limit, excludePos);
}
async cleanupVocabularyStats(): Promise<VocabularyCleanupSummary> {
return cleanupVocabularyStats(this.db, {
resolveLegacyPos: this.resolveLegacyVocabularyPos,
});
}
async getKanjiStats(limit = 100): Promise<KanjiStatsRow[]> {
return getKanjiStats(this.db, limit);
}
async getWordOccurrences(
headword: string,
word: string,
reading: string,
limit = 100,
offset = 0,
): Promise<WordOccurrenceRow[]> {
return getWordOccurrences(this.db, headword, word, reading, limit, offset);
}
async getKanjiOccurrences(
kanji: string,
limit = 100,
offset = 0,
): Promise<KanjiOccurrenceRow[]> {
return getKanjiOccurrences(this.db, kanji, limit, offset);
}
async getSessionEvents(sessionId: number, limit = 500): Promise<SessionEventRow[]> {
return getSessionEvents(this.db, sessionId, limit);
}
async getMediaLibrary(): Promise<MediaLibraryRow[]> {
return getMediaLibrary(this.db);
}
async getMediaDetail(videoId: number): Promise<MediaDetailRow | null> {
return getMediaDetail(this.db, videoId);
}
async getMediaSessions(videoId: number, limit = 100): Promise<SessionSummaryQueryRow[]> {
return getMediaSessions(this.db, videoId, limit);
}
async getMediaDailyRollups(videoId: number, limit = 90): Promise<ImmersionSessionRollupRow[]> {
return getMediaDailyRollups(this.db, videoId, limit);
}
async getCoverArt(videoId: number): Promise<MediaArtRow | null> {
return getCoverArt(this.db, videoId);
}
async getAnimeLibrary(): Promise<AnimeLibraryRow[]> {
return getAnimeLibrary(this.db);
}
async getAnimeDetail(animeId: number): Promise<AnimeDetailRow | null> {
return getAnimeDetail(this.db, animeId);
}
async getAnimeEpisodes(animeId: number): Promise<AnimeEpisodeRow[]> {
return getAnimeEpisodes(this.db, animeId);
}
async getAnimeAnilistEntries(animeId: number): Promise<AnimeAnilistEntryRow[]> {
return getAnimeAnilistEntries(this.db, animeId);
}
async getAnimeCoverArt(animeId: number): Promise<MediaArtRow | null> {
return getAnimeCoverArt(this.db, animeId);
}
async getAnimeWords(animeId: number, limit = 50): Promise<AnimeWordRow[]> {
return getAnimeWords(this.db, animeId, limit);
}
async getEpisodeWords(videoId: number, limit = 50): Promise<AnimeWordRow[]> {
return getEpisodeWords(this.db, videoId, limit);
}
async getEpisodeSessions(videoId: number): Promise<SessionSummaryQueryRow[]> {
return getEpisodeSessions(this.db, videoId);
}
async setVideoWatched(videoId: number, watched: boolean): Promise<void> {
markVideoWatched(this.db, videoId, watched);
}
async getEpisodeCardEvents(videoId: number): Promise<EpisodeCardEventRow[]> {
return getEpisodeCardEvents(this.db, videoId);
}
async getAnimeDailyRollups(animeId: number, limit = 90): Promise<ImmersionSessionRollupRow[]> {
return getAnimeDailyRollups(this.db, animeId, limit);
}
async getStreakCalendar(days = 90): Promise<StreakCalendarRow[]> {
return getStreakCalendar(this.db, days);
}
async getEpisodesPerDay(limit = 90): Promise<EpisodesPerDayRow[]> {
return getEpisodesPerDay(this.db, limit);
}
async getNewAnimePerDay(limit = 90): Promise<NewAnimePerDayRow[]> {
return getNewAnimePerDay(this.db, limit);
}
async getWatchTimePerAnime(limit = 90): Promise<WatchTimePerAnimeRow[]> {
return getWatchTimePerAnime(this.db, limit);
}
async getWordDetail(wordId: number): Promise<WordDetailRow | null> {
return getWordDetail(this.db, wordId);
}
async getWordAnimeAppearances(wordId: number): Promise<WordAnimeAppearanceRow[]> {
return getWordAnimeAppearances(this.db, wordId);
}
async getSimilarWords(wordId: number, limit = 10): Promise<SimilarWordRow[]> {
return getSimilarWords(this.db, wordId, limit);
}
async getKanjiDetail(kanjiId: number): Promise<KanjiDetailRow | null> {
return getKanjiDetail(this.db, kanjiId);
}
async getKanjiAnimeAppearances(kanjiId: number): Promise<KanjiAnimeAppearanceRow[]> {
return getKanjiAnimeAppearances(this.db, kanjiId);
}
async getKanjiWords(kanjiId: number, limit = 20): Promise<KanjiWordRow[]> {
return getKanjiWords(this.db, kanjiId, limit);
}
setCoverArtFetcher(fetcher: CoverArtFetcher | null): void {
this.coverArtFetcher = fetcher;
}
async ensureCoverArt(videoId: number): Promise<boolean> {
const existing = getCoverArt(this.db, videoId);
if (existing?.coverBlob) {
return true;
}
if (!this.coverArtFetcher) {
return false;
}
const inFlight = this.pendingCoverFetches.get(videoId);
if (inFlight) {
return await inFlight;
}
const fetchPromise = (async () => {
const detail = getMediaDetail(this.db, videoId);
const canonicalTitle = detail?.canonicalTitle?.trim();
if (!canonicalTitle) {
return false;
}
return await this.coverArtFetcher!.fetchIfMissing(this.db, videoId, canonicalTitle);
})();
this.pendingCoverFetches.set(videoId, fetchPromise);
try {
return await fetchPromise;
} finally {
this.pendingCoverFetches.delete(videoId);
}
}
handleMediaChange(mediaPath: string | null, mediaTitle: string | null): void {
const normalizedPath = normalizeMediaPath(mediaPath);
const normalizedTitle = normalizeText(mediaTitle);
@@ -254,6 +528,7 @@ export class ImmersionTrackerService {
`Starting immersion session for path=${normalizedPath} videoId=${sessionInfo.videoId}`,
);
this.startSession(sessionInfo.videoId, sessionInfo.startedAtMs);
this.captureAnimeMetadataAsync(sessionInfo.videoId, normalizedPath, normalizedTitle || null);
this.captureVideoMetadataAsync(sessionInfo.videoId, sourceType, normalizedPath);
}
@@ -265,40 +540,110 @@ export class ImmersionTrackerService {
this.updateVideoTitleForActiveSession(normalizedTitle);
}
recordSubtitleLine(text: string, startSec: number, endSec: number): void {
recordSubtitleLine(
text: string,
startSec: number,
endSec: number,
tokens?: MergedToken[] | null,
): void {
if (!this.sessionState || !text.trim()) return;
const cleaned = normalizeText(text);
if (!cleaned) return;
if (!endSec || endSec <= 0) {
return;
}
const startMs = secToMs(startSec);
const subtitleKey = `${startMs}:${cleaned}`;
if (this.recordedSubtitleKeys.has(subtitleKey)) {
return;
}
this.recordedSubtitleKeys.add(subtitleKey);
const nowMs = Date.now();
const nowSec = nowMs / 1000;
const metrics = calculateTextMetrics(cleaned);
const extractedVocabulary = extractLineVocabulary(cleaned);
this.sessionState.currentLineIndex += 1;
this.sessionState.linesSeen += 1;
this.sessionState.wordsSeen += metrics.words;
this.sessionState.tokensSeen += metrics.tokens;
this.sessionState.pendingTelemetry = true;
for (const { headword, word, reading } of extractedVocabulary.words) {
this.recordWrite({
kind: 'word',
const wordOccurrences = new Map<
string,
{
headword: string;
word: string;
reading: string;
partOfSpeech: string;
pos1: string;
pos2: string;
pos3: string;
occurrenceCount: number;
}
>();
for (const token of tokens ?? []) {
if (shouldExcludeTokenFromVocabularyPersistence(token)) {
continue;
}
const headword = normalizeText(token.headword || token.surface);
const word = normalizeText(token.surface || token.headword);
const reading = normalizeText(token.reading);
if (!headword || !word) {
continue;
}
const wordKey = [
headword,
word,
reading,
firstSeen: nowSec,
lastSeen: nowSec,
].join('\u0000');
const storedPartOfSpeech = deriveStoredPartOfSpeech({
partOfSpeech: token.partOfSpeech,
pos1: token.pos1 ?? '',
});
const existing = wordOccurrences.get(wordKey);
if (existing) {
existing.occurrenceCount += 1;
continue;
}
wordOccurrences.set(wordKey, {
headword,
word,
reading,
partOfSpeech: storedPartOfSpeech,
pos1: token.pos1 ?? '',
pos2: token.pos2 ?? '',
pos3: token.pos3 ?? '',
occurrenceCount: 1,
});
}
for (const kanji of extractedVocabulary.kanji) {
const kanjiCounts = new Map<string, number>();
for (const char of cleaned) {
if (!isKanji(char)) {
continue;
}
kanjiCounts.set(char, (kanjiCounts.get(char) ?? 0) + 1);
}
this.recordWrite({
kind: 'kanji',
kind: 'subtitleLine',
sessionId: this.sessionState.sessionId,
videoId: this.sessionState.videoId,
lineIndex: this.sessionState.currentLineIndex,
segmentStartMs: secToMs(startSec),
segmentEndMs: secToMs(endSec),
text: cleaned,
wordOccurrences: Array.from(wordOccurrences.values()),
kanjiOccurrences: Array.from(kanjiCounts.entries()).map(([kanji, occurrenceCount]) => ({
kanji,
occurrenceCount,
})),
firstSeen: nowSec,
lastSeen: nowSec,
});
}
this.recordWrite({
kind: 'event',
@@ -321,6 +666,16 @@ export class ImmersionTrackerService {
});
}
recordMediaDuration(durationSec: number): void {
if (!this.sessionState || !Number.isFinite(durationSec) || durationSec <= 0) return;
const durationMs = Math.round(durationSec * 1000);
const current = getVideoDurationMs(this.db, this.sessionState.videoId);
if (current === 0 || Math.abs(current - durationMs) > 1000) {
this.db.prepare('UPDATE imm_videos SET duration_ms = ?, LAST_UPDATE_DATE = ? WHERE video_id = ?')
.run(durationMs, Date.now(), this.sessionState.videoId);
}
}
recordPlaybackPosition(mediaTimeSec: number | null): void {
if (!this.sessionState || mediaTimeSec === null || !Number.isFinite(mediaTimeSec)) {
return;
@@ -391,6 +746,14 @@ export class ImmersionTrackerService {
this.sessionState.lastWallClockMs = nowMs;
this.sessionState.lastMediaMs = mediaMs;
this.sessionState.pendingTelemetry = true;
if (!this.sessionState.markedWatched) {
const durationMs = getVideoDurationMs(this.db, this.sessionState.videoId);
if (durationMs > 0 && mediaMs >= durationMs * 0.98) {
markVideoWatched(this.db, this.sessionState.videoId, true);
this.sessionState.markedWatched = true;
}
}
}
recordPauseState(isPaused: boolean): void {
@@ -454,7 +817,7 @@ export class ImmersionTrackerService {
});
}
recordCardsMined(count = 1): void {
recordCardsMined(count = 1, noteIds?: number[]): void {
if (!this.sessionState) return;
this.sessionState.cardsMined += count;
this.sessionState.pendingTelemetry = true;
@@ -465,7 +828,10 @@ export class ImmersionTrackerService {
eventType: EVENT_CARD_MINED,
wordsDelta: 0,
cardsDelta: count,
payloadJson: sanitizePayload({ cardsMined: count }, this.maxPayloadBytes),
payloadJson: sanitizePayload(
{ cardsMined: count, ...(noteIds?.length ? { noteIds } : {}) },
this.maxPayloadBytes,
),
});
}
@@ -615,6 +981,7 @@ export class ImmersionTrackerService {
private startSession(videoId: number, startedAtMs?: number): void {
const { sessionId, state } = startSessionRecord(this.db, videoId, startedAtMs);
this.sessionState = state;
this.recordedSubtitleKeys.clear();
this.recordWrite({
kind: 'telemetry',
sessionId,
@@ -673,6 +1040,48 @@ export class ImmersionTrackerService {
})();
}
private captureAnimeMetadataAsync(
videoId: number,
mediaPath: string | null,
mediaTitle: string | null,
): void {
const updatePromise = (async () => {
try {
const parsed = await guessAnimeVideoMetadata(mediaPath, mediaTitle);
if (this.isDestroyed || !parsed?.parsedTitle.trim()) {
return;
}
const animeId = getOrCreateAnimeRecord(this.db, {
parsedTitle: parsed.parsedTitle,
canonicalTitle: parsed.parsedTitle,
anilistId: null,
titleRomaji: null,
titleEnglish: null,
titleNative: null,
metadataJson: parsed.parseMetadataJson,
});
linkVideoToAnimeRecord(this.db, videoId, {
animeId,
parsedBasename: parsed.parsedBasename,
parsedTitle: parsed.parsedTitle,
parsedSeason: parsed.parsedSeason,
parsedEpisode: parsed.parsedEpisode,
parserSource: parsed.parserSource,
parserConfidence: parsed.parserConfidence,
parseMetadataJson: parsed.parseMetadataJson,
});
} catch (error) {
this.logger.warn('Unable to capture anime metadata', (error as Error).message);
}
})();
this.pendingAnimeMetadataUpdates.set(videoId, updatePromise);
void updatePromise.finally(() => {
this.pendingAnimeMetadataUpdates.delete(videoId);
});
}
private updateVideoTitleForActiveSession(canonicalTitle: string): void {
if (!this.sessionState) return;
updateVideoTitleRecord(this.db, this.sessionState.videoId, canonicalTitle);

View File

@@ -0,0 +1,976 @@
import assert from 'node:assert/strict';
import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
import test from 'node:test';
import { Database } from '../sqlite.js';
import {
createTrackerPreparedStatements,
ensureSchema,
getOrCreateAnimeRecord,
getOrCreateVideoRecord,
linkVideoToAnimeRecord,
} from '../storage.js';
import { startSessionRecord } from '../session.js';
import {
cleanupVocabularyStats,
getAnimeDetail,
getAnimeEpisodes,
getAnimeLibrary,
getKanjiOccurrences,
getSessionSummaries,
getVocabularyStats,
getKanjiStats,
getSessionEvents,
getWordOccurrences,
} from '../query.js';
import { SOURCE_TYPE_LOCAL, EVENT_SUBTITLE_LINE } from '../types.js';
function makeDbPath(): string {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'subminer-imm-query-test-'));
return path.join(dir, 'immersion.sqlite');
}
function cleanupDbPath(dbPath: string): void {
const dir = path.dirname(dbPath);
if (!fs.existsSync(dir)) {
return;
}
const bunRuntime = globalThis as typeof globalThis & {
Bun?: {
gc?: (force?: boolean) => void;
};
};
let lastError: NodeJS.ErrnoException | null = null;
for (let attempt = 0; attempt < 3; attempt += 1) {
try {
fs.rmSync(dir, { recursive: true, force: true });
return;
} catch (error) {
const err = error as NodeJS.ErrnoException;
lastError = err;
if (process.platform !== 'win32' || err.code !== 'EBUSY') {
throw error;
}
bunRuntime.Bun?.gc?.(true);
Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, 25);
}
}
if (lastError) {
throw lastError;
}
}
test('getSessionSummaries returns sessionId and canonicalTitle', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
ensureSchema(db);
const stmts = createTrackerPreparedStatements(db);
const videoId = getOrCreateVideoRecord(db, 'local:/tmp/query-test.mkv', {
canonicalTitle: 'Query Test Episode',
sourcePath: '/tmp/query-test.mkv',
sourceUrl: null,
sourceType: SOURCE_TYPE_LOCAL,
});
const startedAtMs = 1_000_000;
const { sessionId } = startSessionRecord(db, videoId, startedAtMs);
stmts.telemetryInsertStmt.run(
sessionId,
startedAtMs + 1_000,
3_000,
2_500,
5,
10,
10,
1,
2,
1,
0,
0,
0,
0,
0,
startedAtMs + 1_000,
startedAtMs + 1_000,
);
const rows = getSessionSummaries(db, 10);
assert.ok(rows.length >= 1);
const row = rows.find((r) => r.sessionId === sessionId);
assert.ok(row, 'expected to find a row for the created session');
assert.equal(typeof row.sessionId, 'number');
assert.equal(row.sessionId, sessionId);
assert.equal(row.canonicalTitle, 'Query Test Episode');
assert.equal(row.videoId, videoId);
assert.ok(row.linesSeen >= 5);
} finally {
db.close();
cleanupDbPath(dbPath);
}
});
test('getSessionSummaries with no telemetry returns zero aggregates', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
ensureSchema(db);
const videoId = getOrCreateVideoRecord(db, 'local:/tmp/no-telemetry.mkv', {
canonicalTitle: 'No Telemetry',
sourcePath: '/tmp/no-telemetry.mkv',
sourceUrl: null,
sourceType: SOURCE_TYPE_LOCAL,
});
const { sessionId } = startSessionRecord(db, videoId, 3_000_000);
const rows = getSessionSummaries(db, 10);
const row = rows.find((r) => r.sessionId === sessionId);
assert.ok(row, 'expected to find the session with no telemetry');
assert.equal(row.canonicalTitle, 'No Telemetry');
assert.equal(row.totalWatchedMs, 0);
assert.equal(row.linesSeen, 0);
assert.equal(row.cardsMined, 0);
} finally {
db.close();
cleanupDbPath(dbPath);
}
});
test('getVocabularyStats returns rows ordered by frequency descending', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
ensureSchema(db);
const stmts = createTrackerPreparedStatements(db);
// Insert words: 猫 twice, 犬 once
stmts.wordUpsertStmt.run('猫', '猫', 'ねこ', 'noun', '名詞', '一般', '', 1_000, 2_000);
stmts.wordUpsertStmt.run('猫', '猫', 'ねこ', 'noun', '名詞', '一般', '', 1_000, 3_000);
stmts.wordUpsertStmt.run('犬', '犬', 'いぬ', 'noun', '名詞', '一般', '', 1_500, 1_500);
const rows = getVocabularyStats(db, 10);
assert.ok(rows.length >= 2);
// First row should be 猫 (frequency 2)
const nekRow = rows.find((r) => r.headword === '猫');
const inuRow = rows.find((r) => r.headword === '犬');
assert.ok(nekRow, 'expected 猫 row');
assert.ok(inuRow, 'expected 犬 row');
assert.equal(nekRow.headword, '猫');
assert.equal(nekRow.word, '猫');
assert.equal(nekRow.reading, 'ねこ');
assert.equal(nekRow.frequency, 2);
assert.equal(typeof nekRow.firstSeen, 'number');
assert.equal(typeof nekRow.lastSeen, 'number');
// Higher frequency should come first
const nekIdx = rows.indexOf(nekRow);
const inuIdx = rows.indexOf(inuRow);
assert.ok(nekIdx < inuIdx, 'higher frequency word should appear first');
} finally {
db.close();
cleanupDbPath(dbPath);
}
});
test('getVocabularyStats returns empty array when no words exist', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
ensureSchema(db);
const rows = getVocabularyStats(db, 10);
assert.deepEqual(rows, []);
} finally {
db.close();
cleanupDbPath(dbPath);
}
});
test('cleanupVocabularyStats repairs stored POS metadata and removes excluded imm_words rows', async () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
ensureSchema(db);
db.prepare(
`INSERT INTO imm_words (
headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
).run('猫', '猫', 'ねこ', 'noun', '名詞', '一般', '', 1_000, 1_500, 3);
db.prepare(
`INSERT INTO imm_words (
headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
).run('知っている', '知っている', '', 'other', '動詞', '自立', '', 1_025, 1_525, 4);
db.prepare(
`INSERT INTO imm_words (
headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
).run('は', 'は', 'は', 'particle', '助詞', '係助詞', '', 1_100, 1_600, 9);
db.prepare(
`INSERT INTO imm_words (
headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
).run('旧', '旧', '', '', '', '', '', 900, 950, 1);
db.prepare(
`INSERT INTO imm_words (
headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
).run('未解決', '未解決', '', '', '', '', '', 901, 951, 1);
const result = await cleanupVocabularyStats(db, {
resolveLegacyPos: async (row) => {
if (row.headword === '旧') {
return {
partOfSpeech: 'noun',
headword: '旧',
reading: 'きゅう',
pos1: '名詞',
pos2: '一般',
pos3: '',
};
}
if (row.headword === '知っている') {
return {
partOfSpeech: 'verb',
headword: '知る',
reading: 'しっている',
pos1: '動詞',
pos2: '自立',
pos3: '',
};
}
return null;
},
});
const rows = getVocabularyStats(db, 10);
const repairedRows = db
.prepare(
`SELECT headword, word, reading, part_of_speech, pos1, pos2
FROM imm_words
ORDER BY headword ASC, word ASC`,
)
.all() as Array<{
headword: string;
word: string;
reading: string;
part_of_speech: string;
pos1: string;
pos2: string;
}>;
assert.deepEqual(result, { scanned: 5, kept: 3, deleted: 2, repaired: 2 });
assert.deepEqual(
rows.map((row) => ({ headword: row.headword, frequency: row.frequency })),
[
{ headword: '知る', frequency: 4 },
{ headword: '猫', frequency: 3 },
{ headword: '旧', frequency: 1 },
],
);
assert.deepEqual(
repairedRows,
[
{
headword: '旧',
word: '旧',
reading: 'きゅう',
part_of_speech: 'noun',
pos1: '名詞',
pos2: '一般',
},
{
headword: '猫',
word: '猫',
reading: 'ねこ',
part_of_speech: 'noun',
pos1: '名詞',
pos2: '一般',
},
{
headword: '知る',
word: '知っている',
reading: 'しっている',
part_of_speech: 'verb',
pos1: '動詞',
pos2: '自立',
},
],
);
} finally {
db.close();
cleanupDbPath(dbPath);
}
});
test('cleanupVocabularyStats merges repaired duplicates instead of violating the imm_words unique key', async () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
ensureSchema(db);
const videoId = getOrCreateVideoRecord(db, 'local:/tmp/cleanup-merge.mkv', {
canonicalTitle: 'Cleanup Merge',
sourcePath: '/tmp/cleanup-merge.mkv',
sourceUrl: null,
sourceType: SOURCE_TYPE_LOCAL,
});
const { sessionId } = startSessionRecord(db, videoId, 2_000_000);
const duplicateResult = db
.prepare(
`INSERT INTO imm_words (
headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
)
.run('知る', '知っている', 'しっている', 'verb', '動詞', '自立', '', 2_000, 2_500, 3);
const legacyResult = db
.prepare(
`INSERT INTO imm_words (
headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
)
.run('知っている', '知っている', '', 'other', '動詞', '自立', '', 1_000, 3_000, 4);
const lineResult = db
.prepare(
`INSERT INTO imm_subtitle_lines (
session_id, event_id, video_id, anime_id, line_index, segment_start_ms, segment_end_ms, text, CREATED_DATE, LAST_UPDATE_DATE
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
)
.run(sessionId, null, videoId, null, 1, 0, 1000, '知っている', 2_000, 2_000);
const lineId = Number(lineResult.lastInsertRowid);
const duplicateId = Number(duplicateResult.lastInsertRowid);
const legacyId = Number(legacyResult.lastInsertRowid);
db.prepare(
`INSERT INTO imm_word_line_occurrences (line_id, word_id, occurrence_count)
VALUES (?, ?, ?)`,
).run(lineId, duplicateId, 2);
db.prepare(
`INSERT INTO imm_word_line_occurrences (line_id, word_id, occurrence_count)
VALUES (?, ?, ?)`,
).run(lineId, legacyId, 1);
const result = await cleanupVocabularyStats(db, {
resolveLegacyPos: async (row) => {
if (row.id !== legacyId) {
return null;
}
return {
partOfSpeech: 'verb',
headword: '知る',
reading: 'しっている',
pos1: '動詞',
pos2: '自立',
pos3: '',
};
},
});
const rows = db
.prepare(
`SELECT id, headword, word, reading, frequency, first_seen, last_seen
FROM imm_words
ORDER BY id ASC`,
)
.all() as Array<{
id: number;
headword: string;
word: string;
reading: string;
frequency: number;
first_seen: number;
last_seen: number;
}>;
const occurrences = getWordOccurrences(db, '知る', '知っている', 'しっている', 10);
assert.deepEqual(result, { scanned: 2, kept: 1, deleted: 1, repaired: 1 });
assert.deepEqual(rows, [
{
id: duplicateId,
headword: '知る',
word: '知っている',
reading: 'しっている',
frequency: 7,
first_seen: 1_000,
last_seen: 3_000,
},
]);
assert.deepEqual(occurrences, [
{
animeId: null,
animeTitle: null,
videoId,
videoTitle: 'Cleanup Merge',
sessionId,
lineIndex: 1,
segmentStartMs: 0,
segmentEndMs: 1000,
text: '知っている',
occurrenceCount: 3,
},
]);
} finally {
db.close();
cleanupDbPath(dbPath);
}
});
test('getKanjiStats returns rows ordered by frequency descending', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
ensureSchema(db);
const stmts = createTrackerPreparedStatements(db);
// Insert kanji: 日 twice, 月 once
stmts.kanjiUpsertStmt.run('日', 1_000, 2_000);
stmts.kanjiUpsertStmt.run('日', 1_000, 3_000);
stmts.kanjiUpsertStmt.run('月', 1_500, 1_500);
const rows = getKanjiStats(db, 10);
assert.ok(rows.length >= 2);
const nichiRow = rows.find((r) => r.kanji === '日');
const tsukiRow = rows.find((r) => r.kanji === '月');
assert.ok(nichiRow, 'expected 日 row');
assert.ok(tsukiRow, 'expected 月 row');
assert.equal(nichiRow.kanji, '日');
assert.equal(nichiRow.frequency, 2);
assert.equal(typeof nichiRow.firstSeen, 'number');
assert.equal(typeof nichiRow.lastSeen, 'number');
// Higher frequency should come first
const nichiIdx = rows.indexOf(nichiRow);
const tsukiIdx = rows.indexOf(tsukiRow);
assert.ok(nichiIdx < tsukiIdx, 'higher frequency kanji should appear first');
} finally {
db.close();
cleanupDbPath(dbPath);
}
});
test('getKanjiStats returns empty array when no kanji exist', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
ensureSchema(db);
const rows = getKanjiStats(db, 10);
assert.deepEqual(rows, []);
} finally {
db.close();
cleanupDbPath(dbPath);
}
});
test('getSessionEvents returns events ordered by ts_ms ascending', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
ensureSchema(db);
const stmts = createTrackerPreparedStatements(db);
const videoId = getOrCreateVideoRecord(db, 'local:/tmp/events-test.mkv', {
canonicalTitle: 'Events Test',
sourcePath: '/tmp/events-test.mkv',
sourceUrl: null,
sourceType: SOURCE_TYPE_LOCAL,
});
const startedAtMs = 5_000_000;
const { sessionId } = startSessionRecord(db, videoId, startedAtMs);
// Insert two events at different timestamps
stmts.eventInsertStmt.run(
sessionId,
startedAtMs + 2_000,
EVENT_SUBTITLE_LINE,
1,
0,
800,
2,
0,
'{"line":"second"}',
startedAtMs + 2_000,
startedAtMs + 2_000,
);
stmts.eventInsertStmt.run(
sessionId,
startedAtMs + 1_000,
EVENT_SUBTITLE_LINE,
0,
0,
600,
3,
0,
'{"line":"first"}',
startedAtMs + 1_000,
startedAtMs + 1_000,
);
const events = getSessionEvents(db, sessionId, 50);
assert.equal(events.length, 2);
// Should be ordered ASC by ts_ms
assert.equal(events[0]!.tsMs, startedAtMs + 1_000);
assert.equal(events[1]!.tsMs, startedAtMs + 2_000);
assert.equal(events[0]!.eventType, EVENT_SUBTITLE_LINE);
assert.equal(events[0]!.payload, '{"line":"first"}');
assert.equal(events[1]!.payload, '{"line":"second"}');
} finally {
db.close();
cleanupDbPath(dbPath);
}
});
test('getSessionEvents returns empty array for session with no events', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
ensureSchema(db);
const events = getSessionEvents(db, 9999, 50);
assert.deepEqual(events, []);
} finally {
db.close();
cleanupDbPath(dbPath);
}
});
test('getSessionEvents respects limit parameter', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
ensureSchema(db);
const stmts = createTrackerPreparedStatements(db);
const videoId = getOrCreateVideoRecord(db, 'local:/tmp/events-limit.mkv', {
canonicalTitle: 'Events Limit Test',
sourcePath: '/tmp/events-limit.mkv',
sourceUrl: null,
sourceType: SOURCE_TYPE_LOCAL,
});
const startedAtMs = 7_000_000;
const { sessionId } = startSessionRecord(db, videoId, startedAtMs);
// Insert 5 events
for (let i = 0; i < 5; i += 1) {
stmts.eventInsertStmt.run(
sessionId,
startedAtMs + i * 1_000,
EVENT_SUBTITLE_LINE,
i,
0,
500,
1,
0,
null,
startedAtMs + i * 1_000,
startedAtMs + i * 1_000,
);
}
const limited = getSessionEvents(db, sessionId, 3);
assert.equal(limited.length, 3);
} finally {
db.close();
cleanupDbPath(dbPath);
}
});
test('anime-level queries group by anime_id and preserve episode-level rows', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
ensureSchema(db);
const stmts = createTrackerPreparedStatements(db);
const lwaAnimeId = getOrCreateAnimeRecord(db, {
parsedTitle: 'Little Witch Academia',
canonicalTitle: 'Little Witch Academia',
anilistId: 33_435,
titleRomaji: 'Little Witch Academia',
titleEnglish: 'Little Witch Academia',
titleNative: 'リトルウィッチアカデミア',
metadataJson: '{"source":"anilist"}',
});
const frierenAnimeId = getOrCreateAnimeRecord(db, {
parsedTitle: 'Frieren',
canonicalTitle: 'Frieren',
anilistId: 52_921,
titleRomaji: 'Sousou no Frieren',
titleEnglish: 'Frieren: Beyond Journey\'s End',
titleNative: '葬送のフリーレン',
metadataJson: '{"source":"anilist"}',
});
const lwaEpisode5 = getOrCreateVideoRecord(db, 'local:/tmp/lwa-s02e05.mkv', {
canonicalTitle: 'Episode 5',
sourcePath: '/tmp/Little Witch Academia S02E05.mkv',
sourceUrl: null,
sourceType: SOURCE_TYPE_LOCAL,
});
const lwaEpisode6 = getOrCreateVideoRecord(db, 'local:/tmp/lwa-s02e06.mkv', {
canonicalTitle: 'Episode 6',
sourcePath: '/tmp/Little Witch Academia S02E06.mkv',
sourceUrl: null,
sourceType: SOURCE_TYPE_LOCAL,
});
const frierenEpisode3 = getOrCreateVideoRecord(db, 'local:/tmp/frieren-03.mkv', {
canonicalTitle: 'Episode 3',
sourcePath: '/tmp/[SubsPlease] Frieren - 03 - Departure.mkv',
sourceUrl: null,
sourceType: SOURCE_TYPE_LOCAL,
});
linkVideoToAnimeRecord(db, lwaEpisode5, {
animeId: lwaAnimeId,
parsedBasename: 'Little Witch Academia S02E05.mkv',
parsedTitle: 'Little Witch Academia',
parsedSeason: 2,
parsedEpisode: 5,
parserSource: 'fallback',
parserConfidence: 1,
parseMetadataJson: '{"episode":5}',
});
linkVideoToAnimeRecord(db, lwaEpisode6, {
animeId: lwaAnimeId,
parsedBasename: 'Little Witch Academia S02E06.mkv',
parsedTitle: 'Little Witch Academia',
parsedSeason: 2,
parsedEpisode: 6,
parserSource: 'fallback',
parserConfidence: 1,
parseMetadataJson: '{"episode":6}',
});
linkVideoToAnimeRecord(db, frierenEpisode3, {
animeId: frierenAnimeId,
parsedBasename: '[SubsPlease] Frieren - 03 - Departure.mkv',
parsedTitle: 'Frieren',
parsedSeason: 1,
parsedEpisode: 3,
parserSource: 'fallback',
parserConfidence: 0.6,
parseMetadataJson: '{"episode":3}',
});
const sessionA = startSessionRecord(db, lwaEpisode5, 1_000_000);
const sessionB = startSessionRecord(db, lwaEpisode5, 1_010_000);
const sessionC = startSessionRecord(db, lwaEpisode6, 1_020_000);
const sessionD = startSessionRecord(db, frierenEpisode3, 1_030_000);
stmts.telemetryInsertStmt.run(
sessionA.sessionId,
1_001_000,
4_000,
3_000,
10,
25,
25,
1,
3,
2,
0,
0,
0,
0,
0,
1_001_000,
1_001_000,
);
stmts.telemetryInsertStmt.run(
sessionB.sessionId,
1_011_000,
5_000,
4_000,
11,
27,
27,
2,
4,
2,
0,
0,
0,
0,
0,
1_011_000,
1_011_000,
);
stmts.telemetryInsertStmt.run(
sessionC.sessionId,
1_021_000,
6_000,
5_000,
12,
28,
28,
3,
5,
4,
0,
0,
0,
0,
0,
1_021_000,
1_021_000,
);
stmts.telemetryInsertStmt.run(
sessionD.sessionId,
1_031_000,
4_000,
3_500,
8,
20,
20,
1,
2,
1,
0,
0,
0,
0,
0,
1_031_000,
1_031_000,
);
const animeLibrary = getAnimeLibrary(db);
assert.equal(animeLibrary.length, 2);
assert.deepEqual(
animeLibrary.map((row) => ({
animeId: row.animeId,
canonicalTitle: row.canonicalTitle,
totalSessions: row.totalSessions,
totalActiveMs: row.totalActiveMs,
totalCards: row.totalCards,
episodeCount: row.episodeCount,
})),
[
{
animeId: lwaAnimeId,
canonicalTitle: 'Little Witch Academia',
totalSessions: 3,
totalActiveMs: 12_000,
totalCards: 6,
episodeCount: 2,
},
{
animeId: frierenAnimeId,
canonicalTitle: 'Frieren',
totalSessions: 1,
totalActiveMs: 3_500,
totalCards: 1,
episodeCount: 1,
},
],
);
const animeDetail = getAnimeDetail(db, lwaAnimeId);
assert.ok(animeDetail);
assert.equal(animeDetail?.animeId, lwaAnimeId);
assert.equal(animeDetail?.canonicalTitle, 'Little Witch Academia');
assert.equal(animeDetail?.anilistId, 33_435);
assert.equal(animeDetail?.totalSessions, 3);
assert.equal(animeDetail?.totalActiveMs, 12_000);
assert.equal(animeDetail?.totalCards, 6);
assert.equal(animeDetail?.totalWordsSeen, 80);
assert.equal(animeDetail?.totalLinesSeen, 33);
assert.equal(animeDetail?.totalLookupCount, 12);
assert.equal(animeDetail?.totalLookupHits, 8);
assert.equal(animeDetail?.episodeCount, 2);
const episodes = getAnimeEpisodes(db, lwaAnimeId);
assert.deepEqual(
episodes.map((row) => ({
videoId: row.videoId,
season: row.season,
episode: row.episode,
totalSessions: row.totalSessions,
totalActiveMs: row.totalActiveMs,
totalCards: row.totalCards,
})),
[
{
videoId: lwaEpisode5,
season: 2,
episode: 5,
totalSessions: 2,
totalActiveMs: 7_000,
totalCards: 3,
},
{
videoId: lwaEpisode6,
season: 2,
episode: 6,
totalSessions: 1,
totalActiveMs: 5_000,
totalCards: 3,
},
],
);
} finally {
db.close();
cleanupDbPath(dbPath);
}
});
test('getWordOccurrences maps a normalized word back to anime, video, and subtitle line context', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
ensureSchema(db);
const animeId = getOrCreateAnimeRecord(db, {
parsedTitle: 'Little Witch Academia',
canonicalTitle: 'Little Witch Academia',
anilistId: null,
titleRomaji: null,
titleEnglish: null,
titleNative: null,
metadataJson: '{"source":"test"}',
});
const videoId = getOrCreateVideoRecord(db, 'local:/tmp/lwa-s02e04.mkv', {
canonicalTitle: 'Episode 4',
sourcePath: '/tmp/Little Witch Academia S02E04.mkv',
sourceUrl: null,
sourceType: SOURCE_TYPE_LOCAL,
});
linkVideoToAnimeRecord(db, videoId, {
animeId,
parsedBasename: 'Little Witch Academia S02E04.mkv',
parsedTitle: 'Little Witch Academia',
parsedSeason: 2,
parsedEpisode: 4,
parserSource: 'fallback',
parserConfidence: 1,
parseMetadataJson: '{"episode":4}',
});
const { sessionId } = startSessionRecord(db, videoId, 1_000_000);
const wordResult = db
.prepare(
`INSERT INTO imm_words (
headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
)
.run('猫', '猫', 'ねこ', 'noun', '名詞', '一般', '', 1_000, 1_500, 4);
const lineResult = db
.prepare(
`INSERT INTO imm_subtitle_lines (
session_id, event_id, video_id, anime_id, line_index, segment_start_ms, segment_end_ms, text, CREATED_DATE, LAST_UPDATE_DATE
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
)
.run(sessionId, null, videoId, animeId, 1, 0, 1000, '猫 猫 日 日 は', 1_000, 1_000);
db.prepare(
`INSERT INTO imm_word_line_occurrences (line_id, word_id, occurrence_count)
VALUES (?, ?, ?)`,
).run(Number(lineResult.lastInsertRowid), Number(wordResult.lastInsertRowid), 2);
const rows = getWordOccurrences(db, '猫', '猫', 'ねこ', 10);
assert.deepEqual(rows, [
{
animeId,
animeTitle: 'Little Witch Academia',
videoId,
videoTitle: 'Episode 4',
sessionId,
lineIndex: 1,
segmentStartMs: 0,
segmentEndMs: 1000,
text: '猫 猫 日 日 は',
occurrenceCount: 2,
},
]);
} finally {
db.close();
cleanupDbPath(dbPath);
}
});
test('getKanjiOccurrences maps a kanji back to anime, video, and subtitle line context', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
ensureSchema(db);
const animeId = getOrCreateAnimeRecord(db, {
parsedTitle: 'Frieren',
canonicalTitle: 'Frieren',
anilistId: null,
titleRomaji: null,
titleEnglish: null,
titleNative: null,
metadataJson: '{"source":"test"}',
});
const videoId = getOrCreateVideoRecord(db, 'local:/tmp/frieren-03.mkv', {
canonicalTitle: 'Episode 3',
sourcePath: '/tmp/[SubsPlease] Frieren - 03 - Departure.mkv',
sourceUrl: null,
sourceType: SOURCE_TYPE_LOCAL,
});
linkVideoToAnimeRecord(db, videoId, {
animeId,
parsedBasename: '[SubsPlease] Frieren - 03 - Departure.mkv',
parsedTitle: 'Frieren',
parsedSeason: 1,
parsedEpisode: 3,
parserSource: 'fallback',
parserConfidence: 1,
parseMetadataJson: '{"episode":3}',
});
const { sessionId } = startSessionRecord(db, videoId, 2_000_000);
const kanjiResult = db
.prepare(
`INSERT INTO imm_kanji (
kanji, first_seen, last_seen, frequency
) VALUES (?, ?, ?, ?)`,
)
.run('日', 2_000, 2_500, 8);
const lineResult = db
.prepare(
`INSERT INTO imm_subtitle_lines (
session_id, event_id, video_id, anime_id, line_index, segment_start_ms, segment_end_ms, text, CREATED_DATE, LAST_UPDATE_DATE
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
)
.run(sessionId, null, videoId, animeId, 3, 5000, 6500, '今日は日曜', 2_000, 2_000);
db.prepare(
`INSERT INTO imm_kanji_line_occurrences (line_id, kanji_id, occurrence_count)
VALUES (?, ?, ?)`,
).run(Number(lineResult.lastInsertRowid), Number(kanjiResult.lastInsertRowid), 2);
const rows = getKanjiOccurrences(db, '日', 10);
assert.deepEqual(rows, [
{
animeId,
animeTitle: 'Frieren',
videoId,
videoTitle: 'Episode 3',
sessionId,
lineIndex: 3,
segmentStartMs: 5000,
segmentEndMs: 6500,
text: '今日は日曜',
occurrenceCount: 2,
},
]);
} finally {
db.close();
cleanupDbPath(dbPath);
}
});

View File

@@ -0,0 +1,71 @@
import type { Token } from '../../../types';
import type { LegacyVocabularyPosResolution } from './types';
import { deriveStoredPartOfSpeech } from '../tokenizer/part-of-speech';
const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
const KATAKANA_CODEPOINT_START = 0x30a1;
const KATAKANA_CODEPOINT_END = 0x30f6;
function normalizeLookupText(value: string | null | undefined): string {
return typeof value === 'string' ? value.trim() : '';
}
function katakanaToHiragana(text: string): string {
let normalized = '';
for (const char of text) {
const code = char.codePointAt(0);
if (code === undefined) {
continue;
}
if (code >= KATAKANA_CODEPOINT_START && code <= KATAKANA_CODEPOINT_END) {
normalized += String.fromCodePoint(code - KATAKANA_TO_HIRAGANA_OFFSET);
continue;
}
normalized += char;
}
return normalized;
}
function toResolution(token: Token): LegacyVocabularyPosResolution {
return {
headword: normalizeLookupText(token.headword) || normalizeLookupText(token.word),
reading: katakanaToHiragana(normalizeLookupText(token.katakanaReading)),
partOfSpeech: deriveStoredPartOfSpeech({
partOfSpeech: token.partOfSpeech,
pos1: token.pos1,
}),
pos1: normalizeLookupText(token.pos1),
pos2: normalizeLookupText(token.pos2),
pos3: normalizeLookupText(token.pos3),
};
}
export function resolveLegacyVocabularyPosFromTokens(
lookupText: string,
tokens: Token[] | null,
): LegacyVocabularyPosResolution | null {
const normalizedLookup = normalizeLookupText(lookupText);
if (!normalizedLookup || !tokens || tokens.length === 0) {
return null;
}
const exactSurfaceMatches = tokens.filter(
(token) => normalizeLookupText(token.word) === normalizedLookup,
);
if (exactSurfaceMatches.length === 1) {
return toResolution(exactSurfaceMatches[0]!);
}
const exactHeadwordMatches = tokens.filter(
(token) => normalizeLookupText(token.headword) === normalizedLookup,
);
if (exactHeadwordMatches.length === 1) {
return toResolution(exactHeadwordMatches[0]!);
}
if (tokens.length === 1) {
return toResolution(tokens[0]!);
}
return null;
}

View File

@@ -112,35 +112,46 @@ function upsertDailyRollupsForGroups(
words_per_min, lookup_hit_rate, CREATED_DATE, LAST_UPDATE_DATE
)
SELECT
CAST(s.started_at_ms / 86400000 AS INTEGER) AS rollup_day,
CAST(julianday(s.started_at_ms / 1000, 'unixepoch', 'localtime') - 2440587.5 AS INTEGER) AS rollup_day,
s.video_id AS video_id,
COUNT(DISTINCT s.session_id) AS total_sessions,
COALESCE(SUM(t.active_watched_ms), 0) / 60000.0 AS total_active_min,
COALESCE(SUM(t.lines_seen), 0) AS total_lines_seen,
COALESCE(SUM(t.words_seen), 0) AS total_words_seen,
COALESCE(SUM(t.tokens_seen), 0) AS total_tokens_seen,
COALESCE(SUM(t.cards_mined), 0) AS total_cards,
COALESCE(SUM(sm.max_active_ms), 0) / 60000.0 AS total_active_min,
COALESCE(SUM(sm.max_lines), 0) AS total_lines_seen,
COALESCE(SUM(sm.max_words), 0) AS total_words_seen,
COALESCE(SUM(sm.max_tokens), 0) AS total_tokens_seen,
COALESCE(SUM(sm.max_cards), 0) AS total_cards,
CASE
WHEN COALESCE(SUM(t.active_watched_ms), 0) > 0
THEN (COALESCE(SUM(t.cards_mined), 0) * 60.0) / (COALESCE(SUM(t.active_watched_ms), 0) / 60000.0)
WHEN COALESCE(SUM(sm.max_active_ms), 0) > 0
THEN (COALESCE(SUM(sm.max_cards), 0) * 60.0) / (COALESCE(SUM(sm.max_active_ms), 0) / 60000.0)
ELSE NULL
END AS cards_per_hour,
CASE
WHEN COALESCE(SUM(t.active_watched_ms), 0) > 0
THEN COALESCE(SUM(t.words_seen), 0) / (COALESCE(SUM(t.active_watched_ms), 0) / 60000.0)
WHEN COALESCE(SUM(sm.max_active_ms), 0) > 0
THEN COALESCE(SUM(sm.max_words), 0) / (COALESCE(SUM(sm.max_active_ms), 0) / 60000.0)
ELSE NULL
END AS words_per_min,
CASE
WHEN COALESCE(SUM(t.lookup_count), 0) > 0
THEN CAST(COALESCE(SUM(t.lookup_hits), 0) AS REAL) / CAST(SUM(t.lookup_count) AS REAL)
WHEN COALESCE(SUM(sm.max_lookups), 0) > 0
THEN CAST(COALESCE(SUM(sm.max_hits), 0) AS REAL) / CAST(SUM(sm.max_lookups) AS REAL)
ELSE NULL
END AS lookup_hit_rate,
? AS CREATED_DATE,
? AS LAST_UPDATE_DATE
FROM imm_sessions s
JOIN imm_session_telemetry t
ON t.session_id = s.session_id
WHERE CAST(s.started_at_ms / 86400000 AS INTEGER) = ? AND s.video_id = ?
JOIN (
SELECT
t.session_id,
MAX(t.active_watched_ms) AS max_active_ms,
MAX(t.lines_seen) AS max_lines,
MAX(t.words_seen) AS max_words,
MAX(t.tokens_seen) AS max_tokens,
MAX(t.cards_mined) AS max_cards,
MAX(t.lookup_count) AS max_lookups,
MAX(t.lookup_hits) AS max_hits
FROM imm_session_telemetry t
GROUP BY t.session_id
) sm ON s.session_id = sm.session_id
WHERE CAST(julianday(s.started_at_ms / 1000, 'unixepoch', 'localtime') - 2440587.5 AS INTEGER) = ? AND s.video_id = ?
GROUP BY rollup_day, s.video_id
ON CONFLICT (rollup_day, video_id) DO UPDATE SET
total_sessions = excluded.total_sessions,
@@ -176,20 +187,29 @@ function upsertMonthlyRollupsForGroups(
total_words_seen, total_tokens_seen, total_cards, CREATED_DATE, LAST_UPDATE_DATE
)
SELECT
CAST(strftime('%Y%m', s.started_at_ms / 1000, 'unixepoch') AS INTEGER) AS rollup_month,
CAST(strftime('%Y%m', s.started_at_ms / 1000, 'unixepoch', 'localtime') AS INTEGER) AS rollup_month,
s.video_id AS video_id,
COUNT(DISTINCT s.session_id) AS total_sessions,
COALESCE(SUM(t.active_watched_ms), 0) / 60000.0 AS total_active_min,
COALESCE(SUM(t.lines_seen), 0) AS total_lines_seen,
COALESCE(SUM(t.words_seen), 0) AS total_words_seen,
COALESCE(SUM(t.tokens_seen), 0) AS total_tokens_seen,
COALESCE(SUM(t.cards_mined), 0) AS total_cards,
COALESCE(SUM(sm.max_active_ms), 0) / 60000.0 AS total_active_min,
COALESCE(SUM(sm.max_lines), 0) AS total_lines_seen,
COALESCE(SUM(sm.max_words), 0) AS total_words_seen,
COALESCE(SUM(sm.max_tokens), 0) AS total_tokens_seen,
COALESCE(SUM(sm.max_cards), 0) AS total_cards,
? AS CREATED_DATE,
? AS LAST_UPDATE_DATE
FROM imm_sessions s
JOIN imm_session_telemetry t
ON t.session_id = s.session_id
WHERE CAST(strftime('%Y%m', s.started_at_ms / 1000, 'unixepoch') AS INTEGER) = ? AND s.video_id = ?
JOIN (
SELECT
t.session_id,
MAX(t.active_watched_ms) AS max_active_ms,
MAX(t.lines_seen) AS max_lines,
MAX(t.words_seen) AS max_words,
MAX(t.tokens_seen) AS max_tokens,
MAX(t.cards_mined) AS max_cards
FROM imm_session_telemetry t
GROUP BY t.session_id
) sm ON s.session_id = sm.session_id
WHERE CAST(strftime('%Y%m', s.started_at_ms / 1000, 'unixepoch', 'localtime') AS INTEGER) = ? AND s.video_id = ?
GROUP BY rollup_month, s.video_id
ON CONFLICT (rollup_month, video_id) DO UPDATE SET
total_sessions = excluded.total_sessions,
@@ -216,8 +236,8 @@ function getAffectedRollupGroups(
.prepare(
`
SELECT DISTINCT
CAST(s.started_at_ms / 86400000 AS INTEGER) AS rollup_day,
CAST(strftime('%Y%m', s.started_at_ms / 1000, 'unixepoch') AS INTEGER) AS rollup_month,
CAST(julianday(s.started_at_ms / 1000, 'unixepoch', 'localtime') - 2440587.5 AS INTEGER) AS rollup_day,
CAST(strftime('%Y%m', s.started_at_ms / 1000, 'unixepoch', 'localtime') AS INTEGER) AS rollup_month,
s.video_id AS video_id
FROM imm_session_telemetry t
JOIN imm_sessions s

View File

@@ -4,7 +4,7 @@ import { EventEmitter } from 'node:events';
import test from 'node:test';
import type { spawn as spawnFn } from 'node:child_process';
import { SOURCE_TYPE_LOCAL } from './types';
import { getLocalVideoMetadata, runFfprobe } from './metadata';
import { getLocalVideoMetadata, guessAnimeVideoMetadata, runFfprobe } from './metadata';
type Spawn = typeof spawnFn;
@@ -146,3 +146,79 @@ test('getLocalVideoMetadata derives title and falls back to null hash on read er
assert.equal(hashFallbackMetadata.canonicalTitle, 'Episode 02');
assert.equal(hashFallbackMetadata.hashSha256, null);
});
test('guessAnimeVideoMetadata uses guessit basename output first when available', async () => {
const seenTargets: string[] = [];
const parsed = await guessAnimeVideoMetadata('/tmp/Little Witch Academia S02E05.mkv', 'Episode 5', {
runGuessit: async (target) => {
seenTargets.push(target);
return JSON.stringify({
title: 'Little Witch Academia',
season: 2,
episode: 5,
});
},
});
assert.deepEqual(seenTargets, ['Little Witch Academia S02E05.mkv']);
assert.deepEqual(parsed, {
parsedBasename: 'Little Witch Academia S02E05.mkv',
parsedTitle: 'Little Witch Academia',
parsedSeason: 2,
parsedEpisode: 5,
parserSource: 'guessit',
parserConfidence: 1,
parseMetadataJson: JSON.stringify({
filename: 'Little Witch Academia S02E05.mkv',
source: 'guessit',
}),
});
});
test('guessAnimeVideoMetadata falls back to parser when guessit throws', async () => {
const parsed = await guessAnimeVideoMetadata('/tmp/Little Witch Academia S02E05.mkv', 'Episode 5', {
runGuessit: async () => {
throw new Error('guessit unavailable');
},
});
assert.deepEqual(parsed, {
parsedBasename: 'Little Witch Academia S02E05.mkv',
parsedTitle: 'Little Witch Academia',
parsedSeason: 2,
parsedEpisode: 5,
parserSource: 'fallback',
parserConfidence: 1,
parseMetadataJson: JSON.stringify({
confidence: 'high',
filename: 'Little Witch Academia S02E05.mkv',
rawTitle: 'Little Witch Academia S02E05',
source: 'fallback',
}),
});
});
test('guessAnimeVideoMetadata falls back when guessit output is incomplete', async () => {
const parsed = await guessAnimeVideoMetadata(
'/tmp/[SubsPlease] Frieren - 03 (1080p).mkv',
null,
{
runGuessit: async () => JSON.stringify({ episode: 3 }),
},
);
assert.deepEqual(parsed, {
parsedBasename: '[SubsPlease] Frieren - 03 (1080p).mkv',
parsedTitle: 'Frieren - 03 (1080p)',
parsedSeason: null,
parsedEpisode: null,
parserSource: 'fallback',
parserConfidence: 0.2,
parseMetadataJson: JSON.stringify({
confidence: 'low',
filename: '[SubsPlease] Frieren - 03 (1080p).mkv',
rawTitle: 'Frieren - 03 (1080p)',
source: 'fallback',
}),
});
});

View File

@@ -1,6 +1,13 @@
import crypto from 'node:crypto';
import { spawn as nodeSpawn } from 'node:child_process';
import * as fs from 'node:fs';
import path from 'node:path';
import { parseMediaInfo } from '../../../jimaku/utils';
import {
guessAnilistMediaInfo,
runGuessit,
type GuessAnilistMediaInfoDeps,
} from '../anilist/anilist-updater';
import {
deriveCanonicalTitle,
emptyMetadata,
@@ -8,7 +15,12 @@ import {
parseFps,
toNullableInt,
} from './reducer';
import { SOURCE_TYPE_LOCAL, type ProbeMetadata, type VideoMetadata } from './types';
import {
SOURCE_TYPE_LOCAL,
type ParsedAnimeVideoGuess,
type ProbeMetadata,
type VideoMetadata,
} from './types';
type SpawnFn = typeof nodeSpawn;
@@ -24,6 +36,21 @@ interface MetadataDeps {
fs?: FsDeps;
}
interface GuessAnimeVideoMetadataDeps {
runGuessit?: GuessAnilistMediaInfoDeps['runGuessit'];
}
function mapParserConfidenceToScore(confidence: 'high' | 'medium' | 'low'): number {
switch (confidence) {
case 'high':
return 1;
case 'medium':
return 0.6;
default:
return 0.2;
}
}
export async function computeSha256(
mediaPath: string,
deps: MetadataDeps = {},
@@ -151,3 +178,48 @@ export async function getLocalVideoMetadata(
metadataJson: null,
};
}
export async function guessAnimeVideoMetadata(
mediaPath: string | null,
mediaTitle: string | null,
deps: GuessAnimeVideoMetadataDeps = {},
): Promise<ParsedAnimeVideoGuess | null> {
const parsed = await guessAnilistMediaInfo(mediaPath, mediaTitle, {
runGuessit: deps.runGuessit ?? runGuessit,
});
if (!parsed) {
return null;
}
const parsedBasename = mediaPath ? path.basename(mediaPath) : null;
if (parsed.source === 'guessit') {
return {
parsedBasename,
parsedTitle: parsed.title,
parsedSeason: parsed.season,
parsedEpisode: parsed.episode,
parserSource: 'guessit',
parserConfidence: 1,
parseMetadataJson: JSON.stringify({
filename: parsedBasename,
source: 'guessit',
}),
};
}
const fallbackInfo = parseMediaInfo(mediaPath ?? mediaTitle);
return {
parsedBasename: parsedBasename ?? fallbackInfo.filename ?? null,
parsedTitle: parsed.title,
parsedSeason: parsed.season,
parsedEpisode: parsed.episode,
parserSource: 'fallback',
parserConfidence: mapParserConfidenceToScore(fallbackInfo.confidence),
parseMetadataJson: JSON.stringify({
confidence: fallbackInfo.confidence,
filename: fallbackInfo.filename,
rawTitle: fallbackInfo.rawTitle,
source: 'fallback',
}),
};
}

File diff suppressed because it is too large Load Diff

View File

@@ -30,6 +30,7 @@ export function createInitialSessionState(
lastPauseStartMs: null,
isPaused: false,
pendingTelemetry: true,
markedWatched: false,
};
}

View File

@@ -9,7 +9,9 @@ import {
createTrackerPreparedStatements,
ensureSchema,
executeQueuedWrite,
getOrCreateAnimeRecord,
getOrCreateVideoRecord,
linkVideoToAnimeRecord,
} from './storage';
import { EVENT_SUBTITLE_LINE, SESSION_STATUS_ENDED, SOURCE_TYPE_LOCAL } from './types';
@@ -60,6 +62,7 @@ test('ensureSchema creates immersion core tables', () => {
const tableNames = new Set(rows.map((row) => row.name));
assert.ok(tableNames.has('imm_videos'));
assert.ok(tableNames.has('imm_anime'));
assert.ok(tableNames.has('imm_sessions'));
assert.ok(tableNames.has('imm_session_telemetry'));
assert.ok(tableNames.has('imm_session_events'));
@@ -67,8 +70,28 @@ test('ensureSchema creates immersion core tables', () => {
assert.ok(tableNames.has('imm_monthly_rollups'));
assert.ok(tableNames.has('imm_words'));
assert.ok(tableNames.has('imm_kanji'));
assert.ok(tableNames.has('imm_subtitle_lines'));
assert.ok(tableNames.has('imm_word_line_occurrences'));
assert.ok(tableNames.has('imm_kanji_line_occurrences'));
assert.ok(tableNames.has('imm_rollup_state'));
const videoColumns = new Set(
(
db.prepare('PRAGMA table_info(imm_videos)').all() as Array<{
name: string;
}>
).map((row) => row.name),
);
assert.ok(videoColumns.has('anime_id'));
assert.ok(videoColumns.has('parsed_basename'));
assert.ok(videoColumns.has('parsed_title'));
assert.ok(videoColumns.has('parsed_season'));
assert.ok(videoColumns.has('parsed_episode'));
assert.ok(videoColumns.has('parser_source'));
assert.ok(videoColumns.has('parser_confidence'));
assert.ok(videoColumns.has('parse_metadata_json'));
const rollupStateRow = db
.prepare('SELECT state_value FROM imm_rollup_state WHERE state_key = ?')
.get('last_rollup_sample_ms') as {
@@ -82,6 +105,470 @@ test('ensureSchema creates immersion core tables', () => {
}
});
test('ensureSchema migrates legacy videos and backfills anime metadata from filenames', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
db.exec(`
CREATE TABLE imm_schema_version (
schema_version INTEGER PRIMARY KEY,
applied_at_ms INTEGER NOT NULL
);
INSERT INTO imm_schema_version(schema_version, applied_at_ms) VALUES (4, 1);
CREATE TABLE imm_videos(
video_id INTEGER PRIMARY KEY AUTOINCREMENT,
video_key TEXT NOT NULL UNIQUE,
canonical_title TEXT NOT NULL,
source_type INTEGER NOT NULL,
source_path TEXT,
source_url TEXT,
duration_ms INTEGER NOT NULL CHECK(duration_ms>=0),
file_size_bytes INTEGER CHECK(file_size_bytes>=0),
codec_id INTEGER, container_id INTEGER,
width_px INTEGER, height_px INTEGER, fps_x100 INTEGER,
bitrate_kbps INTEGER, audio_codec_id INTEGER,
hash_sha256 TEXT, screenshot_path TEXT,
metadata_json TEXT,
CREATED_DATE INTEGER,
LAST_UPDATE_DATE INTEGER
);
`);
const insertLegacyVideo = db.prepare(`
INSERT INTO imm_videos (
video_key, canonical_title, source_type, source_path, source_url,
duration_ms, file_size_bytes, codec_id, container_id, width_px, height_px,
fps_x100, bitrate_kbps, audio_codec_id, hash_sha256, screenshot_path,
metadata_json, CREATED_DATE, LAST_UPDATE_DATE
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`);
insertLegacyVideo.run(
'local:/library/Little Witch Academia S02E05.mkv',
'Episode 5',
SOURCE_TYPE_LOCAL,
'/library/Little Witch Academia S02E05.mkv',
null,
0,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
1,
1,
);
insertLegacyVideo.run(
'local:/library/Little Witch Academia S02E06.mkv',
'Episode 6',
SOURCE_TYPE_LOCAL,
'/library/Little Witch Academia S02E06.mkv',
null,
0,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
1,
1,
);
insertLegacyVideo.run(
'local:/library/[SubsPlease] Frieren - 03 - Departure.mkv',
'Episode 3',
SOURCE_TYPE_LOCAL,
'/library/[SubsPlease] Frieren - 03 - Departure.mkv',
null,
0,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
1,
1,
);
ensureSchema(db);
const videoColumns = new Set(
(
db.prepare('PRAGMA table_info(imm_videos)').all() as Array<{
name: string;
}>
).map((row) => row.name),
);
assert.ok(videoColumns.has('anime_id'));
assert.ok(videoColumns.has('parsed_basename'));
assert.ok(videoColumns.has('parsed_title'));
assert.ok(videoColumns.has('parsed_season'));
assert.ok(videoColumns.has('parsed_episode'));
assert.ok(videoColumns.has('parser_source'));
assert.ok(videoColumns.has('parser_confidence'));
assert.ok(videoColumns.has('parse_metadata_json'));
const animeRows = db
.prepare('SELECT canonical_title FROM imm_anime ORDER BY canonical_title')
.all() as Array<{ canonical_title: string }>;
assert.deepEqual(
animeRows.map((row) => row.canonical_title),
['Frieren', 'Little Witch Academia'],
);
const littleWitchRows = db
.prepare(
`
SELECT
a.canonical_title AS anime_title,
v.parsed_title,
v.parsed_basename,
v.parsed_season,
v.parsed_episode,
v.parser_source,
v.parser_confidence
FROM imm_videos v
JOIN imm_anime a ON a.anime_id = v.anime_id
WHERE v.video_key LIKE 'local:/library/Little Witch Academia%'
ORDER BY v.video_key
`,
)
.all() as Array<{
anime_title: string;
parsed_title: string | null;
parsed_basename: string | null;
parsed_season: number | null;
parsed_episode: number | null;
parser_source: string | null;
parser_confidence: number | null;
}>;
assert.equal(littleWitchRows.length, 2);
assert.deepEqual(
littleWitchRows.map((row) => ({
animeTitle: row.anime_title,
parsedTitle: row.parsed_title,
parsedBasename: row.parsed_basename,
parsedSeason: row.parsed_season,
parsedEpisode: row.parsed_episode,
parserSource: row.parser_source,
})),
[
{
animeTitle: 'Little Witch Academia',
parsedTitle: 'Little Witch Academia',
parsedBasename: 'Little Witch Academia S02E05.mkv',
parsedSeason: 2,
parsedEpisode: 5,
parserSource: 'fallback',
},
{
animeTitle: 'Little Witch Academia',
parsedTitle: 'Little Witch Academia',
parsedBasename: 'Little Witch Academia S02E06.mkv',
parsedSeason: 2,
parsedEpisode: 6,
parserSource: 'fallback',
},
],
);
assert.ok(
littleWitchRows.every(
(row) => typeof row.parser_confidence === 'number' && row.parser_confidence > 0,
),
);
const frierenRow = db
.prepare(
`
SELECT
a.canonical_title AS anime_title,
v.parsed_title,
v.parsed_episode,
v.parser_source
FROM imm_videos v
JOIN imm_anime a ON a.anime_id = v.anime_id
WHERE v.video_key = ?
`,
)
.get('local:/library/[SubsPlease] Frieren - 03 - Departure.mkv') as {
anime_title: string;
parsed_title: string | null;
parsed_episode: number | null;
parser_source: string | null;
} | null;
assert.ok(frierenRow);
assert.equal(frierenRow?.anime_title, 'Frieren');
assert.equal(frierenRow?.parsed_title, 'Frieren');
assert.equal(frierenRow?.parsed_episode, 3);
assert.equal(frierenRow?.parser_source, 'fallback');
} finally {
db.close();
cleanupDbPath(dbPath);
}
});
test('ensureSchema adds subtitle-line occurrence tables to schema version 6 databases', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
db.exec(`
CREATE TABLE imm_schema_version (
schema_version INTEGER PRIMARY KEY,
applied_at_ms INTEGER NOT NULL
);
INSERT INTO imm_schema_version(schema_version, applied_at_ms) VALUES (6, 1);
CREATE TABLE imm_videos(
video_id INTEGER PRIMARY KEY AUTOINCREMENT,
video_key TEXT NOT NULL UNIQUE,
anime_id INTEGER,
canonical_title TEXT NOT NULL,
source_type INTEGER NOT NULL,
source_path TEXT,
source_url TEXT,
parsed_basename TEXT,
parsed_title TEXT,
parsed_season INTEGER,
parsed_episode INTEGER,
parser_source TEXT,
parser_confidence REAL,
parse_metadata_json TEXT,
duration_ms INTEGER NOT NULL CHECK(duration_ms>=0),
file_size_bytes INTEGER CHECK(file_size_bytes>=0),
codec_id INTEGER, container_id INTEGER,
width_px INTEGER, height_px INTEGER, fps_x100 INTEGER,
bitrate_kbps INTEGER, audio_codec_id INTEGER,
hash_sha256 TEXT, screenshot_path TEXT,
metadata_json TEXT,
CREATED_DATE INTEGER,
LAST_UPDATE_DATE INTEGER
);
CREATE TABLE imm_sessions(
session_id INTEGER PRIMARY KEY AUTOINCREMENT,
session_uuid TEXT NOT NULL UNIQUE,
video_id INTEGER NOT NULL,
started_at_ms INTEGER NOT NULL,
ended_at_ms INTEGER,
status INTEGER NOT NULL,
locale_id INTEGER,
target_lang_id INTEGER,
difficulty_tier INTEGER,
subtitle_mode INTEGER,
CREATED_DATE INTEGER,
LAST_UPDATE_DATE INTEGER
);
CREATE TABLE imm_session_events(
event_id INTEGER PRIMARY KEY AUTOINCREMENT,
session_id INTEGER NOT NULL,
ts_ms INTEGER NOT NULL,
event_type INTEGER NOT NULL,
line_index INTEGER,
segment_start_ms INTEGER,
segment_end_ms INTEGER,
words_delta INTEGER NOT NULL DEFAULT 0,
cards_delta INTEGER NOT NULL DEFAULT 0,
payload_json TEXT,
CREATED_DATE INTEGER,
LAST_UPDATE_DATE INTEGER
);
CREATE TABLE imm_words(
id INTEGER PRIMARY KEY AUTOINCREMENT,
headword TEXT,
word TEXT,
reading TEXT,
part_of_speech TEXT,
pos1 TEXT,
pos2 TEXT,
pos3 TEXT,
first_seen REAL,
last_seen REAL,
frequency INTEGER,
UNIQUE(headword, word, reading)
);
CREATE TABLE imm_kanji(
id INTEGER PRIMARY KEY AUTOINCREMENT,
kanji TEXT,
first_seen REAL,
last_seen REAL,
frequency INTEGER,
UNIQUE(kanji)
);
CREATE TABLE imm_rollup_state(
state_key TEXT PRIMARY KEY,
state_value INTEGER NOT NULL
);
`);
ensureSchema(db);
const tableNames = new Set(
(
db.prepare(`SELECT name FROM sqlite_master WHERE type = 'table' AND name LIKE 'imm_%'`).all() as
Array<{ name: string }>
).map((row) => row.name),
);
assert.ok(tableNames.has('imm_subtitle_lines'));
assert.ok(tableNames.has('imm_word_line_occurrences'));
assert.ok(tableNames.has('imm_kanji_line_occurrences'));
} finally {
db.close();
cleanupDbPath(dbPath);
}
});
test('anime rows are reused by normalized parsed title and upgraded with AniList metadata', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
ensureSchema(db);
const firstVideoId = getOrCreateVideoRecord(db, 'local:/tmp/lwa-s02e05.mkv', {
canonicalTitle: 'Episode 5',
sourcePath: '/tmp/Little Witch Academia S02E05.mkv',
sourceUrl: null,
sourceType: SOURCE_TYPE_LOCAL,
});
const secondVideoId = getOrCreateVideoRecord(db, 'local:/tmp/lwa-s02e06.mkv', {
canonicalTitle: 'Episode 6',
sourcePath: '/tmp/Little Witch Academia S02E06.mkv',
sourceUrl: null,
sourceType: SOURCE_TYPE_LOCAL,
});
const provisionalAnimeId = getOrCreateAnimeRecord(db, {
parsedTitle: 'Little Witch Academia',
canonicalTitle: 'Little Witch Academia',
anilistId: null,
titleRomaji: null,
titleEnglish: null,
titleNative: null,
metadataJson: '{"source":"parsed"}',
});
linkVideoToAnimeRecord(db, firstVideoId, {
animeId: provisionalAnimeId,
parsedBasename: 'Little Witch Academia S02E05.mkv',
parsedTitle: 'Little Witch Academia',
parsedSeason: 2,
parsedEpisode: 5,
parserSource: 'fallback',
parserConfidence: 0.6,
parseMetadataJson: '{"source":"parsed","episode":5}',
});
const reusedAnimeId = getOrCreateAnimeRecord(db, {
parsedTitle: ' little witch academia ',
canonicalTitle: 'Little Witch Academia',
anilistId: null,
titleRomaji: null,
titleEnglish: null,
titleNative: null,
metadataJson: '{"source":"parsed"}',
});
linkVideoToAnimeRecord(db, secondVideoId, {
animeId: reusedAnimeId,
parsedBasename: 'Little Witch Academia S02E06.mkv',
parsedTitle: 'Little Witch Academia',
parsedSeason: 2,
parsedEpisode: 6,
parserSource: 'fallback',
parserConfidence: 0.6,
parseMetadataJson: '{"source":"parsed","episode":6}',
});
assert.equal(reusedAnimeId, provisionalAnimeId);
const upgradedAnimeId = getOrCreateAnimeRecord(db, {
parsedTitle: 'Little Witch Academia',
canonicalTitle: 'Little Witch Academia TV',
anilistId: 33_435,
titleRomaji: 'Little Witch Academia',
titleEnglish: 'Little Witch Academia',
titleNative: 'リトルウィッチアカデミア',
metadataJson: '{"source":"anilist"}',
});
assert.equal(upgradedAnimeId, provisionalAnimeId);
const animeRows = db.prepare('SELECT * FROM imm_anime').all() as Array<{
anime_id: number;
normalized_title_key: string;
canonical_title: string;
anilist_id: number | null;
title_romaji: string | null;
title_english: string | null;
title_native: string | null;
metadata_json: string | null;
}>;
assert.equal(animeRows.length, 1);
assert.equal(animeRows[0]?.anime_id, provisionalAnimeId);
assert.equal(animeRows[0]?.normalized_title_key, 'little witch academia');
assert.equal(animeRows[0]?.canonical_title, 'Little Witch Academia TV');
assert.equal(animeRows[0]?.anilist_id, 33_435);
assert.equal(animeRows[0]?.title_romaji, 'Little Witch Academia');
assert.equal(animeRows[0]?.title_english, 'Little Witch Academia');
assert.equal(animeRows[0]?.title_native, 'リトルウィッチアカデミア');
assert.equal(animeRows[0]?.metadata_json, '{"source":"anilist"}');
const linkedVideos = db
.prepare(
`
SELECT anime_id, parsed_title, parsed_season, parsed_episode
FROM imm_videos
WHERE video_id IN (?, ?)
ORDER BY video_id
`,
)
.all(firstVideoId, secondVideoId) as Array<{
anime_id: number | null;
parsed_title: string | null;
parsed_season: number | null;
parsed_episode: number | null;
}>;
assert.deepEqual(linkedVideos, [
{
anime_id: provisionalAnimeId,
parsed_title: 'Little Witch Academia',
parsed_season: 2,
parsed_episode: 5,
},
{
anime_id: provisionalAnimeId,
parsed_title: 'Little Witch Academia',
parsed_season: 2,
parsed_episode: 6,
},
]);
} finally {
db.close();
cleanupDbPath(dbPath);
}
});
test('start/finalize session updates ended_at and status', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
@@ -191,18 +678,22 @@ test('executeQueuedWrite inserts and upserts word and kanji rows', () => {
ensureSchema(db);
const stmts = createTrackerPreparedStatements(db);
stmts.wordUpsertStmt.run('猫', '猫', '', 10.0, 10.0);
stmts.wordUpsertStmt.run('猫', '猫', '', 5.0, 15.0);
stmts.wordUpsertStmt.run('猫', '猫', '', 'noun', '名詞', '一般', '', 10.0, 10.0);
stmts.wordUpsertStmt.run('猫', '猫', '', 'noun', '名詞', '一般', '', 5.0, 15.0);
stmts.kanjiUpsertStmt.run('日', 9.0, 9.0);
stmts.kanjiUpsertStmt.run('日', 8.0, 11.0);
const wordRow = db
.prepare(
'SELECT headword, frequency, first_seen, last_seen FROM imm_words WHERE headword = ?',
`SELECT headword, frequency, part_of_speech, pos1, pos2, first_seen, last_seen
FROM imm_words WHERE headword = ?`,
)
.get('猫') as {
headword: string;
frequency: number;
part_of_speech: string;
pos1: string;
pos2: string;
first_seen: number;
last_seen: number;
} | null;
@@ -218,6 +709,9 @@ test('executeQueuedWrite inserts and upserts word and kanji rows', () => {
assert.ok(wordRow);
assert.ok(kanjiRow);
assert.equal(wordRow?.frequency, 2);
assert.equal(wordRow?.part_of_speech, 'noun');
assert.equal(wordRow?.pos1, '名詞');
assert.equal(wordRow?.pos2, '一般');
assert.equal(kanjiRow?.frequency, 2);
assert.equal(wordRow?.first_seen, 5);
assert.equal(wordRow?.last_seen, 15);
@@ -228,3 +722,34 @@ test('executeQueuedWrite inserts and upserts word and kanji rows', () => {
cleanupDbPath(dbPath);
}
});
test('word upsert replaces legacy other part_of_speech when better POS metadata arrives later', () => {
const dbPath = makeDbPath();
const db = new Database(dbPath);
try {
ensureSchema(db);
const stmts = createTrackerPreparedStatements(db);
stmts.wordUpsertStmt.run('知っている', '知っている', 'しっている', 'other', '動詞', '自立', '', 10, 10);
stmts.wordUpsertStmt.run('知っている', '知っている', 'しっている', 'verb', '動詞', '自立', '', 11, 12);
const row = db
.prepare('SELECT frequency, part_of_speech, pos1, pos2 FROM imm_words WHERE headword = ?')
.get('知っている') as {
frequency: number;
part_of_speech: string;
pos1: string;
pos2: string;
} | null;
assert.ok(row);
assert.equal(row?.frequency, 2);
assert.equal(row?.part_of_speech, 'verb');
assert.equal(row?.pos1, '動詞');
assert.equal(row?.pos2, '自立');
} finally {
db.close();
cleanupDbPath(dbPath);
}
});

View File

@@ -1,3 +1,4 @@
import { parseMediaInfo } from '../../../jimaku/utils';
import type { DatabaseSync } from './sqlite';
import { SCHEMA_VERSION } from './types';
import type { QueuedWrite, VideoMetadata } from './types';
@@ -7,6 +8,33 @@ export interface TrackerPreparedStatements {
eventInsertStmt: ReturnType<DatabaseSync['prepare']>;
wordUpsertStmt: ReturnType<DatabaseSync['prepare']>;
kanjiUpsertStmt: ReturnType<DatabaseSync['prepare']>;
subtitleLineInsertStmt: ReturnType<DatabaseSync['prepare']>;
wordIdSelectStmt: ReturnType<DatabaseSync['prepare']>;
kanjiIdSelectStmt: ReturnType<DatabaseSync['prepare']>;
wordLineOccurrenceUpsertStmt: ReturnType<DatabaseSync['prepare']>;
kanjiLineOccurrenceUpsertStmt: ReturnType<DatabaseSync['prepare']>;
videoAnimeIdSelectStmt: ReturnType<DatabaseSync['prepare']>;
}
export interface AnimeRecordInput {
parsedTitle: string;
canonicalTitle: string;
anilistId: number | null;
titleRomaji: string | null;
titleEnglish: string | null;
titleNative: string | null;
metadataJson: string | null;
}
export interface VideoAnimeLinkInput {
animeId: number | null;
parsedBasename: string | null;
parsedTitle: string | null;
parsedSeason: number | null;
parsedEpisode: number | null;
parserSource: string | null;
parserConfidence: number | null;
parseMetadataJson: string | null;
}
function hasColumn(db: DatabaseSync, tableName: string, columnName: string): boolean {
@@ -16,9 +44,14 @@ function hasColumn(db: DatabaseSync, tableName: string, columnName: string): boo
.some((row: unknown) => (row as { name: string }).name === columnName);
}
function addColumnIfMissing(db: DatabaseSync, tableName: string, columnName: string): void {
function addColumnIfMissing(
db: DatabaseSync,
tableName: string,
columnName: string,
columnType = 'INTEGER',
): void {
if (!hasColumn(db, tableName, columnName)) {
db.exec(`ALTER TABLE ${tableName} ADD COLUMN ${columnName} INTEGER`);
db.exec(`ALTER TABLE ${tableName} ADD COLUMN ${columnName} ${columnType}`);
}
}
@@ -35,6 +68,247 @@ export function applyPragmas(db: DatabaseSync): void {
db.exec('PRAGMA busy_timeout = 2500');
}
export function normalizeAnimeIdentityKey(title: string): string {
return title
.normalize('NFKC')
.toLowerCase()
.replace(/[^\p{L}\p{N}]+/gu, ' ')
.trim()
.replace(/\s+/g, ' ');
}
function looksLikeEpisodeOnlyTitle(title: string): boolean {
const normalized = title
.normalize('NFKC')
.toLowerCase()
.replace(/\s+/g, ' ')
.trim();
return /^(episode|ep)\s*\d{1,3}$/.test(normalized) || /^第\s*\d{1,3}\s*話$/.test(normalized);
}
function parserConfidenceToScore(confidence: 'high' | 'medium' | 'low'): number {
switch (confidence) {
case 'high':
return 1;
case 'medium':
return 0.6;
default:
return 0.2;
}
}
function parseLegacyAnimeBackfillCandidate(
sourcePath: string | null,
canonicalTitle: string,
): {
basename: string | null;
title: string;
season: number | null;
episode: number | null;
source: 'fallback';
confidenceScore: number;
metadataJson: string;
} | null {
const fromPath =
sourcePath && sourcePath.trim().length > 0 ? parseMediaInfo(sourcePath.trim()) : null;
if (fromPath?.title && !looksLikeEpisodeOnlyTitle(fromPath.title)) {
return {
basename: fromPath.filename || null,
title: fromPath.title,
season: fromPath.season,
episode: fromPath.episode,
source: 'fallback',
confidenceScore: parserConfidenceToScore(fromPath.confidence),
metadataJson: JSON.stringify({
confidence: fromPath.confidence,
filename: fromPath.filename,
rawTitle: fromPath.rawTitle,
migrationSource: 'source_path',
}),
};
}
const fallbackTitle = canonicalTitle.trim();
if (!fallbackTitle) return null;
const fromTitle = parseMediaInfo(fallbackTitle);
if (!fromTitle.title || looksLikeEpisodeOnlyTitle(fromTitle.title)) {
return null;
}
return {
basename: null,
title: fromTitle.title,
season: fromTitle.season,
episode: fromTitle.episode,
source: 'fallback',
confidenceScore: parserConfidenceToScore(fromTitle.confidence),
metadataJson: JSON.stringify({
confidence: fromTitle.confidence,
filename: fromTitle.filename,
rawTitle: fromTitle.rawTitle,
migrationSource: 'canonical_title',
}),
};
}
export function getOrCreateAnimeRecord(db: DatabaseSync, input: AnimeRecordInput): number {
const normalizedTitleKey = normalizeAnimeIdentityKey(input.parsedTitle);
if (!normalizedTitleKey) {
throw new Error('parsedTitle is required to create or update an anime record');
}
const byAnilistId =
input.anilistId !== null
? (db.prepare('SELECT anime_id FROM imm_anime WHERE anilist_id = ?').get(input.anilistId) as {
anime_id: number;
} | null)
: null;
const byNormalizedTitle = db
.prepare('SELECT anime_id FROM imm_anime WHERE normalized_title_key = ?')
.get(normalizedTitleKey) as { anime_id: number } | null;
const existing = byAnilistId ?? byNormalizedTitle;
if (existing?.anime_id) {
db.prepare(
`
UPDATE imm_anime
SET
canonical_title = COALESCE(NULLIF(?, ''), canonical_title),
anilist_id = COALESCE(?, anilist_id),
title_romaji = COALESCE(?, title_romaji),
title_english = COALESCE(?, title_english),
title_native = COALESCE(?, title_native),
metadata_json = COALESCE(?, metadata_json),
LAST_UPDATE_DATE = ?
WHERE anime_id = ?
`,
).run(
input.canonicalTitle,
input.anilistId,
input.titleRomaji,
input.titleEnglish,
input.titleNative,
input.metadataJson,
Date.now(),
existing.anime_id,
);
return existing.anime_id;
}
const nowMs = Date.now();
const result = db
.prepare(
`
INSERT INTO imm_anime(
normalized_title_key,
canonical_title,
anilist_id,
title_romaji,
title_english,
title_native,
metadata_json,
CREATED_DATE,
LAST_UPDATE_DATE
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
`,
)
.run(
normalizedTitleKey,
input.canonicalTitle,
input.anilistId,
input.titleRomaji,
input.titleEnglish,
input.titleNative,
input.metadataJson,
nowMs,
nowMs,
);
return Number(result.lastInsertRowid);
}
export function linkVideoToAnimeRecord(
db: DatabaseSync,
videoId: number,
input: VideoAnimeLinkInput,
): void {
db.prepare(
`
UPDATE imm_videos
SET
anime_id = ?,
parsed_basename = ?,
parsed_title = ?,
parsed_season = ?,
parsed_episode = ?,
parser_source = ?,
parser_confidence = ?,
parse_metadata_json = ?,
LAST_UPDATE_DATE = ?
WHERE video_id = ?
`,
).run(
input.animeId,
input.parsedBasename,
input.parsedTitle,
input.parsedSeason,
input.parsedEpisode,
input.parserSource,
input.parserConfidence,
input.parseMetadataJson,
Date.now(),
videoId,
);
}
function migrateLegacyAnimeMetadata(db: DatabaseSync): void {
addColumnIfMissing(db, 'imm_videos', 'anime_id', 'INTEGER REFERENCES imm_anime(anime_id)');
addColumnIfMissing(db, 'imm_videos', 'parsed_basename', 'TEXT');
addColumnIfMissing(db, 'imm_videos', 'parsed_title', 'TEXT');
addColumnIfMissing(db, 'imm_videos', 'parsed_season', 'INTEGER');
addColumnIfMissing(db, 'imm_videos', 'parsed_episode', 'INTEGER');
addColumnIfMissing(db, 'imm_videos', 'parser_source', 'TEXT');
addColumnIfMissing(db, 'imm_videos', 'parser_confidence', 'REAL');
addColumnIfMissing(db, 'imm_videos', 'parse_metadata_json', 'TEXT');
const legacyRows = db
.prepare(
`
SELECT video_id, source_path, canonical_title
FROM imm_videos
WHERE anime_id IS NULL
`,
)
.all() as Array<{
video_id: number;
source_path: string | null;
canonical_title: string;
}>;
for (const row of legacyRows) {
const parsed = parseLegacyAnimeBackfillCandidate(row.source_path, row.canonical_title);
if (!parsed) continue;
const animeId = getOrCreateAnimeRecord(db, {
parsedTitle: parsed.title,
canonicalTitle: parsed.title,
anilistId: null,
titleRomaji: null,
titleEnglish: null,
titleNative: null,
metadataJson: parsed.metadataJson,
});
linkVideoToAnimeRecord(db, row.video_id, {
animeId,
parsedBasename: parsed.basename,
parsedTitle: parsed.title,
parsedSeason: parsed.season,
parsedEpisode: parsed.episode,
parserSource: parsed.source,
parserConfidence: parsed.confidenceScore,
parseMetadataJson: parsed.metadataJson,
});
}
}
export function ensureSchema(db: DatabaseSync): void {
db.exec(`
CREATE TABLE IF NOT EXISTS imm_schema_version (
@@ -61,14 +335,38 @@ export function ensureSchema(db: DatabaseSync): void {
return;
}
db.exec(`
CREATE TABLE IF NOT EXISTS imm_anime(
anime_id INTEGER PRIMARY KEY AUTOINCREMENT,
normalized_title_key TEXT NOT NULL UNIQUE,
canonical_title TEXT NOT NULL,
anilist_id INTEGER UNIQUE,
title_romaji TEXT,
title_english TEXT,
title_native TEXT,
episodes_total INTEGER,
metadata_json TEXT,
CREATED_DATE INTEGER,
LAST_UPDATE_DATE INTEGER
);
`);
db.exec(`
CREATE TABLE IF NOT EXISTS imm_videos(
video_id INTEGER PRIMARY KEY AUTOINCREMENT,
video_key TEXT NOT NULL UNIQUE,
anime_id INTEGER,
canonical_title TEXT NOT NULL,
source_type INTEGER NOT NULL,
source_path TEXT,
source_url TEXT,
parsed_basename TEXT,
parsed_title TEXT,
parsed_season INTEGER,
parsed_episode INTEGER,
parser_source TEXT,
parser_confidence REAL,
parse_metadata_json TEXT,
watched INTEGER NOT NULL DEFAULT 0,
duration_ms INTEGER NOT NULL CHECK(duration_ms>=0),
file_size_bytes INTEGER CHECK(file_size_bytes>=0),
codec_id INTEGER, container_id INTEGER,
@@ -77,7 +375,8 @@ export function ensureSchema(db: DatabaseSync): void {
hash_sha256 TEXT, screenshot_path TEXT,
metadata_json TEXT,
CREATED_DATE INTEGER,
LAST_UPDATE_DATE INTEGER
LAST_UPDATE_DATE INTEGER,
FOREIGN KEY(anime_id) REFERENCES imm_anime(anime_id) ON DELETE SET NULL
);
`);
db.exec(`
@@ -173,6 +472,10 @@ export function ensureSchema(db: DatabaseSync): void {
headword TEXT,
word TEXT,
reading TEXT,
part_of_speech TEXT,
pos1 TEXT,
pos2 TEXT,
pos3 TEXT,
first_seen REAL,
last_seen REAL,
frequency INTEGER,
@@ -189,42 +492,59 @@ export function ensureSchema(db: DatabaseSync): void {
UNIQUE(kanji)
);
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_sessions_video_started
ON imm_sessions(video_id, started_at_ms DESC)
CREATE TABLE IF NOT EXISTS imm_subtitle_lines(
line_id INTEGER PRIMARY KEY AUTOINCREMENT,
session_id INTEGER NOT NULL,
event_id INTEGER,
video_id INTEGER NOT NULL,
anime_id INTEGER,
line_index INTEGER NOT NULL,
segment_start_ms INTEGER,
segment_end_ms INTEGER,
text TEXT NOT NULL,
CREATED_DATE INTEGER,
LAST_UPDATE_DATE INTEGER,
FOREIGN KEY(session_id) REFERENCES imm_sessions(session_id) ON DELETE CASCADE,
FOREIGN KEY(event_id) REFERENCES imm_session_events(event_id) ON DELETE SET NULL,
FOREIGN KEY(video_id) REFERENCES imm_videos(video_id) ON DELETE CASCADE,
FOREIGN KEY(anime_id) REFERENCES imm_anime(anime_id) ON DELETE SET NULL
);
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_sessions_status_started
ON imm_sessions(status, started_at_ms DESC)
CREATE TABLE IF NOT EXISTS imm_word_line_occurrences(
line_id INTEGER NOT NULL,
word_id INTEGER NOT NULL,
occurrence_count INTEGER NOT NULL,
PRIMARY KEY(line_id, word_id),
FOREIGN KEY(line_id) REFERENCES imm_subtitle_lines(line_id) ON DELETE CASCADE,
FOREIGN KEY(word_id) REFERENCES imm_words(id) ON DELETE CASCADE
);
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_telemetry_session_sample
ON imm_session_telemetry(session_id, sample_ms DESC)
CREATE TABLE IF NOT EXISTS imm_kanji_line_occurrences(
line_id INTEGER NOT NULL,
kanji_id INTEGER NOT NULL,
occurrence_count INTEGER NOT NULL,
PRIMARY KEY(line_id, kanji_id),
FOREIGN KEY(line_id) REFERENCES imm_subtitle_lines(line_id) ON DELETE CASCADE,
FOREIGN KEY(kanji_id) REFERENCES imm_kanji(id) ON DELETE CASCADE
);
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_events_session_ts
ON imm_session_events(session_id, ts_ms DESC)
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_events_type_ts
ON imm_session_events(event_type, ts_ms DESC)
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_rollups_day_video
ON imm_daily_rollups(rollup_day, video_id)
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_rollups_month_video
ON imm_monthly_rollups(rollup_month, video_id)
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_words_headword_word_reading
ON imm_words(headword, word, reading)
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_kanji_kanji
ON imm_kanji(kanji)
CREATE TABLE IF NOT EXISTS imm_media_art(
video_id INTEGER PRIMARY KEY,
anilist_id INTEGER,
cover_url TEXT,
cover_blob BLOB,
title_romaji TEXT,
title_english TEXT,
episodes_total INTEGER,
fetched_at_ms INTEGER NOT NULL,
CREATED_DATE INTEGER,
LAST_UPDATE_DATE INTEGER,
FOREIGN KEY(video_id) REFERENCES imm_videos(video_id) ON DELETE CASCADE
);
`);
if (currentVersion?.schema_version === 1) {
@@ -299,6 +619,134 @@ export function ensureSchema(db: DatabaseSync): void {
dropColumnIfExists(db, 'imm_sessions', 'updated_at_ms');
}
if (currentVersion?.schema_version && currentVersion.schema_version < 5) {
migrateLegacyAnimeMetadata(db);
}
if (currentVersion?.schema_version && currentVersion.schema_version < 6) {
addColumnIfMissing(db, 'imm_words', 'part_of_speech', 'TEXT');
addColumnIfMissing(db, 'imm_words', 'pos1', 'TEXT');
addColumnIfMissing(db, 'imm_words', 'pos2', 'TEXT');
addColumnIfMissing(db, 'imm_words', 'pos3', 'TEXT');
}
if (currentVersion?.schema_version && currentVersion.schema_version < 7) {
db.exec(`
CREATE TABLE IF NOT EXISTS imm_subtitle_lines(
line_id INTEGER PRIMARY KEY AUTOINCREMENT,
session_id INTEGER NOT NULL,
event_id INTEGER,
video_id INTEGER NOT NULL,
anime_id INTEGER,
line_index INTEGER NOT NULL,
segment_start_ms INTEGER,
segment_end_ms INTEGER,
text TEXT NOT NULL,
CREATED_DATE INTEGER,
LAST_UPDATE_DATE INTEGER,
FOREIGN KEY(session_id) REFERENCES imm_sessions(session_id) ON DELETE CASCADE,
FOREIGN KEY(event_id) REFERENCES imm_session_events(event_id) ON DELETE SET NULL,
FOREIGN KEY(video_id) REFERENCES imm_videos(video_id) ON DELETE CASCADE,
FOREIGN KEY(anime_id) REFERENCES imm_anime(anime_id) ON DELETE SET NULL
)
`);
db.exec(`
CREATE TABLE IF NOT EXISTS imm_word_line_occurrences(
line_id INTEGER NOT NULL,
word_id INTEGER NOT NULL,
occurrence_count INTEGER NOT NULL,
PRIMARY KEY(line_id, word_id),
FOREIGN KEY(line_id) REFERENCES imm_subtitle_lines(line_id) ON DELETE CASCADE,
FOREIGN KEY(word_id) REFERENCES imm_words(id) ON DELETE CASCADE
)
`);
db.exec(`
CREATE TABLE IF NOT EXISTS imm_kanji_line_occurrences(
line_id INTEGER NOT NULL,
kanji_id INTEGER NOT NULL,
occurrence_count INTEGER NOT NULL,
PRIMARY KEY(line_id, kanji_id),
FOREIGN KEY(line_id) REFERENCES imm_subtitle_lines(line_id) ON DELETE CASCADE,
FOREIGN KEY(kanji_id) REFERENCES imm_kanji(id) ON DELETE CASCADE
)
`);
}
db.exec(`
CREATE INDEX IF NOT EXISTS idx_anime_normalized_title
ON imm_anime(normalized_title_key)
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_anime_anilist_id
ON imm_anime(anilist_id)
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_videos_anime_id
ON imm_videos(anime_id)
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_sessions_video_started
ON imm_sessions(video_id, started_at_ms DESC)
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_sessions_status_started
ON imm_sessions(status, started_at_ms DESC)
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_telemetry_session_sample
ON imm_session_telemetry(session_id, sample_ms DESC)
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_events_session_ts
ON imm_session_events(session_id, ts_ms DESC)
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_events_type_ts
ON imm_session_events(event_type, ts_ms DESC)
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_rollups_day_video
ON imm_daily_rollups(rollup_day, video_id)
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_rollups_month_video
ON imm_monthly_rollups(rollup_month, video_id)
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_words_headword_word_reading
ON imm_words(headword, word, reading)
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_kanji_kanji
ON imm_kanji(kanji)
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_subtitle_lines_session_line
ON imm_subtitle_lines(session_id, line_index)
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_subtitle_lines_video_line
ON imm_subtitle_lines(video_id, line_index)
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_subtitle_lines_anime_line
ON imm_subtitle_lines(anime_id, line_index)
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_word_line_occurrences_word
ON imm_word_line_occurrences(word_id, line_id)
`);
db.exec(`
CREATE INDEX IF NOT EXISTS idx_kanji_line_occurrences_kanji
ON imm_kanji_line_occurrences(kanji_id, line_id)
`);
if (currentVersion?.schema_version && currentVersion.schema_version < SCHEMA_VERSION) {
db.exec('DELETE FROM imm_daily_rollups');
db.exec('DELETE FROM imm_monthly_rollups');
db.exec(`UPDATE imm_rollup_state SET state_value = 0 WHERE state_key = 'last_rollup_sample_ms'`);
}
db.exec(`
INSERT INTO imm_schema_version(schema_version, applied_at_ms)
VALUES (${SCHEMA_VERSION}, ${Date.now()})
@@ -328,12 +776,21 @@ export function createTrackerPreparedStatements(db: DatabaseSync): TrackerPrepar
`),
wordUpsertStmt: db.prepare(`
INSERT INTO imm_words (
headword, word, reading, first_seen, last_seen, frequency
headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency
) VALUES (
?, ?, ?, ?, ?, 1
?, ?, ?, ?, ?, ?, ?, ?, ?, 1
)
ON CONFLICT(headword, word, reading) DO UPDATE SET
frequency = COALESCE(frequency, 0) + 1,
part_of_speech = CASE
WHEN COALESCE(NULLIF(imm_words.part_of_speech, ''), 'other') = 'other'
AND COALESCE(NULLIF(excluded.part_of_speech, ''), '') <> ''
THEN excluded.part_of_speech
ELSE imm_words.part_of_speech
END,
pos1 = COALESCE(NULLIF(imm_words.pos1, ''), excluded.pos1),
pos2 = COALESCE(NULLIF(imm_words.pos2, ''), excluded.pos2),
pos3 = COALESCE(NULLIF(imm_words.pos3, ''), excluded.pos3),
first_seen = MIN(COALESCE(first_seen, excluded.first_seen), excluded.first_seen),
last_seen = MAX(COALESCE(last_seen, excluded.last_seen), excluded.last_seen)
`),
@@ -348,9 +805,93 @@ export function createTrackerPreparedStatements(db: DatabaseSync): TrackerPrepar
first_seen = MIN(COALESCE(first_seen, excluded.first_seen), excluded.first_seen),
last_seen = MAX(COALESCE(last_seen, excluded.last_seen), excluded.last_seen)
`),
subtitleLineInsertStmt: db.prepare(`
INSERT INTO imm_subtitle_lines (
session_id, event_id, video_id, anime_id, line_index, segment_start_ms,
segment_end_ms, text, CREATED_DATE, LAST_UPDATE_DATE
) VALUES (
?, ?, ?, ?, ?, ?, ?, ?, ?, ?
)
`),
wordIdSelectStmt: db.prepare(`
SELECT id FROM imm_words
WHERE headword = ? AND word = ? AND reading = ?
`),
kanjiIdSelectStmt: db.prepare(`
SELECT id FROM imm_kanji
WHERE kanji = ?
`),
wordLineOccurrenceUpsertStmt: db.prepare(`
INSERT INTO imm_word_line_occurrences (
line_id, word_id, occurrence_count
) VALUES (
?, ?, ?
)
ON CONFLICT(line_id, word_id) DO UPDATE SET
occurrence_count = imm_word_line_occurrences.occurrence_count + excluded.occurrence_count
`),
kanjiLineOccurrenceUpsertStmt: db.prepare(`
INSERT INTO imm_kanji_line_occurrences (
line_id, kanji_id, occurrence_count
) VALUES (
?, ?, ?
)
ON CONFLICT(line_id, kanji_id) DO UPDATE SET
occurrence_count = imm_kanji_line_occurrences.occurrence_count + excluded.occurrence_count
`),
videoAnimeIdSelectStmt: db.prepare(`
SELECT anime_id FROM imm_videos
WHERE video_id = ?
`),
};
}
function incrementWordAggregate(
stmts: TrackerPreparedStatements,
occurrence: Extract<QueuedWrite, { kind: 'subtitleLine' }>['wordOccurrences'][number],
firstSeen: number,
lastSeen: number,
): number {
for (let i = 0; i < occurrence.occurrenceCount; i += 1) {
stmts.wordUpsertStmt.run(
occurrence.headword,
occurrence.word,
occurrence.reading,
occurrence.partOfSpeech,
occurrence.pos1,
occurrence.pos2,
occurrence.pos3,
firstSeen,
lastSeen,
);
}
const row = stmts.wordIdSelectStmt.get(
occurrence.headword,
occurrence.word,
occurrence.reading,
) as { id: number } | null;
if (!row?.id) {
throw new Error(`Failed to resolve imm_words id for ${occurrence.headword}`);
}
return row.id;
}
function incrementKanjiAggregate(
stmts: TrackerPreparedStatements,
occurrence: Extract<QueuedWrite, { kind: 'subtitleLine' }>['kanjiOccurrences'][number],
firstSeen: number,
lastSeen: number,
): number {
for (let i = 0; i < occurrence.occurrenceCount; i += 1) {
stmts.kanjiUpsertStmt.run(occurrence.kanji, firstSeen, lastSeen);
}
const row = stmts.kanjiIdSelectStmt.get(occurrence.kanji) as { id: number } | null;
if (!row?.id) {
throw new Error(`Failed to resolve imm_kanji id for ${occurrence.kanji}`);
}
return row.id;
}
export function executeQueuedWrite(write: QueuedWrite, stmts: TrackerPreparedStatements): void {
if (write.kind === 'telemetry') {
stmts.telemetryInsertStmt.run(
@@ -379,6 +920,10 @@ export function executeQueuedWrite(write: QueuedWrite, stmts: TrackerPreparedSta
write.headword,
write.word,
write.reading,
write.partOfSpeech,
write.pos1,
write.pos2,
write.pos3,
write.firstSeen,
write.lastSeen,
);
@@ -388,6 +933,31 @@ export function executeQueuedWrite(write: QueuedWrite, stmts: TrackerPreparedSta
stmts.kanjiUpsertStmt.run(write.kanji, write.firstSeen, write.lastSeen);
return;
}
if (write.kind === 'subtitleLine') {
const animeRow = stmts.videoAnimeIdSelectStmt.get(write.videoId) as { anime_id: number | null } | null;
const lineResult = stmts.subtitleLineInsertStmt.run(
write.sessionId,
null,
write.videoId,
animeRow?.anime_id ?? null,
write.lineIndex,
write.segmentStartMs ?? null,
write.segmentEndMs ?? null,
write.text,
Date.now(),
Date.now(),
);
const lineId = Number(lineResult.lastInsertRowid);
for (const occurrence of write.wordOccurrences) {
const wordId = incrementWordAggregate(stmts, occurrence, write.firstSeen, write.lastSeen);
stmts.wordLineOccurrenceUpsertStmt.run(lineId, wordId, occurrence.occurrenceCount);
}
for (const occurrence of write.kanjiOccurrences) {
const kanjiId = incrementKanjiAggregate(stmts, occurrence, write.firstSeen, write.lastSeen);
stmts.kanjiLineOccurrenceUpsertStmt.run(lineId, kanjiId, occurrence.occurrenceCount);
}
return;
}
stmts.eventInsertStmt.run(
write.sessionId,

View File

@@ -1,4 +1,4 @@
export const SCHEMA_VERSION = 3;
export const SCHEMA_VERSION = 7;
export const DEFAULT_QUEUE_CAP = 1_000;
export const DEFAULT_BATCH_SIZE = 25;
export const DEFAULT_FLUSH_INTERVAL_MS = 500;
@@ -29,6 +29,9 @@ export const EVENT_PAUSE_END = 8;
export interface ImmersionTrackerOptions {
dbPath: string;
policy?: ImmersionTrackerPolicy;
resolveLegacyVocabularyPos?: (
row: LegacyVocabularyPosRow,
) => Promise<LegacyVocabularyPosResolution | null>;
}
export interface ImmersionTrackerPolicy {
@@ -72,6 +75,7 @@ export interface SessionState extends TelemetryAccumulator {
lastPauseStartMs: number | null;
isPaused: boolean;
pendingTelemetry: boolean;
markedWatched: boolean;
}
interface QueuedTelemetryWrite {
@@ -118,6 +122,10 @@ interface QueuedWordWrite {
headword: string;
word: string;
reading: string;
partOfSpeech: string;
pos1: string;
pos2: string;
pos3: string;
firstSeen: number;
lastSeen: number;
}
@@ -129,11 +137,42 @@ interface QueuedKanjiWrite {
lastSeen: number;
}
export interface CountedWordOccurrence {
headword: string;
word: string;
reading: string;
partOfSpeech: string;
pos1: string;
pos2: string;
pos3: string;
occurrenceCount: number;
}
export interface CountedKanjiOccurrence {
kanji: string;
occurrenceCount: number;
}
interface QueuedSubtitleLineWrite {
kind: 'subtitleLine';
sessionId: number;
videoId: number;
lineIndex: number;
segmentStartMs: number | null;
segmentEndMs: number | null;
text: string;
wordOccurrences: CountedWordOccurrence[];
kanjiOccurrences: CountedKanjiOccurrence[];
firstSeen: number;
lastSeen: number;
}
export type QueuedWrite =
| QueuedTelemetryWrite
| QueuedEventWrite
| QueuedWordWrite
| QueuedKanjiWrite;
| QueuedKanjiWrite
| QueuedSubtitleLineWrite;
export interface VideoMetadata {
sourceType: number;
@@ -152,8 +191,33 @@ export interface VideoMetadata {
metadataJson: string | null;
}
export interface ParsedAnimeVideoMetadata {
animeId: number | null;
parsedBasename: string | null;
parsedTitle: string | null;
parsedSeason: number | null;
parsedEpisode: number | null;
parserSource: string | null;
parserConfidence: number | null;
parseMetadataJson: string | null;
}
export interface ParsedAnimeVideoGuess {
parsedBasename: string | null;
parsedTitle: string;
parsedSeason: number | null;
parsedEpisode: number | null;
parserSource: 'guessit' | 'fallback';
parserConfidence: number;
parseMetadataJson: string;
}
export interface SessionSummaryQueryRow {
sessionId: number;
videoId: number | null;
canonicalTitle: string | null;
animeId: number | null;
animeTitle: string | null;
startedAtMs: number;
endedAtMs: number | null;
totalWatchedMs: number;
@@ -166,6 +230,82 @@ export interface SessionSummaryQueryRow {
lookupHits: number;
}
export interface VocabularyStatsRow {
wordId: number;
headword: string;
word: string;
reading: string;
partOfSpeech: string | null;
pos1: string | null;
pos2: string | null;
pos3: string | null;
frequency: number;
firstSeen: number;
lastSeen: number;
}
export interface VocabularyCleanupSummary {
scanned: number;
kept: number;
deleted: number;
repaired: number;
}
export interface LegacyVocabularyPosRow {
headword: string;
word: string;
reading: string | null;
}
export interface LegacyVocabularyPosResolution {
headword: string;
reading: string;
partOfSpeech: string;
pos1: string;
pos2: string;
pos3: string;
}
export interface KanjiStatsRow {
kanjiId: number;
kanji: string;
frequency: number;
firstSeen: number;
lastSeen: number;
}
export interface WordOccurrenceRow {
animeId: number | null;
animeTitle: string | null;
videoId: number;
videoTitle: string;
sessionId: number;
lineIndex: number;
segmentStartMs: number | null;
segmentEndMs: number | null;
text: string;
occurrenceCount: number;
}
export interface KanjiOccurrenceRow {
animeId: number | null;
animeTitle: string | null;
videoId: number;
videoTitle: string;
sessionId: number;
lineIndex: number;
segmentStartMs: number | null;
segmentEndMs: number | null;
text: string;
occurrenceCount: number;
}
export interface SessionEventRow {
eventType: number;
tsMs: number;
payload: string | null;
}
export interface SessionTimelineRow {
sampleMs: number;
totalWatchedMs: number;
@@ -200,3 +340,180 @@ export interface ProbeMetadata {
bitrateKbps: number | null;
audioCodecId: number | null;
}
export interface MediaArtRow {
videoId: number;
anilistId: number | null;
coverUrl: string | null;
coverBlob: Buffer | null;
titleRomaji: string | null;
titleEnglish: string | null;
episodesTotal: number | null;
fetchedAtMs: number;
}
export interface MediaLibraryRow {
videoId: number;
canonicalTitle: string;
totalSessions: number;
totalActiveMs: number;
totalCards: number;
totalWordsSeen: number;
lastWatchedMs: number;
hasCoverArt: number;
}
export interface MediaDetailRow {
videoId: number;
canonicalTitle: string;
totalSessions: number;
totalActiveMs: number;
totalCards: number;
totalWordsSeen: number;
totalLinesSeen: number;
totalLookupCount: number;
totalLookupHits: number;
}
export interface AnimeLibraryRow {
animeId: number;
canonicalTitle: string;
anilistId: number | null;
totalSessions: number;
totalActiveMs: number;
totalCards: number;
totalWordsSeen: number;
episodeCount: number;
episodesTotal: number | null;
lastWatchedMs: number;
}
export interface AnimeDetailRow {
animeId: number;
canonicalTitle: string;
anilistId: number | null;
titleRomaji: string | null;
titleEnglish: string | null;
titleNative: string | null;
totalSessions: number;
totalActiveMs: number;
totalCards: number;
totalWordsSeen: number;
totalLinesSeen: number;
totalLookupCount: number;
totalLookupHits: number;
episodeCount: number;
lastWatchedMs: number;
}
export interface AnimeAnilistEntryRow {
anilistId: number;
titleRomaji: string | null;
titleEnglish: string | null;
season: number | null;
}
export interface AnimeEpisodeRow {
animeId: number;
videoId: number;
canonicalTitle: string;
parsedTitle: string | null;
season: number | null;
episode: number | null;
durationMs: number;
watched: number;
totalSessions: number;
totalActiveMs: number;
totalCards: number;
totalWordsSeen: number;
lastWatchedMs: number;
}
export interface StreakCalendarRow {
epochDay: number;
totalActiveMin: number;
}
export interface AnimeWordRow {
wordId: number;
headword: string;
word: string;
reading: string;
partOfSpeech: string | null;
frequency: number;
}
export interface EpisodesPerDayRow {
epochDay: number;
episodeCount: number;
}
export interface NewAnimePerDayRow {
epochDay: number;
newAnimeCount: number;
}
export interface WatchTimePerAnimeRow {
epochDay: number;
animeId: number;
animeTitle: string;
totalActiveMin: number;
}
export interface WordDetailRow {
wordId: number;
headword: string;
word: string;
reading: string;
partOfSpeech: string | null;
pos1: string | null;
pos2: string | null;
pos3: string | null;
frequency: number;
firstSeen: number;
lastSeen: number;
}
export interface WordAnimeAppearanceRow {
animeId: number;
animeTitle: string;
occurrenceCount: number;
}
export interface SimilarWordRow {
wordId: number;
headword: string;
word: string;
reading: string;
frequency: number;
}
export interface KanjiDetailRow {
kanjiId: number;
kanji: string;
frequency: number;
firstSeen: number;
lastSeen: number;
}
export interface KanjiAnimeAppearanceRow {
animeId: number;
animeTitle: string;
occurrenceCount: number;
}
export interface KanjiWordRow {
wordId: number;
headword: string;
word: string;
reading: string;
frequency: number;
}
export interface EpisodeCardEventRow {
eventId: number;
sessionId: number;
tsMs: number;
cardsDelta: number;
noteIds: number[];
}

View File

@@ -133,6 +133,17 @@ function isFrequencyExcludedByPos(
);
}
export function shouldExcludeTokenFromVocabularyPersistence(
token: MergedToken,
options: Pick<AnnotationStageOptions, 'pos1Exclusions' | 'pos2Exclusions'> = {},
): boolean {
return isFrequencyExcludedByPos(
token,
resolvePos1Exclusions(options),
resolvePos2Exclusions(options),
);
}
function applyFrequencyMarking(
tokens: MergedToken[],
pos1Exclusions: ReadonlySet<string>,

View File

@@ -0,0 +1,56 @@
import { PartOfSpeech } from '../../../types';
function normalizePosTag(value: string | null | undefined): string {
return typeof value === 'string' ? value.trim() : '';
}
export function isPartOfSpeechValue(value: unknown): value is PartOfSpeech {
return typeof value === 'string' && Object.values(PartOfSpeech).includes(value as PartOfSpeech);
}
export function mapMecabPos1ToPartOfSpeech(pos1: string | null | undefined): PartOfSpeech {
switch (normalizePosTag(pos1)) {
case '名詞':
return PartOfSpeech.noun;
case '動詞':
return PartOfSpeech.verb;
case '形容詞':
return PartOfSpeech.i_adjective;
case '形状詞':
case '形容動詞':
return PartOfSpeech.na_adjective;
case '助詞':
return PartOfSpeech.particle;
case '助動詞':
return PartOfSpeech.bound_auxiliary;
case '記号':
case '補助記号':
return PartOfSpeech.symbol;
default:
return PartOfSpeech.other;
}
}
export function deriveStoredPartOfSpeech(input: {
partOfSpeech?: string | null;
pos1?: string | null;
}): PartOfSpeech {
const pos1Parts = normalizePosTag(input.pos1)
.split('|')
.map((part) => part.trim())
.filter((part) => part.length > 0);
if (pos1Parts.length > 0) {
const derivedParts = [...new Set(pos1Parts.map((part) => mapMecabPos1ToPartOfSpeech(part)))];
if (derivedParts.length === 1) {
return derivedParts[0]!;
}
return PartOfSpeech.other;
}
if (isPartOfSpeechValue(input.partOfSpeech)) {
return input.partOfSpeech;
}
return PartOfSpeech.other;
}

View File

@@ -19,34 +19,12 @@
import * as childProcess from 'child_process';
import { PartOfSpeech, Token, MecabStatus } from './types';
import { createLogger } from './logger';
import { mapMecabPos1ToPartOfSpeech } from './core/services/tokenizer/part-of-speech';
export { PartOfSpeech };
const log = createLogger('mecab');
function mapPartOfSpeech(pos1: string): PartOfSpeech {
switch (pos1) {
case '名詞':
return PartOfSpeech.noun;
case '動詞':
return PartOfSpeech.verb;
case '形容詞':
return PartOfSpeech.i_adjective;
case '形状詞':
case '形容動詞':
return PartOfSpeech.na_adjective;
case '助詞':
return PartOfSpeech.particle;
case '助動詞':
return PartOfSpeech.bound_auxiliary;
case '記号':
case '補助記号':
return PartOfSpeech.symbol;
default:
return PartOfSpeech.other;
}
}
export function parseMecabLine(line: string): Token | null {
if (!line || line === 'EOS' || line.trim() === '') {
return null;
@@ -73,7 +51,7 @@ export function parseMecabLine(line: string): Token | null {
return {
word: surface,
partOfSpeech: mapPartOfSpeech(pos1),
partOfSpeech: mapMecabPos1ToPartOfSpeech(pos1),
pos1,
pos2,
pos3,
@@ -446,4 +424,4 @@ export class MecabTokenizer {
}
}
export { mapPartOfSpeech };
export { mapMecabPos1ToPartOfSpeech as mapPartOfSpeech };