fix(subtitle): restore known and JLPT token annotations

2026-05-28 00:55:16 -07:00 · 2026-03-19 18:03:20 -07:00
parent 1b5f0c6999
commit 43a0d11446
16 changed files with 258 additions and 43 deletions
@@ -360,10 +360,7 @@ test('getTrendsDashboard returns chart-ready aggregated series', () => {
    assert.equal(dashboard.activity.watchTime[0]?.value, 30);
    assert.equal(dashboard.progress.watchTime[1]?.value, 75);
    assert.equal(dashboard.progress.lookups[1]?.value, 18);
-    assert.equal(
-      dashboard.ratios.lookupsPerHundred[0]?.value,
-      +((8 / 120) * 100).toFixed(1),
-    );
+    assert.equal(dashboard.ratios.lookupsPerHundred[0]?.value, +((8 / 120) * 100).toFixed(1));
    assert.equal(dashboard.animePerDay.watchTime[0]?.animeTitle, 'Trend Dashboard Anime');
    assert.equal(dashboard.animeCumulative.watchTime[1]?.value, 75);
    assert.equal(
@@ -2025,6 +2022,7 @@ test('anime/media detail and episode queries use ended-session metrics when tele
      SET
        ended_at_ms = ?,
        status = 2,
+        ended_media_ms = ?,
        active_watched_ms = ?,
        cards_mined = ?,
        tokens_seen = ?,
@@ -2034,9 +2032,9 @@ test('anime/media detail and episode queries use ended-session metrics when tele
      WHERE session_id = ?
      `,
    );
-    updateSession.run(1_001_000, 3_000, 1, 10, 4, 3, now, s1);
-    updateSession.run(1_011_000, 4_000, 2, 20, 5, 4, now, s2);
-    updateSession.run(1_021_000, 5_000, 3, 30, 6, 5, now, s3);
+    updateSession.run(1_001_000, 2_500, 3_000, 1, 10, 4, 3, now, s1);
+    updateSession.run(1_011_000, 6_000, 4_000, 2, 20, 5, 4, now, s2);
+    updateSession.run(1_021_000, 8_000, 5_000, 3, 30, 6, 5, now, s3);

    const animeDetail = getAnimeDetail(db, animeId);
    assert.ok(animeDetail);
@@ -2047,6 +2045,7 @@ test('anime/media detail and episode queries use ended-session metrics when tele
    assert.deepEqual(
      episodes.map((row) => ({
        videoId: row.videoId,
+        endedMediaMs: row.endedMediaMs,
        totalSessions: row.totalSessions,
        totalActiveMs: row.totalActiveMs,
        totalCards: row.totalCards,
@@ -2055,6 +2054,7 @@ test('anime/media detail and episode queries use ended-session metrics when tele
      [
        {
          videoId: episodeOne,
+          endedMediaMs: 6_000,
          totalSessions: 2,
          totalActiveMs: 7_000,
          totalCards: 3,
@@ -2062,6 +2062,7 @@ test('anime/media detail and episode queries use ended-session metrics when tele
        },
        {
          videoId: episodeTwo,
+          endedMediaMs: 8_000,
          totalSessions: 1,
          totalActiveMs: 5_000,
          totalCards: 3,
@@ -818,7 +818,10 @@ function accumulatePoints(points: TrendChartPoint[]): TrendChartPoint[] {
 }

 function buildAggregatedTrendRows(rollups: ImmersionSessionRollupRow[]) {
-  const byKey = new Map<number, { activeMin: number; cards: number; words: number; sessions: number }>();
+  const byKey = new Map<
+    number,
+    { activeMin: number; cards: number; words: number; sessions: number }
+  >();

  for (const rollup of rollups) {
    const existing = byKey.get(rollup.rollupDayOrMonth) ?? {
@@ -894,14 +897,8 @@ function buildLookupsPerHundredWords(sessions: TrendSessionMetricRow[]): TrendCh

  for (const session of sessions) {
    const epochDay = Math.floor(session.startedAtMs / 86_400_000);
-    lookupsByDay.set(
-      epochDay,
-      (lookupsByDay.get(epochDay) ?? 0) + session.yomitanLookupCount,
-    );
-    wordsByDay.set(
-      epochDay,
-      (wordsByDay.get(epochDay) ?? 0) + getTrendSessionWordCount(session),
-    );
+    lookupsByDay.set(epochDay, (lookupsByDay.get(epochDay) ?? 0) + session.yomitanLookupCount);
+    wordsByDay.set(epochDay, (wordsByDay.get(epochDay) ?? 0) + getTrendSessionWordCount(session));
  }

  return Array.from(lookupsByDay.entries())
@@ -1005,8 +1002,13 @@ function buildCumulativePerAnime(points: TrendPerAnimePoint[]): TrendPerAnimePoi
  return result;
 }

-function getVideoAnimeTitleMap(db: DatabaseSync, videoIds: Array<number | null>): Map<number, string> {
-  const uniqueIds = [...new Set(videoIds.filter((value): value is number => typeof value === 'number'))];
+function getVideoAnimeTitleMap(
+  db: DatabaseSync,
+  videoIds: Array<number | null>,
+): Map<number, string> {
+  const uniqueIds = [
+    ...new Set(videoIds.filter((value): value is number => typeof value === 'number')),
+  ];
  if (uniqueIds.length === 0) {
    return new Map();
  }
@@ -1027,7 +1029,10 @@ function getVideoAnimeTitleMap(db: DatabaseSync, videoIds: Array<number | null>)
  return new Map(rows.map((row) => [row.videoId, row.animeTitle]));
 }

-function resolveVideoAnimeTitle(videoId: number | null, titlesByVideoId: Map<number, string>): string {
+function resolveVideoAnimeTitle(
+  videoId: number | null,
+  titlesByVideoId: Map<number, string>,
+): string {
  if (videoId === null) {
    return 'Unknown';
  }
@@ -1087,7 +1092,9 @@ function buildEpisodesPerAnimeFromDailyRollups(
  return result;
 }

-function buildEpisodesPerDayFromDailyRollups(rollups: ImmersionSessionRollupRow[]): TrendChartPoint[] {
+function buildEpisodesPerDayFromDailyRollups(
+  rollups: ImmersionSessionRollupRow[],
+): TrendChartPoint[] {
  const byDay = new Map<number, Set<number>>();

  for (const rollup of rollups) {
@@ -1147,7 +1154,9 @@ function buildNewWordsPerDay(db: DatabaseSync, cutoffMs: number | null): TrendCh
    ORDER BY epochDay ASC
  `);

-  const rows = (cutoffMs === null ? prepared.all() : prepared.all(Math.floor(cutoffMs / 1000))) as Array<{
+  const rows = (
+    cutoffMs === null ? prepared.all() : prepared.all(Math.floor(cutoffMs / 1000))
+  ) as Array<{
    epochDay: number;
    wordCount: number;
  }>;
@@ -1186,10 +1195,8 @@ export function getTrendsDashboard(

  const animePerDay = {
    episodes: buildEpisodesPerAnimeFromDailyRollups(dailyRollups, titlesByVideoId),
-    watchTime: buildPerAnimeFromDailyRollups(
-      dailyRollups,
-      titlesByVideoId,
-      (rollup) => Math.round(rollup.totalActiveMin),
+    watchTime: buildPerAnimeFromDailyRollups(dailyRollups, titlesByVideoId, (rollup) =>
+      Math.round(rollup.totalActiveMin),
    ),
    cards: buildPerAnimeFromDailyRollups(
      dailyRollups,
@@ -1201,10 +1208,7 @@ export function getTrendsDashboard(
      titlesByVideoId,
      (rollup) => rollup.totalTokensSeen,
    ),
-    lookups: buildPerAnimeFromSessions(
-      sessions,
-      (session) => session.yomitanLookupCount,
-    ),
+    lookups: buildPerAnimeFromSessions(sessions, (session) => session.yomitanLookupCount),
    lookupsPerHundred: buildLookupsPerHundredPerAnime(sessions),
  };

@@ -1740,6 +1744,14 @@ export function getAnimeEpisodes(db: DatabaseSync, animeId: number): AnimeEpisod
      v.parsed_season AS season,
      v.parsed_episode AS episode,
      v.duration_ms AS durationMs,
+      (
+        SELECT s_recent.ended_media_ms
+        FROM imm_sessions s_recent
+        WHERE s_recent.video_id = v.video_id
+          AND s_recent.ended_at_ms IS NOT NULL
+        ORDER BY s_recent.ended_at_ms DESC, s_recent.session_id DESC
+        LIMIT 1
+      ) AS endedMediaMs,
      v.watched AS watched,
      COUNT(DISTINCT s.session_id) AS totalSessions,
      COALESCE(SUM(COALESCE(asm.activeWatchedMs, s.active_watched_ms, 0)), 0) AS totalActiveMs,
@@ -39,6 +39,7 @@ export function finalizeSessionRecord(
      SET
        ended_at_ms = ?,
        status = ?,
+        ended_media_ms = ?,
        total_watched_ms = ?,
        active_watched_ms = ?,
        lines_seen = ?,
@@ -58,6 +59,7 @@ export function finalizeSessionRecord(
  ).run(
    endedAtMs,
    SESSION_STATUS_ENDED,
+    sessionState.lastMediaMs,
    sessionState.totalWatchedMs,
    sessionState.activeWatchedMs,
    sessionState.linesSeen,
@@ -740,6 +740,39 @@ test('start/finalize session updates ended_at and status', () => {
  }
 });

+test('finalize session persists ended media position', () => {
+  const dbPath = makeDbPath();
+  const db = new Database(dbPath);
+
+  try {
+    ensureSchema(db);
+    const videoId = getOrCreateVideoRecord(db, 'local:/tmp/slice-a-ended-media.mkv', {
+      canonicalTitle: 'Slice A Ended Media',
+      sourcePath: '/tmp/slice-a-ended-media.mkv',
+      sourceUrl: null,
+      sourceType: SOURCE_TYPE_LOCAL,
+    });
+    const startedAtMs = 1_234_567_000;
+    const endedAtMs = startedAtMs + 8_500;
+    const { sessionId, state } = startSessionRecord(db, videoId, startedAtMs);
+    state.lastMediaMs = 91_000;
+
+    finalizeSessionRecord(db, state, endedAtMs);
+
+    const row = db
+      .prepare('SELECT ended_media_ms FROM imm_sessions WHERE session_id = ?')
+      .get(sessionId) as {
+      ended_media_ms: number | null;
+    } | null;
+
+    assert.ok(row);
+    assert.equal(row?.ended_media_ms, 91_000);
+  } finally {
+    db.close();
+    cleanupDbPath(dbPath);
+  }
+});
+
 test('executeQueuedWrite inserts event and telemetry rows', () => {
  const dbPath = makeDbPath();
  const db = new Database(dbPath);
@@ -569,6 +569,7 @@ export function ensureSchema(db: DatabaseSync): void {
      status INTEGER NOT NULL,
      locale_id INTEGER, target_lang_id INTEGER,
      difficulty_tier INTEGER, subtitle_mode INTEGER,
+      ended_media_ms INTEGER,
      total_watched_ms INTEGER NOT NULL DEFAULT 0,
      active_watched_ms INTEGER NOT NULL DEFAULT 0,
      lines_seen INTEGER NOT NULL DEFAULT 0,
@@ -1026,6 +1027,10 @@ export function ensureSchema(db: DatabaseSync): void {
    `);
  }

+  if (currentVersion?.schema_version && currentVersion.schema_version < 15) {
+    addColumnIfMissing(db, 'imm_sessions', 'ended_media_ms', 'INTEGER');
+  }
+
  ensureLifetimeSummaryTables(db);

  db.exec(`
@@ -1,4 +1,4 @@
-export const SCHEMA_VERSION = 14;
+export const SCHEMA_VERSION = 15;
 export const DEFAULT_QUEUE_CAP = 1_000;
 export const DEFAULT_BATCH_SIZE = 25;
 export const DEFAULT_FLUSH_INTERVAL_MS = 500;
@@ -482,6 +482,7 @@ export interface AnimeEpisodeRow {
  season: number | null;
  episode: number | null;
  durationMs: number;
+  endedMediaMs: number | null;
  watched: number;
  totalSessions: number;
  totalActiveMs: number;
@@ -55,6 +55,29 @@ test('annotateTokens known-word match mode uses headword vs surface', () => {
  assert.equal(surfaceResult[0]?.isKnown, false);
 });

+test('annotateTokens falls back to reading for known-word matches when headword lookup misses', () => {
+  const tokens = [
+    makeToken({
+      surface: '大体',
+      headword: '大体',
+      reading: 'だいたい',
+      frequencyRank: 1895,
+    }),
+  ];
+
+  const result = annotateTokens(
+    tokens,
+    makeDeps({
+      isKnownWord: (text) => text === 'だいたい',
+      getJlptLevel: (text) => (text === '大体' ? 'N4' : null),
+    }),
+  );
+
+  assert.equal(result[0]?.isKnown, true);
+  assert.equal(result[0]?.jlptLevel, 'N4');
+  assert.equal(result[0]?.frequencyRank, 1895);
+});
+
 test('annotateTokens excludes frequency for particle/bound_auxiliary and pos1 exclusions', () => {
  const tokens = [
    makeToken({
@@ -560,12 +560,7 @@ function isJlptEligibleToken(token: MergedToken): boolean {
    return false;
  }

-  const candidates = [
-    resolveJlptLookupText(token),
-    token.surface,
-    token.reading,
-    token.headword,
-  ].filter(
+  const candidates = [resolveJlptLookupText(token), token.surface, token.headword].filter(
    (candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
  );

@@ -659,7 +654,16 @@ function computeTokenKnownStatus(
  knownWordMatchMode: NPlusOneMatchMode,
 ): boolean {
  const matchText = resolveKnownWordText(token.surface, token.headword, knownWordMatchMode);
-  return token.isKnown || (matchText ? isKnownWord(matchText) : false);
+  if (token.isKnown || (matchText ? isKnownWord(matchText) : false)) {
+    return true;
+  }
+
+  const normalizedReading = token.reading.trim();
+  if (!normalizedReading) {
+    return false;
+  }
+
+  return normalizedReading !== matchText.trim() && isKnownWord(normalizedReading);
 }

 function filterTokenFrequencyRank(