fix(subtitle): restore known and JLPT token annotations

2026-05-27 12:55:20 -07:00 · 2026-03-19 18:03:20 -07:00
parent 1b5f0c6999
commit 43a0d11446
16 changed files with 258 additions and 43 deletions
@@ -0,0 +1,70 @@
+---
+id: TASK-202
+title: Use ended session media position for anime episode progress
+status: Done
+assignee:
+  - Codex
+created_date: '2026-03-19 14:55'
+updated_date: '2026-03-19 17:36'
+labels:
+  - stats
+  - ui
+  - bug
+milestone: m-1
+dependencies: []
+references:
+  - stats/src/components/anime/EpisodeList.tsx
+  - stats/src/types/stats.ts
+  - src/core/services/immersion-tracker/session.ts
+  - src/core/services/immersion-tracker/query.ts
+  - src/core/services/immersion-tracker/storage.ts
+priority: medium
+ordinal: 105720
+---
+
+## Description
+
+<!-- SECTION:DESCRIPTION:BEGIN -->
+
+The anime episode list currently computes the `Progress` column from cumulative `totalActiveMs / durationMs`, which can exceed the intended watch-position meaning after rewatches or repeated sessions. Persist the playback position at the time a session ends and drive episode progress from that stored stop position instead.
+
+<!-- SECTION:DESCRIPTION:END -->
+
+## Acceptance Criteria
+
+<!-- AC:BEGIN -->
+
+- [x] #1 Session finalization persists the playback position reached when the session ended.
+- [x] #2 Anime episode queries expose the most recent ended-session media position for each episode.
+- [x] #3 Episode-list progress renders from ended media position instead of cumulative active watch time.
+- [x] #4 Regression coverage locks storage/query/UI behavior for the new progress source.
+<!-- AC:END -->
+
+## Implementation Plan
+
+<!-- SECTION:PLAN:BEGIN -->
+
+1. Add failing regression coverage for persisted ended media position and episode progress rendering.
+2. Add `ended_media_ms` to the immersion-session schema and persist `lastMediaMs` when ending a session.
+3. Thread the new field through episode queries/types and render episode progress from `endedMediaMs / durationMs`.
+4. Run targeted verification plus typecheck, then record the outcome.
+<!-- SECTION:PLAN:END -->
+
+## Outcome
+
+<!-- SECTION:OUTCOME:BEGIN -->
+
+Added nullable `ended_media_ms` storage to immersion sessions, persisted `lastMediaMs` when sessions finalize, and exposed the most recent ended-session media position through anime episode queries/types. The anime episode list now renders `Progress` from `endedMediaMs / durationMs` instead of cumulative active watch time, so rewatches no longer inflate the displayed percentage.
+
+Verification:
+
+- `bun test src/core/services/immersion-tracker/storage-session.test.ts`
+- `bun test src/core/services/immersion-tracker/__tests__/query.test.ts`
+- `bun test stats/src/lib/yomitan-lookup.test.tsx stats/src/lib/stats-ui-navigation.test.tsx`
+- `bun run typecheck`
+- `bun run changelog:lint`
+- `bun x prettier --check 'src/core/services/immersion-tracker/types.ts' 'src/core/services/immersion-tracker/storage.ts' 'src/core/services/immersion-tracker/session.ts' 'src/core/services/immersion-tracker/query.ts' 'src/core/services/immersion-tracker/storage-session.test.ts' 'src/core/services/immersion-tracker/__tests__/query.test.ts' 'stats/src/types/stats.ts' 'stats/src/components/anime/EpisodeList.tsx' 'stats/src/lib/yomitan-lookup.test.tsx' 'stats/src/lib/stats-ui-navigation.test.tsx' 'backlog/tasks/task-202 - Use-ended-session-media-position-for-anime-episode-progress.md' 'changes/2026-03-19-stats-ended-media-progress.md'`
+- `bash .agents/skills/subminer-change-verification/scripts/verify_subminer_change.sh --lane core 'src/core/services/immersion-tracker/types.ts' 'src/core/services/immersion-tracker/storage.ts' 'src/core/services/immersion-tracker/session.ts' 'src/core/services/immersion-tracker/query.ts' 'src/core/services/immersion-tracker/storage-session.test.ts' 'src/core/services/immersion-tracker/__tests__/query.test.ts' 'stats/src/types/stats.ts' 'stats/src/components/anime/EpisodeList.tsx' 'stats/src/lib/yomitan-lookup.test.tsx' 'stats/src/lib/stats-ui-navigation.test.tsx' 'backlog/tasks/task-202 - Use-ended-session-media-position-for-anime-episode-progress.md' 'changes/2026-03-19-stats-ended-media-progress.md'`
+- Verifier artifacts: `.tmp/skill-verification/subminer-verify-20260319-173511-AV7kUg/`
+
+<!-- SECTION:OUTCOME:END -->
@@ -0,0 +1,47 @@
+---
+id: TASK-203
+title: Restore known and JLPT annotation for reading-mismatch subtitle tokens
+status: Done
+assignee:
+  - Codex
+created_date: '2026-03-19 18:25'
+updated_date: '2026-03-19 18:25'
+labels:
+  - subtitle
+  - bug
+dependencies: []
+references:
+  - src/core/services/tokenizer/annotation-stage.ts
+  - src/core/services/tokenizer/annotation-stage.test.ts
+priority: medium
+ordinal: 105721
+---
+
+## Description
+
+<!-- SECTION:DESCRIPTION:BEGIN -->
+
+Some subtitle tokens lose both known-word coloring and JLPT underline even though the popup resolves a valid dictionary term. Repro example: `大体` in `大体 僕だって困ってたんですよ！` can be known via kana-only Anki data (`だいたい`) while JLPT lookup should still resolve from the kanji surface/headword.
+
+<!-- SECTION:DESCRIPTION:END -->
+
+## Acceptance Criteria
+
+<!-- AC:BEGIN -->
+
+- [x] #1 Subtitle annotation can mark a token known via its reading when the configured headword/surface lookup misses.
+- [x] #2 JLPT eligibility no longer drops valid kanji terms just because their reading contains repeated kana patterns.
+- [x] #3 Regression coverage locks the combined known + JLPT case for `大体`.
+<!-- AC:END -->
+
+## Outcome
+
+<!-- SECTION:OUTCOME:BEGIN -->
+
+Known-word annotation now falls back to the token reading after the configured headword/surface lookup misses, so kana-only known-card entries still light up matching subtitle tokens. JLPT eligibility now ignores repeated-kana noise checks on the reading when a real surface/headword is present, which preserves JLPT tagging for words like `大体`.
+
+Verification:
+
+- `bun test src/core/services/tokenizer/annotation-stage.test.ts`
+
+<!-- SECTION:OUTCOME:END -->
@@ -0,0 +1,4 @@
+type: fixed
+area: subtitle
+
+- Restored known-word coloring and JLPT underlines for subtitle tokens like `大体` when the subtitle token is kanji but the known-word cache only matches the kana reading.
@@ -0,0 +1,4 @@
+type: fixed
+area: stats
+
+- Episode progress in the anime page now uses the last ended playback position instead of cumulative active watch time, avoiding distorted percentages after rewatches or repeated sessions.
@@ -360,10 +360,7 @@ test('getTrendsDashboard returns chart-ready aggregated series', () => {
    assert.equal(dashboard.activity.watchTime[0]?.value, 30);
    assert.equal(dashboard.progress.watchTime[1]?.value, 75);
    assert.equal(dashboard.progress.lookups[1]?.value, 18);
-    assert.equal(
-      dashboard.ratios.lookupsPerHundred[0]?.value,
-      +((8 / 120) * 100).toFixed(1),
-    );
+    assert.equal(dashboard.ratios.lookupsPerHundred[0]?.value, +((8 / 120) * 100).toFixed(1));
    assert.equal(dashboard.animePerDay.watchTime[0]?.animeTitle, 'Trend Dashboard Anime');
    assert.equal(dashboard.animeCumulative.watchTime[1]?.value, 75);
    assert.equal(
@@ -2025,6 +2022,7 @@ test('anime/media detail and episode queries use ended-session metrics when tele
      SET
        ended_at_ms = ?,
        status = 2,
+        ended_media_ms = ?,
        active_watched_ms = ?,
        cards_mined = ?,
        tokens_seen = ?,
@@ -2034,9 +2032,9 @@ test('anime/media detail and episode queries use ended-session metrics when tele
      WHERE session_id = ?
      `,
    );
-    updateSession.run(1_001_000, 3_000, 1, 10, 4, 3, now, s1);
-    updateSession.run(1_011_000, 4_000, 2, 20, 5, 4, now, s2);
-    updateSession.run(1_021_000, 5_000, 3, 30, 6, 5, now, s3);
+    updateSession.run(1_001_000, 2_500, 3_000, 1, 10, 4, 3, now, s1);
+    updateSession.run(1_011_000, 6_000, 4_000, 2, 20, 5, 4, now, s2);
+    updateSession.run(1_021_000, 8_000, 5_000, 3, 30, 6, 5, now, s3);

    const animeDetail = getAnimeDetail(db, animeId);
    assert.ok(animeDetail);
@@ -2047,6 +2045,7 @@ test('anime/media detail and episode queries use ended-session metrics when tele
    assert.deepEqual(
      episodes.map((row) => ({
        videoId: row.videoId,
+        endedMediaMs: row.endedMediaMs,
        totalSessions: row.totalSessions,
        totalActiveMs: row.totalActiveMs,
        totalCards: row.totalCards,
@@ -2055,6 +2054,7 @@ test('anime/media detail and episode queries use ended-session metrics when tele
      [
        {
          videoId: episodeOne,
+          endedMediaMs: 6_000,
          totalSessions: 2,
          totalActiveMs: 7_000,
          totalCards: 3,
@@ -2062,6 +2062,7 @@ test('anime/media detail and episode queries use ended-session metrics when tele
        },
        {
          videoId: episodeTwo,
+          endedMediaMs: 8_000,
          totalSessions: 1,
          totalActiveMs: 5_000,
          totalCards: 3,
@@ -818,7 +818,10 @@ function accumulatePoints(points: TrendChartPoint[]): TrendChartPoint[] {
 }

 function buildAggregatedTrendRows(rollups: ImmersionSessionRollupRow[]) {
-  const byKey = new Map<number, { activeMin: number; cards: number; words: number; sessions: number }>();
+  const byKey = new Map<
+    number,
+    { activeMin: number; cards: number; words: number; sessions: number }
+  >();

  for (const rollup of rollups) {
    const existing = byKey.get(rollup.rollupDayOrMonth) ?? {
@@ -894,14 +897,8 @@ function buildLookupsPerHundredWords(sessions: TrendSessionMetricRow[]): TrendCh

  for (const session of sessions) {
    const epochDay = Math.floor(session.startedAtMs / 86_400_000);
-    lookupsByDay.set(
-      epochDay,
-      (lookupsByDay.get(epochDay) ?? 0) + session.yomitanLookupCount,
-    );
-    wordsByDay.set(
-      epochDay,
-      (wordsByDay.get(epochDay) ?? 0) + getTrendSessionWordCount(session),
-    );
+    lookupsByDay.set(epochDay, (lookupsByDay.get(epochDay) ?? 0) + session.yomitanLookupCount);
+    wordsByDay.set(epochDay, (wordsByDay.get(epochDay) ?? 0) + getTrendSessionWordCount(session));
  }

  return Array.from(lookupsByDay.entries())
@@ -1005,8 +1002,13 @@ function buildCumulativePerAnime(points: TrendPerAnimePoint[]): TrendPerAnimePoi
  return result;
 }

-function getVideoAnimeTitleMap(db: DatabaseSync, videoIds: Array<number | null>): Map<number, string> {
-  const uniqueIds = [...new Set(videoIds.filter((value): value is number => typeof value === 'number'))];
+function getVideoAnimeTitleMap(
+  db: DatabaseSync,
+  videoIds: Array<number | null>,
+): Map<number, string> {
+  const uniqueIds = [
+    ...new Set(videoIds.filter((value): value is number => typeof value === 'number')),
+  ];
  if (uniqueIds.length === 0) {
    return new Map();
  }
@@ -1027,7 +1029,10 @@ function getVideoAnimeTitleMap(db: DatabaseSync, videoIds: Array<number | null>)
  return new Map(rows.map((row) => [row.videoId, row.animeTitle]));
 }

-function resolveVideoAnimeTitle(videoId: number | null, titlesByVideoId: Map<number, string>): string {
+function resolveVideoAnimeTitle(
+  videoId: number | null,
+  titlesByVideoId: Map<number, string>,
+): string {
  if (videoId === null) {
    return 'Unknown';
  }
@@ -1087,7 +1092,9 @@ function buildEpisodesPerAnimeFromDailyRollups(
  return result;
 }

-function buildEpisodesPerDayFromDailyRollups(rollups: ImmersionSessionRollupRow[]): TrendChartPoint[] {
+function buildEpisodesPerDayFromDailyRollups(
+  rollups: ImmersionSessionRollupRow[],
+): TrendChartPoint[] {
  const byDay = new Map<number, Set<number>>();

  for (const rollup of rollups) {
@@ -1147,7 +1154,9 @@ function buildNewWordsPerDay(db: DatabaseSync, cutoffMs: number | null): TrendCh
    ORDER BY epochDay ASC
  `);

-  const rows = (cutoffMs === null ? prepared.all() : prepared.all(Math.floor(cutoffMs / 1000))) as Array<{
+  const rows = (
+    cutoffMs === null ? prepared.all() : prepared.all(Math.floor(cutoffMs / 1000))
+  ) as Array<{
    epochDay: number;
    wordCount: number;
  }>;
@@ -1186,10 +1195,8 @@ export function getTrendsDashboard(

  const animePerDay = {
    episodes: buildEpisodesPerAnimeFromDailyRollups(dailyRollups, titlesByVideoId),
-    watchTime: buildPerAnimeFromDailyRollups(
-      dailyRollups,
-      titlesByVideoId,
-      (rollup) => Math.round(rollup.totalActiveMin),
+    watchTime: buildPerAnimeFromDailyRollups(dailyRollups, titlesByVideoId, (rollup) =>
+      Math.round(rollup.totalActiveMin),
    ),
    cards: buildPerAnimeFromDailyRollups(
      dailyRollups,
@@ -1201,10 +1208,7 @@ export function getTrendsDashboard(
      titlesByVideoId,
      (rollup) => rollup.totalTokensSeen,
    ),
-    lookups: buildPerAnimeFromSessions(
-      sessions,
-      (session) => session.yomitanLookupCount,
-    ),
+    lookups: buildPerAnimeFromSessions(sessions, (session) => session.yomitanLookupCount),
    lookupsPerHundred: buildLookupsPerHundredPerAnime(sessions),
  };

@@ -1740,6 +1744,14 @@ export function getAnimeEpisodes(db: DatabaseSync, animeId: number): AnimeEpisod
      v.parsed_season AS season,
      v.parsed_episode AS episode,
      v.duration_ms AS durationMs,
+      (
+        SELECT s_recent.ended_media_ms
+        FROM imm_sessions s_recent
+        WHERE s_recent.video_id = v.video_id
+          AND s_recent.ended_at_ms IS NOT NULL
+        ORDER BY s_recent.ended_at_ms DESC, s_recent.session_id DESC
+        LIMIT 1
+      ) AS endedMediaMs,
      v.watched AS watched,
      COUNT(DISTINCT s.session_id) AS totalSessions,
      COALESCE(SUM(COALESCE(asm.activeWatchedMs, s.active_watched_ms, 0)), 0) AS totalActiveMs,
@@ -39,6 +39,7 @@ export function finalizeSessionRecord(
      SET
        ended_at_ms = ?,
        status = ?,
+        ended_media_ms = ?,
        total_watched_ms = ?,
        active_watched_ms = ?,
        lines_seen = ?,
@@ -58,6 +59,7 @@ export function finalizeSessionRecord(
  ).run(
    endedAtMs,
    SESSION_STATUS_ENDED,
+    sessionState.lastMediaMs,
    sessionState.totalWatchedMs,
    sessionState.activeWatchedMs,
    sessionState.linesSeen,
@@ -740,6 +740,39 @@ test('start/finalize session updates ended_at and status', () => {
  }
 });

+test('finalize session persists ended media position', () => {
+  const dbPath = makeDbPath();
+  const db = new Database(dbPath);
+
+  try {
+    ensureSchema(db);
+    const videoId = getOrCreateVideoRecord(db, 'local:/tmp/slice-a-ended-media.mkv', {
+      canonicalTitle: 'Slice A Ended Media',
+      sourcePath: '/tmp/slice-a-ended-media.mkv',
+      sourceUrl: null,
+      sourceType: SOURCE_TYPE_LOCAL,
+    });
+    const startedAtMs = 1_234_567_000;
+    const endedAtMs = startedAtMs + 8_500;
+    const { sessionId, state } = startSessionRecord(db, videoId, startedAtMs);
+    state.lastMediaMs = 91_000;
+
+    finalizeSessionRecord(db, state, endedAtMs);
+
+    const row = db
+      .prepare('SELECT ended_media_ms FROM imm_sessions WHERE session_id = ?')
+      .get(sessionId) as {
+      ended_media_ms: number | null;
+    } | null;
+
+    assert.ok(row);
+    assert.equal(row?.ended_media_ms, 91_000);
+  } finally {
+    db.close();
+    cleanupDbPath(dbPath);
+  }
+});
+
 test('executeQueuedWrite inserts event and telemetry rows', () => {
  const dbPath = makeDbPath();
  const db = new Database(dbPath);
@@ -569,6 +569,7 @@ export function ensureSchema(db: DatabaseSync): void {
      status INTEGER NOT NULL,
      locale_id INTEGER, target_lang_id INTEGER,
      difficulty_tier INTEGER, subtitle_mode INTEGER,
+      ended_media_ms INTEGER,
      total_watched_ms INTEGER NOT NULL DEFAULT 0,
      active_watched_ms INTEGER NOT NULL DEFAULT 0,
      lines_seen INTEGER NOT NULL DEFAULT 0,
@@ -1026,6 +1027,10 @@ export function ensureSchema(db: DatabaseSync): void {
    `);
  }

+  if (currentVersion?.schema_version && currentVersion.schema_version < 15) {
+    addColumnIfMissing(db, 'imm_sessions', 'ended_media_ms', 'INTEGER');
+  }
+
  ensureLifetimeSummaryTables(db);

  db.exec(`
@@ -1,4 +1,4 @@
-export const SCHEMA_VERSION = 14;
+export const SCHEMA_VERSION = 15;
 export const DEFAULT_QUEUE_CAP = 1_000;
 export const DEFAULT_BATCH_SIZE = 25;
 export const DEFAULT_FLUSH_INTERVAL_MS = 500;
@@ -482,6 +482,7 @@ export interface AnimeEpisodeRow {
  season: number | null;
  episode: number | null;
  durationMs: number;
+  endedMediaMs: number | null;
  watched: number;
  totalSessions: number;
  totalActiveMs: number;
@@ -55,6 +55,29 @@ test('annotateTokens known-word match mode uses headword vs surface', () => {
  assert.equal(surfaceResult[0]?.isKnown, false);
 });

+test('annotateTokens falls back to reading for known-word matches when headword lookup misses', () => {
+  const tokens = [
+    makeToken({
+      surface: '大体',
+      headword: '大体',
+      reading: 'だいたい',
+      frequencyRank: 1895,
+    }),
+  ];
+
+  const result = annotateTokens(
+    tokens,
+    makeDeps({
+      isKnownWord: (text) => text === 'だいたい',
+      getJlptLevel: (text) => (text === '大体' ? 'N4' : null),
+    }),
+  );
+
+  assert.equal(result[0]?.isKnown, true);
+  assert.equal(result[0]?.jlptLevel, 'N4');
+  assert.equal(result[0]?.frequencyRank, 1895);
+});
+
 test('annotateTokens excludes frequency for particle/bound_auxiliary and pos1 exclusions', () => {
  const tokens = [
    makeToken({
@@ -560,12 +560,7 @@ function isJlptEligibleToken(token: MergedToken): boolean {
    return false;
  }

-  const candidates = [
-    resolveJlptLookupText(token),
-    token.surface,
-    token.reading,
-    token.headword,
-  ].filter(
+  const candidates = [resolveJlptLookupText(token), token.surface, token.headword].filter(
    (candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
  );

@@ -659,7 +654,16 @@ function computeTokenKnownStatus(
  knownWordMatchMode: NPlusOneMatchMode,
 ): boolean {
  const matchText = resolveKnownWordText(token.surface, token.headword, knownWordMatchMode);
-  return token.isKnown || (matchText ? isKnownWord(matchText) : false);
+  if (token.isKnown || (matchText ? isKnownWord(matchText) : false)) {
+    return true;
+  }
+
+  const normalizedReading = token.reading.trim();
+  if (!normalizedReading) {
+    return false;
+  }
+
+  return normalizedReading !== matchText.trim() && isKnownWord(normalizedReading);
 }

 function filterTokenFrequencyRank(
@@ -82,6 +82,10 @@ export function EpisodeList({
                ep.totalYomitanLookupCount,
                ep.totalTokensSeen,
              );
+              const progressPct =
+                ep.durationMs > 0 && ep.endedMediaMs != null
+                  ? Math.min(100, Math.round((ep.endedMediaMs / ep.durationMs) * 100))
+                  : null;

              return (
                <Fragment key={ep.videoId}>
@@ -99,17 +103,17 @@ export function EpisodeList({
                      {ep.canonicalTitle}
                    </td>
                    <td className="py-2 pr-3 text-right">
-                      {ep.durationMs > 0 ? (
+                      {progressPct != null ? (
                        <span
                          className={
-                            ep.totalActiveMs >= ep.durationMs * 0.85
+                            progressPct >= 85
                              ? 'text-ctp-green'
-                              : ep.totalActiveMs >= ep.durationMs * 0.5
+                              : progressPct >= 50
                                ? 'text-ctp-peach'
                                : 'text-ctp-overlay2'
                          }
                        >
-                          {Math.min(100, Math.round((ep.totalActiveMs / ep.durationMs) * 100))}%
+                          {progressPct}%
                        </span>
                      ) : (
                        <span className="text-ctp-overlay2">{'\u2014'}</span>
@@ -21,6 +21,7 @@ test('EpisodeList renders explicit episode detail button alongside quick peek ro
          episode: 9,
          season: 1,
          durationMs: 1,
+          endedMediaMs: null,
          watched: 0,
          canonicalTitle: 'Episode 9',
          totalSessions: 1,
@@ -87,11 +87,12 @@ test('EpisodeList renders per-episode Yomitan lookup rate', () => {
          videoId: 9,
          episode: 9,
          season: 1,
-          durationMs: 1,
+          durationMs: 100,
+          endedMediaMs: 6,
          watched: 0,
          canonicalTitle: 'Episode 9',
          totalSessions: 1,
-          totalActiveMs: 1,
+          totalActiveMs: 90,
          totalCards: 1,
          totalTokensSeen: 350,
          totalYomitanLookupCount: 7,
@@ -103,6 +104,8 @@ test('EpisodeList renders per-episode Yomitan lookup rate', () => {

  assert.match(markup, /Lookup Rate/);
  assert.match(markup, /2\.0 \/ 100 tokens/);
+  assert.match(markup, /6%/);
+  assert.doesNotMatch(markup, /90%/);
 });

 test('AnimeOverviewStats renders aggregate Yomitan lookup metrics', () => {
@@ -212,6 +212,7 @@ export interface AnimeEpisode {
  episode: number | null;
  season: number | null;
  durationMs: number;
+  endedMediaMs: number | null;
  watched: number;
  canonicalTitle: string;
  totalSessions: number;