diff --git a/backlog/tasks/task-325 - Fix-session-chart-known-word-percentage-denominator.md b/backlog/tasks/task-325 - Fix-session-chart-known-word-percentage-denominator.md new file mode 100644 index 00000000..48d39318 --- /dev/null +++ b/backlog/tasks/task-325 - Fix-session-chart-known-word-percentage-denominator.md @@ -0,0 +1,38 @@ +--- +id: TASK-325 +title: Fix session chart known-word percentage denominator +status: Done +assignee: [] +created_date: '2026-05-04 01:19' +updated_date: '2026-05-04 01:23' +labels: + - stats +dependencies: [] +priority: medium +--- + +## Description + + +Session detail known-word percentages should use the same filtered vocabulary occurrence rows for both known and total word counts. Current chart can divide known persisted word occurrences by raw token totals, causing excluded tokens to depress the known percentage. + + +## Acceptance Criteria + +- [x] #1 Session known-word timeline API exposes cumulative filtered total word counts alongside known counts. +- [x] #2 Session detail chart computes known/unknown areas from filtered totals, not raw timeline token counts, when known-word data is available. +- [x] #3 Session summary known-word rate uses filtered persisted word totals where available and preserves safe fallback behavior when known-word data is unavailable. +- [x] #4 Regression tests cover filtered denominator behavior for the API and chart data path. + + +## Implementation Notes + + +Implemented in-place fix using existing persisted word occurrence rows. `/api/stats/sessions/:id/known-words-timeline` now returns cumulative `totalWordsSeen` from filtered persisted occurrences, and session known-word rates divide by the same filtered total. Session detail chart builds known/unknown areas from `totalWordsSeen` instead of raw timeline `tokensSeen`. + + +## Final Summary + + +Known-word percentages on session charts now use filtered persisted word totals for both numerator and denominator. No migration/backfill required; data comes from existing `imm_word_line_occurrences`. Added regression coverage for the API response/rate and chart data builder. + diff --git a/backlog/tasks/task-326 - Make-stats-word-metrics-honor-filtering-rules.md b/backlog/tasks/task-326 - Make-stats-word-metrics-honor-filtering-rules.md new file mode 100644 index 00000000..66d33e7c --- /dev/null +++ b/backlog/tasks/task-326 - Make-stats-word-metrics-honor-filtering-rules.md @@ -0,0 +1,42 @@ +--- +id: TASK-326 +title: Make stats word metrics honor filtering rules +status: Done +assignee: [] +created_date: '2026-05-04 01:35' +updated_date: '2026-05-04 02:08' +labels: + - stats +dependencies: [] +priority: high +--- + +## Description + + +Audit stats app metrics that show or derive from word totals and make them use filtered persisted vocabulary occurrences where the UI concept is learned/seen words. Preserve raw telemetry only where it is intentionally playback/token telemetry. + + +## Acceptance Criteria + +- [x] #1 Stats UI word totals, word rates, lookup-per-word rates, and chart word series use filtered persisted word occurrences where available. +- [x] #2 Known-word metrics continue to use the same filtered denominator as known counts. +- [x] #3 Trend, overview, library, session, and episode surfaces are audited with regression coverage for changed data paths. +- [x] #4 Fallback behavior remains safe for sessions without persisted vocabulary occurrences. + + +## Implementation Notes + + +Audit finding: raw `tokensSeen` / `totalTokensSeen` still feeds overview hints, dashboard aggregation, trends activity/progress/anime cumulative/library summary, lookup-per-100-word rates, session rows/recent sessions/episode sessions, and library/anime/media headers. Vocabulary and known unique word summaries already use persisted filtered vocabulary rows. Recommended design: query-time filtered word totals from existing `imm_word_line_occurrences`, with raw-token fallback only when a session has no persisted occurrence rows. + +Implemented shared query-time filtered word counts. Session summaries, overview hints, daily/monthly rollups, anime/media library/detail rows, anime episode rows, episode/media sessions, trends activity/progress/anime cumulative, library summary, and lookup-per-100-word ratios now use filtered persisted word occurrences. Fallback remains raw token totals only for sessions with no persisted subtitle-line rows. + +Follow-up implemented: Vocab frequency tables now apply the same tokenizer vocabulary predicate at read time, because old `imm_words` rows can predate current tokenizer exclusion rules. Vocabulary persistence and cleanup also mirror the broader subtitle-annotation grammar filters. Added common frequency stop terms observed in the stats vocabulary list to the shared tokenizer exclusion set so those rows are filtered consistently across subtitle annotations, persistence, cleanup, stats reads, and SQL word-count aggregates. + + +## Final Summary + + +Stats word metrics now honor filtering rules through the read-model query layer. Existing persisted `imm_word_line_occurrences` provide the filtered denominator; no migration/backfill needed. Vocab tables filter stored rows on read using tokenizer vocabulary rules, so legacy noisy rows stop appearing without a migration. Added regressions for session/overview/rollup fallback behavior, trends/library lookup-rate behavior, vocabulary read filtering, cleanup filtering, and shared stop-term filtering. + diff --git a/backlog/tasks/task-327 - Persist-stats-page-exclusion-list-in-database.md b/backlog/tasks/task-327 - Persist-stats-page-exclusion-list-in-database.md new file mode 100644 index 00000000..67ac01b0 --- /dev/null +++ b/backlog/tasks/task-327 - Persist-stats-page-exclusion-list-in-database.md @@ -0,0 +1,42 @@ +--- +id: TASK-327 +title: Persist stats page exclusion list in database +status: Done +assignee: [] +created_date: '2026-05-04 01:39' +updated_date: '2026-05-04 01:49' +labels: + - feature + - stats + - database +dependencies: [] +priority: medium +--- + +## Description + + +Add database-backed persistence for the stats page exclusion list. On first load with the new schema, seed the new table from the existing exclusion list source so existing user choices are preserved. After migration, update database rows whenever the exclusion list is changed or saved so it persists across browser sessions indefinitely. + + +## Acceptance Criteria + +- [x] #1 A new small database table stores stats page exclusion entries. +- [x] #2 First load with the new schema seeds the table from the existing exclusion list source. +- [x] #3 Subsequent exclusion list save/change operations update the database-backed list. +- [x] #4 Regression coverage verifies migration/seed behavior and persistence updates. + + +## Implementation Notes + + +Implemented DB-backed stats exclusion list using schema version 18 and new `imm_stats_excluded_words` table. Added read/replace query helpers, service methods, and `/api/stats/excluded-words` GET/PUT routes. Stats frontend now loads exclusions from DB, seeds the empty DB table from legacy `localStorage` on first load, and writes each toggle/restore/clear through the API while keeping localStorage in sync for compatibility. Added focused regression coverage for schema/read-replace, API routes, API client, and frontend bootstrap/update behavior. Verification: `bun run typecheck` passed; `bun test src/core/services/__tests__/stats-server.test.ts stats/src/lib/api-client.test.ts stats/src/hooks/useExcludedWords.test.ts` passed; `bun test src/core/services/immersion-tracker/storage-session.test.ts` passed; `bun run docs:test` passed; `bun run format:check:stats` passed; `bun run changelog:lint` passed. Blocked/unrelated: `bun run typecheck:stats` fails in existing stats files (`AnilistSelector.tsx`, `reading-utils*`, `session-grouping.test.ts`, `yomitan-lookup.test.tsx`); `bun run test:immersion:sqlite:src` fails existing `recordSubtitleLine counts exact Yomitan tokens for session metrics` expected 4 got 3; `bun run docs:build` fails missing `@catppuccin/vitepress/theme/macchiato/mauve.css` import. + +Added `src/core/services/__tests__/stats-server.test.ts` and `stats/src/hooks/useExcludedWords.test.ts` to the `test:core:src` allowlist so the new DB exclusion route/client/store regressions run in the maintained fast source lane. + + +## Final Summary + + +Persisted the stats vocabulary exclusion list in SQLite with new schema version 18 table `imm_stats_excluded_words`. Added backend read/replace helpers and `/api/stats/excluded-words` GET/PUT routes, then wired the stats frontend exclusion store to load DB rows, seed an empty DB from legacy browser localStorage on first load, and update the DB on toggle/restore/clear. Updated docs and added changelog fragment. Focused tests and root typecheck pass; broader stats/docs/sqlite gates are blocked by unrelated existing failures recorded in notes. + diff --git a/backlog/tasks/task-329 - Keep-JLPT-subtitle-styling-underline-only.md b/backlog/tasks/task-329 - Keep-JLPT-subtitle-styling-underline-only.md new file mode 100644 index 00000000..7d1d9163 --- /dev/null +++ b/backlog/tasks/task-329 - Keep-JLPT-subtitle-styling-underline-only.md @@ -0,0 +1,43 @@ +--- +id: TASK-329 +title: Keep JLPT subtitle styling underline-only +status: Done +assignee: [] +created_date: '2026-05-04 02:13' +labels: + - bug + - renderer + - jlpt +dependencies: [] +references: + - src/renderer/style.css + - src/renderer/subtitle-render.test.ts +priority: medium +--- + +## Description + + +Fix subtitle token styling so JLPT metadata never changes token text color. JLPT should only render the level marker/underline affordance while known, n+1, name-match, and frequency colors retain priority. + + +## Acceptance Criteria + +- [x] #1 JLPT-only subtitle tokens do not set token text color. +- [x] #2 JLPT level marker/underline still uses configured JLPT color. +- [x] #3 Existing known, n+1, name-match, and frequency text colors remain unchanged. + + +## Final Summary + + +Changed subtitle JLPT styling from text color to underline decoration and updated renderer CSS regression coverage. + +Verification: +- `bun test src/renderer/subtitle-render.test.ts` +- `bunx prettier --check src/renderer/subtitle-render.test.ts src/renderer/style.css` +- `bun run typecheck` + +Blocked: +- `bun run test:fast` fails in existing dirty stats/session work: `recordSubtitleLine counts exact Yomitan tokens for session metrics` expects `tokensSeen` 4 but gets 3. + diff --git a/changes/327-stats-exclusion-db.md b/changes/327-stats-exclusion-db.md new file mode 100644 index 00000000..6fabe609 --- /dev/null +++ b/changes/327-stats-exclusion-db.md @@ -0,0 +1,4 @@ +type: changed +area: stats + +- Stats vocabulary exclusions now persist in the immersion database and import existing browser-local exclusions on first load. diff --git a/docs-site/immersion-tracking.md b/docs-site/immersion-tracking.md index 357ee3fe..549182b4 100644 --- a/docs-site/immersion-tracking.md +++ b/docs-site/immersion-tracking.md @@ -102,7 +102,7 @@ Secondary subtitle text (typically English translations) is stored alongside pri ### Word Exclusion List -The Vocabulary tab toolbar includes an **Exclusions** button for hiding words from all vocabulary views. Excluded words are stored in browser localStorage and can be managed (restored or cleared) from the exclusion modal. Exclusions affect stat cards, charts, the frequency rank table, and the word list. +The Vocabulary tab toolbar includes an **Exclusions** button for hiding words from all vocabulary views. Excluded words are stored in the immersion database, with older browser localStorage exclusions imported on first load after upgrade. They can be managed (restored or cleared) from the exclusion modal. Exclusions affect stat cards, charts, the frequency rank table, and the word list. ## Retention Defaults diff --git a/package.json b/package.json index aafafe65..19b24877 100644 --- a/package.json +++ b/package.json @@ -48,7 +48,7 @@ "test:plugin:src": "lua scripts/test-plugin-lua-compat.lua && lua scripts/test-plugin-start-gate.lua && lua scripts/test-plugin-binary-windows.lua", "test:launcher:smoke:src": "bun test launcher/smoke.e2e.test.ts", "test:launcher:src": "bun test launcher/config.test.ts launcher/config-domain-parsers.test.ts launcher/config/cli-parser-builder.test.ts launcher/config/args-normalizer.test.ts launcher/mpv.test.ts launcher/picker.test.ts launcher/parse-args.test.ts launcher/main.test.ts launcher/commands/command-modules.test.ts launcher/smoke.e2e.test.ts && bun run test:plugin:src", - "test:core:src": "bun test src/cli/args.test.ts src/cli/help.test.ts src/shared/setup-state.test.ts src/core/services/cli-command.test.ts src/core/services/field-grouping-overlay.test.ts src/core/services/numeric-shortcut-session.test.ts src/core/services/secondary-subtitle.test.ts src/core/services/mpv-render-metrics.test.ts src/core/services/overlay-content-measurement.test.ts src/core/services/mpv-control.test.ts src/core/services/mpv.test.ts src/core/services/runtime-options-ipc.test.ts src/core/services/runtime-config.test.ts src/core/services/yomitan-extension-paths.test.ts src/core/services/config-hot-reload.test.ts src/core/services/discord-presence.test.ts src/core/services/tokenizer.test.ts src/core/services/tokenizer/annotation-stage.test.ts src/core/services/tokenizer/parser-selection-stage.test.ts src/core/services/tokenizer/parser-enrichment-stage.test.ts src/core/services/subsync.test.ts src/core/services/overlay-bridge.test.ts src/core/services/overlay-shortcut-handler.test.ts src/core/services/stats-window.test.ts src/main/runtime/stats-server-routing.test.ts src/core/services/mining.test.ts src/core/services/anki-jimaku.test.ts src/core/services/jimaku-download-path.test.ts src/core/services/jellyfin.test.ts src/core/services/jellyfin-remote.test.ts src/core/services/immersion-tracker-service.test.ts src/core/services/overlay-runtime-init.test.ts src/core/services/app-ready.test.ts src/core/services/startup-bootstrap.test.ts src/core/services/subtitle-processing-controller.test.ts src/core/services/anilist/anilist-update-queue.test.ts src/core/services/anilist/rate-limiter.test.ts src/core/services/jlpt-token-filter.test.ts src/core/services/subtitle-position.test.ts src/core/utils/shortcut-config.test.ts src/main/runtime/first-run-setup-plugin.test.ts src/main/runtime/first-run-setup-service.test.ts src/main/runtime/first-run-setup-window.test.ts src/main/runtime/tray-runtime.test.ts src/main/runtime/tray-main-actions.test.ts src/main/runtime/tray-main-deps.test.ts src/main/runtime/tray-runtime-handlers.test.ts src/main/runtime/cli-command-context-main-deps.test.ts src/main/runtime/app-ready-main-deps.test.ts src/renderer/error-recovery.test.ts src/renderer/subtitle-render.test.ts src/renderer/handlers/mouse.test.ts src/renderer/handlers/keyboard.test.ts src/renderer/modals/jimaku.test.ts src/subsync/utils.test.ts src/main/anilist-url-guard.test.ts src/window-trackers/hyprland-tracker.test.ts src/window-trackers/x11-tracker.test.ts src/window-trackers/windows-helper.test.ts src/window-trackers/windows-tracker.test.ts launcher/config.test.ts launcher/config-domain-parsers.test.ts launcher/config/cli-parser-builder.test.ts launcher/config/args-normalizer.test.ts launcher/parse-args.test.ts launcher/main.test.ts launcher/commands/command-modules.test.ts launcher/setup-gate.test.ts stats/src/lib/api-client.test.ts", + "test:core:src": "bun test src/cli/args.test.ts src/cli/help.test.ts src/shared/setup-state.test.ts src/core/services/cli-command.test.ts src/core/services/field-grouping-overlay.test.ts src/core/services/numeric-shortcut-session.test.ts src/core/services/secondary-subtitle.test.ts src/core/services/mpv-render-metrics.test.ts src/core/services/overlay-content-measurement.test.ts src/core/services/mpv-control.test.ts src/core/services/mpv.test.ts src/core/services/runtime-options-ipc.test.ts src/core/services/runtime-config.test.ts src/core/services/yomitan-extension-paths.test.ts src/core/services/config-hot-reload.test.ts src/core/services/discord-presence.test.ts src/core/services/tokenizer.test.ts src/core/services/tokenizer/annotation-stage.test.ts src/core/services/tokenizer/parser-selection-stage.test.ts src/core/services/tokenizer/parser-enrichment-stage.test.ts src/core/services/subsync.test.ts src/core/services/overlay-bridge.test.ts src/core/services/overlay-shortcut-handler.test.ts src/core/services/stats-window.test.ts src/core/services/__tests__/stats-server.test.ts src/main/runtime/stats-server-routing.test.ts src/core/services/mining.test.ts src/core/services/anki-jimaku.test.ts src/core/services/jimaku-download-path.test.ts src/core/services/jellyfin.test.ts src/core/services/jellyfin-remote.test.ts src/core/services/immersion-tracker-service.test.ts src/core/services/overlay-runtime-init.test.ts src/core/services/app-ready.test.ts src/core/services/startup-bootstrap.test.ts src/core/services/subtitle-processing-controller.test.ts src/core/services/anilist/anilist-update-queue.test.ts src/core/services/anilist/rate-limiter.test.ts src/core/services/jlpt-token-filter.test.ts src/core/services/subtitle-position.test.ts src/core/utils/shortcut-config.test.ts src/main/runtime/first-run-setup-plugin.test.ts src/main/runtime/first-run-setup-service.test.ts src/main/runtime/first-run-setup-window.test.ts src/main/runtime/tray-runtime.test.ts src/main/runtime/tray-main-actions.test.ts src/main/runtime/tray-main-deps.test.ts src/main/runtime/tray-runtime-handlers.test.ts src/main/runtime/cli-command-context-main-deps.test.ts src/main/runtime/app-ready-main-deps.test.ts src/renderer/error-recovery.test.ts src/renderer/subtitle-render.test.ts src/renderer/handlers/mouse.test.ts src/renderer/handlers/keyboard.test.ts src/renderer/modals/jimaku.test.ts src/subsync/utils.test.ts src/main/anilist-url-guard.test.ts src/window-trackers/hyprland-tracker.test.ts src/window-trackers/x11-tracker.test.ts src/window-trackers/windows-helper.test.ts src/window-trackers/windows-tracker.test.ts launcher/config.test.ts launcher/config-domain-parsers.test.ts launcher/config/cli-parser-builder.test.ts launcher/config/args-normalizer.test.ts launcher/parse-args.test.ts launcher/main.test.ts launcher/commands/command-modules.test.ts launcher/setup-gate.test.ts stats/src/lib/api-client.test.ts stats/src/hooks/useExcludedWords.test.ts", "test:core:dist": "bun test dist/cli/args.test.js dist/cli/help.test.js dist/core/services/cli-command.test.js dist/core/services/ipc.test.js dist/core/services/anki-jimaku-ipc.test.js dist/core/services/field-grouping-overlay.test.js dist/core/services/numeric-shortcut-session.test.js dist/core/services/secondary-subtitle.test.js dist/core/services/mpv-render-metrics.test.js dist/core/services/overlay-content-measurement.test.js dist/core/services/mpv-control.test.js dist/core/services/mpv.test.js dist/core/services/runtime-options-ipc.test.js dist/core/services/runtime-config.test.js dist/core/services/yomitan-extension-paths.test.js dist/core/services/config-hot-reload.test.js dist/core/services/discord-presence.test.js dist/core/services/tokenizer.test.js dist/core/services/tokenizer/annotation-stage.test.js dist/core/services/tokenizer/parser-selection-stage.test.js dist/core/services/tokenizer/parser-enrichment-stage.test.js dist/core/services/subsync.test.js dist/core/services/overlay-bridge.test.js dist/core/services/overlay-manager.test.js dist/core/services/overlay-shortcut-handler.test.js dist/core/services/mining.test.js dist/core/services/anki-jimaku.test.js dist/core/services/jimaku-download-path.test.js dist/core/services/jellyfin.test.js dist/core/services/jellyfin-remote.test.js dist/core/services/immersion-tracker-service.test.js dist/core/services/overlay-runtime-init.test.js dist/core/services/app-ready.test.js dist/core/services/startup-bootstrap.test.js dist/core/services/subtitle-processing-controller.test.js dist/core/services/anilist/anilist-token-store.test.js dist/core/services/anilist/anilist-update-queue.test.js dist/core/services/anilist/rate-limiter.test.js dist/core/services/jlpt-token-filter.test.js dist/core/services/subtitle-position.test.js dist/renderer/error-recovery.test.js dist/renderer/subtitle-render.test.js dist/renderer/handlers/mouse.test.js dist/renderer/handlers/keyboard.test.js dist/renderer/modals/jimaku.test.js dist/subsync/utils.test.js dist/main/anilist-url-guard.test.js dist/window-trackers/hyprland-tracker.test.js dist/window-trackers/x11-tracker.test.js dist/window-trackers/windows-helper.test.js dist/window-trackers/windows-tracker.test.js", "test:core:smoke:dist": "bun test dist/cli/help.test.js dist/core/services/runtime-config.test.js dist/core/services/ipc.test.js dist/core/services/overlay-manager.test.js dist/core/services/anilist/anilist-token-store.test.js dist/core/services/startup-bootstrap.test.js dist/renderer/error-recovery.test.js dist/main/anilist-url-guard.test.js dist/window-trackers/x11-tracker.test.js", "test:smoke:dist": "bun run test:config:smoke:dist && bun run test:core:smoke:dist", diff --git a/src/core/services/__tests__/stats-server.test.ts b/src/core/services/__tests__/stats-server.test.ts index 23190614..d88bdfa7 100644 --- a/src/core/services/__tests__/stats-server.test.ts +++ b/src/core/services/__tests__/stats-server.test.ts @@ -277,6 +277,8 @@ function createMockTracker( getSessionTimeline: async () => [], getSessionEvents: async () => [], getVocabularyStats: async () => VOCABULARY_STATS, + getStatsExcludedWords: async () => [], + replaceStatsExcludedWords: async () => {}, getKanjiStats: async () => KANJI_STATS, getWordOccurrences: async () => OCCURRENCES, getKanjiOccurrences: async () => OCCURRENCES, @@ -362,7 +364,7 @@ describe('stats server API routes', () => { assert.ok(Array.isArray(body)); }); - it('GET /api/stats/sessions enriches each session with known-word metrics when cache exists', async () => { + it('GET /api/stats/sessions enriches known-word metrics using filtered persisted totals', async () => { await withTempDir(async (dir) => { const cachePath = path.join(dir, 'known-words.json'); fs.writeFileSync( @@ -391,7 +393,7 @@ describe('stats server API routes', () => { const body = await res.json(); const first = body[0]; assert.equal(first.knownWordsSeen, 2); - assert.equal(first.knownWordRate, 2.5); + assert.equal(first.knownWordRate, 66.7); }); }); @@ -436,7 +438,7 @@ describe('stats server API routes', () => { assert.equal(seenLimit, undefined); }); - it('GET /api/stats/sessions/:id/known-words-timeline preserves line positions and counts known occurrences', async () => { + it('GET /api/stats/sessions/:id/known-words-timeline preserves line positions and counts filtered totals', async () => { await withTempDir(async (dir) => { const cachePath = path.join(dir, 'known-words.json'); fs.writeFileSync( @@ -461,8 +463,8 @@ describe('stats server API routes', () => { const res = await app.request('/api/stats/sessions/1/known-words-timeline'); assert.equal(res.status, 200); assert.deepEqual(await res.json(), [ - { linesSeen: 1, knownWordsSeen: 2 }, - { linesSeen: 3, knownWordsSeen: 3 }, + { linesSeen: 1, knownWordsSeen: 2, totalWordsSeen: 2 }, + { linesSeen: 3, knownWordsSeen: 3, totalWordsSeen: 7 }, ]); }); }); @@ -730,6 +732,65 @@ describe('stats server API routes', () => { assert.equal(body[0].pos3, null); }); + it('GET /api/stats/excluded-words returns tracker exclusion rows', async () => { + const app = createStatsApp( + createMockTracker({ + getStatsExcludedWords: async () => [ + { headword: '猫', word: '猫', reading: 'ねこ' }, + { headword: 'する', word: 'する', reading: 'する' }, + ], + }), + ); + + const res = await app.request('/api/stats/excluded-words'); + assert.equal(res.status, 200); + assert.deepEqual(await res.json(), [ + { headword: '猫', word: '猫', reading: 'ねこ' }, + { headword: 'する', word: 'する', reading: 'する' }, + ]); + }); + + it('PUT /api/stats/excluded-words replaces tracker exclusion rows', async () => { + let seenWords: unknown = null; + const app = createStatsApp( + createMockTracker({ + replaceStatsExcludedWords: async (words: unknown) => { + seenWords = words; + }, + }), + ); + + const res = await app.request('/api/stats/excluded-words', { + method: 'PUT', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + words: [ + { headword: '猫', word: '猫', reading: 'ねこ' }, + { headword: 'する', word: 'する', reading: 'する' }, + ], + }), + }); + + assert.equal(res.status, 200); + assert.deepEqual(await res.json(), { ok: true }); + assert.deepEqual(seenWords, [ + { headword: '猫', word: '猫', reading: 'ねこ' }, + { headword: 'する', word: 'する', reading: 'する' }, + ]); + }); + + it('PUT /api/stats/excluded-words rejects malformed rows', async () => { + const app = createStatsApp(createMockTracker()); + + const res = await app.request('/api/stats/excluded-words', { + method: 'PUT', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ words: [{ headword: '猫', word: 7, reading: 'ねこ' }] }), + }); + + assert.equal(res.status, 400); + }); + it('GET /api/stats/anime returns anime library', async () => { const app = createStatsApp(createMockTracker()); const res = await app.request('/api/stats/anime'); diff --git a/src/core/services/immersion-tracker-service.ts b/src/core/services/immersion-tracker-service.ts index fe93e508..7aa393c7 100644 --- a/src/core/services/immersion-tracker-service.ts +++ b/src/core/services/immersion-tracker-service.ts @@ -52,7 +52,9 @@ import { getKanjiWords, getSessionEvents, getSimilarWords, + getStatsExcludedWords, getVocabularyStats, + replaceStatsExcludedWords, getWordAnimeAppearances, getWordDetail, getWordOccurrences, @@ -151,6 +153,7 @@ import { type SessionSummaryQueryRow, type SessionTimelineRow, type SimilarWordRow, + type StatsExcludedWordRow, type StreakCalendarRow, type VocabularyCleanupSummary, type WatchTimePerAnimeRow, @@ -289,6 +292,7 @@ export type { SessionSummaryQueryRow, SessionTimelineRow, SimilarWordRow, + StatsExcludedWordRow, StreakCalendarRow, WatchTimePerAnimeRow, WordAnimeAppearanceRow, @@ -498,6 +502,14 @@ export class ImmersionTrackerService { return getVocabularyStats(this.db, limit, excludePos); } + async getStatsExcludedWords(): Promise { + return getStatsExcludedWords(this.db); + } + + async replaceStatsExcludedWords(words: StatsExcludedWordRow[]): Promise { + replaceStatsExcludedWords(this.db, words); + } + async cleanupVocabularyStats(): Promise { return cleanupVocabularyStats(this.db, { resolveLegacyPos: this.resolveLegacyVocabularyPos, diff --git a/src/core/services/immersion-tracker/__tests__/query.test.ts b/src/core/services/immersion-tracker/__tests__/query.test.ts index fcc18a7f..bafe6bfa 100644 --- a/src/core/services/immersion-tracker/__tests__/query.test.ts +++ b/src/core/services/immersion-tracker/__tests__/query.test.ts @@ -86,6 +86,77 @@ function cleanupDbPath(dbPath: string): void { } } +function insertFilteredWordOccurrence( + db: InstanceType, + options: { + sessionId: number; + videoId: number; + animeId?: number | null; + lineIndex?: number; + occurrenceCount: number; + startedAtMs: number; + headword?: string; + word?: string; + reading?: string; + partOfSpeech?: string; + pos1?: string; + pos2?: string; + pos3?: string; + }, +): void { + const headword = options.headword ?? options.word ?? '猫'; + const word = options.word ?? headword; + const lineId = Number( + db + .prepare( + `INSERT INTO imm_subtitle_lines ( + session_id, event_id, video_id, anime_id, line_index, + segment_start_ms, segment_end_ms, text, CREATED_DATE, LAST_UPDATE_DATE + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run( + options.sessionId, + null, + options.videoId, + options.animeId ?? null, + options.lineIndex ?? 1, + 0, + 1000, + word, + options.startedAtMs, + options.startedAtMs, + ).lastInsertRowid, + ); + const wordRow = db + .prepare( + `INSERT INTO imm_words ( + headword, word, reading, pos1, pos2, pos3, part_of_speech, + first_seen, last_seen, frequency + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(headword, word, reading) DO UPDATE SET + frequency = imm_words.frequency + excluded.frequency, + last_seen = excluded.last_seen + RETURNING id`, + ) + .get( + word, + options.reading ?? '', + options.pos1 ?? '名詞', + options.pos2 ?? '一般', + options.pos3 ?? '', + options.partOfSpeech ?? 'noun', + Math.floor(options.startedAtMs / 1000), + Math.floor(options.startedAtMs / 1000), + options.occurrenceCount, + ) as { id: number }; + const wordId = Number(wordRow.id); + + db.prepare( + `INSERT INTO imm_word_line_occurrences (line_id, word_id, occurrence_count) + VALUES (?, ?, ?)`, + ).run(lineId, wordId, options.occurrenceCount); +} + function withMockNowMs(fixedDateMs: string | number, run: () => T): T { const previousNowMs = globalThis.__subminerTestNowMs; globalThis.__subminerTestNowMs = fixedDateMs; @@ -1236,6 +1307,89 @@ test('getQueryHints computes weekly new-word cutoff from calendar midnights', () }); }); +test('word-count read models use filtered persisted occurrences with raw fallback', () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + ensureSchema(db); + const videoId = getOrCreateVideoRecord(db, 'local:/tmp/filtered-word-metrics.mkv', { + canonicalTitle: 'Filtered Word Metrics', + sourcePath: '/tmp/filtered-word-metrics.mkv', + sourceUrl: null, + sourceType: SOURCE_TYPE_LOCAL, + }); + + const startedAtMs = 1_700_000_000_000; + const withOccurrences = startSessionRecord(db, videoId, startedAtMs); + const fallbackOnly = startSessionRecord(db, videoId, startedAtMs + 60_000); + + db.prepare( + ` + UPDATE imm_sessions + SET ended_at_ms = ?, status = 2, active_watched_ms = ?, tokens_seen = ?, yomitan_lookup_count = ? + WHERE session_id = ? + `, + ).run(startedAtMs + 30_000, 2, 5, 1, withOccurrences.sessionId); + db.prepare( + ` + UPDATE imm_sessions + SET ended_at_ms = ?, status = 2, active_watched_ms = ?, tokens_seen = ?, yomitan_lookup_count = ? + WHERE session_id = ? + `, + ).run(startedAtMs + 90_000, 2, 7, 2, fallbackOnly.sessionId); + + insertFilteredWordOccurrence(db, { + sessionId: withOccurrences.sessionId, + videoId, + occurrenceCount: 2, + startedAtMs, + }); + insertFilteredWordOccurrence(db, { + sessionId: withOccurrences.sessionId, + videoId, + lineIndex: 2, + occurrenceCount: 3, + startedAtMs, + headword: 'じゃない', + word: 'じゃない', + partOfSpeech: 'i_adjective', + pos1: '形容詞', + pos2: '*|自立', + pos3: '*', + }); + + db.prepare( + ` + INSERT INTO imm_daily_rollups ( + rollup_day, video_id, total_sessions, total_active_min, total_lines_seen, + total_tokens_seen, total_cards + ) VALUES (?, ?, ?, ?, ?, ?, ?) + `, + ).run(Math.floor(startedAtMs / 86_400_000), videoId, 2, 1, 2, 12, 0); + + const summaries = getSessionSummaries(db, 10); + assert.equal( + summaries.find((session) => session.sessionId === withOccurrences.sessionId)?.tokensSeen, + 2, + ); + assert.equal( + summaries.find((session) => session.sessionId === fallbackOnly.sessionId)?.tokensSeen, + 7, + ); + + const hints = getQueryHints(db); + assert.equal(hints.totalTokensSeen, 9); + + const rollup = getDailyRollups(db, 1)[0]!; + assert.equal(rollup.totalTokensSeen, 9); + assert.equal(rollup.tokensPerMin, 9); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); + test('getQueryHints counts new words by distinct headword first-seen time', () => { const dbPath = makeDbPath(); const db = new Database(dbPath); @@ -1430,6 +1584,61 @@ test('getVocabularyStats returns rows ordered by frequency descending', () => { } }); +test('getVocabularyStats filters rows that fail tokenizer vocabulary rules', () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + ensureSchema(db); + const stmts = createTrackerPreparedStatements(db); + + stmts.wordUpsertStmt.run( + 'どうしても', + 'どうしてもって', + 'どうしてもって', + 'other', + '副詞|助詞', + '一般|格助詞', + '', + 1_000, + 1_000, + ); + stmts.wordUpsertStmt.run( + 'じゃない', + 'じゃない', + '', + 'i_adjective', + '形容詞', + '*|自立', + '*', + 1_100, + 1_100, + ); + stmts.wordUpsertStmt.run( + '何か', + '何か', + 'なにか', + 'other', + '名詞|助詞', + '代名詞|副助詞/並立助詞/終助詞', + '一般|*', + 1_200, + 1_200, + ); + stmts.wordUpsertStmt.run('猫', '猫', 'ねこ', 'noun', '名詞', '一般', '', 1_500, 1_500); + + const rows = getVocabularyStats(db, 10); + + assert.deepEqual( + rows.map((row) => row.headword), + ['猫'], + ); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); + test('getVocabularyStats returns empty array when no words exist', () => { const dbPath = makeDbPath(); const db = new Database(dbPath); @@ -1475,6 +1684,22 @@ test('cleanupVocabularyStats repairs stored POS metadata and removes excluded im headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, ).run('未解決', '未解決', '', '', '', '', '', 901, 951, 1); + db.prepare( + `INSERT INTO imm_words ( + headword, word, reading, part_of_speech, pos1, pos2, pos3, first_seen, last_seen, frequency + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ).run( + 'どうしても', + 'どうしてもって', + 'どうしてもって', + 'other', + '副詞|助詞', + '一般|格助詞', + '', + 1_110, + 1_610, + 7, + ); const result = await cleanupVocabularyStats(db, { resolveLegacyPos: async (row) => { @@ -1517,7 +1742,7 @@ test('cleanupVocabularyStats repairs stored POS metadata and removes excluded im pos2: string; }>; - assert.deepEqual(result, { scanned: 5, kept: 3, deleted: 2, repaired: 2 }); + assert.deepEqual(result, { scanned: 6, kept: 3, deleted: 3, repaired: 2 }); assert.deepEqual( rows.map((row) => ({ headword: row.headword, frequency: row.frequency })), [ @@ -2226,6 +2451,31 @@ test('getSessionWordsByLine joins word occurrences through imm_words.id', () => `INSERT INTO imm_word_line_occurrences (line_id, word_id, occurrence_count) VALUES (?, ?, ?)`, ).run(lineId, wordId, 1); + const excludedWordId = Number( + db + .prepare( + `INSERT INTO imm_words ( + headword, word, reading, pos1, pos2, pos3, part_of_speech, first_seen, last_seen, frequency + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run( + 'じゃない', + 'じゃない', + '', + '形容詞', + '*|自立', + '*', + 'i_adjective', + startedAtMs, + startedAtMs, + 1, + ).lastInsertRowid, + ); + + db.prepare( + `INSERT INTO imm_word_line_occurrences (line_id, word_id, occurrence_count) + VALUES (?, ?, ?)`, + ).run(lineId, excludedWordId, 3); assert.deepEqual(getSessionWordsByLine(db, sessionId), [ { lineIndex: 0, headword: '猫', occurrenceCount: 1 }, @@ -3959,6 +4209,121 @@ test('getTrendsDashboard librarySummary returns null lookupsPerHundred when word } }); +test('getTrendsDashboard word metrics use filtered persisted occurrences', () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + ensureSchema(db); + const stmts = createTrackerPreparedStatements(db); + const videoId = getOrCreateVideoRecord(db, 'local:/tmp/filtered-trends.mkv', { + canonicalTitle: 'Filtered Trends Episode', + sourcePath: '/tmp/filtered-trends.mkv', + sourceUrl: null, + sourceType: SOURCE_TYPE_LOCAL, + }); + const animeId = getOrCreateAnimeRecord(db, { + parsedTitle: 'Filtered Trends Anime', + canonicalTitle: 'Filtered Trends Anime', + anilistId: null, + titleRomaji: null, + titleEnglish: null, + titleNative: null, + metadataJson: null, + }); + linkVideoToAnimeRecord(db, videoId, { + animeId, + parsedBasename: 'filtered-trends.mkv', + parsedTitle: 'Filtered Trends Anime', + parsedSeason: 1, + parsedEpisode: 1, + parserSource: 'test', + parserConfidence: 1, + parseMetadataJson: null, + }); + + const dayOneStart = 1_700_000_000_000; + const dayTwoStart = dayOneStart + 86_400_000; + const rows = [ + { start: dayOneStart, rawWords: 10, filteredWords: 2, lookups: 4 }, + { start: dayTwoStart, rawWords: 20, filteredWords: 3, lookups: 6 }, + ]; + + for (const [index, row] of rows.entries()) { + const session = startSessionRecord(db, videoId, row.start); + stmts.telemetryInsertStmt.run( + session.sessionId, + `${row.start + 60_000}`, + 10 * 60_000, + 10 * 60_000, + 1, + row.rawWords, + 0, + 0, + 0, + row.lookups, + 0, + 0, + 0, + 0, + `${row.start + 60_000}`, + `${row.start + 60_000}`, + ); + db.prepare( + ` + UPDATE imm_sessions + SET ended_at_ms = ?, total_watched_ms = ?, active_watched_ms = ?, + lines_seen = ?, tokens_seen = ?, cards_mined = ?, yomitan_lookup_count = ? + WHERE session_id = ? + `, + ).run( + `${row.start + 60_000}`, + 10 * 60_000, + 10 * 60_000, + 1, + row.rawWords, + 0, + row.lookups, + session.sessionId, + ); + insertFilteredWordOccurrence(db, { + sessionId: session.sessionId, + videoId, + animeId, + lineIndex: index + 1, + occurrenceCount: row.filteredWords, + startedAtMs: row.start, + headword: `単語${index}`, + }); + db.prepare( + ` + INSERT INTO imm_daily_rollups ( + rollup_day, video_id, total_sessions, total_active_min, total_lines_seen, + total_tokens_seen, total_cards + ) VALUES (?, ?, ?, ?, ?, ?, ?) + `, + ).run(Math.floor(row.start / 86_400_000), videoId, 1, 10, 1, row.rawWords, 0); + } + + const dashboard = getTrendsDashboard(db, 'all', 'day'); + assert.deepEqual( + dashboard.activity.words.map((point) => point.value), + [2, 3], + ); + assert.deepEqual( + dashboard.progress.words.map((point) => point.value), + [2, 5], + ); + assert.equal(dashboard.ratios.lookupsPerHundred[0]?.value, 200); + assert.equal(dashboard.librarySummary[0]?.words, 5); + assert.equal(dashboard.librarySummary[0]?.lookupsPerHundred, 200); + assert.equal(dashboard.animeCumulative.words.at(-1)?.value, 5); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); + test('getTrendsDashboard librarySummary is empty when no rollups exist', () => { const dbPath = makeDbPath(); const db = new Database(dbPath); diff --git a/src/core/services/immersion-tracker/query-lexical.ts b/src/core/services/immersion-tracker/query-lexical.ts index 736595b8..6774e294 100644 --- a/src/core/services/immersion-tracker/query-lexical.ts +++ b/src/core/services/immersion-tracker/query-lexical.ts @@ -1,4 +1,6 @@ import type { DatabaseSync } from './sqlite'; +import { PartOfSpeech, type MergedToken } from '../../../types'; +import { shouldExcludeTokenFromVocabularyPersistence } from '../tokenizer/annotation-stage'; import type { KanjiAnimeAppearanceRow, KanjiDetailRow, @@ -7,18 +9,55 @@ import type { KanjiWordRow, SessionEventRow, SimilarWordRow, + StatsExcludedWordRow, VocabularyStatsRow, WordAnimeAppearanceRow, WordDetailRow, WordOccurrenceRow, } from './types'; -import { fromDbTimestamp } from './query-shared'; +import { fromDbTimestamp, toDbTimestamp } from './query-shared'; +import { nowMs } from './time'; + +const VOCABULARY_STATS_FILTER_OVERSAMPLE_FACTOR = 4; +const VOCABULARY_STATS_FILTER_OVERSAMPLE_MIN = 100; + +function toVocabularyToken(row: VocabularyStatsRow): MergedToken { + const partOfSpeech = + row.partOfSpeech && Object.values(PartOfSpeech).includes(row.partOfSpeech as PartOfSpeech) + ? (row.partOfSpeech as PartOfSpeech) + : PartOfSpeech.other; + + return { + surface: row.word, + reading: row.reading ?? '', + headword: row.headword, + startPos: 0, + endPos: row.word.length, + partOfSpeech, + pos1: row.pos1 ?? '', + pos2: row.pos2 ?? '', + pos3: row.pos3 ?? '', + frequencyRank: row.frequencyRank ?? undefined, + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }; +} + +function isVocabularyStatsRowVisible(row: VocabularyStatsRow): boolean { + return !shouldExcludeTokenFromVocabularyPersistence(toVocabularyToken(row)); +} export function getVocabularyStats( db: DatabaseSync, limit = 100, excludePos?: string[], ): VocabularyStatsRow[] { + const queryLimit = Math.max( + limit, + limit * VOCABULARY_STATS_FILTER_OVERSAMPLE_FACTOR, + limit + VOCABULARY_STATS_FILTER_OVERSAMPLE_MIN, + ); const hasExclude = excludePos && excludePos.length > 0; const placeholders = hasExclude ? excludePos.map(() => '?').join(', ') : ''; const whereClause = hasExclude @@ -37,8 +76,48 @@ export function getVocabularyStats( GROUP BY w.id ORDER BY w.frequency DESC LIMIT ? `); - const params = hasExclude ? [...excludePos, limit] : [limit]; - return stmt.all(...params) as VocabularyStatsRow[]; + const params = hasExclude ? [...excludePos, queryLimit] : [queryLimit]; + return (stmt.all(...params) as VocabularyStatsRow[]) + .filter(isVocabularyStatsRowVisible) + .slice(0, limit); +} + +export function getStatsExcludedWords(db: DatabaseSync): StatsExcludedWordRow[] { + return db + .prepare( + ` + SELECT headword, word, reading + FROM imm_stats_excluded_words + ORDER BY headword COLLATE NOCASE, word COLLATE NOCASE, reading COLLATE NOCASE + `, + ) + .all() as StatsExcludedWordRow[]; +} + +export function replaceStatsExcludedWords(db: DatabaseSync, words: StatsExcludedWordRow[]): void { + const now = toDbTimestamp(nowMs()); + const insertStmt = db.prepare(` + INSERT OR IGNORE INTO imm_stats_excluded_words( + headword, + word, + reading, + CREATED_DATE, + LAST_UPDATE_DATE + ) + VALUES (?, ?, ?, ?, ?) + `); + + db.exec('BEGIN IMMEDIATE'); + try { + db.prepare('DELETE FROM imm_stats_excluded_words').run(); + for (const word of words) { + insertStmt.run(word.headword, word.word, word.reading, now, now); + } + db.exec('COMMIT'); + } catch (error) { + db.exec('ROLLBACK'); + throw error; + } } export function getKanjiStats(db: DatabaseSync, limit = 100): KanjiStatsRow[] { diff --git a/src/core/services/immersion-tracker/query-library.ts b/src/core/services/immersion-tracker/query-library.ts index e9edaa5e..a650abc4 100644 --- a/src/core/services/immersion-tracker/query-library.ts +++ b/src/core/services/immersion-tracker/query-library.ts @@ -16,12 +16,31 @@ import type { StreakCalendarRow, WatchTimePerAnimeRow, } from './types'; -import { ACTIVE_SESSION_METRICS_CTE, fromDbTimestamp, resolvedCoverBlobExpr } from './query-shared'; +import { + ACTIVE_SESSION_METRICS_CTE, + SESSION_WORD_COUNTS_CTE, + SESSION_WORD_COUNTS_SELECT, + fromDbTimestamp, + resolvedCoverBlobExpr, + sessionDisplayWordsExpr, + visibleWordSql, +} from './query-shared'; export function getAnimeLibrary(db: DatabaseSync): AnimeLibraryRow[] { + const wordsExpr = sessionDisplayWordsExpr('s', 'swc'); const rows = db .prepare( ` + ${SESSION_WORD_COUNTS_CTE}, + anime_word_counts AS ( + SELECT v.anime_id AS animeId, SUM(${wordsExpr}) AS totalTokensSeen + FROM imm_sessions s + JOIN imm_videos v ON v.video_id = s.video_id + LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id + WHERE s.ended_at_ms IS NOT NULL + AND v.anime_id IS NOT NULL + GROUP BY v.anime_id + ) SELECT a.anime_id AS animeId, a.canonical_title AS canonicalTitle, @@ -29,13 +48,14 @@ export function getAnimeLibrary(db: DatabaseSync): AnimeLibraryRow[] { COALESCE(lm.total_sessions, 0) AS totalSessions, COALESCE(lm.total_active_ms, 0) AS totalActiveMs, COALESCE(lm.total_cards, 0) AS totalCards, - COALESCE(lm.total_tokens_seen, 0) AS totalTokensSeen, + COALESCE(awc.totalTokensSeen, lm.total_tokens_seen, 0) AS totalTokensSeen, COUNT(DISTINCT v.video_id) AS episodeCount, a.episodes_total AS episodesTotal, COALESCE(lm.last_watched_ms, 0) AS lastWatchedMs FROM imm_anime a JOIN imm_lifetime_anime lm ON lm.anime_id = a.anime_id JOIN imm_videos v ON v.anime_id = a.anime_id + LEFT JOIN anime_word_counts awc ON awc.animeId = a.anime_id GROUP BY a.anime_id ORDER BY totalActiveMs DESC, lm.last_watched_ms DESC, canonicalTitle ASC `, @@ -48,6 +68,7 @@ export function getAnimeLibrary(db: DatabaseSync): AnimeLibraryRow[] { } export function getAnimeDetail(db: DatabaseSync, animeId: number): AnimeDetailRow | null { + const wordsExpr = sessionDisplayWordsExpr('s', 'swc', 'COALESCE(asm.tokensSeen, s.tokens_seen)'); const row = db .prepare( ` @@ -63,7 +84,10 @@ export function getAnimeDetail(db: DatabaseSync, animeId: number): AnimeDetailRo COALESCE(lm.total_sessions, 0) AS totalSessions, COALESCE(lm.total_active_ms, 0) AS totalActiveMs, COALESCE(lm.total_cards, 0) AS totalCards, - COALESCE(lm.total_tokens_seen, 0) AS totalTokensSeen, + CASE + WHEN COUNT(s.session_id) > 0 THEN COALESCE(SUM(${wordsExpr}), 0) + ELSE COALESCE(lm.total_tokens_seen, 0) + END AS totalTokensSeen, COALESCE(lm.total_lines_seen, 0) AS totalLinesSeen, COALESCE(SUM(COALESCE(asm.lookupCount, s.lookup_count, 0)), 0) AS totalLookupCount, COALESCE(SUM(COALESCE(asm.lookupHits, s.lookup_hits, 0)), 0) AS totalLookupHits, @@ -75,6 +99,7 @@ export function getAnimeDetail(db: DatabaseSync, animeId: number): AnimeDetailRo JOIN imm_videos v ON v.anime_id = a.anime_id LEFT JOIN imm_sessions s ON s.video_id = v.video_id LEFT JOIN active_session_metrics asm ON asm.sessionId = s.session_id + LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id WHERE a.anime_id = ? GROUP BY a.anime_id `, @@ -108,6 +133,7 @@ export function getAnimeAnilistEntries(db: DatabaseSync, animeId: number): Anime } export function getAnimeEpisodes(db: DatabaseSync, animeId: number): AnimeEpisodeRow[] { + const wordsExpr = sessionDisplayWordsExpr('s', 'swc', 'COALESCE(asm.tokensSeen, s.tokens_seen)'); const rows = db .prepare( ` @@ -162,12 +188,13 @@ export function getAnimeEpisodes(db: DatabaseSync, animeId: number): AnimeEpisod COUNT(DISTINCT s.session_id) AS totalSessions, COALESCE(SUM(COALESCE(asm.activeWatchedMs, s.active_watched_ms, 0)), 0) AS totalActiveMs, COALESCE(SUM(COALESCE(asm.cardsMined, s.cards_mined, 0)), 0) AS totalCards, - COALESCE(SUM(COALESCE(asm.tokensSeen, s.tokens_seen, 0)), 0) AS totalTokensSeen, + COALESCE(SUM(${wordsExpr}), 0) AS totalTokensSeen, COALESCE(SUM(COALESCE(asm.yomitanLookupCount, s.yomitan_lookup_count, 0)), 0) AS totalYomitanLookupCount, MAX(s.started_at_ms) AS lastWatchedMs FROM imm_videos v LEFT JOIN imm_sessions s ON s.video_id = v.video_id LEFT JOIN active_session_metrics asm ON asm.sessionId = s.session_id + LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id WHERE v.anime_id = ? GROUP BY v.video_id ORDER BY @@ -192,16 +219,25 @@ export function getAnimeEpisodes(db: DatabaseSync, animeId: number): AnimeEpisod } export function getMediaLibrary(db: DatabaseSync): MediaLibraryRow[] { + const wordsExpr = sessionDisplayWordsExpr('s', 'swc'); const rows = db .prepare( ` + ${SESSION_WORD_COUNTS_CTE}, + media_word_counts AS ( + SELECT s.video_id AS videoId, SUM(${wordsExpr}) AS totalTokensSeen + FROM imm_sessions s + LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id + WHERE s.ended_at_ms IS NOT NULL + GROUP BY s.video_id + ) SELECT v.video_id AS videoId, v.canonical_title AS canonicalTitle, COALESCE(lm.total_sessions, 0) AS totalSessions, COALESCE(lm.total_active_ms, 0) AS totalActiveMs, COALESCE(lm.total_cards, 0) AS totalCards, - COALESCE(lm.total_tokens_seen, 0) AS totalTokensSeen, + COALESCE(mwc.totalTokensSeen, lm.total_tokens_seen, 0) AS totalTokensSeen, COALESCE(lm.last_watched_ms, 0) AS lastWatchedMs, yv.youtube_video_id AS youtubeVideoId, yv.video_url AS videoUrl, @@ -220,6 +256,7 @@ export function getMediaLibrary(db: DatabaseSync): MediaLibraryRow[] { END AS hasCoverArt FROM imm_videos v JOIN imm_lifetime_media lm ON lm.video_id = v.video_id + LEFT JOIN media_word_counts mwc ON mwc.videoId = v.video_id LEFT JOIN imm_media_art ma ON ma.video_id = v.video_id LEFT JOIN imm_youtube_videos yv ON yv.video_id = v.video_id ORDER BY lm.last_watched_ms DESC @@ -233,6 +270,7 @@ export function getMediaLibrary(db: DatabaseSync): MediaLibraryRow[] { } export function getMediaDetail(db: DatabaseSync, videoId: number): MediaDetailRow | null { + const wordsExpr = sessionDisplayWordsExpr('s', 'swc', 'COALESCE(asm.tokensSeen, s.tokens_seen)'); return db .prepare( ` @@ -244,7 +282,10 @@ export function getMediaDetail(db: DatabaseSync, videoId: number): MediaDetailRo COALESCE(lm.total_sessions, 0) AS totalSessions, COALESCE(lm.total_active_ms, 0) AS totalActiveMs, COALESCE(lm.total_cards, 0) AS totalCards, - COALESCE(lm.total_tokens_seen, 0) AS totalTokensSeen, + CASE + WHEN COUNT(s.session_id) > 0 THEN COALESCE(SUM(${wordsExpr}), 0) + ELSE COALESCE(lm.total_tokens_seen, 0) + END AS totalTokensSeen, COALESCE(lm.total_lines_seen, 0) AS totalLinesSeen, COALESCE(SUM(COALESCE(asm.lookupCount, s.lookup_count, 0)), 0) AS totalLookupCount, COALESCE(SUM(COALESCE(asm.lookupHits, s.lookup_hits, 0)), 0) AS totalLookupHits, @@ -265,6 +306,7 @@ export function getMediaDetail(db: DatabaseSync, videoId: number): MediaDetailRo LEFT JOIN imm_youtube_videos yv ON yv.video_id = v.video_id LEFT JOIN imm_sessions s ON s.video_id = v.video_id LEFT JOIN active_session_metrics asm ON asm.sessionId = s.session_id + LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id WHERE v.video_id = ? GROUP BY v.video_id `, @@ -277,6 +319,7 @@ export function getMediaSessions( videoId: number, limit = 100, ): SessionSummaryQueryRow[] { + const wordsExpr = sessionDisplayWordsExpr('s', 'swc', 'COALESCE(asm.tokensSeen, s.tokens_seen)'); const rows = db .prepare( ` @@ -290,13 +333,14 @@ export function getMediaSessions( COALESCE(asm.totalWatchedMs, s.total_watched_ms, 0) AS totalWatchedMs, COALESCE(asm.activeWatchedMs, s.active_watched_ms, 0) AS activeWatchedMs, COALESCE(asm.linesSeen, s.lines_seen, 0) AS linesSeen, - COALESCE(asm.tokensSeen, s.tokens_seen, 0) AS tokensSeen, + ${wordsExpr} AS tokensSeen, COALESCE(asm.cardsMined, s.cards_mined, 0) AS cardsMined, COALESCE(asm.lookupCount, s.lookup_count, 0) AS lookupCount, COALESCE(asm.lookupHits, s.lookup_hits, 0) AS lookupHits, COALESCE(asm.yomitanLookupCount, s.yomitan_lookup_count, 0) AS yomitanLookupCount FROM imm_sessions s LEFT JOIN active_session_metrics asm ON asm.sessionId = s.session_id + LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id LEFT JOIN imm_videos v ON v.video_id = s.video_id WHERE s.video_id = ? ORDER BY s.started_at_ms DESC @@ -321,10 +365,27 @@ export function getMediaDailyRollups( videoId: number, limit = 90, ): ImmersionSessionRollupRow[] { + const wordsExpr = sessionDisplayWordsExpr('s', 'swc'); return db .prepare( ` - WITH recent_days AS ( + WITH session_word_counts AS ( + ${SESSION_WORD_COUNTS_SELECT} + ), + daily_word_counts AS ( + SELECT + CAST( + julianday(CAST(s.started_at_ms AS REAL) / 1000, 'unixepoch', 'localtime') - 2440587.5 + AS INTEGER + ) AS rollupDay, + s.video_id AS videoId, + SUM(${wordsExpr}) AS totalTokensSeen + FROM imm_sessions s + LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id + WHERE s.ended_at_ms IS NOT NULL + GROUP BY rollupDay, s.video_id + ), + recent_days AS ( SELECT DISTINCT rollup_day FROM imm_daily_rollups WHERE video_id = ? @@ -337,12 +398,18 @@ export function getMediaDailyRollups( total_sessions AS totalSessions, total_active_min AS totalActiveMin, total_lines_seen AS totalLinesSeen, - total_tokens_seen AS totalTokensSeen, + COALESCE(dwc.totalTokensSeen, total_tokens_seen) AS totalTokensSeen, total_cards AS totalCards, cards_per_hour AS cardsPerHour, - tokens_per_min AS tokensPerMin, + CASE + WHEN total_active_min > 0 THEN COALESCE(dwc.totalTokensSeen, total_tokens_seen) * 1.0 / total_active_min + ELSE NULL + END AS tokensPerMin, lookup_hit_rate AS lookupHitRate FROM imm_daily_rollups + LEFT JOIN daily_word_counts dwc + ON dwc.rollupDay = rollup_day + AND dwc.videoId = video_id WHERE video_id = ? AND rollup_day IN (SELECT rollup_day FROM recent_days) ORDER BY rollup_day DESC, video_id DESC @@ -356,10 +423,27 @@ export function getAnimeDailyRollups( animeId: number, limit = 90, ): ImmersionSessionRollupRow[] { + const wordsExpr = sessionDisplayWordsExpr('s', 'swc'); return db .prepare( ` - WITH recent_days AS ( + WITH session_word_counts AS ( + ${SESSION_WORD_COUNTS_SELECT} + ), + daily_word_counts AS ( + SELECT + CAST( + julianday(CAST(s.started_at_ms AS REAL) / 1000, 'unixepoch', 'localtime') - 2440587.5 + AS INTEGER + ) AS rollupDay, + s.video_id AS videoId, + SUM(${wordsExpr}) AS totalTokensSeen + FROM imm_sessions s + LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id + WHERE s.ended_at_ms IS NOT NULL + GROUP BY rollupDay, s.video_id + ), + recent_days AS ( SELECT DISTINCT r.rollup_day FROM imm_daily_rollups r JOIN imm_videos v ON v.video_id = r.video_id @@ -370,11 +454,19 @@ export function getAnimeDailyRollups( SELECT r.rollup_day AS rollupDayOrMonth, r.video_id AS videoId, r.total_sessions AS totalSessions, r.total_active_min AS totalActiveMin, r.total_lines_seen AS totalLinesSeen, - r.total_tokens_seen AS totalTokensSeen, r.total_cards AS totalCards, - r.cards_per_hour AS cardsPerHour, r.tokens_per_min AS tokensPerMin, + COALESCE(dwc.totalTokensSeen, r.total_tokens_seen) AS totalTokensSeen, + r.total_cards AS totalCards, + r.cards_per_hour AS cardsPerHour, + CASE + WHEN r.total_active_min > 0 THEN COALESCE(dwc.totalTokensSeen, r.total_tokens_seen) * 1.0 / r.total_active_min + ELSE NULL + END AS tokensPerMin, r.lookup_hit_rate AS lookupHitRate FROM imm_daily_rollups r JOIN imm_videos v ON v.video_id = r.video_id + LEFT JOIN daily_word_counts dwc + ON dwc.rollupDay = r.rollup_day + AND dwc.videoId = r.video_id WHERE v.anime_id = ? AND r.rollup_day IN (SELECT rollup_day FROM recent_days) ORDER BY r.rollup_day DESC, r.video_id DESC @@ -470,7 +562,7 @@ export function getAnimeWords(db: DatabaseSync, animeId: number, limit = 50): An FROM imm_word_line_occurrences o JOIN imm_subtitle_lines sl ON sl.line_id = o.line_id JOIN imm_words w ON w.id = o.word_id - WHERE sl.anime_id = ? + WHERE sl.anime_id = ? AND ${visibleWordSql('w')} GROUP BY w.id ORDER BY frequency DESC LIMIT ? @@ -556,6 +648,7 @@ export function getEpisodeWords(db: DatabaseSync, videoId: number, limit = 50): } export function getEpisodeSessions(db: DatabaseSync, videoId: number): SessionSummaryQueryRow[] { + const wordsExpr = sessionDisplayWordsExpr('s', 'swc', 'COALESCE(asm.tokensSeen, s.tokens_seen)'); const rows = db .prepare( ` @@ -567,7 +660,7 @@ export function getEpisodeSessions(db: DatabaseSync, videoId: number): SessionSu COALESCE(asm.totalWatchedMs, s.total_watched_ms, 0) AS totalWatchedMs, COALESCE(asm.activeWatchedMs, s.active_watched_ms, 0) AS activeWatchedMs, COALESCE(asm.linesSeen, s.lines_seen, 0) AS linesSeen, - COALESCE(asm.tokensSeen, s.tokens_seen, 0) AS tokensSeen, + ${wordsExpr} AS tokensSeen, COALESCE(asm.cardsMined, s.cards_mined, 0) AS cardsMined, COALESCE(asm.lookupCount, s.lookup_count, 0) AS lookupCount, COALESCE(asm.lookupHits, s.lookup_hits, 0) AS lookupHits, @@ -575,6 +668,7 @@ export function getEpisodeSessions(db: DatabaseSync, videoId: number): SessionSu FROM imm_sessions s JOIN imm_videos v ON v.video_id = s.video_id LEFT JOIN active_session_metrics asm ON asm.sessionId = s.session_id + LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id WHERE s.video_id = ? ORDER BY s.started_at_ms DESC `, diff --git a/src/core/services/immersion-tracker/query-sessions.ts b/src/core/services/immersion-tracker/query-sessions.ts index d81fb0bc..a135f226 100644 --- a/src/core/services/immersion-tracker/query-sessions.ts +++ b/src/core/services/immersion-tracker/query-sessions.ts @@ -6,14 +6,18 @@ import type { } from './types'; import { ACTIVE_SESSION_METRICS_CTE, + SESSION_WORD_COUNTS_CTE, + SESSION_WORD_COUNTS_SELECT, currentDbTimestamp, fromDbTimestamp, getLocalEpochDay, getShiftedLocalDaySec, - toDbTimestamp, + sessionDisplayWordsExpr, + visibleWordSql, } from './query-shared'; export function getSessionSummaries(db: DatabaseSync, limit = 50): SessionSummaryQueryRow[] { + const wordsExpr = sessionDisplayWordsExpr('s', 'swc', 'COALESCE(asm.tokensSeen, s.tokens_seen)'); const prepared = db.prepare(` ${ACTIVE_SESSION_METRICS_CTE} SELECT @@ -27,13 +31,14 @@ export function getSessionSummaries(db: DatabaseSync, limit = 50): SessionSummar COALESCE(asm.totalWatchedMs, s.total_watched_ms, 0) AS totalWatchedMs, COALESCE(asm.activeWatchedMs, s.active_watched_ms, 0) AS activeWatchedMs, COALESCE(asm.linesSeen, s.lines_seen, 0) AS linesSeen, - COALESCE(asm.tokensSeen, s.tokens_seen, 0) AS tokensSeen, + ${wordsExpr} AS tokensSeen, COALESCE(asm.cardsMined, s.cards_mined, 0) AS cardsMined, COALESCE(asm.lookupCount, s.lookup_count, 0) AS lookupCount, COALESCE(asm.lookupHits, s.lookup_hits, 0) AS lookupHits, COALESCE(asm.yomitanLookupCount, s.yomitan_lookup_count, 0) AS yomitanLookupCount FROM imm_sessions s LEFT JOIN active_session_metrics asm ON asm.sessionId = s.session_id + LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id LEFT JOIN imm_videos v ON v.video_id = s.video_id LEFT JOIN imm_anime a ON a.anime_id = v.anime_id ORDER BY s.started_at_ms DESC @@ -94,7 +99,9 @@ export function getSessionTimeline( /** Returns all distinct headwords in the vocabulary table (global). */ export function getAllDistinctHeadwords(db: DatabaseSync): string[] { - const rows = db.prepare('SELECT DISTINCT headword FROM imm_words').all() as Array<{ + const rows = db + .prepare(`SELECT DISTINCT headword FROM imm_words w WHERE ${visibleWordSql('w')}`) + .all() as Array<{ headword: string; }>; return rows.map((r) => r.headword); @@ -109,7 +116,7 @@ export function getAnimeDistinctHeadwords(db: DatabaseSync, animeId: number): st FROM imm_word_line_occurrences o JOIN imm_subtitle_lines sl ON sl.line_id = o.line_id JOIN imm_words w ON w.id = o.word_id - WHERE sl.anime_id = ? + WHERE sl.anime_id = ? AND ${visibleWordSql('w')} `, ) .all(animeId) as Array<{ headword: string }>; @@ -125,7 +132,7 @@ export function getMediaDistinctHeadwords(db: DatabaseSync, videoId: number): st FROM imm_word_line_occurrences o JOIN imm_subtitle_lines sl ON sl.line_id = o.line_id JOIN imm_words w ON w.id = o.word_id - WHERE sl.video_id = ? + WHERE sl.video_id = ? AND ${visibleWordSql('w')} `, ) .all(videoId) as Array<{ headword: string }>; @@ -148,7 +155,7 @@ export function getSessionWordsByLine( FROM imm_subtitle_lines sl JOIN imm_word_line_occurrences wlo ON wlo.line_id = sl.line_id JOIN imm_words w ON w.id = wlo.word_id - WHERE sl.session_id = ? + WHERE sl.session_id = ? AND ${visibleWordSql('w')} ORDER BY sl.line_index ASC `); return stmt.all(sessionId) as Array<{ @@ -290,11 +297,17 @@ export function getQueryHints(db: DatabaseSync): { const totalCards = Number(lifetime?.totalCards ?? 0); const activeDays = Number(lifetime?.activeDays ?? 0); + const lookupWordsExpr = sessionDisplayWordsExpr( + 's', + 'swc', + 'COALESCE(t.tokens_seen, s.tokens_seen)', + ); const lookupTotals = db .prepare( ` + ${SESSION_WORD_COUNTS_CTE} SELECT - COALESCE(SUM(COALESCE(t.tokens_seen, s.tokens_seen, 0)), 0) AS totalTokensSeen, + COALESCE(SUM(${lookupWordsExpr}), 0) AS totalTokensSeen, COALESCE(SUM(COALESCE(t.lookup_count, s.lookup_count, 0)), 0) AS totalLookupCount, COALESCE(SUM(COALESCE(t.lookup_hits, s.lookup_hits, 0)), 0) AS totalLookupHits, COALESCE(SUM(COALESCE(t.yomitan_lookup_count, s.yomitan_lookup_count, 0)), 0) AS totalYomitanLookupCount @@ -309,6 +322,7 @@ export function getQueryHints(db: DatabaseSync): { FROM imm_session_telemetry GROUP BY session_id ) t ON t.session_id = s.session_id + LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id WHERE s.ended_at_ms IS NOT NULL `, ) @@ -338,8 +352,25 @@ export function getQueryHints(db: DatabaseSync): { } export function getDailyRollups(db: DatabaseSync, limit = 60): ImmersionSessionRollupRow[] { + const wordsExpr = sessionDisplayWordsExpr('s', 'swc'); const prepared = db.prepare(` - WITH recent_days AS ( + WITH session_word_counts AS ( + ${SESSION_WORD_COUNTS_SELECT} + ), + daily_word_counts AS ( + SELECT + CAST( + julianday(CAST(s.started_at_ms AS REAL) / 1000, 'unixepoch', 'localtime') - 2440587.5 + AS INTEGER + ) AS rollupDay, + s.video_id AS videoId, + SUM(${wordsExpr}) AS totalTokensSeen + FROM imm_sessions s + LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id + WHERE s.ended_at_ms IS NOT NULL + GROUP BY rollupDay, s.video_id + ), + recent_days AS ( SELECT DISTINCT rollup_day FROM imm_daily_rollups ORDER BY rollup_day DESC @@ -351,12 +382,21 @@ export function getDailyRollups(db: DatabaseSync, limit = 60): ImmersionSessionR r.total_sessions AS totalSessions, r.total_active_min AS totalActiveMin, r.total_lines_seen AS totalLinesSeen, - r.total_tokens_seen AS totalTokensSeen, + COALESCE(dwc.totalTokensSeen, r.total_tokens_seen) AS totalTokensSeen, r.total_cards AS totalCards, r.cards_per_hour AS cardsPerHour, - r.tokens_per_min AS tokensPerMin, + CASE + WHEN r.total_active_min > 0 THEN COALESCE(dwc.totalTokensSeen, r.total_tokens_seen) * 1.0 / r.total_active_min + ELSE NULL + END AS tokensPerMin, r.lookup_hit_rate AS lookupHitRate FROM imm_daily_rollups r + LEFT JOIN daily_word_counts dwc + ON dwc.rollupDay = r.rollup_day + AND ( + (dwc.videoId IS NULL AND r.video_id IS NULL) + OR dwc.videoId = r.video_id + ) WHERE r.rollup_day IN (SELECT rollup_day FROM recent_days) ORDER BY r.rollup_day DESC, r.video_id DESC `); @@ -365,33 +405,53 @@ export function getDailyRollups(db: DatabaseSync, limit = 60): ImmersionSessionR } export function getMonthlyRollups(db: DatabaseSync, limit = 24): ImmersionSessionRollupRow[] { + const wordsExpr = sessionDisplayWordsExpr('s', 'swc'); const prepared = db.prepare(` - WITH recent_months AS ( + WITH session_word_counts AS ( + ${SESSION_WORD_COUNTS_SELECT} + ), + monthly_word_counts AS ( + SELECT + CAST(strftime('%Y%m', CAST(s.started_at_ms AS REAL) / 1000, 'unixepoch', 'localtime') AS INTEGER) AS rollupMonth, + s.video_id AS videoId, + SUM(${wordsExpr}) AS totalTokensSeen + FROM imm_sessions s + LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id + WHERE s.ended_at_ms IS NOT NULL + GROUP BY rollupMonth, s.video_id + ), + recent_months AS ( SELECT DISTINCT rollup_month FROM imm_monthly_rollups ORDER BY rollup_month DESC LIMIT ? ) SELECT - rollup_month AS rollupDayOrMonth, - video_id AS videoId, - total_sessions AS totalSessions, - total_active_min AS totalActiveMin, - total_lines_seen AS totalLinesSeen, - total_tokens_seen AS totalTokensSeen, - total_cards AS totalCards, + r.rollup_month AS rollupDayOrMonth, + r.video_id AS videoId, + r.total_sessions AS totalSessions, + r.total_active_min AS totalActiveMin, + r.total_lines_seen AS totalLinesSeen, + COALESCE(mwc.totalTokensSeen, r.total_tokens_seen) AS totalTokensSeen, + r.total_cards AS totalCards, CASE - WHEN total_active_min > 0 THEN (total_cards * 60.0) / total_active_min + WHEN r.total_active_min > 0 THEN (r.total_cards * 60.0) / r.total_active_min ELSE NULL END AS cardsPerHour, CASE - WHEN total_active_min > 0 THEN total_tokens_seen * 1.0 / total_active_min + WHEN r.total_active_min > 0 THEN COALESCE(mwc.totalTokensSeen, r.total_tokens_seen) * 1.0 / r.total_active_min ELSE NULL END AS tokensPerMin, NULL AS lookupHitRate - FROM imm_monthly_rollups - WHERE rollup_month IN (SELECT rollup_month FROM recent_months) - ORDER BY rollup_month DESC, video_id DESC + FROM imm_monthly_rollups r + LEFT JOIN monthly_word_counts mwc + ON mwc.rollupMonth = r.rollup_month + AND ( + (mwc.videoId IS NULL AND r.video_id IS NULL) + OR mwc.videoId = r.video_id + ) + WHERE r.rollup_month IN (SELECT rollup_month FROM recent_months) + ORDER BY r.rollup_month DESC, r.video_id DESC `); return prepared.all(limit) as unknown as ImmersionSessionRollupRow[]; } diff --git a/src/core/services/immersion-tracker/query-shared.ts b/src/core/services/immersion-tracker/query-shared.ts index c492fd2c..a727b1c9 100644 --- a/src/core/services/immersion-tracker/query-shared.ts +++ b/src/core/services/immersion-tracker/query-shared.ts @@ -1,6 +1,42 @@ import type { DatabaseSync } from './sqlite'; +import { SUBTITLE_ANNOTATION_EXCLUDED_TERMS } from '../tokenizer/subtitle-annotation-filter'; import { nowMs } from './time'; +function quoteSqlString(value: string): string { + return `'${value.replaceAll("'", "''")}'`; +} + +const SQL_EXCLUDED_VOCABULARY_TERMS = [...SUBTITLE_ANNOTATION_EXCLUDED_TERMS].map(quoteSqlString); +const SQL_EXCLUDED_VOCABULARY_TERMS_LIST = + SQL_EXCLUDED_VOCABULARY_TERMS.length > 0 ? SQL_EXCLUDED_VOCABULARY_TERMS.join(', ') : "''"; + +export function visibleWordSql(wordAlias: string): string { + return `( + TRIM(COALESCE(${wordAlias}.word, '')) NOT IN (${SQL_EXCLUDED_VOCABULARY_TERMS_LIST}) + AND TRIM(COALESCE(${wordAlias}.headword, '')) NOT IN (${SQL_EXCLUDED_VOCABULARY_TERMS_LIST}) + AND TRIM(COALESCE(${wordAlias}.reading, '')) NOT IN (${SQL_EXCLUDED_VOCABULARY_TERMS_LIST}) + )`; +} + +export function filteredWordOccurrenceCountSql(occurrenceAlias: string, wordAlias: string): string { + return `CASE + WHEN ${occurrenceAlias}.word_id IS NOT NULL AND ${visibleWordSql(wordAlias)} + THEN ${occurrenceAlias}.occurrence_count + ELSE 0 + END`; +} + +export const SESSION_WORD_COUNTS_SELECT = ` + SELECT + sl.session_id AS sessionId, + COUNT(DISTINCT sl.line_id) AS persistedLineCount, + COALESCE(SUM(${filteredWordOccurrenceCountSql('wlo', 'w')}), 0) AS filteredWordsSeen + FROM imm_subtitle_lines sl + LEFT JOIN imm_word_line_occurrences wlo ON wlo.line_id = sl.line_id + LEFT JOIN imm_words w ON w.id = wlo.word_id + GROUP BY sl.session_id +`; + export const ACTIVE_SESSION_METRICS_CTE = ` WITH active_session_metrics AS ( SELECT @@ -17,9 +53,29 @@ export const ACTIVE_SESSION_METRICS_CTE = ` JOIN imm_sessions s ON s.session_id = t.session_id WHERE s.ended_at_ms IS NULL GROUP BY t.session_id + ), + session_word_counts AS ( + ${SESSION_WORD_COUNTS_SELECT} ) `; +export const SESSION_WORD_COUNTS_CTE = ` + WITH session_word_counts AS ( + ${SESSION_WORD_COUNTS_SELECT} + ) +`; + +export function sessionDisplayWordsExpr( + sessionAlias: string, + wordCountAlias: string, + rawTokensExpr = `${sessionAlias}.tokens_seen`, +): string { + return `CASE + WHEN COALESCE(${wordCountAlias}.persistedLineCount, 0) > 0 THEN COALESCE(${wordCountAlias}.filteredWordsSeen, 0) + ELSE COALESCE(${rawTokensExpr}, 0) + END`; +} + export function makePlaceholders(values: number[]): string { return values.map(() => '?').join(','); } diff --git a/src/core/services/immersion-tracker/query-trends.ts b/src/core/services/immersion-tracker/query-trends.ts index 8dd4aecd..f4e41cbc 100644 --- a/src/core/services/immersion-tracker/query-trends.ts +++ b/src/core/services/immersion-tracker/query-trends.ts @@ -9,6 +9,7 @@ import { getLocalMonthKey, getShiftedLocalDayTimestamp, makePlaceholders, + sessionDisplayWordsExpr, toDbTimestamp, } from './query-shared'; import { getDailyRollups, getMonthlyRollups } from './query-sessions'; @@ -560,6 +561,7 @@ function getTrendSessionMetrics( db: DatabaseSync, cutoffMs: string | null, ): TrendSessionMetricRow[] { + const wordsExpr = sessionDisplayWordsExpr('s', 'swc', 'COALESCE(asm.tokensSeen, s.tokens_seen)'); const whereClause = cutoffMs === null ? '' : 'WHERE s.started_at_ms >= ?'; const cutoffValue = cutoffMs === null ? null : toDbTimestamp(cutoffMs); const prepared = db.prepare(` @@ -570,11 +572,12 @@ function getTrendSessionMetrics( v.canonical_title AS canonicalTitle, a.canonical_title AS animeTitle, COALESCE(asm.activeWatchedMs, s.active_watched_ms, 0) AS activeWatchedMs, - COALESCE(asm.tokensSeen, s.tokens_seen, 0) AS tokensSeen, + ${wordsExpr} AS tokensSeen, COALESCE(asm.cardsMined, s.cards_mined, 0) AS cardsMined, COALESCE(asm.yomitanLookupCount, s.yomitan_lookup_count, 0) AS yomitanLookupCount FROM imm_sessions s LEFT JOIN active_session_metrics asm ON asm.sessionId = s.session_id + LEFT JOIN session_word_counts swc ON swc.sessionId = s.session_id LEFT JOIN imm_videos v ON v.video_id = s.video_id LEFT JOIN imm_anime a ON a.anime_id = v.anime_id ${whereClause} diff --git a/src/core/services/immersion-tracker/storage-session.test.ts b/src/core/services/immersion-tracker/storage-session.test.ts index d07dc7a6..650542b9 100644 --- a/src/core/services/immersion-tracker/storage-session.test.ts +++ b/src/core/services/immersion-tracker/storage-session.test.ts @@ -4,6 +4,7 @@ import os from 'node:os'; import path from 'node:path'; import test from 'node:test'; import { Database } from './sqlite'; +import { getStatsExcludedWords, replaceStatsExcludedWords } from './query-lexical'; import { finalizeSessionRecord, startSessionRecord } from './session'; import { applyPragmas, @@ -113,6 +114,7 @@ test('ensureSchema creates immersion core tables', () => { assert.ok(tableNames.has('imm_rollup_state')); assert.ok(tableNames.has('imm_cover_art_blobs')); assert.ok(tableNames.has('imm_youtube_videos')); + assert.ok(tableNames.has('imm_stats_excluded_words')); const videoColumns = new Set( ( @@ -153,6 +155,32 @@ test('ensureSchema creates immersion core tables', () => { } }); +test('stats excluded words are replaced and read from sqlite storage', () => { + const dbPath = makeDbPath(); + const db = new Database(dbPath); + + try { + ensureSchema(db); + + replaceStatsExcludedWords(db, [ + { headword: '猫', word: '猫', reading: 'ねこ' }, + { headword: 'する', word: 'する', reading: 'する' }, + ]); + assert.deepEqual(getStatsExcludedWords(db), [ + { headword: 'する', word: 'する', reading: 'する' }, + { headword: '猫', word: '猫', reading: 'ねこ' }, + ]); + + replaceStatsExcludedWords(db, [{ headword: '犬', word: '犬', reading: 'いぬ' }]); + assert.deepEqual(getStatsExcludedWords(db), [ + { headword: '犬', word: '犬', reading: 'いぬ' }, + ]); + } finally { + db.close(); + cleanupDbPath(dbPath); + } +}); + test('ensureSchema adds youtube metadata table to existing schema version 15 databases', () => { const dbPath = makeDbPath(); const db = new Database(dbPath); diff --git a/src/core/services/immersion-tracker/storage.ts b/src/core/services/immersion-tracker/storage.ts index bb4e3e34..f0daeaee 100644 --- a/src/core/services/immersion-tracker/storage.ts +++ b/src/core/services/immersion-tracker/storage.ts @@ -464,6 +464,19 @@ function ensureLifetimeSummaryTables(db: DatabaseSync): void { `); } +function ensureStatsExcludedWordsTable(db: DatabaseSync): void { + db.exec(` + CREATE TABLE IF NOT EXISTS imm_stats_excluded_words( + headword TEXT NOT NULL, + word TEXT NOT NULL, + reading TEXT NOT NULL, + CREATED_DATE TEXT, + LAST_UPDATE_DATE TEXT, + PRIMARY KEY(headword, word, reading) + ) + `); +} + export function getOrCreateAnimeRecord(db: DatabaseSync, input: AnimeRecordInput): number { const normalizedTitleKey = normalizeAnimeIdentityKey(input.parsedTitle); if (!normalizedTitleKey) { @@ -678,6 +691,7 @@ export function ensureSchema(db: DatabaseSync): void { .get() as { schema_version: number } | null; if (currentVersion?.schema_version === SCHEMA_VERSION) { ensureLifetimeSummaryTables(db); + ensureStatsExcludedWordsTable(db); return; } @@ -1221,6 +1235,7 @@ export function ensureSchema(db: DatabaseSync): void { migrateSessionEventTimestampsToText(db); ensureLifetimeSummaryTables(db); + ensureStatsExcludedWordsTable(db); db.exec(` CREATE INDEX IF NOT EXISTS idx_anime_normalized_title diff --git a/src/core/services/immersion-tracker/types.ts b/src/core/services/immersion-tracker/types.ts index f0171244..b0dcf6ce 100644 --- a/src/core/services/immersion-tracker/types.ts +++ b/src/core/services/immersion-tracker/types.ts @@ -1,4 +1,4 @@ -export const SCHEMA_VERSION = 17; +export const SCHEMA_VERSION = 18; export const DEFAULT_QUEUE_CAP = 1_000; export const DEFAULT_BATCH_SIZE = 25; export const DEFAULT_FLUSH_INTERVAL_MS = 500; @@ -301,6 +301,12 @@ export interface VocabularyStatsRow { lastSeen: number; } +export interface StatsExcludedWordRow { + headword: string; + word: string; + reading: string; +} + export interface VocabularyCleanupSummary { scanned: number; kept: number; diff --git a/src/core/services/stats-server.ts b/src/core/services/stats-server.ts index cdaeef01..ad97c242 100644 --- a/src/core/services/stats-server.ts +++ b/src/core/services/stats-server.ts @@ -20,6 +20,12 @@ type StatsServerNoteInfo = { fields: Record; }; +type StatsExcludedWordPayload = { + headword: string; + word: string; + reading: string; +}; + function parseIntQuery(raw: string | undefined, fallback: number, maxLimit?: number): number { if (raw === undefined) return fallback; const n = Number(raw); @@ -49,6 +55,23 @@ function parseEventTypesQuery(raw: string | undefined): number[] | undefined { return parsed.length > 0 ? parsed : undefined; } +function parseExcludedWordsBody(body: unknown): StatsExcludedWordPayload[] | null { + if (!body || typeof body !== 'object' || !Array.isArray((body as { words?: unknown }).words)) { + return null; + } + + const words: StatsExcludedWordPayload[] = []; + for (const row of (body as { words: unknown[] }).words) { + if (!row || typeof row !== 'object') return null; + const { headword, word, reading } = row as Record; + if (typeof headword !== 'string' || typeof word !== 'string' || typeof reading !== 'string') { + return null; + } + words.push({ headword, word, reading }); + } + return words; +} + function resolveStatsNoteFieldName( noteInfo: StatsServerNoteInfo, ...preferredNames: (string | undefined)[] @@ -161,6 +184,21 @@ function toKnownWordRate(knownWordsSeen: number, tokensSeen: number): number { return Number(((knownWordsSeen / tokensSeen) * 100).toFixed(1)); } +function summarizeFilteredWordOccurrences( + wordsByLine: Array<{ lineIndex: number; headword: string; occurrenceCount: number }>, + knownWordsSet: Set, +): { knownWordsSeen: number; totalWordsSeen: number } { + let knownWordsSeen = 0; + let totalWordsSeen = 0; + for (const row of wordsByLine) { + totalWordsSeen += row.occurrenceCount; + if (knownWordsSet.has(row.headword)) { + knownWordsSeen += row.occurrenceCount; + } + } + return { knownWordsSeen, totalWordsSeen }; +} + async function enrichSessionsWithKnownWordMetrics( tracker: ImmersionTrackerService, sessions: Array<{ @@ -188,21 +226,21 @@ async function enrichSessionsWithKnownWordMetrics( const enriched = await Promise.all( sessions.map(async (session) => { let knownWordsSeen = 0; + let totalWordsSeen = 0; try { const wordsByLine = await tracker.getSessionWordsByLine(session.sessionId); - for (const row of wordsByLine) { - if (knownWordsSet.has(row.headword)) { - knownWordsSeen += row.occurrenceCount; - } - } + const summary = summarizeFilteredWordOccurrences(wordsByLine, knownWordsSet); + knownWordsSeen = summary.knownWordsSeen; + totalWordsSeen = summary.totalWordsSeen; } catch { knownWordsSeen = 0; + totalWordsSeen = 0; } return { ...session, knownWordsSeen, - knownWordRate: toKnownWordRate(knownWordsSeen, session.tokensSeen), + knownWordRate: toKnownWordRate(knownWordsSeen, totalWordsSeen), }; }), ); @@ -391,32 +429,45 @@ export function createStatsApp( const id = parseIntQuery(c.req.param('id'), 0); if (id <= 0) return c.json([], 400); - const knownWordsSet = loadKnownWordsSet(options?.knownWordCachePath); - if (!knownWordsSet) return c.json([]); + const knownWordsSet = loadKnownWordsSet(options?.knownWordCachePath) ?? new Set(); // Get per-line word occurrences for the session. const wordsByLine = await tracker.getSessionWordsByLine(id); - // Build cumulative known-word occurrence count per recorded line index. + // Build cumulative filtered occurrence counts per recorded line index. // The stats UI uses line-count progress to align this series with the session // timeline, so preserve the stored line position rather than compressing gaps. - const lineGroups = new Map(); + const totalLineGroups = new Map(); + const knownLineGroups = new Map(); for (const row of wordsByLine) { - if (!knownWordsSet.has(row.headword)) { - continue; + totalLineGroups.set( + row.lineIndex, + (totalLineGroups.get(row.lineIndex) ?? 0) + row.occurrenceCount, + ); + if (knownWordsSet.has(row.headword)) { + knownLineGroups.set( + row.lineIndex, + (knownLineGroups.get(row.lineIndex) ?? 0) + row.occurrenceCount, + ); } - lineGroups.set(row.lineIndex, (lineGroups.get(row.lineIndex) ?? 0) + row.occurrenceCount); } - const sortedLineIndices = [...lineGroups.keys()].sort((a, b) => a - b); + const sortedLineIndices = [...totalLineGroups.keys()].sort((a, b) => a - b); let knownWordsSeen = 0; - const knownByLinesSeen: Array<{ linesSeen: number; knownWordsSeen: number }> = []; + let totalWordsSeen = 0; + const knownByLinesSeen: Array<{ + linesSeen: number; + knownWordsSeen: number; + totalWordsSeen: number; + }> = []; for (const lineIdx of sortedLineIndices) { - knownWordsSeen += lineGroups.get(lineIdx)!; + knownWordsSeen += knownLineGroups.get(lineIdx) ?? 0; + totalWordsSeen += totalLineGroups.get(lineIdx)!; knownByLinesSeen.push({ linesSeen: lineIdx, knownWordsSeen, + totalWordsSeen, }); } @@ -430,6 +481,18 @@ export function createStatsApp( return c.json(vocab); }); + app.get('/api/stats/excluded-words', async (c) => { + return c.json(await tracker.getStatsExcludedWords()); + }); + + app.put('/api/stats/excluded-words', async (c) => { + const body = await c.req.json().catch(() => null); + const words = parseExcludedWordsBody(body); + if (!words) return c.body(null, 400); + await tracker.replaceStatsExcludedWords(words); + return c.json({ ok: true }); + }); + app.get('/api/stats/vocabulary/occurrences', async (c) => { const headword = (c.req.query('headword') ?? '').trim(); const word = (c.req.query('word') ?? '').trim(); diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts index cb78c244..f5fca15c 100644 --- a/src/core/services/tokenizer/annotation-stage.test.ts +++ b/src/core/services/tokenizer/annotation-stage.test.ts @@ -5,6 +5,7 @@ import { annotateTokens, AnnotationStageDeps, shouldExcludeTokenFromSubtitleAnnotations, + shouldExcludeTokenFromVocabularyPersistence, stripSubtitleAnnotationMetadata, } from './annotation-stage'; @@ -366,6 +367,87 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes kana-only non-independe assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true); }); +test('shouldExcludeTokenFromVocabularyPersistence mirrors subtitle annotation grammar filters', () => { + const tokens = [ + makeToken({ + surface: 'どうしてもって', + headword: 'どうしても', + reading: 'ドウシテモッテ', + partOfSpeech: PartOfSpeech.other, + pos1: '副詞|助詞', + pos2: '一般|格助詞', + }), + makeToken({ + surface: 'そうだ', + headword: 'そう', + reading: 'ソウダ', + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞|助動詞', + pos2: '一般|', + pos3: '助動詞語幹|', + }), + ]; + + for (const token of tokens) { + assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface); + assert.equal(shouldExcludeTokenFromVocabularyPersistence(token), true, token.surface); + } +}); + +test('shouldExcludeTokenFromVocabularyPersistence excludes common frequency stop terms', () => { + const tokens = [ + makeToken({ + surface: 'じゃない', + headword: 'じゃない', + reading: '', + partOfSpeech: PartOfSpeech.i_adjective, + pos1: '形容詞', + pos2: '*|自立', + pos3: '*', + }), + makeToken({ + surface: 'である', + headword: 'である', + reading: '', + partOfSpeech: PartOfSpeech.verb, + pos1: '動詞', + pos2: '*', + pos3: '*', + }), + makeToken({ + surface: '何か', + headword: '何か', + reading: 'なにか', + partOfSpeech: PartOfSpeech.other, + pos1: '名詞|助詞', + pos2: '代名詞|副助詞/並立助詞/終助詞', + pos3: '一般|*', + }), + makeToken({ + surface: '確かに', + headword: '確かに', + reading: 'たしかに', + partOfSpeech: PartOfSpeech.other, + pos1: '名詞|助詞', + pos2: '形容動詞語幹|副詞化', + pos3: '*', + }), + makeToken({ + surface: 'あなた', + headword: '貴方', + reading: 'あなた', + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '代名詞', + pos3: '一般', + }), + ]; + + for (const token of tokens) { + assert.equal(shouldExcludeTokenFromVocabularyPersistence(token), true, token.surface); + } +}); + test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => { const token = makeToken({ surface: 'は', diff --git a/src/core/services/tokenizer/annotation-stage.ts b/src/core/services/tokenizer/annotation-stage.ts index c57d935b..7e08e923 100644 --- a/src/core/services/tokenizer/annotation-stage.ts +++ b/src/core/services/tokenizer/annotation-stage.ts @@ -328,10 +328,12 @@ export function shouldExcludeTokenFromVocabularyPersistence( token: MergedToken, options: Pick = {}, ): boolean { - return isFrequencyExcludedByPos( - token, - resolvePos1Exclusions(options), - resolvePos2Exclusions(options), + const pos1Exclusions = resolvePos1Exclusions(options); + const pos2Exclusions = resolvePos2Exclusions(options); + + return ( + sharedShouldExcludeTokenFromSubtitleAnnotations(token, { pos1Exclusions, pos2Exclusions }) || + isFrequencyExcludedByPos(token, pos1Exclusions, pos2Exclusions) ); } diff --git a/src/core/services/tokenizer/subtitle-annotation-filter.ts b/src/core/services/tokenizer/subtitle-annotation-filter.ts index 4537a962..c37627e3 100644 --- a/src/core/services/tokenizer/subtitle-annotation-filter.ts +++ b/src/core/services/tokenizer/subtitle-annotation-filter.ts @@ -13,17 +13,40 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60; const KATAKANA_CODEPOINT_START = 0x30a1; const KATAKANA_CODEPOINT_END = 0x30f6; -const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([ +export const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([ 'あ', 'ああ', + 'あなた', + 'あんた', 'ええ', 'うう', 'おお', + 'おい', + 'お前', + 'こいつ', + 'こっち', + 'じゃない', + 'そうだ', + 'たち', + 'である', + 'どこか', + 'なんか', + 'べき', 'はあ', 'はは', 'へえ', 'ふう', 'ほう', + 'やはり', + 'って', + '何か', + '何だ', + '何も', + '如何した', + '様', + '確かに', + '誰も', + '貴方', ]); const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの']; const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [ diff --git a/src/renderer/style.css b/src/renderer/style.css index c6e3a116..80dfca07 100644 --- a/src/renderer/style.css +++ b/src/renderer/style.css @@ -793,105 +793,60 @@ body.settings-modal-open [data-subminer-yomitan-popup-host='true'] { color: var(--subtitle-name-match-color, #f5bde6); } -#subtitleRoot - .word.word-jlpt-n1:not( - :is( - .word-known, - .word-n-plus-one, - .word-name-match, - .word-frequency-single, - .word-frequency-band-1, - .word-frequency-band-2, - .word-frequency-band-3, - .word-frequency-band-4, - .word-frequency-band-5 - ) - ) { - color: var(--subtitle-jlpt-n1-color, #ed8796); +#subtitleRoot .word.word-jlpt-n1 { + text-decoration-line: underline; + text-decoration-color: var(--subtitle-jlpt-n1-color, #ed8796); + text-decoration-thickness: 0.08em; + text-underline-offset: 0.12em; + text-decoration-skip-ink: none; } #subtitleRoot .word.word-jlpt-n1[data-jlpt-level]::after { color: var(--subtitle-jlpt-n1-color, #ed8796); } -#subtitleRoot - .word.word-jlpt-n2:not( - :is( - .word-known, - .word-n-plus-one, - .word-name-match, - .word-frequency-single, - .word-frequency-band-1, - .word-frequency-band-2, - .word-frequency-band-3, - .word-frequency-band-4, - .word-frequency-band-5 - ) - ) { - color: var(--subtitle-jlpt-n2-color, #f5a97f); +#subtitleRoot .word.word-jlpt-n2 { + text-decoration-line: underline; + text-decoration-color: var(--subtitle-jlpt-n2-color, #f5a97f); + text-decoration-thickness: 0.08em; + text-underline-offset: 0.12em; + text-decoration-skip-ink: none; } #subtitleRoot .word.word-jlpt-n2[data-jlpt-level]::after { color: var(--subtitle-jlpt-n2-color, #f5a97f); } -#subtitleRoot - .word.word-jlpt-n3:not( - :is( - .word-known, - .word-n-plus-one, - .word-name-match, - .word-frequency-single, - .word-frequency-band-1, - .word-frequency-band-2, - .word-frequency-band-3, - .word-frequency-band-4, - .word-frequency-band-5 - ) - ) { - color: var(--subtitle-jlpt-n3-color, #f9e2af); +#subtitleRoot .word.word-jlpt-n3 { + text-decoration-line: underline; + text-decoration-color: var(--subtitle-jlpt-n3-color, #f9e2af); + text-decoration-thickness: 0.08em; + text-underline-offset: 0.12em; + text-decoration-skip-ink: none; } #subtitleRoot .word.word-jlpt-n3[data-jlpt-level]::after { color: var(--subtitle-jlpt-n3-color, #f9e2af); } -#subtitleRoot - .word.word-jlpt-n4:not( - :is( - .word-known, - .word-n-plus-one, - .word-name-match, - .word-frequency-single, - .word-frequency-band-1, - .word-frequency-band-2, - .word-frequency-band-3, - .word-frequency-band-4, - .word-frequency-band-5 - ) - ) { - color: var(--subtitle-jlpt-n4-color, #a6e3a1); +#subtitleRoot .word.word-jlpt-n4 { + text-decoration-line: underline; + text-decoration-color: var(--subtitle-jlpt-n4-color, #a6e3a1); + text-decoration-thickness: 0.08em; + text-underline-offset: 0.12em; + text-decoration-skip-ink: none; } #subtitleRoot .word.word-jlpt-n4[data-jlpt-level]::after { color: var(--subtitle-jlpt-n4-color, #a6e3a1); } -#subtitleRoot - .word.word-jlpt-n5:not( - :is( - .word-known, - .word-n-plus-one, - .word-name-match, - .word-frequency-single, - .word-frequency-band-1, - .word-frequency-band-2, - .word-frequency-band-3, - .word-frequency-band-4, - .word-frequency-band-5 - ) - ) { - color: var(--subtitle-jlpt-n5-color, #8aadf4); +#subtitleRoot .word.word-jlpt-n5 { + text-decoration-line: underline; + text-decoration-color: var(--subtitle-jlpt-n5-color, #8aadf4); + text-decoration-thickness: 0.08em; + text-underline-offset: 0.12em; + text-decoration-skip-ink: none; } #subtitleRoot .word.word-jlpt-n5[data-jlpt-level]::after { diff --git a/src/renderer/subtitle-render.test.ts b/src/renderer/subtitle-render.test.ts index a8df27d7..323b19ba 100644 --- a/src/renderer/subtitle-render.test.ts +++ b/src/renderer/subtitle-render.test.ts @@ -220,20 +220,8 @@ function normalizeCssSelector(selector: string): string { .trim(); } -function buildJlptColorSelector(level: number): string { - const higherPriorityClasses = [ - '.word-known', - '.word-n-plus-one', - '.word-name-match', - '.word-frequency-single', - '.word-frequency-band-1', - '.word-frequency-band-2', - '.word-frequency-band-3', - '.word-frequency-band-4', - '.word-frequency-band-5', - ].join(', '); - - return `#subtitleRoot .word.word-jlpt-n${level}:not(:is(${higherPriorityClasses}))`; +function buildJlptUnderlineSelector(level: number): string { + return `#subtitleRoot .word.word-jlpt-n${level}`; } test('computeWordClass preserves known and n+1 classes while adding JLPT classes', () => { @@ -885,7 +873,7 @@ test('shouldRenderTokenizedSubtitle enables token rendering when tokens exist', assert.equal(shouldRenderTokenizedSubtitle(0), false); }); -test('subtitle annotation CSS changes token color without overriding typography', () => { +test('subtitle annotation CSS underlines JLPT tokens without changing token color', () => { const distCssPath = path.join(process.cwd(), 'dist', 'renderer', 'style.css'); const srcCssPath = path.join(process.cwd(), 'src', 'renderer', 'style.css'); @@ -899,12 +887,15 @@ test('subtitle annotation CSS changes token color without overriding typography' const cssText = fs.readFileSync(cssPath, 'utf-8'); for (let level = 1; level <= 5; level += 1) { - const plainJlptBlock = extractClassBlock(cssText, `#subtitleRoot .word.word-jlpt-n${level}`); - assert.doesNotMatch(plainJlptBlock, /(?:^|\n)\s*color\s*:/m); - - const block = extractClassBlock(cssText, buildJlptColorSelector(level)); + const block = extractClassBlock(cssText, buildJlptUnderlineSelector(level)); assert.ok(block.length > 0, `word-jlpt-n${level} class should exist`); - assert.match(block, new RegExp(`color:\\s*var\\(--subtitle-jlpt-n${level}-color,`)); + assert.doesNotMatch(block, /(?:^|\n)\s*color\s*:/m); + assert.doesNotMatch(block, /-webkit-text-fill-color\s*:/); + assert.match(block, /text-decoration-line:\s*underline;/); + assert.match( + block, + new RegExp(`text-decoration-color:\\s*var\\(--subtitle-jlpt-n${level}-color,`), + ); assert.doesNotMatch(block, /border-bottom\s*:/); assert.doesNotMatch(block, /padding-bottom\s*:/); assert.doesNotMatch(block, /box-decoration-break\s*:/); diff --git a/stats/src/components/sessions/SessionDetail.tsx b/stats/src/components/sessions/SessionDetail.tsx index 2c8d776e..52ee8006 100644 --- a/stats/src/components/sessions/SessionDetail.tsx +++ b/stats/src/components/sessions/SessionDetail.tsx @@ -53,18 +53,31 @@ function formatTime(ms: number): string { }); } -/** Build a lookup: linesSeen → knownWordsSeen */ -function buildKnownWordsLookup(knownWordsTimeline: KnownWordsTimelinePoint[]): Map { - const map = new Map(); +type KnownWordsLineCounts = { + knownWordsSeen: number; + totalWordsSeen: number; +}; + +/** Build a lookup: linesSeen -> filtered known/total word counts */ +function buildKnownWordsLookup( + knownWordsTimeline: KnownWordsTimelinePoint[], +): Map { + const map = new Map(); for (const pt of knownWordsTimeline) { - map.set(pt.linesSeen, pt.knownWordsSeen); + map.set(pt.linesSeen, { + knownWordsSeen: pt.knownWordsSeen, + totalWordsSeen: pt.totalWordsSeen, + }); } return map; } -/** For a given linesSeen value, find the closest known words count (floor lookup). */ -function lookupKnownWords(map: Map, linesSeen: number): number { - if (map.size === 0) return 0; +/** For a given linesSeen value, find the closest filtered word counts (floor lookup). */ +function lookupKnownWordCounts( + map: Map, + linesSeen: number, +): KnownWordsLineCounts { + if (map.size === 0) return { knownWordsSeen: 0, totalWordsSeen: 0 }; if (map.has(linesSeen)) return map.get(linesSeen)!; let best = 0; for (const k of map.keys()) { @@ -72,7 +85,7 @@ function lookupKnownWords(map: Map, linesSeen: number): number { best = k; } } - return best > 0 ? map.get(best)! : 0; + return best > 0 ? map.get(best)! : { knownWordsSeen: 0, totalWordsSeen: 0 }; } interface RatioChartPoint { @@ -93,6 +106,32 @@ type TimelineEntry = { tokensSeen: number; }; +export function buildKnownWordsRatioChartData( + sorted: TimelineEntry[], + knownWordsMap: Map, +): RatioChartPoint[] { + const chartData: RatioChartPoint[] = []; + for (const t of sorted) { + const counts = lookupKnownWordCounts(knownWordsMap, t.linesSeen); + const totalWords = counts.totalWordsSeen; + if (totalWords === 0) continue; + const knownWords = Math.min(counts.knownWordsSeen, totalWords); + const unknownWords = totalWords - knownWords; + chartData.push({ + tsMs: t.sampleMs, + knownWords, + unknownWords, + totalWords, + }); + } + return chartData; +} + +export function getKnownPctAxisMax(values: number[]): number { + const max = Math.max(0, ...values.filter((value) => Number.isFinite(value))); + return Math.min(100, Math.ceil((max + 5) / 10) * 10); +} + function SessionChartOffsetProbe({ offset, onPlotAreaChange, @@ -291,7 +330,7 @@ function RatioView({ session, }: { sorted: TimelineEntry[]; - knownWordsMap: Map; + knownWordsMap: Map; cardEvents: SessionEvent[]; yomitanLookupEvents: SessionEvent[]; pauseRegions: Array<{ startMs: number; endMs: number }>; @@ -309,19 +348,7 @@ function RatioView({ session: SessionSummary; }) { const [plotArea, setPlotArea] = useState(null); - const chartData: RatioChartPoint[] = []; - for (const t of sorted) { - const totalWords = getSessionDisplayWordCount(t); - if (totalWords === 0) continue; - const knownWords = Math.min(lookupKnownWords(knownWordsMap, t.linesSeen), totalWords); - const unknownWords = totalWords - knownWords; - chartData.push({ - tsMs: t.sampleMs, - knownWords, - unknownWords, - totalWords, - }); - } + const chartData = buildKnownWordsRatioChartData(sorted, knownWordsMap); if (chartData.length === 0) { return
No word data for this session.
; diff --git a/stats/src/hooks/useExcludedWords.test.ts b/stats/src/hooks/useExcludedWords.test.ts new file mode 100644 index 00000000..83d81a12 --- /dev/null +++ b/stats/src/hooks/useExcludedWords.test.ts @@ -0,0 +1,87 @@ +import assert from 'node:assert/strict'; +import test from 'node:test'; +import { + getExcludedWordsSnapshot, + initializeExcludedWordsStore, + resetExcludedWordsStoreForTests, + setExcludedWords, +} from './useExcludedWords'; +import { BASE_URL } from '../lib/api-client'; + +const STORAGE_KEY = 'subminer-excluded-words'; + +function installLocalStorage(initial: Record = {}) { + const values = new Map(Object.entries(initial)); + Object.defineProperty(globalThis, 'localStorage', { + configurable: true, + value: { + getItem: (key: string) => values.get(key) ?? null, + setItem: (key: string, value: string) => values.set(key, value), + removeItem: (key: string) => values.delete(key), + }, + }); + return values; +} + +test('initializeExcludedWordsStore seeds empty database exclusions from localStorage', async () => { + resetExcludedWordsStoreForTests(); + const localRows = [{ headword: '猫', word: '猫', reading: 'ねこ' }]; + const storage = installLocalStorage({ [STORAGE_KEY]: JSON.stringify(localRows) }); + const originalFetch = globalThis.fetch; + const requests: Array<{ url: string; method: string; body: string }> = []; + globalThis.fetch = (async (input: RequestInfo | URL, init?: RequestInit) => { + requests.push({ + url: String(input), + method: init?.method ?? 'GET', + body: String(init?.body ?? ''), + }); + if (!init?.method) { + return new Response(JSON.stringify([]), { + status: 200, + headers: { 'Content-Type': 'application/json' }, + }); + } + return new Response(JSON.stringify({ ok: true }), { status: 200 }); + }) as typeof globalThis.fetch; + + try { + await initializeExcludedWordsStore(); + + assert.deepEqual(getExcludedWordsSnapshot(), localRows); + assert.deepEqual(requests, [ + { url: `${BASE_URL}/api/stats/excluded-words`, method: 'GET', body: '' }, + { + url: `${BASE_URL}/api/stats/excluded-words`, + method: 'PUT', + body: JSON.stringify({ words: localRows }), + }, + ]); + assert.equal(storage.get(STORAGE_KEY), JSON.stringify(localRows)); + } finally { + globalThis.fetch = originalFetch; + resetExcludedWordsStoreForTests(); + } +}); + +test('setExcludedWords updates the database-backed exclusion list', async () => { + resetExcludedWordsStoreForTests(); + const storage = installLocalStorage(); + const originalFetch = globalThis.fetch; + let seenBody = ''; + globalThis.fetch = (async (_input: RequestInfo | URL, init?: RequestInit) => { + seenBody = String(init?.body ?? ''); + return new Response(JSON.stringify({ ok: true }), { status: 200 }); + }) as typeof globalThis.fetch; + + try { + const rows = [{ headword: 'する', word: 'する', reading: 'する' }]; + await setExcludedWords(rows); + + assert.deepEqual(getExcludedWordsSnapshot(), rows); + assert.equal(seenBody, JSON.stringify({ words: rows })); + assert.equal(storage.get(STORAGE_KEY), JSON.stringify(rows)); + } finally { + globalThis.fetch = originalFetch; + resetExcludedWordsStoreForTests(); + } +}); diff --git a/stats/src/hooks/useExcludedWords.ts b/stats/src/hooks/useExcludedWords.ts index cee1a8c5..97ffb7b6 100644 --- a/stats/src/hooks/useExcludedWords.ts +++ b/stats/src/hooks/useExcludedWords.ts @@ -1,10 +1,8 @@ -import { useCallback, useSyncExternalStore } from 'react'; +import { useCallback, useEffect, useSyncExternalStore } from 'react'; +import { apiClient } from '../lib/api-client'; +import type { StatsExcludedWord } from '../types/stats'; -export interface ExcludedWord { - headword: string; - word: string; - reading: string; -} +export type ExcludedWord = StatsExcludedWord; const STORAGE_KEY = 'subminer-excluded-words'; @@ -14,16 +12,37 @@ function toKey(w: ExcludedWord): string { let cached: ExcludedWord[] | null = null; let cachedKeys: Set | null = null; +let initialized: Promise | null = null; +let revision = 0; const listeners = new Set<() => void>(); +function readLocalStorage(): ExcludedWord[] { + if (typeof localStorage === 'undefined') return []; + try { + const raw = localStorage.getItem(STORAGE_KEY); + const parsed: unknown = raw ? JSON.parse(raw) : []; + if (!Array.isArray(parsed)) return []; + return parsed.filter( + (row): row is ExcludedWord => + row !== null && + typeof row === 'object' && + typeof (row as ExcludedWord).headword === 'string' && + typeof (row as ExcludedWord).word === 'string' && + typeof (row as ExcludedWord).reading === 'string', + ); + } catch { + return []; + } +} + +function writeLocalStorage(words: ExcludedWord[]): void { + if (typeof localStorage === 'undefined') return; + localStorage.setItem(STORAGE_KEY, JSON.stringify(words)); +} + function load(): ExcludedWord[] { if (cached) return cached; - try { - const raw = localStorage.getItem(STORAGE_KEY); - cached = raw ? JSON.parse(raw) : []; - } catch { - cached = []; - } + cached = readLocalStorage(); return cached!; } @@ -33,24 +52,73 @@ function getKeySet(): Set { return cachedKeys; } -function persist(words: ExcludedWord[]) { +function applyWords(words: ExcludedWord[]): void { cached = words; cachedKeys = new Set(words.map(toKey)); - localStorage.setItem(STORAGE_KEY, JSON.stringify(words)); + writeLocalStorage(words); for (const fn of listeners) fn(); } -function getSnapshot(): ExcludedWord[] { +export function getExcludedWordsSnapshot(): ExcludedWord[] { return load(); } +export async function setExcludedWords(words: ExcludedWord[]): Promise { + revision += 1; + applyWords(words); + try { + await apiClient.setExcludedWords(words); + } catch (error) { + console.error('Failed to persist excluded words to stats database', error); + } +} + +export function initializeExcludedWordsStore(): Promise { + if (initialized) return initialized; + const startRevision = revision; + initialized = (async () => { + const localWords = load(); + let dbWords: ExcludedWord[]; + try { + dbWords = await apiClient.getExcludedWords(); + } catch (error) { + console.error('Failed to load excluded words from stats database', error); + return; + } + + if (revision !== startRevision) return; + if (dbWords.length > 0) { + applyWords(dbWords); + return; + } + if (localWords.length > 0) { + await setExcludedWords(localWords); + return; + } + applyWords([]); + })(); + return initialized; +} + +export function resetExcludedWordsStoreForTests(): void { + cached = null; + cachedKeys = null; + initialized = null; + revision = 0; + listeners.clear(); +} + function subscribe(fn: () => void): () => void { listeners.add(fn); return () => listeners.delete(fn); } export function useExcludedWords() { - const excluded = useSyncExternalStore(subscribe, getSnapshot); + const excluded = useSyncExternalStore(subscribe, getExcludedWordsSnapshot); + + useEffect(() => { + void initializeExcludedWordsStore(); + }, []); const isExcluded = useCallback( (w: { headword: string; word: string; reading: string }) => getKeySet().has(toKey(w)), @@ -61,17 +129,19 @@ export function useExcludedWords() { const key = toKey(w); const current = load(); if (getKeySet().has(key)) { - persist(current.filter((e) => toKey(e) !== key)); + void setExcludedWords(current.filter((e) => toKey(e) !== key)); } else { - persist([...current, w]); + void setExcludedWords([...current, w]); } }, []); const removeExclusion = useCallback((w: ExcludedWord) => { - persist(load().filter((e) => toKey(e) !== toKey(w))); + void setExcludedWords(load().filter((e) => toKey(e) !== toKey(w))); }, []); - const clearAll = useCallback(() => persist([]), []); + const clearAll = useCallback(() => { + void setExcludedWords([]); + }, []); return { excluded, isExcluded, toggleExclusion, removeExclusion, clearAll }; } diff --git a/stats/src/hooks/useSessions.ts b/stats/src/hooks/useSessions.ts index 4e72be0d..1c838cdb 100644 --- a/stats/src/hooks/useSessions.ts +++ b/stats/src/hooks/useSessions.ts @@ -42,6 +42,7 @@ export function useSessions(limit = 50) { export interface KnownWordsTimelinePoint { linesSeen: number; knownWordsSeen: number; + totalWordsSeen: number; } export function useSessionDetail(sessionId: number | null) { diff --git a/stats/src/lib/api-client.test.ts b/stats/src/lib/api-client.test.ts index 88f9989d..b1973b0c 100644 --- a/stats/src/lib/api-client.test.ts +++ b/stats/src/lib/api-client.test.ts @@ -172,6 +172,50 @@ test('getSessionEvents can request only specific event types', async () => { } }); +test('getExcludedWords requests database-backed exclusions', async () => { + const originalFetch = globalThis.fetch; + let seenUrl = ''; + globalThis.fetch = (async (input: RequestInfo | URL) => { + seenUrl = String(input); + return new Response(JSON.stringify([{ headword: '猫', word: '猫', reading: 'ねこ' }]), { + status: 200, + headers: { 'Content-Type': 'application/json' }, + }); + }) as typeof globalThis.fetch; + + try { + const words = await apiClient.getExcludedWords(); + assert.equal(seenUrl, `${BASE_URL}/api/stats/excluded-words`); + assert.deepEqual(words, [{ headword: '猫', word: '猫', reading: 'ねこ' }]); + } finally { + globalThis.fetch = originalFetch; + } +}); + +test('setExcludedWords replaces database-backed exclusions', async () => { + const originalFetch = globalThis.fetch; + let seenUrl = ''; + let seenMethod = ''; + let seenBody = ''; + globalThis.fetch = (async (input: RequestInfo | URL, init?: RequestInit) => { + seenUrl = String(input); + seenMethod = init?.method ?? 'GET'; + seenBody = String(init?.body ?? ''); + return new Response(null, { status: 200 }); + }) as typeof globalThis.fetch; + + try { + await apiClient.setExcludedWords([{ headword: '猫', word: '猫', reading: 'ねこ' }]); + assert.equal(seenUrl, `${BASE_URL}/api/stats/excluded-words`); + assert.equal(seenMethod, 'PUT'); + assert.deepEqual(JSON.parse(seenBody), { + words: [{ headword: '猫', word: '猫', reading: 'ねこ' }], + }); + } finally { + globalThis.fetch = originalFetch; + } +}); + test('getSessionTimeline requests full session history when limit is omitted', async () => { const originalFetch = globalThis.fetch; let seenUrl = ''; diff --git a/stats/src/lib/api-client.ts b/stats/src/lib/api-client.ts index 083d05f4..320fd66d 100644 --- a/stats/src/lib/api-client.ts +++ b/stats/src/lib/api-client.ts @@ -22,6 +22,7 @@ import type { KanjiDetailData, EpisodeDetailData, StatsAnkiNoteInfo, + StatsExcludedWord, } from '../types/stats'; type StatsLocationLike = Pick; @@ -85,11 +86,19 @@ export const apiClient = { return fetchJson(`/api/stats/sessions/${id}/events?${params.toString()}`); }, getSessionKnownWordsTimeline: (id: number) => - fetchJson>( + fetchJson>( `/api/stats/sessions/${id}/known-words-timeline`, ), getVocabulary: (limit = 100) => fetchJson(`/api/stats/vocabulary?limit=${limit}`), + getExcludedWords: () => fetchJson('/api/stats/excluded-words'), + setExcludedWords: async (words: StatsExcludedWord[]): Promise => { + await fetchResponse('/api/stats/excluded-words', { + method: 'PUT', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ words }), + }); + }, getWordOccurrences: (headword: string, word: string, reading: string, limit = 50, offset = 0) => fetchJson( `/api/stats/vocabulary/occurrences?headword=${encodeURIComponent(headword)}&word=${encodeURIComponent(word)}&reading=${encodeURIComponent(reading)}&limit=${limit}&offset=${offset}`, diff --git a/stats/src/lib/session-detail.test.tsx b/stats/src/lib/session-detail.test.tsx index c8e3be80..7d4591d4 100644 --- a/stats/src/lib/session-detail.test.tsx +++ b/stats/src/lib/session-detail.test.tsx @@ -1,7 +1,11 @@ import assert from 'node:assert/strict'; import test from 'node:test'; import { renderToStaticMarkup } from 'react-dom/server'; -import { SessionDetail, getKnownPctAxisMax } from '../components/sessions/SessionDetail'; +import { + SessionDetail, + buildKnownWordsRatioChartData, + getKnownPctAxisMax, +} from '../components/sessions/SessionDetail'; import { buildSessionChartEvents } from './session-events'; import { EventType } from '../types/stats'; @@ -69,3 +73,21 @@ test('getKnownPctAxisMax adds headroom above the highest known percentage', () = test('getKnownPctAxisMax caps the chart top at 100%', () => { assert.equal(getKnownPctAxisMax([97.1, 98.6]), 100); }); + +test('buildKnownWordsRatioChartData uses filtered known-word timeline totals', () => { + const chartData = buildKnownWordsRatioChartData( + [ + { sampleMs: 1_000, linesSeen: 1, tokensSeen: 10 }, + { sampleMs: 2_000, linesSeen: 2, tokensSeen: 20 }, + ], + new Map([ + [1, { knownWordsSeen: 2, totalWordsSeen: 3 }], + [2, { knownWordsSeen: 3, totalWordsSeen: 4 }], + ]), + ); + + assert.deepEqual(chartData, [ + { tsMs: 1_000, knownWords: 2, unknownWords: 1, totalWords: 3 }, + { tsMs: 2_000, knownWords: 3, unknownWords: 1, totalWords: 4 }, + ]); +}); diff --git a/stats/src/types/stats.ts b/stats/src/types/stats.ts index 81861fee..2686a052 100644 --- a/stats/src/types/stats.ts +++ b/stats/src/types/stats.ts @@ -76,6 +76,12 @@ export interface VocabularyEntry { lastSeen: number; } +export interface StatsExcludedWord { + headword: string; + word: string; + reading: string; +} + export interface KanjiEntry { kanjiId: number; kanji: string;