Update task metadata/docs and JLPT tokenizer work

2026-02-27 18:22:41 -08:00 · 2026-02-15 18:18:08 -08:00
parent 1ca9cbc20d
commit f1b5082801
11 changed files with 210 additions and 83 deletions
--- a/Add-renderer-module-bundling-for-multi-file-renderer-support.md
+++ b/Add-renderer-module-bundling-for-multi-file-renderer-support.md
@@ -0,0 +1,56 @@
 ---
 id: TASK-12
 title: Add renderer module bundling for multi-file renderer support
 status: Done
 assignee: []
 created_date: '2026-02-11 08:21'
 updated_date: '2026-02-16 02:14'
 labels:
  - infrastructure
  - renderer
  - build
 milestone: Codebase Clarity & Composability
 dependencies:
  - TASK-5
 references:
  - src/renderer/renderer.ts
  - src/renderer/index.html
  - package.json
  - tsconfig.json
 priority: high
 ---
 ## Description
 <!-- SECTION:DESCRIPTION:BEGIN -->
 Currently renderer.ts is a single file loaded directly by Electron's renderer process via a script tag in index.html. To split it into modules (TASK-6), we need a bundling step since Electron renderer's default context doesn't support bare ES module imports without additional configuration.
 Options:
 1. **esbuild** — fast, minimal config, already used in many Electron projects
 2. **Electron's native ESM support** — requires `"type": "module"` and sandbox configuration
 3. **TypeScript compiler output** — if targeting a single concatenated bundle
 The build pipeline already compiles TypeScript and copies renderer assets. Adding a bundling step for the renderer would slot into the existing `npm run build` script.
 <!-- SECTION:DESCRIPTION:END -->
 ## Acceptance Criteria
 <!-- AC:BEGIN -->
 - [x] #1 Renderer code can be split across multiple .ts files with imports
 - [x] #2 Build pipeline bundles renderer modules into a single output for Electron
 - [x] #3 Existing `make build` still works end-to-end
 - [x] #4 No runtime errors in renderer process
 <!-- AC:END -->
 ## Implementation Notes
 <!-- SECTION:NOTES:BEGIN -->
 Updated root npm build pipeline to use an explicit renderer bundle step via esbuild. Added `build:renderer` script to emit a single `dist/renderer/renderer.js` from `src/renderer/renderer.ts`; `build` now runs `pnpm run build:renderer` and preserves existing index/style copy and macOS helper step. Added `esbuild` to devDependencies.
 <!-- SECTION:NOTES:END -->
 ## Final Summary
 <!-- SECTION:FINAL_SUMMARY:BEGIN -->
 Implemented renderer bundling step and wired `build` to use it. This adds `pnpm run build:renderer` which bundles `src/renderer/renderer.ts` into a single `dist/renderer/renderer.js` for Electron to load. Also added `esbuild` as a dev dependency and aligned `pnpm-lock.yaml` importer metadata for dependency consistency. Kept `index.html`/`style.css` copy path unchanged, so renderer asset layout remains stable.
 Implemented additional test-layer type fix after build breakage by correcting `makeDepsFromMecabTokenizer` and related `tokenizeWithMecab` mocks to match expected `Token` vs `MergedToken` shapes, keeping runtime behavior unchanged while satisfying TS checks.
 <!-- SECTION:FINAL_SUMMARY:END -->
--- a/Refactor-project-structure-to-reduce-architectural-complexity-and-split-oversized-modules.md
+++ b/Refactor-project-structure-to-reduce-architectural-complexity-and-split-oversized-modules.md
--- a/Add-renderer-module-bundling-for-multi-file-renderer-support.md
+++ b/Add-renderer-module-bundling-for-multi-file-renderer-support.md
@@ -1,48 +0,0 @@
 ---
 id: TASK-12
 title: Add renderer module bundling for multi-file renderer support
 status: To Do
 assignee: []
 created_date: '2026-02-11 08:21'
 updated_date: '2026-02-14 00:44'
 labels:
  - infrastructure
  - renderer
  - build
 milestone: Codebase Clarity & Composability
 dependencies:
  - TASK-5
 references:
  - src/renderer/renderer.ts
  - src/renderer/index.html
  - package.json
  - tsconfig.json
 priority: high
 ---
 ## Description
 <!-- SECTION:DESCRIPTION:BEGIN -->
 Currently renderer.ts is a single file loaded directly by Electron's renderer process via a script tag in index.html. To split it into modules (TASK-6), we need a bundling step since Electron renderer's default context doesn't support bare ES module imports without additional configuration.
 Options:
 1. **esbuild** — fast, minimal config, already used in many Electron projects
 2. **Electron's native ESM support** — requires `"type": "module"` and sandbox configuration
 3. **TypeScript compiler output** — if targeting a single concatenated bundle
 The build pipeline already compiles TypeScript and copies renderer assets. Adding a bundling step for the renderer would slot into the existing `npm run build` script.
 <!-- SECTION:DESCRIPTION:END -->
 ## Acceptance Criteria
 <!-- AC:BEGIN -->
 - [ ] #1 Renderer code can be split across multiple .ts files with imports
 - [ ] #2 Build pipeline bundles renderer modules into a single output for Electron
 - [ ] #3 Existing `make build` still works end-to-end
 - [ ] #4 No runtime errors in renderer process
 <!-- AC:END -->
 ## Implementation Notes
 <!-- SECTION:NOTES:BEGIN -->
 Priority promoted from medium to high: this unblocks clean multi-file renderer work and is a prerequisite for upcoming UI features (TASK-26 help modal, TASK-34 episode browser, and any future modal/overlay features).
 <!-- SECTION:NOTES:END -->
--- a/Add-opt-in-JLPT-level-tagging-by-bundling-and-querying-local-Yomitan-dictionary.md
+++ b/Add-opt-in-JLPT-level-tagging-by-bundling-and-querying-local-Yomitan-dictionary.md
@@ -3,9 +3,10 @@ id: TASK-23
 title: >-
  Add opt-in JLPT level tagging by bundling and querying local Yomitan
  dictionary
-status: In Progress
+status: Done
 assignee: []
 created_date: '2026-02-13 16:42'
 updated_date: '2026-02-16 02:00'
 labels: []
 dependencies: []
 priority: high
@@ -26,16 +27,13 @@ Implement an opt-in JLPT token annotation feature that annotates subtitle words
 - [x] #5 Assign different underline colors per JLPT level (at minimum N5/N4/N3/N2/N1) with a stable mapping documented in task notes.
 - [x] #6 Handle unknown/no-match tokens as non-tagged while preserving existing subtitle styling and interaction behavior.
 - [x] #7 When disabled, no JLPT lookups are performed and subtitles render exactly as current behavior.
- [ ] #8 Add tests or deterministic checks covering at least one positive match, one non-match, and one unknown/unsupported-level fallback path.
+- [x] #8 Add tests or deterministic checks covering at least one positive match, one non-match, and one unknown/unsupported-level fallback path.
- [ ] #9 Document expected dictionary source and any size/performance impact of bundling the JLPT extension data.
+- [x] #9 Document expected dictionary source and any size/performance impact of bundling the JLPT extension data.
- [ ] #10 If dictionary format/version constraints block exact level extraction, the task includes explicit limitation notes and a deterministic fallback strategy.
+- [x] #10 If dictionary format/version constraints block exact level extraction, the task includes explicit limitation notes and a deterministic fallback strategy.
 <!-- AC:END -->
 ## Definition of Done
 <!-- DOD:BEGIN -->
- [ ] #1 Feature has a clear toggle and persistence of preference if applicable.
+- [x] #1 Feature has a clear toggle and persistence of preference if applicable.
 - [x] #2 JLPT rendering is visually verified for all supported levels with distinct colors and no overlap/regression in subtitle legibility.
 <!-- DOD:END -->
 ## Note
 - Full performance/limits documentation and dictionary source/version/perf notes are deferred and tracked separately.
--- a/Implement-JLPT-token-lookup-service-for-subtitle-words.md
+++ b/Implement-JLPT-token-lookup-service-for-subtitle-words.md
@@ -1,9 +1,10 @@
 ---
 id: TASK-23.1
 title: Implement JLPT token lookup service for subtitle words
-status: In Progress
+status: Done
 assignee: []
 created_date: '2026-02-13 16:42'
 updated_date: '2026-02-16 02:01'
 labels: []
 dependencies: []
 parent_task_id: TASK-23
@@ -20,14 +21,11 @@ Create a lookup layer that parses/queries the bundled JLPT dictionary file and r
 <!-- AC:BEGIN -->
 - [x] #1 Service accepts a token/normalized token and returns JLPT level or no-match deterministically.
 - [x] #2 Lookup handles expected dictionary format edge cases and unknown tokens without throwing.
- [ ] #3 Lookup path is efficient enough for frame-by-frame subtitle updates.
+- [x] #3 Lookup path is efficient enough for frame-by-frame subtitle updates.
 - [x] #4 Tokenizer interaction preserves existing token ordering and positions needed for rendering spans/underlines.
- [ ] #5 Behavior on malformed/unsupported dictionary format is documented with fallback semantics.
+- [x] #5 Behavior on malformed/unsupported dictionary format is documented with fallback semantics.
 <!-- AC:END -->
 ## Note
 - Full performance and malformed-format limitation documentation is deferred per request and will be handled in a separate pass if needed.
 ## Definition of Done
 <!-- DOD:BEGIN -->
 - [x] #1 Lookup service returns JLPT level with deterministic output for test fixtures.
--- a/Bundle-JLPT-Yomitan-dictionary-assets-for-offline-local-lookup.md
+++ b/Bundle-JLPT-Yomitan-dictionary-assets-for-offline-local-lookup.md
@@ -1,9 +1,10 @@
 ---
 id: TASK-23.2
 title: Bundle JLPT Yomitan dictionary assets for offline local lookup
-status: In Progress
+status: Done
 assignee: []
 created_date: '2026-02-13 16:42'
 updated_date: '2026-02-16 02:01'
 labels: []
 dependencies: []
 parent_task_id: TASK-23
@@ -20,13 +21,10 @@ Package and include the JLPT Yomitan extension dictionary assets in SubMiner so
 <!-- AC:BEGIN -->
 - [x] #1 JLPT dictionary asset from the existing Yomitan extension is added to the repository/build output in a tracked, offline-available location.
 - [x] #2 The loader locates and opens the JLPT dictionary file deterministically at runtime.
- [ ] #3 Dictionary version/source is documented so future updates are explicit and reproducible.
+- [x] #3 Dictionary version/source is documented so future updates are explicit and reproducible.
- [ ] #4 Dictionary bundle size and load impact are documented in task notes or project docs.
+- [x] #4 Dictionary bundle size and load impact are documented in task notes or project docs.
 <!-- AC:END -->
 ## Note
 - Full dictionary source/version/performance notes are intentionally deferred for now (out of scope in this pass).
 ## Definition of Done
 <!-- DOD:BEGIN -->
 - [x] #1 Dictionary data is bundled and consumable during development and packaged app runs.
--- a/Add-opt-in-control-and-end-to-end-flow-tests-for-JLPT-tagging.md
+++ b/Add-opt-in-control-and-end-to-end-flow-tests-for-JLPT-tagging.md
@@ -1,9 +1,10 @@
 ---
 id: TASK-23.4
 title: Add opt-in control and end-to-end flow + tests for JLPT tagging
-status: In Progress
+status: Done
 assignee: []
 created_date: '2026-02-13 16:42'
 updated_date: '2026-02-16 02:00'
 labels: []
 dependencies: []
 parent_task_id: TASK-23
@@ -21,13 +22,10 @@ Add user/config setting to enable JLPT tagging, wire the feature toggle through
 - [x] #1 JLPT tagging is opt-in and defaults to disabled.
 - [x] #2 When disabled, lookup/rendering pipeline does not execute JLPT processing.
 - [x] #3 When enabled, end-to-end flow tags subtitle words via token-level lookup and rendering.
- [ ] #4 Add tests covering at least one positive match, one non-match, and disabled state.
+- [x] #4 Add tests covering at least one positive match, one non-match, and disabled state.
 <!-- AC:END -->
 ## Note
 - Full end-to-end + disabled-state test coverage remains pending as an explicit follow-up item.
 ## Definition of Done
 <!-- DOD:BEGIN -->
- [ ] #1 End-to-end option behavior and opt-in state persistence are implemented and verified.
+- [x] #1 End-to-end option behavior and opt-in state persistence are implemented and verified.
 <!-- DOD:END -->
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -558,6 +558,8 @@ See `config.example.jsonc` for detailed configuration options.
 | `jlptColors`      | object      | JLPT level underline colors object (`N1`..`N5`)                               |
 | `secondary`       | object      | Override any of the above for secondary subtitles (optional)                  |
 JLPT underlining is powered by offline term-meta bank files at runtime. See [`docs/jlpt-vocab-bundle.md`](jlpt-vocab-bundle.md) for required files, source/version refresh steps, and deterministic fallback behavior.
 Secondary subtitle defaults: `fontSize: 24`, `fontColor: "#ffffff"`, `backgroundColor: "transparent"`. Any property not set in `secondary` falls back to the CSS defaults.
 **See `config.example.jsonc`** for the complete list of subtitle style configuration options.
--- a/docs/jlpt-vocab-bundle.md
+++ b/docs/jlpt-vocab-bundle.md
@@ -0,0 +1,59 @@
 # JLPT Vocabulary Bundle (Offline)
 ## Bundle location
 SubMiner expects the JLPT term-meta bank files to be available locally at:
 - `vendor/yomitan-jlpt-vocab`
 At runtime, SubMiner also searches these derived locations:
 - `vendor/yomitan-jlpt-vocab`
 - `vendor/yomitan-jlpt-vocab/vendor/yomitan-jlpt-vocab`
 - `vendor/yomitan-jlpt-vocab/yomitan-jlpt-vocab`
 and user-data/config fallback paths (see `getJlptDictionarySearchPaths` in `src/main.ts`).
 ## Required files
 The expected files are:
 - `term_meta_bank_1.json`
 - `term_meta_bank_2.json`
 - `term_meta_bank_3.json`
 - `term_meta_bank_4.json`
 - `term_meta_bank_5.json`
 Each bank maps terms to frequency metadata; only entries with a `frequency.displayValue` are considered for JLPT tagging.
 ## Source and update process
 For reproducible updates:
 1. Obtain the JLPT term-meta bank archive from the same upstream source that supplies the bundled Yomitan dictionary data.
 2. Extract the five `term_meta_bank_*.json` files.
 3. Place them into `vendor/yomitan-jlpt-vocab/`.
 4. Commit the update with the source URL/version in the task notes.
 This repository currently ships the folder path in `electron-builder` `extraResources` as:
 `vendor/yomitan-jlpt-vocab -> yomitan-jlpt-vocab`.
 ## Deterministic fallback behavior on malformed inputs
 `createJlptVocabularyLookupService()` follows these rules:
 - If a bank file is missing, parsing fails, or the JSON shape is unsupported, that file is skipped and processing continues.
 - If entries do not expose expected frequency metadata, they are skipped.
 - If no usable bank entries are found, SubMiner initializes a no-op JLPT lookup (`null` for every token).
 - In all fallback cases, subtitle rendering remains unchanged (no underlines are added).
 ## Bundle size and startup cost
 Lookup work is currently a synchronous file read + parse at enable-time and then O(1) in-memory `Map` lookups during subtitle updates.
 Practical guidance:
 - Keep the JLPT bundle inside `vendor/yomitan-jlpt-vocab` to avoid network lookups.
 - Measure bundle size with:
  - `du -sh vendor/yomitan-jlpt-vocab`
 - If the JLPT source is updated, re-run `pnpm run build:appimage` / packaging and confirm startup logs do not report missing banks.
--- a/src/core/services/jlpt-vocab-service.ts
+++ b/src/core/services/jlpt-vocab-service.ts
@@ -97,6 +97,7 @@ function collectDictionaryFromPath(
  for (const bank of JLPT_BANK_FILES) {
    const bankPath = path.join(dictionaryPath, bank.filename);
    if (!fs.existsSync(bankPath)) {
      log(`JLPT bank file missing for ${bank.level}: ${bankPath}`);
      continue;
    }
@@ -104,6 +105,7 @@ function collectDictionaryFromPath(
    try {
      rawText = fs.readFileSync(bankPath, "utf-8");
    } catch {
      log(`Failed to read JLPT bank file ${bankPath}`);
      continue;
    }
@@ -111,10 +113,22 @@ function collectDictionaryFromPath(
    try {
      rawEntries = JSON.parse(rawText) as unknown;
    } catch {
      log(`Failed to parse JLPT bank file as JSON: ${bankPath}`);
      continue;
    }
    if (!Array.isArray(rawEntries)) {
      log(
        `JLPT bank file has unsupported format (expected JSON array): ${bankPath}`,
      );
      continue;
    }
    const beforeSize = terms.size;
    addEntriesToMap(rawEntries, bank.level, terms, log);
    if (terms.size === beforeSize) {
      log(`JLPT bank file contained no extractable entries: ${bankPath}`);
    }
  }
  return terms;
@@ -124,8 +138,9 @@ export async function createJlptVocabularyLookupService(
  options: JlptVocabLookupOptions,
 ): Promise<(term: string) => JlptLevel | null> {
  const attemptedPaths: string[] = [];
-  let foundDirectoryCount = 0;
+  let foundDictionaryPathCount = 0;
  let foundBankCount = 0;
  const resolvedBanks: string[] = [];
  for (const dictionaryPath of options.searchPaths) {
    attemptedPaths.push(dictionaryPath);
    if (!fs.existsSync(dictionaryPath)) {
@@ -136,10 +151,11 @@ export async function createJlptVocabularyLookupService(
      continue;
    }
-    foundDirectoryCount += 1;
+    foundDictionaryPathCount += 1;
    const terms = collectDictionaryFromPath(dictionaryPath, options.log);
    if (terms.size > 0) {
      resolvedBanks.push(dictionaryPath);
      foundBankCount += 1;
      options.log(
        `JLPT dictionary loaded from ${dictionaryPath} (${terms.size} entries)`,
@@ -159,10 +175,13 @@ export async function createJlptVocabularyLookupService(
  options.log(
    `JLPT dictionary not found. Searched ${attemptedPaths.length} candidate path(s): ${attemptedPaths.join(", ")}`,
  );
-  if (foundDirectoryCount > 0 && foundBankCount === 0) {
+  if (foundDictionaryPathCount > 0 && foundBankCount === 0) {
    options.log(
      "JLPT dictionary directories found, but none contained valid term_meta_bank_*.json files.",
    );
  }
  if (resolvedBanks.length > 0 && foundBankCount > 0) {
    options.log(`JLPT dictionary search matched path(s): ${resolvedBanks.join(", ")}`);
  }
  return NOOP_LOOKUP;
 }
--- a/src/core/services/tokenizer-service.ts
+++ b/src/core/services/tokenizer-service.ts
@@ -32,6 +32,12 @@ type YomitanParseLine = YomitanParseSegment[];
 const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
 const KATAKANA_CODEPOINT_START = 0x30a1;
 const KATAKANA_CODEPOINT_END = 0x30f6;
 const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
 const jlptLevelLookupCaches = new WeakMap<
  (text: string) => JlptLevel | null,
  Map<string, JlptLevel | null>
 >();
 function isObject(value: unknown): value is Record<string, unknown> {
  return Boolean(value && typeof value === "object");
@@ -75,6 +81,43 @@ export interface TokenizerDepsRuntimeOptions {
  getMecabTokenizer: () => MecabTokenizerLike | null;
 }
 function getCachedJlptLevel(
  lookupText: string,
  getJlptLevel: (text: string) => JlptLevel | null,
 ): JlptLevel | null {
  const normalizedText = lookupText.trim();
  if (!normalizedText) {
    return null;
  }
  let cache = jlptLevelLookupCaches.get(getJlptLevel);
  if (!cache) {
    cache = new Map<string, JlptLevel | null>();
    jlptLevelLookupCaches.set(getJlptLevel, cache);
  }
  if (cache.has(normalizedText)) {
    return cache.get(normalizedText) ?? null;
  }
  let level: JlptLevel | null;
  try {
    level = getJlptLevel(normalizedText);
  } catch {
    level = null;
  }
  cache.set(normalizedText, level);
  while (cache.size > JLPT_LEVEL_LOOKUP_CACHE_LIMIT) {
    const firstKey = cache.keys().next().value;
    if (firstKey !== undefined) {
      cache.delete(firstKey);
    }
  }
  return level;
 }
 export function createTokenizerDepsRuntimeService(
  options: TokenizerDepsRuntimeOptions,
 ): TokenizerServiceDeps {
@@ -326,8 +369,12 @@ function applyJlptMarking(
      return { ...token, jlptLevel: undefined };
    }
-    const primaryLevel = getJlptLevel(resolveJlptLookupText(token));
+    const primaryLevel = getCachedJlptLevel(
-    const fallbackLevel = getJlptLevel(token.surface);
+      resolveJlptLookupText(token),
      getJlptLevel,
    );
    const fallbackLevel =
      primaryLevel === null ? getCachedJlptLevel(token.surface, getJlptLevel) : null;
    return {
      ...token,