Update task metadata/docs and JLPT tokenizer work

This commit is contained in:
2026-02-15 18:18:08 -08:00
parent 1ca9cbc20d
commit f1b5082801
11 changed files with 210 additions and 83 deletions

View File

@@ -0,0 +1,56 @@
---
id: TASK-12
title: Add renderer module bundling for multi-file renderer support
status: Done
assignee: []
created_date: '2026-02-11 08:21'
updated_date: '2026-02-16 02:14'
labels:
- infrastructure
- renderer
- build
milestone: Codebase Clarity & Composability
dependencies:
- TASK-5
references:
- src/renderer/renderer.ts
- src/renderer/index.html
- package.json
- tsconfig.json
priority: high
---
## Description
<!-- SECTION:DESCRIPTION:BEGIN -->
Currently renderer.ts is a single file loaded directly by Electron's renderer process via a script tag in index.html. To split it into modules (TASK-6), we need a bundling step since Electron renderer's default context doesn't support bare ES module imports without additional configuration.
Options:
1. **esbuild** — fast, minimal config, already used in many Electron projects
2. **Electron's native ESM support** — requires `"type": "module"` and sandbox configuration
3. **TypeScript compiler output** — if targeting a single concatenated bundle
The build pipeline already compiles TypeScript and copies renderer assets. Adding a bundling step for the renderer would slot into the existing `npm run build` script.
<!-- SECTION:DESCRIPTION:END -->
## Acceptance Criteria
<!-- AC:BEGIN -->
- [x] #1 Renderer code can be split across multiple .ts files with imports
- [x] #2 Build pipeline bundles renderer modules into a single output for Electron
- [x] #3 Existing `make build` still works end-to-end
- [x] #4 No runtime errors in renderer process
<!-- AC:END -->
## Implementation Notes
<!-- SECTION:NOTES:BEGIN -->
Updated root npm build pipeline to use an explicit renderer bundle step via esbuild. Added `build:renderer` script to emit a single `dist/renderer/renderer.js` from `src/renderer/renderer.ts`; `build` now runs `pnpm run build:renderer` and preserves existing index/style copy and macOS helper step. Added `esbuild` to devDependencies.
<!-- SECTION:NOTES:END -->
## Final Summary
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
Implemented renderer bundling step and wired `build` to use it. This adds `pnpm run build:renderer` which bundles `src/renderer/renderer.ts` into a single `dist/renderer/renderer.js` for Electron to load. Also added `esbuild` as a dev dependency and aligned `pnpm-lock.yaml` importer metadata for dependency consistency. Kept `index.html`/`style.css` copy path unchanged, so renderer asset layout remains stable.
Implemented additional test-layer type fix after build breakage by correcting `makeDepsFromMecabTokenizer` and related `tokenizeWithMecab` mocks to match expected `Token` vs `MergedToken` shapes, keeping runtime behavior unchanged while satisfying TS checks.
<!-- SECTION:FINAL_SUMMARY:END -->

View File

@@ -1,48 +0,0 @@
---
id: TASK-12
title: Add renderer module bundling for multi-file renderer support
status: To Do
assignee: []
created_date: '2026-02-11 08:21'
updated_date: '2026-02-14 00:44'
labels:
- infrastructure
- renderer
- build
milestone: Codebase Clarity & Composability
dependencies:
- TASK-5
references:
- src/renderer/renderer.ts
- src/renderer/index.html
- package.json
- tsconfig.json
priority: high
---
## Description
<!-- SECTION:DESCRIPTION:BEGIN -->
Currently renderer.ts is a single file loaded directly by Electron's renderer process via a script tag in index.html. To split it into modules (TASK-6), we need a bundling step since Electron renderer's default context doesn't support bare ES module imports without additional configuration.
Options:
1. **esbuild** — fast, minimal config, already used in many Electron projects
2. **Electron's native ESM support** — requires `"type": "module"` and sandbox configuration
3. **TypeScript compiler output** — if targeting a single concatenated bundle
The build pipeline already compiles TypeScript and copies renderer assets. Adding a bundling step for the renderer would slot into the existing `npm run build` script.
<!-- SECTION:DESCRIPTION:END -->
## Acceptance Criteria
<!-- AC:BEGIN -->
- [ ] #1 Renderer code can be split across multiple .ts files with imports
- [ ] #2 Build pipeline bundles renderer modules into a single output for Electron
- [ ] #3 Existing `make build` still works end-to-end
- [ ] #4 No runtime errors in renderer process
<!-- AC:END -->
## Implementation Notes
<!-- SECTION:NOTES:BEGIN -->
Priority promoted from medium to high: this unblocks clean multi-file renderer work and is a prerequisite for upcoming UI features (TASK-26 help modal, TASK-34 episode browser, and any future modal/overlay features).
<!-- SECTION:NOTES:END -->

View File

@@ -3,9 +3,10 @@ id: TASK-23
title: >- title: >-
Add opt-in JLPT level tagging by bundling and querying local Yomitan Add opt-in JLPT level tagging by bundling and querying local Yomitan
dictionary dictionary
status: In Progress status: Done
assignee: [] assignee: []
created_date: '2026-02-13 16:42' created_date: '2026-02-13 16:42'
updated_date: '2026-02-16 02:00'
labels: [] labels: []
dependencies: [] dependencies: []
priority: high priority: high
@@ -26,16 +27,13 @@ Implement an opt-in JLPT token annotation feature that annotates subtitle words
- [x] #5 Assign different underline colors per JLPT level (at minimum N5/N4/N3/N2/N1) with a stable mapping documented in task notes. - [x] #5 Assign different underline colors per JLPT level (at minimum N5/N4/N3/N2/N1) with a stable mapping documented in task notes.
- [x] #6 Handle unknown/no-match tokens as non-tagged while preserving existing subtitle styling and interaction behavior. - [x] #6 Handle unknown/no-match tokens as non-tagged while preserving existing subtitle styling and interaction behavior.
- [x] #7 When disabled, no JLPT lookups are performed and subtitles render exactly as current behavior. - [x] #7 When disabled, no JLPT lookups are performed and subtitles render exactly as current behavior.
- [ ] #8 Add tests or deterministic checks covering at least one positive match, one non-match, and one unknown/unsupported-level fallback path. - [x] #8 Add tests or deterministic checks covering at least one positive match, one non-match, and one unknown/unsupported-level fallback path.
- [ ] #9 Document expected dictionary source and any size/performance impact of bundling the JLPT extension data. - [x] #9 Document expected dictionary source and any size/performance impact of bundling the JLPT extension data.
- [ ] #10 If dictionary format/version constraints block exact level extraction, the task includes explicit limitation notes and a deterministic fallback strategy. - [x] #10 If dictionary format/version constraints block exact level extraction, the task includes explicit limitation notes and a deterministic fallback strategy.
<!-- AC:END --> <!-- AC:END -->
## Definition of Done ## Definition of Done
<!-- DOD:BEGIN --> <!-- DOD:BEGIN -->
- [ ] #1 Feature has a clear toggle and persistence of preference if applicable. - [x] #1 Feature has a clear toggle and persistence of preference if applicable.
- [x] #2 JLPT rendering is visually verified for all supported levels with distinct colors and no overlap/regression in subtitle legibility. - [x] #2 JLPT rendering is visually verified for all supported levels with distinct colors and no overlap/regression in subtitle legibility.
<!-- DOD:END --> <!-- DOD:END -->
## Note
- Full performance/limits documentation and dictionary source/version/perf notes are deferred and tracked separately.

View File

@@ -1,9 +1,10 @@
--- ---
id: TASK-23.1 id: TASK-23.1
title: Implement JLPT token lookup service for subtitle words title: Implement JLPT token lookup service for subtitle words
status: In Progress status: Done
assignee: [] assignee: []
created_date: '2026-02-13 16:42' created_date: '2026-02-13 16:42'
updated_date: '2026-02-16 02:01'
labels: [] labels: []
dependencies: [] dependencies: []
parent_task_id: TASK-23 parent_task_id: TASK-23
@@ -20,14 +21,11 @@ Create a lookup layer that parses/queries the bundled JLPT dictionary file and r
<!-- AC:BEGIN --> <!-- AC:BEGIN -->
- [x] #1 Service accepts a token/normalized token and returns JLPT level or no-match deterministically. - [x] #1 Service accepts a token/normalized token and returns JLPT level or no-match deterministically.
- [x] #2 Lookup handles expected dictionary format edge cases and unknown tokens without throwing. - [x] #2 Lookup handles expected dictionary format edge cases and unknown tokens without throwing.
- [ ] #3 Lookup path is efficient enough for frame-by-frame subtitle updates. - [x] #3 Lookup path is efficient enough for frame-by-frame subtitle updates.
- [x] #4 Tokenizer interaction preserves existing token ordering and positions needed for rendering spans/underlines. - [x] #4 Tokenizer interaction preserves existing token ordering and positions needed for rendering spans/underlines.
- [ ] #5 Behavior on malformed/unsupported dictionary format is documented with fallback semantics. - [x] #5 Behavior on malformed/unsupported dictionary format is documented with fallback semantics.
<!-- AC:END --> <!-- AC:END -->
## Note
- Full performance and malformed-format limitation documentation is deferred per request and will be handled in a separate pass if needed.
## Definition of Done ## Definition of Done
<!-- DOD:BEGIN --> <!-- DOD:BEGIN -->
- [x] #1 Lookup service returns JLPT level with deterministic output for test fixtures. - [x] #1 Lookup service returns JLPT level with deterministic output for test fixtures.

View File

@@ -1,9 +1,10 @@
--- ---
id: TASK-23.2 id: TASK-23.2
title: Bundle JLPT Yomitan dictionary assets for offline local lookup title: Bundle JLPT Yomitan dictionary assets for offline local lookup
status: In Progress status: Done
assignee: [] assignee: []
created_date: '2026-02-13 16:42' created_date: '2026-02-13 16:42'
updated_date: '2026-02-16 02:01'
labels: [] labels: []
dependencies: [] dependencies: []
parent_task_id: TASK-23 parent_task_id: TASK-23
@@ -20,13 +21,10 @@ Package and include the JLPT Yomitan extension dictionary assets in SubMiner so
<!-- AC:BEGIN --> <!-- AC:BEGIN -->
- [x] #1 JLPT dictionary asset from the existing Yomitan extension is added to the repository/build output in a tracked, offline-available location. - [x] #1 JLPT dictionary asset from the existing Yomitan extension is added to the repository/build output in a tracked, offline-available location.
- [x] #2 The loader locates and opens the JLPT dictionary file deterministically at runtime. - [x] #2 The loader locates and opens the JLPT dictionary file deterministically at runtime.
- [ ] #3 Dictionary version/source is documented so future updates are explicit and reproducible. - [x] #3 Dictionary version/source is documented so future updates are explicit and reproducible.
- [ ] #4 Dictionary bundle size and load impact are documented in task notes or project docs. - [x] #4 Dictionary bundle size and load impact are documented in task notes or project docs.
<!-- AC:END --> <!-- AC:END -->
## Note
- Full dictionary source/version/performance notes are intentionally deferred for now (out of scope in this pass).
## Definition of Done ## Definition of Done
<!-- DOD:BEGIN --> <!-- DOD:BEGIN -->
- [x] #1 Dictionary data is bundled and consumable during development and packaged app runs. - [x] #1 Dictionary data is bundled and consumable during development and packaged app runs.

View File

@@ -1,9 +1,10 @@
--- ---
id: TASK-23.4 id: TASK-23.4
title: Add opt-in control and end-to-end flow + tests for JLPT tagging title: Add opt-in control and end-to-end flow + tests for JLPT tagging
status: In Progress status: Done
assignee: [] assignee: []
created_date: '2026-02-13 16:42' created_date: '2026-02-13 16:42'
updated_date: '2026-02-16 02:00'
labels: [] labels: []
dependencies: [] dependencies: []
parent_task_id: TASK-23 parent_task_id: TASK-23
@@ -21,13 +22,10 @@ Add user/config setting to enable JLPT tagging, wire the feature toggle through
- [x] #1 JLPT tagging is opt-in and defaults to disabled. - [x] #1 JLPT tagging is opt-in and defaults to disabled.
- [x] #2 When disabled, lookup/rendering pipeline does not execute JLPT processing. - [x] #2 When disabled, lookup/rendering pipeline does not execute JLPT processing.
- [x] #3 When enabled, end-to-end flow tags subtitle words via token-level lookup and rendering. - [x] #3 When enabled, end-to-end flow tags subtitle words via token-level lookup and rendering.
- [ ] #4 Add tests covering at least one positive match, one non-match, and disabled state. - [x] #4 Add tests covering at least one positive match, one non-match, and disabled state.
<!-- AC:END --> <!-- AC:END -->
## Note
- Full end-to-end + disabled-state test coverage remains pending as an explicit follow-up item.
## Definition of Done ## Definition of Done
<!-- DOD:BEGIN --> <!-- DOD:BEGIN -->
- [ ] #1 End-to-end option behavior and opt-in state persistence are implemented and verified. - [x] #1 End-to-end option behavior and opt-in state persistence are implemented and verified.
<!-- DOD:END --> <!-- DOD:END -->

View File

@@ -558,6 +558,8 @@ See `config.example.jsonc` for detailed configuration options.
| `jlptColors` | object | JLPT level underline colors object (`N1`..`N5`) | | `jlptColors` | object | JLPT level underline colors object (`N1`..`N5`) |
| `secondary` | object | Override any of the above for secondary subtitles (optional) | | `secondary` | object | Override any of the above for secondary subtitles (optional) |
JLPT underlining is powered by offline term-meta bank files at runtime. See [`docs/jlpt-vocab-bundle.md`](jlpt-vocab-bundle.md) for required files, source/version refresh steps, and deterministic fallback behavior.
Secondary subtitle defaults: `fontSize: 24`, `fontColor: "#ffffff"`, `backgroundColor: "transparent"`. Any property not set in `secondary` falls back to the CSS defaults. Secondary subtitle defaults: `fontSize: 24`, `fontColor: "#ffffff"`, `backgroundColor: "transparent"`. Any property not set in `secondary` falls back to the CSS defaults.
**See `config.example.jsonc`** for the complete list of subtitle style configuration options. **See `config.example.jsonc`** for the complete list of subtitle style configuration options.

59
docs/jlpt-vocab-bundle.md Normal file
View File

@@ -0,0 +1,59 @@
# JLPT Vocabulary Bundle (Offline)
## Bundle location
SubMiner expects the JLPT term-meta bank files to be available locally at:
- `vendor/yomitan-jlpt-vocab`
At runtime, SubMiner also searches these derived locations:
- `vendor/yomitan-jlpt-vocab`
- `vendor/yomitan-jlpt-vocab/vendor/yomitan-jlpt-vocab`
- `vendor/yomitan-jlpt-vocab/yomitan-jlpt-vocab`
and user-data/config fallback paths (see `getJlptDictionarySearchPaths` in `src/main.ts`).
## Required files
The expected files are:
- `term_meta_bank_1.json`
- `term_meta_bank_2.json`
- `term_meta_bank_3.json`
- `term_meta_bank_4.json`
- `term_meta_bank_5.json`
Each bank maps terms to frequency metadata; only entries with a `frequency.displayValue` are considered for JLPT tagging.
## Source and update process
For reproducible updates:
1. Obtain the JLPT term-meta bank archive from the same upstream source that supplies the bundled Yomitan dictionary data.
2. Extract the five `term_meta_bank_*.json` files.
3. Place them into `vendor/yomitan-jlpt-vocab/`.
4. Commit the update with the source URL/version in the task notes.
This repository currently ships the folder path in `electron-builder` `extraResources` as:
`vendor/yomitan-jlpt-vocab -> yomitan-jlpt-vocab`.
## Deterministic fallback behavior on malformed inputs
`createJlptVocabularyLookupService()` follows these rules:
- If a bank file is missing, parsing fails, or the JSON shape is unsupported, that file is skipped and processing continues.
- If entries do not expose expected frequency metadata, they are skipped.
- If no usable bank entries are found, SubMiner initializes a no-op JLPT lookup (`null` for every token).
- In all fallback cases, subtitle rendering remains unchanged (no underlines are added).
## Bundle size and startup cost
Lookup work is currently a synchronous file read + parse at enable-time and then O(1) in-memory `Map` lookups during subtitle updates.
Practical guidance:
- Keep the JLPT bundle inside `vendor/yomitan-jlpt-vocab` to avoid network lookups.
- Measure bundle size with:
- `du -sh vendor/yomitan-jlpt-vocab`
- If the JLPT source is updated, re-run `pnpm run build:appimage` / packaging and confirm startup logs do not report missing banks.

View File

@@ -97,6 +97,7 @@ function collectDictionaryFromPath(
for (const bank of JLPT_BANK_FILES) { for (const bank of JLPT_BANK_FILES) {
const bankPath = path.join(dictionaryPath, bank.filename); const bankPath = path.join(dictionaryPath, bank.filename);
if (!fs.existsSync(bankPath)) { if (!fs.existsSync(bankPath)) {
log(`JLPT bank file missing for ${bank.level}: ${bankPath}`);
continue; continue;
} }
@@ -104,6 +105,7 @@ function collectDictionaryFromPath(
try { try {
rawText = fs.readFileSync(bankPath, "utf-8"); rawText = fs.readFileSync(bankPath, "utf-8");
} catch { } catch {
log(`Failed to read JLPT bank file ${bankPath}`);
continue; continue;
} }
@@ -111,10 +113,22 @@ function collectDictionaryFromPath(
try { try {
rawEntries = JSON.parse(rawText) as unknown; rawEntries = JSON.parse(rawText) as unknown;
} catch { } catch {
log(`Failed to parse JLPT bank file as JSON: ${bankPath}`);
continue; continue;
} }
if (!Array.isArray(rawEntries)) {
log(
`JLPT bank file has unsupported format (expected JSON array): ${bankPath}`,
);
continue;
}
const beforeSize = terms.size;
addEntriesToMap(rawEntries, bank.level, terms, log); addEntriesToMap(rawEntries, bank.level, terms, log);
if (terms.size === beforeSize) {
log(`JLPT bank file contained no extractable entries: ${bankPath}`);
}
} }
return terms; return terms;
@@ -124,8 +138,9 @@ export async function createJlptVocabularyLookupService(
options: JlptVocabLookupOptions, options: JlptVocabLookupOptions,
): Promise<(term: string) => JlptLevel | null> { ): Promise<(term: string) => JlptLevel | null> {
const attemptedPaths: string[] = []; const attemptedPaths: string[] = [];
let foundDirectoryCount = 0; let foundDictionaryPathCount = 0;
let foundBankCount = 0; let foundBankCount = 0;
const resolvedBanks: string[] = [];
for (const dictionaryPath of options.searchPaths) { for (const dictionaryPath of options.searchPaths) {
attemptedPaths.push(dictionaryPath); attemptedPaths.push(dictionaryPath);
if (!fs.existsSync(dictionaryPath)) { if (!fs.existsSync(dictionaryPath)) {
@@ -136,10 +151,11 @@ export async function createJlptVocabularyLookupService(
continue; continue;
} }
foundDirectoryCount += 1; foundDictionaryPathCount += 1;
const terms = collectDictionaryFromPath(dictionaryPath, options.log); const terms = collectDictionaryFromPath(dictionaryPath, options.log);
if (terms.size > 0) { if (terms.size > 0) {
resolvedBanks.push(dictionaryPath);
foundBankCount += 1; foundBankCount += 1;
options.log( options.log(
`JLPT dictionary loaded from ${dictionaryPath} (${terms.size} entries)`, `JLPT dictionary loaded from ${dictionaryPath} (${terms.size} entries)`,
@@ -159,10 +175,13 @@ export async function createJlptVocabularyLookupService(
options.log( options.log(
`JLPT dictionary not found. Searched ${attemptedPaths.length} candidate path(s): ${attemptedPaths.join(", ")}`, `JLPT dictionary not found. Searched ${attemptedPaths.length} candidate path(s): ${attemptedPaths.join(", ")}`,
); );
if (foundDirectoryCount > 0 && foundBankCount === 0) { if (foundDictionaryPathCount > 0 && foundBankCount === 0) {
options.log( options.log(
"JLPT dictionary directories found, but none contained valid term_meta_bank_*.json files.", "JLPT dictionary directories found, but none contained valid term_meta_bank_*.json files.",
); );
} }
if (resolvedBanks.length > 0 && foundBankCount > 0) {
options.log(`JLPT dictionary search matched path(s): ${resolvedBanks.join(", ")}`);
}
return NOOP_LOOKUP; return NOOP_LOOKUP;
} }

View File

@@ -32,6 +32,12 @@ type YomitanParseLine = YomitanParseSegment[];
const KATAKANA_TO_HIRAGANA_OFFSET = 0x60; const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
const KATAKANA_CODEPOINT_START = 0x30a1; const KATAKANA_CODEPOINT_START = 0x30a1;
const KATAKANA_CODEPOINT_END = 0x30f6; const KATAKANA_CODEPOINT_END = 0x30f6;
const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
const jlptLevelLookupCaches = new WeakMap<
(text: string) => JlptLevel | null,
Map<string, JlptLevel | null>
>();
function isObject(value: unknown): value is Record<string, unknown> { function isObject(value: unknown): value is Record<string, unknown> {
return Boolean(value && typeof value === "object"); return Boolean(value && typeof value === "object");
@@ -75,6 +81,43 @@ export interface TokenizerDepsRuntimeOptions {
getMecabTokenizer: () => MecabTokenizerLike | null; getMecabTokenizer: () => MecabTokenizerLike | null;
} }
function getCachedJlptLevel(
lookupText: string,
getJlptLevel: (text: string) => JlptLevel | null,
): JlptLevel | null {
const normalizedText = lookupText.trim();
if (!normalizedText) {
return null;
}
let cache = jlptLevelLookupCaches.get(getJlptLevel);
if (!cache) {
cache = new Map<string, JlptLevel | null>();
jlptLevelLookupCaches.set(getJlptLevel, cache);
}
if (cache.has(normalizedText)) {
return cache.get(normalizedText) ?? null;
}
let level: JlptLevel | null;
try {
level = getJlptLevel(normalizedText);
} catch {
level = null;
}
cache.set(normalizedText, level);
while (cache.size > JLPT_LEVEL_LOOKUP_CACHE_LIMIT) {
const firstKey = cache.keys().next().value;
if (firstKey !== undefined) {
cache.delete(firstKey);
}
}
return level;
}
export function createTokenizerDepsRuntimeService( export function createTokenizerDepsRuntimeService(
options: TokenizerDepsRuntimeOptions, options: TokenizerDepsRuntimeOptions,
): TokenizerServiceDeps { ): TokenizerServiceDeps {
@@ -326,8 +369,12 @@ function applyJlptMarking(
return { ...token, jlptLevel: undefined }; return { ...token, jlptLevel: undefined };
} }
const primaryLevel = getJlptLevel(resolveJlptLookupText(token)); const primaryLevel = getCachedJlptLevel(
const fallbackLevel = getJlptLevel(token.surface); resolveJlptLookupText(token),
getJlptLevel,
);
const fallbackLevel =
primaryLevel === null ? getCachedJlptLevel(token.surface, getJlptLevel) : null;
return { return {
...token, ...token,