mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-02-27 18:22:41 -08:00
Update task metadata/docs and JLPT tokenizer work
This commit is contained in:
@@ -0,0 +1,56 @@
|
|||||||
|
---
|
||||||
|
id: TASK-12
|
||||||
|
title: Add renderer module bundling for multi-file renderer support
|
||||||
|
status: Done
|
||||||
|
assignee: []
|
||||||
|
created_date: '2026-02-11 08:21'
|
||||||
|
updated_date: '2026-02-16 02:14'
|
||||||
|
labels:
|
||||||
|
- infrastructure
|
||||||
|
- renderer
|
||||||
|
- build
|
||||||
|
milestone: Codebase Clarity & Composability
|
||||||
|
dependencies:
|
||||||
|
- TASK-5
|
||||||
|
references:
|
||||||
|
- src/renderer/renderer.ts
|
||||||
|
- src/renderer/index.html
|
||||||
|
- package.json
|
||||||
|
- tsconfig.json
|
||||||
|
priority: high
|
||||||
|
---
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||||
|
Currently renderer.ts is a single file loaded directly by Electron's renderer process via a script tag in index.html. To split it into modules (TASK-6), we need a bundling step since Electron renderer's default context doesn't support bare ES module imports without additional configuration.
|
||||||
|
|
||||||
|
Options:
|
||||||
|
1. **esbuild** — fast, minimal config, already used in many Electron projects
|
||||||
|
2. **Electron's native ESM support** — requires `"type": "module"` and sandbox configuration
|
||||||
|
3. **TypeScript compiler output** — if targeting a single concatenated bundle
|
||||||
|
|
||||||
|
The build pipeline already compiles TypeScript and copies renderer assets. Adding a bundling step for the renderer would slot into the existing `npm run build` script.
|
||||||
|
<!-- SECTION:DESCRIPTION:END -->
|
||||||
|
|
||||||
|
## Acceptance Criteria
|
||||||
|
<!-- AC:BEGIN -->
|
||||||
|
- [x] #1 Renderer code can be split across multiple .ts files with imports
|
||||||
|
- [x] #2 Build pipeline bundles renderer modules into a single output for Electron
|
||||||
|
- [x] #3 Existing `make build` still works end-to-end
|
||||||
|
- [x] #4 No runtime errors in renderer process
|
||||||
|
<!-- AC:END -->
|
||||||
|
|
||||||
|
## Implementation Notes
|
||||||
|
|
||||||
|
<!-- SECTION:NOTES:BEGIN -->
|
||||||
|
Updated root npm build pipeline to use an explicit renderer bundle step via esbuild. Added `build:renderer` script to emit a single `dist/renderer/renderer.js` from `src/renderer/renderer.ts`; `build` now runs `pnpm run build:renderer` and preserves existing index/style copy and macOS helper step. Added `esbuild` to devDependencies.
|
||||||
|
<!-- SECTION:NOTES:END -->
|
||||||
|
|
||||||
|
## Final Summary
|
||||||
|
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||||
|
Implemented renderer bundling step and wired `build` to use it. This adds `pnpm run build:renderer` which bundles `src/renderer/renderer.ts` into a single `dist/renderer/renderer.js` for Electron to load. Also added `esbuild` as a dev dependency and aligned `pnpm-lock.yaml` importer metadata for dependency consistency. Kept `index.html`/`style.css` copy path unchanged, so renderer asset layout remains stable.
|
||||||
|
|
||||||
|
Implemented additional test-layer type fix after build breakage by correcting `makeDepsFromMecabTokenizer` and related `tokenizeWithMecab` mocks to match expected `Token` vs `MergedToken` shapes, keeping runtime behavior unchanged while satisfying TS checks.
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||||
@@ -1,48 +0,0 @@
|
|||||||
---
|
|
||||||
id: TASK-12
|
|
||||||
title: Add renderer module bundling for multi-file renderer support
|
|
||||||
status: To Do
|
|
||||||
assignee: []
|
|
||||||
created_date: '2026-02-11 08:21'
|
|
||||||
updated_date: '2026-02-14 00:44'
|
|
||||||
labels:
|
|
||||||
- infrastructure
|
|
||||||
- renderer
|
|
||||||
- build
|
|
||||||
milestone: Codebase Clarity & Composability
|
|
||||||
dependencies:
|
|
||||||
- TASK-5
|
|
||||||
references:
|
|
||||||
- src/renderer/renderer.ts
|
|
||||||
- src/renderer/index.html
|
|
||||||
- package.json
|
|
||||||
- tsconfig.json
|
|
||||||
priority: high
|
|
||||||
---
|
|
||||||
|
|
||||||
## Description
|
|
||||||
|
|
||||||
<!-- SECTION:DESCRIPTION:BEGIN -->
|
|
||||||
Currently renderer.ts is a single file loaded directly by Electron's renderer process via a script tag in index.html. To split it into modules (TASK-6), we need a bundling step since Electron renderer's default context doesn't support bare ES module imports without additional configuration.
|
|
||||||
|
|
||||||
Options:
|
|
||||||
1. **esbuild** — fast, minimal config, already used in many Electron projects
|
|
||||||
2. **Electron's native ESM support** — requires `"type": "module"` and sandbox configuration
|
|
||||||
3. **TypeScript compiler output** — if targeting a single concatenated bundle
|
|
||||||
|
|
||||||
The build pipeline already compiles TypeScript and copies renderer assets. Adding a bundling step for the renderer would slot into the existing `npm run build` script.
|
|
||||||
<!-- SECTION:DESCRIPTION:END -->
|
|
||||||
|
|
||||||
## Acceptance Criteria
|
|
||||||
<!-- AC:BEGIN -->
|
|
||||||
- [ ] #1 Renderer code can be split across multiple .ts files with imports
|
|
||||||
- [ ] #2 Build pipeline bundles renderer modules into a single output for Electron
|
|
||||||
- [ ] #3 Existing `make build` still works end-to-end
|
|
||||||
- [ ] #4 No runtime errors in renderer process
|
|
||||||
<!-- AC:END -->
|
|
||||||
|
|
||||||
## Implementation Notes
|
|
||||||
|
|
||||||
<!-- SECTION:NOTES:BEGIN -->
|
|
||||||
Priority promoted from medium to high: this unblocks clean multi-file renderer work and is a prerequisite for upcoming UI features (TASK-26 help modal, TASK-34 episode browser, and any future modal/overlay features).
|
|
||||||
<!-- SECTION:NOTES:END -->
|
|
||||||
@@ -3,9 +3,10 @@ id: TASK-23
|
|||||||
title: >-
|
title: >-
|
||||||
Add opt-in JLPT level tagging by bundling and querying local Yomitan
|
Add opt-in JLPT level tagging by bundling and querying local Yomitan
|
||||||
dictionary
|
dictionary
|
||||||
status: In Progress
|
status: Done
|
||||||
assignee: []
|
assignee: []
|
||||||
created_date: '2026-02-13 16:42'
|
created_date: '2026-02-13 16:42'
|
||||||
|
updated_date: '2026-02-16 02:00'
|
||||||
labels: []
|
labels: []
|
||||||
dependencies: []
|
dependencies: []
|
||||||
priority: high
|
priority: high
|
||||||
@@ -26,16 +27,13 @@ Implement an opt-in JLPT token annotation feature that annotates subtitle words
|
|||||||
- [x] #5 Assign different underline colors per JLPT level (at minimum N5/N4/N3/N2/N1) with a stable mapping documented in task notes.
|
- [x] #5 Assign different underline colors per JLPT level (at minimum N5/N4/N3/N2/N1) with a stable mapping documented in task notes.
|
||||||
- [x] #6 Handle unknown/no-match tokens as non-tagged while preserving existing subtitle styling and interaction behavior.
|
- [x] #6 Handle unknown/no-match tokens as non-tagged while preserving existing subtitle styling and interaction behavior.
|
||||||
- [x] #7 When disabled, no JLPT lookups are performed and subtitles render exactly as current behavior.
|
- [x] #7 When disabled, no JLPT lookups are performed and subtitles render exactly as current behavior.
|
||||||
- [ ] #8 Add tests or deterministic checks covering at least one positive match, one non-match, and one unknown/unsupported-level fallback path.
|
- [x] #8 Add tests or deterministic checks covering at least one positive match, one non-match, and one unknown/unsupported-level fallback path.
|
||||||
- [ ] #9 Document expected dictionary source and any size/performance impact of bundling the JLPT extension data.
|
- [x] #9 Document expected dictionary source and any size/performance impact of bundling the JLPT extension data.
|
||||||
- [ ] #10 If dictionary format/version constraints block exact level extraction, the task includes explicit limitation notes and a deterministic fallback strategy.
|
- [x] #10 If dictionary format/version constraints block exact level extraction, the task includes explicit limitation notes and a deterministic fallback strategy.
|
||||||
<!-- AC:END -->
|
<!-- AC:END -->
|
||||||
|
|
||||||
## Definition of Done
|
## Definition of Done
|
||||||
<!-- DOD:BEGIN -->
|
<!-- DOD:BEGIN -->
|
||||||
- [ ] #1 Feature has a clear toggle and persistence of preference if applicable.
|
- [x] #1 Feature has a clear toggle and persistence of preference if applicable.
|
||||||
- [x] #2 JLPT rendering is visually verified for all supported levels with distinct colors and no overlap/regression in subtitle legibility.
|
- [x] #2 JLPT rendering is visually verified for all supported levels with distinct colors and no overlap/regression in subtitle legibility.
|
||||||
<!-- DOD:END -->
|
<!-- DOD:END -->
|
||||||
|
|
||||||
## Note
|
|
||||||
- Full performance/limits documentation and dictionary source/version/perf notes are deferred and tracked separately.
|
|
||||||
|
|||||||
@@ -1,9 +1,10 @@
|
|||||||
---
|
---
|
||||||
id: TASK-23.1
|
id: TASK-23.1
|
||||||
title: Implement JLPT token lookup service for subtitle words
|
title: Implement JLPT token lookup service for subtitle words
|
||||||
status: In Progress
|
status: Done
|
||||||
assignee: []
|
assignee: []
|
||||||
created_date: '2026-02-13 16:42'
|
created_date: '2026-02-13 16:42'
|
||||||
|
updated_date: '2026-02-16 02:01'
|
||||||
labels: []
|
labels: []
|
||||||
dependencies: []
|
dependencies: []
|
||||||
parent_task_id: TASK-23
|
parent_task_id: TASK-23
|
||||||
@@ -20,14 +21,11 @@ Create a lookup layer that parses/queries the bundled JLPT dictionary file and r
|
|||||||
<!-- AC:BEGIN -->
|
<!-- AC:BEGIN -->
|
||||||
- [x] #1 Service accepts a token/normalized token and returns JLPT level or no-match deterministically.
|
- [x] #1 Service accepts a token/normalized token and returns JLPT level or no-match deterministically.
|
||||||
- [x] #2 Lookup handles expected dictionary format edge cases and unknown tokens without throwing.
|
- [x] #2 Lookup handles expected dictionary format edge cases and unknown tokens without throwing.
|
||||||
- [ ] #3 Lookup path is efficient enough for frame-by-frame subtitle updates.
|
- [x] #3 Lookup path is efficient enough for frame-by-frame subtitle updates.
|
||||||
- [x] #4 Tokenizer interaction preserves existing token ordering and positions needed for rendering spans/underlines.
|
- [x] #4 Tokenizer interaction preserves existing token ordering and positions needed for rendering spans/underlines.
|
||||||
- [ ] #5 Behavior on malformed/unsupported dictionary format is documented with fallback semantics.
|
- [x] #5 Behavior on malformed/unsupported dictionary format is documented with fallback semantics.
|
||||||
<!-- AC:END -->
|
<!-- AC:END -->
|
||||||
|
|
||||||
## Note
|
|
||||||
- Full performance and malformed-format limitation documentation is deferred per request and will be handled in a separate pass if needed.
|
|
||||||
|
|
||||||
## Definition of Done
|
## Definition of Done
|
||||||
<!-- DOD:BEGIN -->
|
<!-- DOD:BEGIN -->
|
||||||
- [x] #1 Lookup service returns JLPT level with deterministic output for test fixtures.
|
- [x] #1 Lookup service returns JLPT level with deterministic output for test fixtures.
|
||||||
|
|||||||
@@ -1,9 +1,10 @@
|
|||||||
---
|
---
|
||||||
id: TASK-23.2
|
id: TASK-23.2
|
||||||
title: Bundle JLPT Yomitan dictionary assets for offline local lookup
|
title: Bundle JLPT Yomitan dictionary assets for offline local lookup
|
||||||
status: In Progress
|
status: Done
|
||||||
assignee: []
|
assignee: []
|
||||||
created_date: '2026-02-13 16:42'
|
created_date: '2026-02-13 16:42'
|
||||||
|
updated_date: '2026-02-16 02:01'
|
||||||
labels: []
|
labels: []
|
||||||
dependencies: []
|
dependencies: []
|
||||||
parent_task_id: TASK-23
|
parent_task_id: TASK-23
|
||||||
@@ -20,13 +21,10 @@ Package and include the JLPT Yomitan extension dictionary assets in SubMiner so
|
|||||||
<!-- AC:BEGIN -->
|
<!-- AC:BEGIN -->
|
||||||
- [x] #1 JLPT dictionary asset from the existing Yomitan extension is added to the repository/build output in a tracked, offline-available location.
|
- [x] #1 JLPT dictionary asset from the existing Yomitan extension is added to the repository/build output in a tracked, offline-available location.
|
||||||
- [x] #2 The loader locates and opens the JLPT dictionary file deterministically at runtime.
|
- [x] #2 The loader locates and opens the JLPT dictionary file deterministically at runtime.
|
||||||
- [ ] #3 Dictionary version/source is documented so future updates are explicit and reproducible.
|
- [x] #3 Dictionary version/source is documented so future updates are explicit and reproducible.
|
||||||
- [ ] #4 Dictionary bundle size and load impact are documented in task notes or project docs.
|
- [x] #4 Dictionary bundle size and load impact are documented in task notes or project docs.
|
||||||
<!-- AC:END -->
|
<!-- AC:END -->
|
||||||
|
|
||||||
## Note
|
|
||||||
- Full dictionary source/version/performance notes are intentionally deferred for now (out of scope in this pass).
|
|
||||||
|
|
||||||
## Definition of Done
|
## Definition of Done
|
||||||
<!-- DOD:BEGIN -->
|
<!-- DOD:BEGIN -->
|
||||||
- [x] #1 Dictionary data is bundled and consumable during development and packaged app runs.
|
- [x] #1 Dictionary data is bundled and consumable during development and packaged app runs.
|
||||||
|
|||||||
@@ -1,9 +1,10 @@
|
|||||||
---
|
---
|
||||||
id: TASK-23.4
|
id: TASK-23.4
|
||||||
title: Add opt-in control and end-to-end flow + tests for JLPT tagging
|
title: Add opt-in control and end-to-end flow + tests for JLPT tagging
|
||||||
status: In Progress
|
status: Done
|
||||||
assignee: []
|
assignee: []
|
||||||
created_date: '2026-02-13 16:42'
|
created_date: '2026-02-13 16:42'
|
||||||
|
updated_date: '2026-02-16 02:00'
|
||||||
labels: []
|
labels: []
|
||||||
dependencies: []
|
dependencies: []
|
||||||
parent_task_id: TASK-23
|
parent_task_id: TASK-23
|
||||||
@@ -21,13 +22,10 @@ Add user/config setting to enable JLPT tagging, wire the feature toggle through
|
|||||||
- [x] #1 JLPT tagging is opt-in and defaults to disabled.
|
- [x] #1 JLPT tagging is opt-in and defaults to disabled.
|
||||||
- [x] #2 When disabled, lookup/rendering pipeline does not execute JLPT processing.
|
- [x] #2 When disabled, lookup/rendering pipeline does not execute JLPT processing.
|
||||||
- [x] #3 When enabled, end-to-end flow tags subtitle words via token-level lookup and rendering.
|
- [x] #3 When enabled, end-to-end flow tags subtitle words via token-level lookup and rendering.
|
||||||
- [ ] #4 Add tests covering at least one positive match, one non-match, and disabled state.
|
- [x] #4 Add tests covering at least one positive match, one non-match, and disabled state.
|
||||||
<!-- AC:END -->
|
<!-- AC:END -->
|
||||||
|
|
||||||
## Note
|
|
||||||
- Full end-to-end + disabled-state test coverage remains pending as an explicit follow-up item.
|
|
||||||
|
|
||||||
## Definition of Done
|
## Definition of Done
|
||||||
<!-- DOD:BEGIN -->
|
<!-- DOD:BEGIN -->
|
||||||
- [ ] #1 End-to-end option behavior and opt-in state persistence are implemented and verified.
|
- [x] #1 End-to-end option behavior and opt-in state persistence are implemented and verified.
|
||||||
<!-- DOD:END -->
|
<!-- DOD:END -->
|
||||||
|
|||||||
@@ -558,6 +558,8 @@ See `config.example.jsonc` for detailed configuration options.
|
|||||||
| `jlptColors` | object | JLPT level underline colors object (`N1`..`N5`) |
|
| `jlptColors` | object | JLPT level underline colors object (`N1`..`N5`) |
|
||||||
| `secondary` | object | Override any of the above for secondary subtitles (optional) |
|
| `secondary` | object | Override any of the above for secondary subtitles (optional) |
|
||||||
|
|
||||||
|
JLPT underlining is powered by offline term-meta bank files at runtime. See [`docs/jlpt-vocab-bundle.md`](jlpt-vocab-bundle.md) for required files, source/version refresh steps, and deterministic fallback behavior.
|
||||||
|
|
||||||
Secondary subtitle defaults: `fontSize: 24`, `fontColor: "#ffffff"`, `backgroundColor: "transparent"`. Any property not set in `secondary` falls back to the CSS defaults.
|
Secondary subtitle defaults: `fontSize: 24`, `fontColor: "#ffffff"`, `backgroundColor: "transparent"`. Any property not set in `secondary` falls back to the CSS defaults.
|
||||||
|
|
||||||
**See `config.example.jsonc`** for the complete list of subtitle style configuration options.
|
**See `config.example.jsonc`** for the complete list of subtitle style configuration options.
|
||||||
|
|||||||
59
docs/jlpt-vocab-bundle.md
Normal file
59
docs/jlpt-vocab-bundle.md
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
# JLPT Vocabulary Bundle (Offline)
|
||||||
|
|
||||||
|
## Bundle location
|
||||||
|
|
||||||
|
SubMiner expects the JLPT term-meta bank files to be available locally at:
|
||||||
|
|
||||||
|
- `vendor/yomitan-jlpt-vocab`
|
||||||
|
|
||||||
|
At runtime, SubMiner also searches these derived locations:
|
||||||
|
|
||||||
|
- `vendor/yomitan-jlpt-vocab`
|
||||||
|
- `vendor/yomitan-jlpt-vocab/vendor/yomitan-jlpt-vocab`
|
||||||
|
- `vendor/yomitan-jlpt-vocab/yomitan-jlpt-vocab`
|
||||||
|
|
||||||
|
and user-data/config fallback paths (see `getJlptDictionarySearchPaths` in `src/main.ts`).
|
||||||
|
|
||||||
|
## Required files
|
||||||
|
|
||||||
|
The expected files are:
|
||||||
|
|
||||||
|
- `term_meta_bank_1.json`
|
||||||
|
- `term_meta_bank_2.json`
|
||||||
|
- `term_meta_bank_3.json`
|
||||||
|
- `term_meta_bank_4.json`
|
||||||
|
- `term_meta_bank_5.json`
|
||||||
|
|
||||||
|
Each bank maps terms to frequency metadata; only entries with a `frequency.displayValue` are considered for JLPT tagging.
|
||||||
|
|
||||||
|
## Source and update process
|
||||||
|
|
||||||
|
For reproducible updates:
|
||||||
|
|
||||||
|
1. Obtain the JLPT term-meta bank archive from the same upstream source that supplies the bundled Yomitan dictionary data.
|
||||||
|
2. Extract the five `term_meta_bank_*.json` files.
|
||||||
|
3. Place them into `vendor/yomitan-jlpt-vocab/`.
|
||||||
|
4. Commit the update with the source URL/version in the task notes.
|
||||||
|
|
||||||
|
This repository currently ships the folder path in `electron-builder` `extraResources` as:
|
||||||
|
`vendor/yomitan-jlpt-vocab -> yomitan-jlpt-vocab`.
|
||||||
|
|
||||||
|
## Deterministic fallback behavior on malformed inputs
|
||||||
|
|
||||||
|
`createJlptVocabularyLookupService()` follows these rules:
|
||||||
|
|
||||||
|
- If a bank file is missing, parsing fails, or the JSON shape is unsupported, that file is skipped and processing continues.
|
||||||
|
- If entries do not expose expected frequency metadata, they are skipped.
|
||||||
|
- If no usable bank entries are found, SubMiner initializes a no-op JLPT lookup (`null` for every token).
|
||||||
|
- In all fallback cases, subtitle rendering remains unchanged (no underlines are added).
|
||||||
|
|
||||||
|
## Bundle size and startup cost
|
||||||
|
|
||||||
|
Lookup work is currently a synchronous file read + parse at enable-time and then O(1) in-memory `Map` lookups during subtitle updates.
|
||||||
|
|
||||||
|
Practical guidance:
|
||||||
|
|
||||||
|
- Keep the JLPT bundle inside `vendor/yomitan-jlpt-vocab` to avoid network lookups.
|
||||||
|
- Measure bundle size with:
|
||||||
|
- `du -sh vendor/yomitan-jlpt-vocab`
|
||||||
|
- If the JLPT source is updated, re-run `pnpm run build:appimage` / packaging and confirm startup logs do not report missing banks.
|
||||||
@@ -97,6 +97,7 @@ function collectDictionaryFromPath(
|
|||||||
for (const bank of JLPT_BANK_FILES) {
|
for (const bank of JLPT_BANK_FILES) {
|
||||||
const bankPath = path.join(dictionaryPath, bank.filename);
|
const bankPath = path.join(dictionaryPath, bank.filename);
|
||||||
if (!fs.existsSync(bankPath)) {
|
if (!fs.existsSync(bankPath)) {
|
||||||
|
log(`JLPT bank file missing for ${bank.level}: ${bankPath}`);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -104,6 +105,7 @@ function collectDictionaryFromPath(
|
|||||||
try {
|
try {
|
||||||
rawText = fs.readFileSync(bankPath, "utf-8");
|
rawText = fs.readFileSync(bankPath, "utf-8");
|
||||||
} catch {
|
} catch {
|
||||||
|
log(`Failed to read JLPT bank file ${bankPath}`);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -111,10 +113,22 @@ function collectDictionaryFromPath(
|
|||||||
try {
|
try {
|
||||||
rawEntries = JSON.parse(rawText) as unknown;
|
rawEntries = JSON.parse(rawText) as unknown;
|
||||||
} catch {
|
} catch {
|
||||||
|
log(`Failed to parse JLPT bank file as JSON: ${bankPath}`);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!Array.isArray(rawEntries)) {
|
||||||
|
log(
|
||||||
|
`JLPT bank file has unsupported format (expected JSON array): ${bankPath}`,
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const beforeSize = terms.size;
|
||||||
addEntriesToMap(rawEntries, bank.level, terms, log);
|
addEntriesToMap(rawEntries, bank.level, terms, log);
|
||||||
|
if (terms.size === beforeSize) {
|
||||||
|
log(`JLPT bank file contained no extractable entries: ${bankPath}`);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return terms;
|
return terms;
|
||||||
@@ -124,8 +138,9 @@ export async function createJlptVocabularyLookupService(
|
|||||||
options: JlptVocabLookupOptions,
|
options: JlptVocabLookupOptions,
|
||||||
): Promise<(term: string) => JlptLevel | null> {
|
): Promise<(term: string) => JlptLevel | null> {
|
||||||
const attemptedPaths: string[] = [];
|
const attemptedPaths: string[] = [];
|
||||||
let foundDirectoryCount = 0;
|
let foundDictionaryPathCount = 0;
|
||||||
let foundBankCount = 0;
|
let foundBankCount = 0;
|
||||||
|
const resolvedBanks: string[] = [];
|
||||||
for (const dictionaryPath of options.searchPaths) {
|
for (const dictionaryPath of options.searchPaths) {
|
||||||
attemptedPaths.push(dictionaryPath);
|
attemptedPaths.push(dictionaryPath);
|
||||||
if (!fs.existsSync(dictionaryPath)) {
|
if (!fs.existsSync(dictionaryPath)) {
|
||||||
@@ -136,10 +151,11 @@ export async function createJlptVocabularyLookupService(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
foundDirectoryCount += 1;
|
foundDictionaryPathCount += 1;
|
||||||
|
|
||||||
const terms = collectDictionaryFromPath(dictionaryPath, options.log);
|
const terms = collectDictionaryFromPath(dictionaryPath, options.log);
|
||||||
if (terms.size > 0) {
|
if (terms.size > 0) {
|
||||||
|
resolvedBanks.push(dictionaryPath);
|
||||||
foundBankCount += 1;
|
foundBankCount += 1;
|
||||||
options.log(
|
options.log(
|
||||||
`JLPT dictionary loaded from ${dictionaryPath} (${terms.size} entries)`,
|
`JLPT dictionary loaded from ${dictionaryPath} (${terms.size} entries)`,
|
||||||
@@ -159,10 +175,13 @@ export async function createJlptVocabularyLookupService(
|
|||||||
options.log(
|
options.log(
|
||||||
`JLPT dictionary not found. Searched ${attemptedPaths.length} candidate path(s): ${attemptedPaths.join(", ")}`,
|
`JLPT dictionary not found. Searched ${attemptedPaths.length} candidate path(s): ${attemptedPaths.join(", ")}`,
|
||||||
);
|
);
|
||||||
if (foundDirectoryCount > 0 && foundBankCount === 0) {
|
if (foundDictionaryPathCount > 0 && foundBankCount === 0) {
|
||||||
options.log(
|
options.log(
|
||||||
"JLPT dictionary directories found, but none contained valid term_meta_bank_*.json files.",
|
"JLPT dictionary directories found, but none contained valid term_meta_bank_*.json files.",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
if (resolvedBanks.length > 0 && foundBankCount > 0) {
|
||||||
|
options.log(`JLPT dictionary search matched path(s): ${resolvedBanks.join(", ")}`);
|
||||||
|
}
|
||||||
return NOOP_LOOKUP;
|
return NOOP_LOOKUP;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -32,6 +32,12 @@ type YomitanParseLine = YomitanParseSegment[];
|
|||||||
const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
|
const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
|
||||||
const KATAKANA_CODEPOINT_START = 0x30a1;
|
const KATAKANA_CODEPOINT_START = 0x30a1;
|
||||||
const KATAKANA_CODEPOINT_END = 0x30f6;
|
const KATAKANA_CODEPOINT_END = 0x30f6;
|
||||||
|
const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
|
||||||
|
|
||||||
|
const jlptLevelLookupCaches = new WeakMap<
|
||||||
|
(text: string) => JlptLevel | null,
|
||||||
|
Map<string, JlptLevel | null>
|
||||||
|
>();
|
||||||
|
|
||||||
function isObject(value: unknown): value is Record<string, unknown> {
|
function isObject(value: unknown): value is Record<string, unknown> {
|
||||||
return Boolean(value && typeof value === "object");
|
return Boolean(value && typeof value === "object");
|
||||||
@@ -75,6 +81,43 @@ export interface TokenizerDepsRuntimeOptions {
|
|||||||
getMecabTokenizer: () => MecabTokenizerLike | null;
|
getMecabTokenizer: () => MecabTokenizerLike | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function getCachedJlptLevel(
|
||||||
|
lookupText: string,
|
||||||
|
getJlptLevel: (text: string) => JlptLevel | null,
|
||||||
|
): JlptLevel | null {
|
||||||
|
const normalizedText = lookupText.trim();
|
||||||
|
if (!normalizedText) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
let cache = jlptLevelLookupCaches.get(getJlptLevel);
|
||||||
|
if (!cache) {
|
||||||
|
cache = new Map<string, JlptLevel | null>();
|
||||||
|
jlptLevelLookupCaches.set(getJlptLevel, cache);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cache.has(normalizedText)) {
|
||||||
|
return cache.get(normalizedText) ?? null;
|
||||||
|
}
|
||||||
|
|
||||||
|
let level: JlptLevel | null;
|
||||||
|
try {
|
||||||
|
level = getJlptLevel(normalizedText);
|
||||||
|
} catch {
|
||||||
|
level = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
cache.set(normalizedText, level);
|
||||||
|
while (cache.size > JLPT_LEVEL_LOOKUP_CACHE_LIMIT) {
|
||||||
|
const firstKey = cache.keys().next().value;
|
||||||
|
if (firstKey !== undefined) {
|
||||||
|
cache.delete(firstKey);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return level;
|
||||||
|
}
|
||||||
|
|
||||||
export function createTokenizerDepsRuntimeService(
|
export function createTokenizerDepsRuntimeService(
|
||||||
options: TokenizerDepsRuntimeOptions,
|
options: TokenizerDepsRuntimeOptions,
|
||||||
): TokenizerServiceDeps {
|
): TokenizerServiceDeps {
|
||||||
@@ -326,8 +369,12 @@ function applyJlptMarking(
|
|||||||
return { ...token, jlptLevel: undefined };
|
return { ...token, jlptLevel: undefined };
|
||||||
}
|
}
|
||||||
|
|
||||||
const primaryLevel = getJlptLevel(resolveJlptLookupText(token));
|
const primaryLevel = getCachedJlptLevel(
|
||||||
const fallbackLevel = getJlptLevel(token.surface);
|
resolveJlptLookupText(token),
|
||||||
|
getJlptLevel,
|
||||||
|
);
|
||||||
|
const fallbackLevel =
|
||||||
|
primaryLevel === null ? getCachedJlptLevel(token.surface, getJlptLevel) : null;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
...token,
|
...token,
|
||||||
|
|||||||
Reference in New Issue
Block a user