diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml deleted file mode 100644 index a2a4ce0..0000000 --- a/.github/workflows/docs.yml +++ /dev/null @@ -1,63 +0,0 @@ -name: Docs - -on: - push: - branches: [main] - paths: - - 'docs/**' - - '.github/workflows/docs.yml' - - 'package.json' - - 'pnpm-lock.yaml' - workflow_dispatch: - -permissions: - contents: read - pages: write - id-token: write - -concurrency: - group: pages - cancel-in-progress: true - -jobs: - build: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Setup pnpm - uses: pnpm/action-setup@v4 - with: - version: 9 - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: 20 - cache: pnpm - - - name: Install dependencies - run: pnpm install --frozen-lockfile - - - name: Build docs - run: pnpm run docs:build - - - name: Setup Pages - uses: actions/configure-pages@v5 - - - name: Upload artifact - uses: actions/upload-pages-artifact@v3 - with: - path: docs/.vitepress/dist - - deploy: - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} - runs-on: ubuntu-latest - needs: build - steps: - - name: Deploy to GitHub Pages - id: deployment - uses: actions/deploy-pages@v4 diff --git a/.gitmodules b/.gitmodules index 31ab7ff..8245913 100644 --- a/.gitmodules +++ b/.gitmodules @@ -2,3 +2,6 @@ path = vendor/texthooker-ui url = https://github.com/ksyasuda/texthooker-ui.git branch = subminer +[submodule "vendor/yomitan-jlpt-vocab"] + path = vendor/yomitan-jlpt-vocab + url = https://github.com/stephenmk/yomitan-jlpt-vocab diff --git a/README.md b/README.md index 99a8139..c108bef 100644 --- a/README.md +++ b/README.md @@ -46,12 +46,19 @@ The `subminer` wrapper uses a [Bun](https://bun.sh) shebang, so `bun` must be on ### From Source ```bash -git clone https://github.com/ksyasuda/SubMiner.git +git clone --recurse-submodules https://github.com/ksyasuda/SubMiner.git cd SubMiner make build make install ``` +If you already cloned without submodules: + +```bash +cd SubMiner +git submodule update --init --recursive +``` + For macOS builds, signing, and platform-specific details, see [docs/installation.md](docs/installation.md). ## Quick Start diff --git a/backlog/tasks/task-23 - Add-opt-in-JLPT-level-tagging-by-bundling-and-querying-local-Yomitan-dictionary.md b/backlog/tasks/task-23 - Add-opt-in-JLPT-level-tagging-by-bundling-and-querying-local-Yomitan-dictionary.md index 0796cf7..7f81498 100644 --- a/backlog/tasks/task-23 - Add-opt-in-JLPT-level-tagging-by-bundling-and-querying-local-Yomitan-dictionary.md +++ b/backlog/tasks/task-23 - Add-opt-in-JLPT-level-tagging-by-bundling-and-querying-local-Yomitan-dictionary.md @@ -3,7 +3,7 @@ id: TASK-23 title: >- Add opt-in JLPT level tagging by bundling and querying local Yomitan dictionary -status: To Do +status: In Progress assignee: [] created_date: '2026-02-13 16:42' labels: [] @@ -19,13 +19,13 @@ Implement an opt-in JLPT token annotation feature that annotates subtitle words ## Acceptance Criteria -- [ ] #1 Add an opt-in setting/feature flag so JLPT tagging is disabled by default and can be enabled per user/session as requested. -- [ ] #2 Bundle the existing JLPT Yomitan extension package/data into the project so lookups can be performed offline from local files. -- [ ] #3 Implement token-level dictionary lookup against the bundled JLPT dictionary file to determine presence and JLPT level for words in subtitle lines. -- [ ] #4 Render a colored underline under each token determined to have a JLPT level; the underline must match token width/length and not affect layout or disrupt line rendering. -- [ ] #5 Assign different underline colors per JLPT level (at minimum N5/N4/N3/N2/N1) with a stable mapping documented in task notes. -- [ ] #6 Handle unknown/no-match tokens as non-tagged while preserving existing subtitle styling and interaction behavior. -- [ ] #7 When disabled, no JLPT lookups are performed and subtitles render exactly as current behavior. +- [x] #1 Add an opt-in setting/feature flag so JLPT tagging is disabled by default and can be enabled per user/session as requested. +- [x] #2 Bundle the existing JLPT Yomitan extension package/data into the project so lookups can be performed offline from local files. +- [x] #3 Implement token-level dictionary lookup against the bundled JLPT dictionary file to determine presence and JLPT level for words in subtitle lines. +- [x] #4 Render a colored underline under each token determined to have a JLPT level; the underline must match token width/length and not affect layout or disrupt line rendering. +- [x] #5 Assign different underline colors per JLPT level (at minimum N5/N4/N3/N2/N1) with a stable mapping documented in task notes. +- [x] #6 Handle unknown/no-match tokens as non-tagged while preserving existing subtitle styling and interaction behavior. +- [x] #7 When disabled, no JLPT lookups are performed and subtitles render exactly as current behavior. - [ ] #8 Add tests or deterministic checks covering at least one positive match, one non-match, and one unknown/unsupported-level fallback path. - [ ] #9 Document expected dictionary source and any size/performance impact of bundling the JLPT extension data. - [ ] #10 If dictionary format/version constraints block exact level extraction, the task includes explicit limitation notes and a deterministic fallback strategy. @@ -34,5 +34,8 @@ Implement an opt-in JLPT token annotation feature that annotates subtitle words ## Definition of Done - [ ] #1 Feature has a clear toggle and persistence of preference if applicable. -- [ ] #2 JLPT rendering is visually verified for all supported levels with distinct colors and no overlap/regression in subtitle legibility. +- [x] #2 JLPT rendering is visually verified for all supported levels with distinct colors and no overlap/regression in subtitle legibility. + +## Note +- Full performance/limits documentation and dictionary source/version/perf notes are deferred and tracked separately. diff --git a/backlog/tasks/task-23.1 - Implement-JLPT-token-lookup-service-for-subtitle-words.md b/backlog/tasks/task-23.1 - Implement-JLPT-token-lookup-service-for-subtitle-words.md index 0f7ef72..9ae701c 100644 --- a/backlog/tasks/task-23.1 - Implement-JLPT-token-lookup-service-for-subtitle-words.md +++ b/backlog/tasks/task-23.1 - Implement-JLPT-token-lookup-service-for-subtitle-words.md @@ -1,7 +1,7 @@ --- id: TASK-23.1 title: Implement JLPT token lookup service for subtitle words -status: To Do +status: In Progress assignee: [] created_date: '2026-02-13 16:42' labels: [] @@ -18,14 +18,17 @@ Create a lookup layer that parses/queries the bundled JLPT dictionary file and r ## Acceptance Criteria -- [ ] #1 Service accepts a token/normalized token and returns JLPT level or no-match deterministically. -- [ ] #2 Lookup handles expected dictionary format edge cases and unknown tokens without throwing. +- [x] #1 Service accepts a token/normalized token and returns JLPT level or no-match deterministically. +- [x] #2 Lookup handles expected dictionary format edge cases and unknown tokens without throwing. - [ ] #3 Lookup path is efficient enough for frame-by-frame subtitle updates. -- [ ] #4 Tokenizer interaction preserves existing token ordering and positions needed for rendering spans/underlines. +- [x] #4 Tokenizer interaction preserves existing token ordering and positions needed for rendering spans/underlines. - [ ] #5 Behavior on malformed/unsupported dictionary format is documented with fallback semantics. +## Note +- Full performance and malformed-format limitation documentation is deferred per request and will be handled in a separate pass if needed. + ## Definition of Done -- [ ] #1 Lookup service returns JLPT level with deterministic output for test fixtures. +- [x] #1 Lookup service returns JLPT level with deterministic output for test fixtures. diff --git a/backlog/tasks/task-23.2 - Bundle-JLPT-Yomitan-dictionary-assets-for-offline-local-lookup.md b/backlog/tasks/task-23.2 - Bundle-JLPT-Yomitan-dictionary-assets-for-offline-local-lookup.md index a8e65df..57eb20d 100644 --- a/backlog/tasks/task-23.2 - Bundle-JLPT-Yomitan-dictionary-assets-for-offline-local-lookup.md +++ b/backlog/tasks/task-23.2 - Bundle-JLPT-Yomitan-dictionary-assets-for-offline-local-lookup.md @@ -1,7 +1,7 @@ --- id: TASK-23.2 title: Bundle JLPT Yomitan dictionary assets for offline local lookup -status: To Do +status: In Progress assignee: [] created_date: '2026-02-13 16:42' labels: [] @@ -18,13 +18,16 @@ Package and include the JLPT Yomitan extension dictionary assets in SubMiner so ## Acceptance Criteria -- [ ] #1 JLPT dictionary asset from the existing Yomitan extension is added to the repository/build output in a tracked, offline-available location. -- [ ] #2 The loader locates and opens the JLPT dictionary file deterministically at runtime. +- [x] #1 JLPT dictionary asset from the existing Yomitan extension is added to the repository/build output in a tracked, offline-available location. +- [x] #2 The loader locates and opens the JLPT dictionary file deterministically at runtime. - [ ] #3 Dictionary version/source is documented so future updates are explicit and reproducible. - [ ] #4 Dictionary bundle size and load impact are documented in task notes or project docs. +## Note +- Full dictionary source/version/performance notes are intentionally deferred for now (out of scope in this pass). + ## Definition of Done -- [ ] #1 Dictionary data is bundled and consumable during development and packaged app runs. +- [x] #1 Dictionary data is bundled and consumable during development and packaged app runs. diff --git a/backlog/tasks/task-23.3 - Render-JLPT-token-underlines-with-level-based-colors-in-subtitle-lines.md b/backlog/tasks/task-23.3 - Render-JLPT-token-underlines-with-level-based-colors-in-subtitle-lines.md index 8b42f61..2424c5a 100644 --- a/backlog/tasks/task-23.3 - Render-JLPT-token-underlines-with-level-based-colors-in-subtitle-lines.md +++ b/backlog/tasks/task-23.3 - Render-JLPT-token-underlines-with-level-based-colors-in-subtitle-lines.md @@ -1,7 +1,7 @@ --- id: TASK-23.3 title: Render JLPT token underlines with level-based colors in subtitle lines -status: To Do +status: Done assignee: [] created_date: '2026-02-13 16:42' labels: [] @@ -18,14 +18,14 @@ Render JLPT-aware token annotations as token-length colored underlines in the su ## Acceptance Criteria -- [ ] #1 For each token with JLPT level, renderer draws an underline matching token width/length. -- [ ] #2 Underlines use distinct colors by JLPT level (e.g., N5/N4/N3/N2/N1) and mapping is consistent/documented. -- [ ] #3 Non-tagged tokens remain visually unchanged. -- [ ] #4 Rendering does not alter line height/selection behavior or break wrapping behavior. -- [ ] #5 Feature degrades gracefully when level data is missing or lookup is unavailable. +- [x] #1 For each token with JLPT level, renderer draws an underline matching token width/length. +- [x] #2 Underlines use distinct colors by JLPT level (e.g., N5/N4/N3/N2/N1) and mapping is consistent/documented. +- [x] #3 Non-tagged tokens remain visually unchanged. +- [x] #4 Rendering does not alter line height/selection behavior or break wrapping behavior. +- [x] #5 Feature degrades gracefully when level data is missing or lookup is unavailable. ## Definition of Done -- [ ] #1 Visual output validated for all mapped JLPT levels with no legibility/layout regressions. +- [x] #1 Visual output validated for all mapped JLPT levels with no legibility/layout regressions. diff --git a/backlog/tasks/task-23.4 - Add-opt-in-control-and-end-to-end-flow-tests-for-JLPT-tagging.md b/backlog/tasks/task-23.4 - Add-opt-in-control-and-end-to-end-flow-tests-for-JLPT-tagging.md index 6081dc0..0533f11 100644 --- a/backlog/tasks/task-23.4 - Add-opt-in-control-and-end-to-end-flow-tests-for-JLPT-tagging.md +++ b/backlog/tasks/task-23.4 - Add-opt-in-control-and-end-to-end-flow-tests-for-JLPT-tagging.md @@ -1,7 +1,7 @@ --- id: TASK-23.4 title: Add opt-in control and end-to-end flow + tests for JLPT tagging -status: To Do +status: In Progress assignee: [] created_date: '2026-02-13 16:42' labels: [] @@ -18,12 +18,15 @@ Add user/config setting to enable JLPT tagging, wire the feature toggle through ## Acceptance Criteria -- [ ] #1 JLPT tagging is opt-in and defaults to disabled. -- [ ] #2 When disabled, lookup/rendering pipeline does not execute JLPT processing. -- [ ] #3 When enabled, end-to-end flow tags subtitle words via token-level lookup and rendering. +- [x] #1 JLPT tagging is opt-in and defaults to disabled. +- [x] #2 When disabled, lookup/rendering pipeline does not execute JLPT processing. +- [x] #3 When enabled, end-to-end flow tags subtitle words via token-level lookup and rendering. - [ ] #4 Add tests covering at least one positive match, one non-match, and disabled state. +## Note +- Full end-to-end + disabled-state test coverage remains pending as an explicit follow-up item. + ## Definition of Done - [ ] #1 End-to-end option behavior and opt-in state persistence are implemented and verified. diff --git a/config.example.jsonc b/config.example.jsonc index c587ec7..fdf142c 100644 --- a/config.example.jsonc +++ b/config.example.jsonc @@ -149,6 +149,7 @@ // Primary and secondary subtitle styling. // ========================================== "subtitleStyle": { + "enableJlpt": false, "fontFamily": "Noto Sans CJK JP Regular, Noto Sans CJK JP, Arial Unicode MS, Arial, sans-serif", "fontSize": 35, "fontColor": "#cad3f5", @@ -157,6 +158,13 @@ "backgroundColor": "rgba(54, 58, 79, 0.5)", "nPlusOneColor": "#c6a0f6", "knownWordColor": "#a6da95", + "jlptColors": { + "N1": "#ed8796", + "N2": "#f5a97f", + "N3": "#f9e2af", + "N4": "#a6e3a1", + "N5": "#8aadf4" + }, "secondary": { "fontSize": 24, "fontColor": "#ffffff", diff --git a/docs/configuration.md b/docs/configuration.md index 192ac96..17a18b6 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -552,12 +552,26 @@ See `config.example.jsonc` for detailed configuration options. | `fontWeight` | string | CSS font-weight, e.g. `"bold"`, `"normal"`, `"600"` (default: `"normal"`) | | `fontStyle` | string | `"normal"` or `"italic"` (default: `"normal"`) | | `backgroundColor` | string | Any CSS color, including `"transparent"` (default: `"rgba(54, 58, 79, 0.5)"`) | +| `enableJlpt` | boolean | Enable JLPT level underline styling (`false` by default) | +| `nPlusOneColor` | string | Existing n+1 highlight color (default: `#c6a0f6`) | +| `knownWordColor` | string | Existing known-word highlight color (default: `#a6da95`) | +| `jlptColors` | object | JLPT level underline colors object (`N1`..`N5`) | | `secondary` | object | Override any of the above for secondary subtitles (optional) | Secondary subtitle defaults: `fontSize: 24`, `fontColor: "#ffffff"`, `backgroundColor: "transparent"`. Any property not set in `secondary` falls back to the CSS defaults. **See `config.example.jsonc`** for the complete list of subtitle style configuration options. +`jlptColors` keys are: + +| Key | Default | Description | +| ---- | --------- | ---------------------------------------- | +| `N1` | `#ed8796` | JLPT N1 underline color | +| `N2` | `#f5a97f` | JLPT N2 underline color | +| `N3` | `#f9e2af` | JLPT N3 underline color | +| `N4` | `#a6e3a1` | JLPT N4 underline color | +| `N5` | `#8aadf4` | JLPT N5 underline color | + ### Texthooker Control whether the browser opens automatically when texthooker starts: diff --git a/docs/public/config.example.jsonc b/docs/public/config.example.jsonc index c587ec7..fdf142c 100644 --- a/docs/public/config.example.jsonc +++ b/docs/public/config.example.jsonc @@ -149,6 +149,7 @@ // Primary and secondary subtitle styling. // ========================================== "subtitleStyle": { + "enableJlpt": false, "fontFamily": "Noto Sans CJK JP Regular, Noto Sans CJK JP, Arial Unicode MS, Arial, sans-serif", "fontSize": 35, "fontColor": "#cad3f5", @@ -157,6 +158,13 @@ "backgroundColor": "rgba(54, 58, 79, 0.5)", "nPlusOneColor": "#c6a0f6", "knownWordColor": "#a6da95", + "jlptColors": { + "N1": "#ed8796", + "N2": "#f5a97f", + "N3": "#f9e2af", + "N4": "#a6e3a1", + "N5": "#8aadf4" + }, "secondary": { "fontSize": 24, "fontColor": "#ffffff", diff --git a/package.json b/package.json index 432f6c6..fdf13df 100644 --- a/package.json +++ b/package.json @@ -97,6 +97,10 @@ "from": "vendor/yomitan", "to": "yomitan" }, + { + "from": "vendor/yomitan-jlpt-vocab", + "to": "yomitan-jlpt-vocab" + }, { "from": "assets", "to": "assets" diff --git a/src/config/definitions.ts b/src/config/definitions.ts index 56aea54..d5b1685 100644 --- a/src/config/definitions.ts +++ b/src/config/definitions.ts @@ -174,6 +174,7 @@ export const DEFAULT_CONFIG: ResolvedConfig = { ffmpeg_path: "", }, subtitleStyle: { + enableJlpt: false, fontFamily: "Noto Sans CJK JP Regular, Noto Sans CJK JP, Arial Unicode MS, Arial, sans-serif", fontSize: 35, @@ -183,6 +184,13 @@ export const DEFAULT_CONFIG: ResolvedConfig = { backgroundColor: "rgba(54, 58, 79, 0.5)", nPlusOneColor: "#c6a0f6", knownWordColor: "#a6da95", + jlptColors: { + N1: "#ed8796", + N2: "#f5a97f", + N3: "#f9e2af", + N4: "#a6e3a1", + N5: "#8aadf4", + }, secondary: { fontSize: 24, fontColor: "#ffffff", @@ -280,6 +288,13 @@ export const CONFIG_OPTION_REGISTRY: ConfigOptionRegistryEntry[] = [ defaultValue: DEFAULT_CONFIG.websocket.port, description: "Built-in subtitle websocket server port.", }, + { + path: "subtitleStyle.enableJlpt", + kind: "boolean", + defaultValue: DEFAULT_CONFIG.subtitleStyle.enableJlpt, + description: "Enable JLPT vocabulary level underlines. " + + "When disabled, JLPT tagging lookup and underlines are skipped.", + }, { path: "ankiConnect.enabled", kind: "boolean", diff --git a/src/config/service.ts b/src/config/service.ts index 6334eba..2007438 100644 --- a/src/config/service.ts +++ b/src/config/service.ts @@ -442,6 +442,18 @@ export class ConfigService { : {}), }, }; + + const enableJlpt = asBoolean((src.subtitleStyle as { enableJlpt?: unknown }).enableJlpt); + if (enableJlpt !== undefined) { + resolved.subtitleStyle.enableJlpt = enableJlpt; + } else if ((src.subtitleStyle as { enableJlpt?: unknown }).enableJlpt !== undefined) { + warn( + "subtitleStyle.enableJlpt", + (src.subtitleStyle as { enableJlpt?: unknown }).enableJlpt, + resolved.subtitleStyle.enableJlpt, + "Expected boolean.", + ); + } } if (isObject(src.ankiConnect)) { diff --git a/src/core/services/index.ts b/src/core/services/index.ts index 62946ef..bbf444b 100644 --- a/src/core/services/index.ts +++ b/src/core/services/index.ts @@ -37,6 +37,7 @@ export { } from "./runtime-config-service"; export { openYomitanSettingsWindow } from "./yomitan-settings-service"; export { createTokenizerDepsRuntimeService, tokenizeSubtitleService } from "./tokenizer-service"; +export { createJlptVocabularyLookupService } from "./jlpt-vocab-service"; export { loadYomitanExtensionService } from "./yomitan-extension-loader-service"; export { getJimakuLanguagePreferenceService, diff --git a/src/core/services/jlpt-excluded-terms.ts b/src/core/services/jlpt-excluded-terms.ts new file mode 100644 index 0000000..1139300 --- /dev/null +++ b/src/core/services/jlpt-excluded-terms.ts @@ -0,0 +1,29 @@ +// Token-level lexical terms excluded from JLPT highlighting. +// These are not tied to POS and act as a safety layer for non-dictionary cases. +export const JLPT_EXCLUDED_TERMS = new Set([ + "この", + "その", + "あの", + "どの", + "これ", + "それ", + "あれ", + "どれ", + "ここ", + "そこ", + "あそこ", + "どこ", + "こと", + "ああ", + "ええ", + "うう", + "おお", + "はは", + "へえ", + "ふう", + "ほう", +]); + +export function shouldIgnoreJlptByTerm(term: string): boolean { + return JLPT_EXCLUDED_TERMS.has(term); +} diff --git a/src/core/services/jlpt-ignored-mecab-pos1.ts b/src/core/services/jlpt-ignored-mecab-pos1.ts new file mode 100644 index 0000000..6d8b198 --- /dev/null +++ b/src/core/services/jlpt-ignored-mecab-pos1.ts @@ -0,0 +1,45 @@ +// MeCab POS1 categories that should be excluded from JLPT-level token tagging. +// These are filtered out because they are typically functional or non-lexical words. +export type JlptIgnoredPos1Entry = { + pos1: string; + reason: string; +}; + +export const JLPT_IGNORED_MECAB_POS1_ENTRIES = [ + { + pos1: "助詞", + reason: "Particles (ko/kara/nagara etc.): mostly grammatical glue, not independent vocabulary.", + }, + { + pos1: "助動詞", + reason: "Auxiliary verbs (past tense, politeness, modality): grammar helpers.", + }, + { + pos1: "記号", + reason: "Symbols/punctuation and symbols-like tokens.", + }, + { + pos1: "補助記号", + reason: "Auxiliary symbols (e.g. bracket-like or markup tokens).", + }, + { + pos1: "連体詞", + reason: "Adnominal forms (e.g. demonstratives like \"この\").", + }, + { + pos1: "感動詞", + reason: "Interjections/onomatopoeia-style exclamations.", + }, + { + pos1: "接続詞", + reason: "Conjunctions that connect clauses, usually not target vocab items.", + }, + { + pos1: "接頭詞", + reason: "Prefixes/prefix-like grammatical elements.", + }, +] as const satisfies readonly JlptIgnoredPos1Entry[]; + +export const JLPT_IGNORED_MECAB_POS1 = JLPT_IGNORED_MECAB_POS1_ENTRIES.map( + (entry) => entry.pos1, +); diff --git a/src/core/services/jlpt-token-filter-config.ts b/src/core/services/jlpt-token-filter-config.ts new file mode 100644 index 0000000..7ef63c7 --- /dev/null +++ b/src/core/services/jlpt-token-filter-config.ts @@ -0,0 +1,23 @@ +import { + JlptIgnoredPos1Entry, + JLPT_IGNORED_MECAB_POS1, + JLPT_IGNORED_MECAB_POS1_ENTRIES, +} from "./jlpt-ignored-mecab-pos1"; + +export { JLPT_IGNORED_MECAB_POS1_ENTRIES, JlptIgnoredPos1Entry }; + +// Data-driven MeCab POS names (pos1) used for JLPT filtering. +export const JLPT_IGNORED_MECAB_POS1_LIST: readonly string[] = + JLPT_IGNORED_MECAB_POS1; + +const JLPT_IGNORED_MECAB_POS1_SET = new Set( + JLPT_IGNORED_MECAB_POS1_LIST, +); + +export function getIgnoredPos1Entries(): readonly JlptIgnoredPos1Entry[] { + return JLPT_IGNORED_MECAB_POS1_ENTRIES; +} + +export function shouldIgnoreJlptForMecabPos1(pos1: string): boolean { + return JLPT_IGNORED_MECAB_POS1_SET.has(pos1); +} diff --git a/src/core/services/jlpt-vocab-service.ts b/src/core/services/jlpt-vocab-service.ts new file mode 100644 index 0000000..696a237 --- /dev/null +++ b/src/core/services/jlpt-vocab-service.ts @@ -0,0 +1,168 @@ +import * as fs from "fs"; +import * as path from "path"; + +import type { JlptLevel } from "../../types"; + +export interface JlptVocabLookupOptions { + searchPaths: string[]; + log: (message: string) => void; +} + +const JLPT_BANK_FILES: { level: JlptLevel; filename: string }[] = [ + { level: "N1", filename: "term_meta_bank_1.json" }, + { level: "N2", filename: "term_meta_bank_2.json" }, + { level: "N3", filename: "term_meta_bank_3.json" }, + { level: "N4", filename: "term_meta_bank_4.json" }, + { level: "N5", filename: "term_meta_bank_5.json" }, +]; +const JLPT_LEVEL_PRECEDENCE: Record = { + N1: 5, + N2: 4, + N3: 3, + N4: 2, + N5: 1, +}; + +const NOOP_LOOKUP = (): null => null; + +function normalizeJlptTerm(value: string): string { + return value.trim(); +} + +function hasFrequencyDisplayValue(meta: unknown): boolean { + if (!meta || typeof meta !== "object") return false; + const frequency = (meta as { frequency?: unknown }).frequency; + if (!frequency || typeof frequency !== "object") return false; + return Object.prototype.hasOwnProperty.call( + frequency as Record, + "displayValue", + ); +} + +function addEntriesToMap( + rawEntries: unknown, + level: JlptLevel, + terms: Map, + log: (message: string) => void, +): void { + const shouldUpdateLevel = ( + existingLevel: JlptLevel | undefined, + incomingLevel: JlptLevel, + ): boolean => + existingLevel === undefined || + JLPT_LEVEL_PRECEDENCE[incomingLevel] > + JLPT_LEVEL_PRECEDENCE[existingLevel]; + + if (!Array.isArray(rawEntries)) { + return; + } + + for (const rawEntry of rawEntries) { + if (!Array.isArray(rawEntry)) { + continue; + } + + const [term, _entryId, meta] = rawEntry as [unknown, unknown, unknown]; + if (typeof term !== "string") { + continue; + } + + const normalizedTerm = normalizeJlptTerm(term); + if (!normalizedTerm) { + continue; + } + + if (!hasFrequencyDisplayValue(meta)) { + continue; + } + + const existingLevel = terms.get(normalizedTerm); + if (shouldUpdateLevel(existingLevel, level)) { + terms.set(normalizedTerm, level); + continue; + } + + log( + `JLPT dictionary already has ${normalizedTerm} as ${existingLevel}; keeping that level instead of ${level}`, + ); + } +} + +function collectDictionaryFromPath( + dictionaryPath: string, + log: (message: string) => void, +): Map { + const terms = new Map(); + + for (const bank of JLPT_BANK_FILES) { + const bankPath = path.join(dictionaryPath, bank.filename); + if (!fs.existsSync(bankPath)) { + continue; + } + + let rawText: string; + try { + rawText = fs.readFileSync(bankPath, "utf-8"); + } catch { + continue; + } + + let rawEntries: unknown; + try { + rawEntries = JSON.parse(rawText) as unknown; + } catch { + continue; + } + + addEntriesToMap(rawEntries, bank.level, terms, log); + } + + return terms; +} + +export async function createJlptVocabularyLookupService( + options: JlptVocabLookupOptions, +): Promise<(term: string) => JlptLevel | null> { + const attemptedPaths: string[] = []; + let foundDirectoryCount = 0; + let foundBankCount = 0; + for (const dictionaryPath of options.searchPaths) { + attemptedPaths.push(dictionaryPath); + if (!fs.existsSync(dictionaryPath)) { + continue; + } + + if (!fs.statSync(dictionaryPath).isDirectory()) { + continue; + } + + foundDirectoryCount += 1; + + const terms = collectDictionaryFromPath(dictionaryPath, options.log); + if (terms.size > 0) { + foundBankCount += 1; + options.log( + `JLPT dictionary loaded from ${dictionaryPath} (${terms.size} entries)`, + ); + return (term: string): JlptLevel | null => { + if (!term) return null; + const normalized = normalizeJlptTerm(term); + return normalized ? terms.get(normalized) ?? null : null; + }; + } + + options.log( + `JLPT dictionary directory exists but contains no readable term_meta_bank_*.json files: ${dictionaryPath}`, + ); + } + + options.log( + `JLPT dictionary not found. Searched ${attemptedPaths.length} candidate path(s): ${attemptedPaths.join(", ")}`, + ); + if (foundDirectoryCount > 0 && foundBankCount === 0) { + options.log( + "JLPT dictionary directories found, but none contained valid term_meta_bank_*.json files.", + ); + } + return NOOP_LOOKUP; +} diff --git a/src/core/services/startup-service.ts b/src/core/services/startup-service.ts index 469aa49..843705e 100644 --- a/src/core/services/startup-service.ts +++ b/src/core/services/startup-service.ts @@ -92,6 +92,7 @@ export async function runAppReadyRuntimeService( ): Promise { deps.loadSubtitlePosition(); deps.resolveKeybindings(); + await deps.createMecabTokenizerAndCheck(); deps.createMpvClient(); deps.reloadConfig(); @@ -117,7 +118,6 @@ export async function runAppReadyRuntimeService( deps.log("mpv_websocket detected, skipping built-in WebSocket server"); } - await deps.createMecabTokenizerAndCheck(); deps.createSubtitleTimingTracker(); await deps.loadYomitanExtension(); diff --git a/src/core/services/tokenizer-service.test.ts b/src/core/services/tokenizer-service.test.ts index 05034fa..3d1a502 100644 --- a/src/core/services/tokenizer-service.test.ts +++ b/src/core/services/tokenizer-service.test.ts @@ -21,6 +21,7 @@ function makeDeps( setYomitanParserInitPromise: () => {}, isKnownWord: () => false, getKnownWordMatchMode: () => "headword", + getJlptLevel: () => null, tokenizeWithMecab: async () => null, ...overrides, }; @@ -43,10 +44,171 @@ function makeDepsFromMecabTokenizer( getMecabTokenizer: () => ({ tokenize, }), + getJlptLevel: () => null, ...overrides, }); } +test("tokenizeSubtitleService assigns JLPT level to parsed Yomitan tokens", async () => { + const result = await tokenizeSubtitleService( + "猫です", + makeDeps({ + getYomitanExt: () => ({ id: "dummy-ext" } as any), + getYomitanParserWindow: () => ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: "scanning-parser", + index: 0, + content: [ + [ + { + text: "猫", + reading: "ねこ", + headwords: [[{ term: "猫" }]], + }, + { + text: "です", + reading: "です", + headwords: [[{ term: "です" }]], + }, + ], + ], + }, + ], + }, + } as unknown as Electron.BrowserWindow), + tokenizeWithMecab: async () => null, + getJlptLevel: (text) => (text === "猫" ? "N5" : null), + }), + ); + + assert.equal(result.tokens?.length, 1); + assert.equal(result.tokens?.[0]?.jlptLevel, "N5"); +}); + +test("tokenizeSubtitleService skips JLPT level for excluded demonstratives", async () => { + const result = await tokenizeSubtitleService( + "この", + makeDeps({ + getYomitanExt: () => ({ id: "dummy-ext" } as any), + getYomitanParserWindow: () => ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: "scanning-parser", + index: 0, + content: [ + [ + { + text: "この", + reading: "この", + headwords: [[{ term: "この" }]], + }, + ], + ], + }, + ], + }, + } as unknown as Electron.BrowserWindow), + tokenizeWithMecab: async () => null, + getJlptLevel: (text) => (text === "この" ? "N5" : null), + }), + ); + + assert.equal(result.tokens?.length, 1); + assert.equal(result.tokens?.[0]?.jlptLevel, undefined); +}); + +test("tokenizeSubtitleService skips JLPT level for repeated kana SFX", async () => { + const result = await tokenizeSubtitleService( + "ああ", + makeDeps({ + getYomitanExt: () => ({ id: "dummy-ext" } as any), + getYomitanParserWindow: () => ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: "scanning-parser", + index: 0, + content: [ + [ + { + text: "ああ", + reading: "ああ", + headwords: [[{ term: "ああ" }]], + }, + ], + ], + }, + ], + }, + } as unknown as Electron.BrowserWindow), + tokenizeWithMecab: async () => null, + getJlptLevel: (text) => (text === "ああ" ? "N5" : null), + }), + ); + + assert.equal(result.tokens?.length, 1); + assert.equal(result.tokens?.[0]?.jlptLevel, undefined); +}); + +test("tokenizeSubtitleService assigns JLPT level to mecab tokens", async () => { + const result = await tokenizeSubtitleService( + "猫です", + makeDepsFromMecabTokenizer(async () => [ + { + word: "猫", + partOfSpeech: PartOfSpeech.noun, + pos1: "", + pos2: "", + pos3: "", + pos4: "", + inflectionType: "", + inflectionForm: "", + headword: "猫", + katakanaReading: "ネコ", + pronunciation: "ネコ", + }, + ], { + getJlptLevel: (text) => (text === "猫" ? "N4" : null), + }), + ); + + assert.equal(result.tokens?.length, 1); + assert.equal(result.tokens?.[0]?.jlptLevel, "N4"); +}); + +test("tokenizeSubtitleService skips JLPT level for mecab tokens marked as ineligible", async () => { + const result = await tokenizeSubtitleService( + "は", + makeDepsFromMecabTokenizer(async () => [ + { + word: "は", + partOfSpeech: PartOfSpeech.particle, + pos1: "助詞", + pos2: "", + pos3: "", + pos4: "", + inflectionType: "", + inflectionForm: "", + headword: "は", + katakanaReading: "ハ", + pronunciation: "ハ", + }, + ], { + getJlptLevel: (text) => (text === "は" ? "N5" : null), + }), + ); + + assert.equal(result.tokens?.length, 1); + assert.equal(result.tokens?.[0]?.pos1, "助詞"); + assert.equal(result.tokens?.[0]?.jlptLevel, undefined); +}); + test("tokenizeSubtitleService returns null tokens for empty normalized text", async () => { const result = await tokenizeSubtitleService(" \\n ", makeDeps()); assert.deepEqual(result, { text: " \\n ", tokens: null }); diff --git a/src/core/services/tokenizer-service.ts b/src/core/services/tokenizer-service.ts index 464c84c..7add0c6 100644 --- a/src/core/services/tokenizer-service.ts +++ b/src/core/services/tokenizer-service.ts @@ -1,20 +1,23 @@ import { BrowserWindow, Extension, session } from "electron"; import { markNPlusOneTargets, mergeTokens } from "../../token-merger"; import { + JlptLevel, MergedToken, NPlusOneMatchMode, PartOfSpeech, SubtitleData, Token, } from "../../types"; +import { shouldIgnoreJlptForMecabPos1 } from "./jlpt-token-filter-config"; +import { shouldIgnoreJlptByTerm } from "./jlpt-excluded-terms"; interface YomitanParseHeadword { term?: unknown; } interface YomitanParseSegment { - text?: unknown; - reading?: unknown; + text?: string; + reading?: string; headwords?: unknown; } @@ -24,6 +27,20 @@ interface YomitanParseResultItem { content?: unknown; } +type YomitanParseLine = YomitanParseSegment[]; + +const KATAKANA_TO_HIRAGANA_OFFSET = 0x60; +const KATAKANA_CODEPOINT_START = 0x30a1; +const KATAKANA_CODEPOINT_END = 0x30f6; + +function isObject(value: unknown): value is Record { + return Boolean(value && typeof value === "object"); +} + +function isString(value: unknown): value is string { + return typeof value === "string"; +} + export interface TokenizerServiceDeps { getYomitanExt: () => Extension | null; getYomitanParserWindow: () => BrowserWindow | null; @@ -34,6 +51,8 @@ export interface TokenizerServiceDeps { setYomitanParserInitPromise: (promise: Promise | null) => void; isKnownWord: (text: string) => boolean; getKnownWordMatchMode: () => NPlusOneMatchMode; + getJlptLevel: (text: string) => JlptLevel | null; + getJlptEnabled?: () => boolean; tokenizeWithMecab: (text: string) => Promise; } @@ -51,6 +70,8 @@ export interface TokenizerDepsRuntimeOptions { setYomitanParserInitPromise: (promise: Promise | null) => void; isKnownWord: (text: string) => boolean; getKnownWordMatchMode: () => NPlusOneMatchMode; + getJlptLevel: (text: string) => JlptLevel | null; + getJlptEnabled?: () => boolean; getMecabTokenizer: () => MecabTokenizerLike | null; } @@ -67,6 +88,8 @@ export function createTokenizerDepsRuntimeService( setYomitanParserInitPromise: options.setYomitanParserInitPromise, isKnownWord: options.isKnownWord, getKnownWordMatchMode: options.getKnownWordMatchMode, + getJlptLevel: options.getJlptLevel, + getJlptEnabled: options.getJlptEnabled, tokenizeWithMecab: async (text) => { const mecabTokenizer = options.getMecabTokenizer(); if (!mecabTokenizer) { @@ -112,19 +135,205 @@ function applyKnownWordMarking( }); } +function resolveJlptLookupText(token: MergedToken): string { + if (token.headword && token.headword.length > 0) { + return token.headword; + } + if (token.reading && token.reading.length > 0) { + return token.reading; + } + return token.surface; +} + +function normalizeJlptTextForExclusion(text: string): string { + const raw = text.trim(); + if (!raw) { + return ""; + } + + let normalized = ""; + for (const char of raw) { + const code = char.codePointAt(0); + if (code === undefined) { + continue; + } + + if (code >= KATAKANA_CODEPOINT_START && code <= KATAKANA_CODEPOINT_END) { + normalized += String.fromCodePoint(code - KATAKANA_TO_HIRAGANA_OFFSET); + continue; + } + + normalized += char; + } + + return normalized; +} + +function isKanaChar(char: string): boolean { + const code = char.codePointAt(0); + if (code === undefined) { + return false; + } + + return ( + (code >= 0x3041 && code <= 0x3096) || + (code >= 0x309b && code <= 0x309f) || + (code >= 0x30a0 && code <= 0x30fa) || + (code >= 0x30fd && code <= 0x30ff) + ); +} + +/** + * Detects repeated-kana speech-like tokens (e.g. 「ああああ」, 「ははは」, 「うーん」 style patterns) + * so they are not JLPT-labeled when they are mostly expressive particles/sfx. + */ +function isRepeatedKanaSfx(text: string): boolean { + const normalized = text.trim(); + if (!normalized) { + return false; + } + + const chars = [...normalized]; + if (!chars.every(isKanaChar)) { + return false; + } + + const counts = new Map(); + let hasAdjacentRepeat = false; + + for (let i = 0; i < chars.length; i += 1) { + const char = chars[i]; + counts.set(char, (counts.get(char) ?? 0) + 1); + if (i > 0 && chars[i] === chars[i - 1]) { + hasAdjacentRepeat = true; + } + } + + const topCount = Math.max(...counts.values()); + if (chars.length <= 2) { + return hasAdjacentRepeat || topCount >= 2; + } + + if (hasAdjacentRepeat) { + return true; + } + + return topCount >= Math.ceil(chars.length / 2); +} + +function isJlptEligibleToken(token: MergedToken): boolean { + if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) return false; + + const candidates = [ + resolveJlptLookupText(token), + token.surface, + token.reading, + token.headword, + ].filter((candidate): candidate is string => typeof candidate === "string" && candidate.length > 0); + + for (const candidate of candidates) { + const normalizedCandidate = normalizeJlptTextForExclusion(candidate); + if (!normalizedCandidate) { + continue; + } + + const trimmedCandidate = candidate.trim(); + if ( + shouldIgnoreJlptByTerm(trimmedCandidate) || + shouldIgnoreJlptByTerm(normalizedCandidate) + ) { + return false; + } + + if ( + isRepeatedKanaSfx(candidate) || + isRepeatedKanaSfx(normalizedCandidate) + ) { + return false; + } + } + + return true; +} + +function isYomitanParseResultItem( + value: unknown, +): value is YomitanParseResultItem { + if (!isObject(value)) { + return false; + } + if ((value as YomitanParseResultItem).source !== "scanning-parser") { + return false; + } + if (!Array.isArray((value as YomitanParseResultItem).content)) { + return false; + } + return true; +} + +function isYomitanParseLine(value: unknown): value is YomitanParseLine { + if (!Array.isArray(value)) { + return false; + } + + return value.every((segment) => { + if (!isObject(segment)) { + return false; + } + + const candidate = segment as YomitanParseSegment; + return isString(candidate.text); + }); +} + +function isYomitanHeadwordRows(value: unknown): value is YomitanParseHeadword[][] { + return ( + Array.isArray(value) && + value.every( + (group) => + Array.isArray(group) && + group.every((item) => + isObject(item) && isString((item as YomitanParseHeadword).term), + ), + ) + ); +} + function extractYomitanHeadword(segment: YomitanParseSegment): string { const headwords = segment.headwords; - if (!Array.isArray(headwords) || headwords.length === 0) { + if (!isYomitanHeadwordRows(headwords)) { return ""; } - const firstGroup = headwords[0]; - if (!Array.isArray(firstGroup) || firstGroup.length === 0) { - return ""; + for (const group of headwords) { + if (group.length > 0) { + const firstHeadword = group[0] as YomitanParseHeadword; + if (isString(firstHeadword?.term)) { + return firstHeadword.term; + } + } } - const firstHeadword = firstGroup[0] as YomitanParseHeadword; - return typeof firstHeadword?.term === "string" ? firstHeadword.term : ""; + return ""; +} + +function applyJlptMarking( + tokens: MergedToken[], + getJlptLevel: (text: string) => JlptLevel | null, +): MergedToken[] { + return tokens.map((token) => { + if (!isJlptEligibleToken(token)) { + return { ...token, jlptLevel: undefined }; + } + + const primaryLevel = getJlptLevel(resolveJlptLookupText(token)); + const fallbackLevel = getJlptLevel(token.surface); + + return { + ...token, + jlptLevel: primaryLevel ?? fallbackLevel ?? token.jlptLevel, + }; + }); } function mapYomitanParseResultsToMergedTokens( @@ -136,14 +345,9 @@ function mapYomitanParseResultsToMergedTokens( return null; } - const scanningItems = parseResults.filter((item) => { - const resultItem = item as YomitanParseResultItem; - return ( - resultItem && - resultItem.source === "scanning-parser" && - Array.isArray(resultItem.content) - ); - }) as YomitanParseResultItem[]; + const scanningItems = parseResults.filter( + (item): item is YomitanParseResultItem => isYomitanParseResultItem(item), + ); if (scanningItems.length === 0) { return null; @@ -158,24 +362,21 @@ function mapYomitanParseResultsToMergedTokens( const tokens: MergedToken[] = []; let charOffset = 0; + let validLineCount = 0; for (const line of content) { - if (!Array.isArray(line)) { + if (!isYomitanParseLine(line)) { continue; } + validLineCount += 1; let surface = ""; let reading = ""; let headword = ""; - for (const rawSegment of line) { - const segment = rawSegment as YomitanParseSegment; - if (!segment || typeof segment !== "object") { - continue; - } - + for (const segment of line) { const segmentText = segment.text; - if (typeof segmentText !== "string" || segmentText.length === 0) { + if (!segmentText || segmentText.length === 0) { continue; } @@ -205,6 +406,7 @@ function mapYomitanParseResultsToMergedTokens( startPos: start, endPos: end, partOfSpeech: PartOfSpeech.other, + pos1: "", isMerged: true, isNPlusOneTarget: false, isKnown: (() => { @@ -218,9 +420,108 @@ function mapYomitanParseResultsToMergedTokens( }); } + if (validLineCount === 0) { + return null; + } return tokens.length > 0 ? tokens : null; } +function pickClosestMecabPos1( + token: MergedToken, + mecabTokens: MergedToken[], +): string | undefined { + if (mecabTokens.length === 0) { + return undefined; + } + + const tokenStart = token.startPos ?? 0; + const tokenEnd = token.endPos ?? tokenStart + token.surface.length; + + let bestPos1: string | undefined; + let bestOverlap = 0; + let bestSpan = 0; + let bestStart = Number.MAX_SAFE_INTEGER; + + for (const mecabToken of mecabTokens) { + if (!mecabToken.pos1) { + continue; + } + + const mecabStart = mecabToken.startPos ?? 0; + const mecabEnd = mecabToken.endPos ?? mecabStart + mecabToken.surface.length; + const overlapStart = Math.max(tokenStart, mecabStart); + const overlapEnd = Math.min(tokenEnd, mecabEnd); + const overlap = Math.max(0, overlapEnd - overlapStart); + if (overlap === 0) { + continue; + } + + const span = mecabEnd - mecabStart; + if ( + overlap > bestOverlap || + (overlap === bestOverlap && + (span > bestSpan || + (span === bestSpan && mecabStart < bestStart))) + ) { + bestOverlap = overlap; + bestSpan = span; + bestStart = mecabStart; + bestPos1 = mecabToken.pos1; + } + } + + return bestOverlap > 0 ? bestPos1 : undefined; +} + +async function enrichYomitanPos1( + tokens: MergedToken[], + deps: TokenizerServiceDeps, + text: string, +): Promise { + if (!tokens || tokens.length === 0) { + return tokens; + } + + let mecabTokens: MergedToken[] | null = null; + try { + mecabTokens = await deps.tokenizeWithMecab(text); + } catch (err) { + const error = err as Error; + console.warn( + "Failed to enrich Yomitan tokens with MeCab POS:", + error.message, + `tokenCount=${tokens.length}`, + `textLength=${text.length}`, + ); + return tokens; + } + + if (!mecabTokens || mecabTokens.length === 0) { + console.warn( + "MeCab enrichment returned no tokens; preserving Yomitan token output.", + `tokenCount=${tokens.length}`, + `textLength=${text.length}`, + ); + return tokens; + } + + return tokens.map((token) => { + if (token.pos1) { + return token; + } + + const pos1 = pickClosestMecabPos1(token, mecabTokens); + if (!pos1) { + return token; + } + + return { + ...token, + pos1, + }; + }); +} + async function ensureYomitanParserWindow( deps: TokenizerServiceDeps, ): Promise { @@ -356,11 +657,16 @@ async function parseWithYomitanInternalParser( script, true, ); - return mapYomitanParseResultsToMergedTokens( + const yomitanTokens = mapYomitanParseResultsToMergedTokens( parseResults, deps.isKnownWord, deps.getKnownWordMatchMode(), ); + if (!yomitanTokens || yomitanTokens.length === 0) { + return null; + } + + return enrichYomitanPos1(yomitanTokens, deps, text); } catch (err) { console.error("Yomitan parser request failed:", (err as Error).message); return null; @@ -385,6 +691,7 @@ export async function tokenizeSubtitleService( .replace(/\n/g, " ") .replace(/\s+/g, " ") .trim(); + const jlptEnabled = deps.getJlptEnabled?.() !== false; const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps); if (yomitanTokens && yomitanTokens.length > 0) { @@ -393,7 +700,10 @@ export async function tokenizeSubtitleService( deps.isKnownWord, deps.getKnownWordMatchMode(), ); - return { text: displayText, tokens: markNPlusOneTargets(knownMarkedTokens) }; + const jlptMarkedTokens = jlptEnabled + ? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel) + : knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined })); + return { text: displayText, tokens: markNPlusOneTargets(jlptMarkedTokens) }; } try { @@ -404,7 +714,10 @@ export async function tokenizeSubtitleService( deps.isKnownWord, deps.getKnownWordMatchMode(), ); - return { text: displayText, tokens: markNPlusOneTargets(knownMarkedTokens) }; + const jlptMarkedTokens = jlptEnabled + ? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel) + : knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined })); + return { text: displayText, tokens: markNPlusOneTargets(jlptMarkedTokens) }; } } catch (err) { console.error("Tokenization error:", (err as Error).message); diff --git a/src/core/services/yomitan-extension-loader-service.ts b/src/core/services/yomitan-extension-loader-service.ts index 79edda5..e206670 100644 --- a/src/core/services/yomitan-extension-loader-service.ts +++ b/src/core/services/yomitan-extension-loader-service.ts @@ -59,6 +59,7 @@ export async function loadYomitanExtensionService( deps: YomitanExtensionLoaderDeps, ): Promise { const searchPaths = [ + path.join(__dirname, "..", "..", "vendor", "yomitan"), path.join(__dirname, "..", "..", "..", "vendor", "yomitan"), path.join(process.resourcesPath, "yomitan"), "/usr/share/SubMiner/yomitan", diff --git a/src/main.ts b/src/main.ts index 53e0600..4b2324c 100644 --- a/src/main.ts +++ b/src/main.ts @@ -95,6 +95,7 @@ import { createOverlayContentMeasurementStoreService, createOverlayWindowService, createTokenizerDepsRuntimeService, + createJlptVocabularyLookupService, cycleSecondarySubModeService, enforceOverlayLayerOrderService, ensureOverlayWindowLevelService, @@ -227,6 +228,8 @@ const isDev = process.argv.includes("--dev") || process.argv.includes("--debug"); const texthookerService = new TexthookerService(); const subtitleWsService = new SubtitleWebSocketService(); +let jlptDictionaryLookupInitialized = false; +let jlptDictionaryLookupInitialization: Promise | null = null; const appLogger = { logInfo: (message: string) => { console.log(message); @@ -464,6 +467,73 @@ function loadSubtitlePosition(): SubtitlePosition | null { return appState.subtitlePosition; } +function getJlptDictionarySearchPaths(): string[] { + const homeDir = os.homedir(); + const dictionaryRoots = [ + // Development/runtime source trees where the repo is checked out. + path.join(__dirname, "..", "..", "vendor", "yomitan-jlpt-vocab"), + path.join(app.getAppPath(), "vendor", "yomitan-jlpt-vocab"), + + // Packaged app resources (Electron build output layout). + path.join(process.resourcesPath, "yomitan-jlpt-vocab"), + path.join(process.resourcesPath, "app.asar", "vendor", "yomitan-jlpt-vocab"), + + // User override/config directories for manually installed dictionaries. + USER_DATA_PATH, + app.getPath("userData"), + path.join(homeDir, ".config", "SubMiner"), + path.join(homeDir, ".config", "subminer"), + path.join(homeDir, "Library", "Application Support", "SubMiner"), + path.join(homeDir, "Library", "Application Support", "subminer"), + + // Last-resort fallback: current working directory (local CLI/test runs). + process.cwd(), + ]; + + const searchPaths: string[] = []; + for (const dictionaryRoot of dictionaryRoots) { + searchPaths.push(dictionaryRoot); + searchPaths.push(path.join(dictionaryRoot, "vendor", "yomitan-jlpt-vocab")); + searchPaths.push(path.join(dictionaryRoot, "yomitan-jlpt-vocab")); + } + + const uniquePaths = new Set(); + for (const searchPath of searchPaths) { + uniquePaths.add(searchPath); + } + + return [...uniquePaths]; +} + +async function initializeJlptDictionaryLookup(): Promise { + appState.jlptLevelLookup = await createJlptVocabularyLookupService({ + searchPaths: getJlptDictionarySearchPaths(), + log: (message) => { + console.log(`[JLPT] ${message}`); + }, + }); +} + +async function ensureJlptDictionaryLookup(): Promise { + if (!getResolvedConfig().subtitleStyle.enableJlpt) { + return; + } + if (jlptDictionaryLookupInitialized) { + return; + } + if (!jlptDictionaryLookupInitialization) { + jlptDictionaryLookupInitialization = initializeJlptDictionaryLookup() + .then(() => { + jlptDictionaryLookupInitialized = true; + }) + .catch((error) => { + jlptDictionaryLookupInitialization = null; + throw error; + }); + } + await jlptDictionaryLookupInitialization; +} + function saveSubtitlePosition(position: SubtitlePosition): void { appState.subtitlePosition = position; saveSubtitlePositionService({ @@ -804,6 +874,7 @@ function updateMpvSubtitleRenderMetrics( } async function tokenizeSubtitle(text: string): Promise { + await ensureJlptDictionaryLookup(); return tokenizeSubtitleService( text, createTokenizerDepsRuntimeService({ @@ -825,6 +896,9 @@ async function tokenizeSubtitle(text: string): Promise { getKnownWordMatchMode: () => appState.ankiIntegration?.getKnownWordMatchMode() ?? getResolvedConfig().ankiConnect.nPlusOne.matchMode, + getJlptLevel: (text) => appState.jlptLevelLookup(text), + getJlptEnabled: () => + getResolvedConfig().subtitleStyle.enableJlpt, getMecabTokenizer: () => appState.mecabTokenizer, }), ); @@ -1345,6 +1419,7 @@ registerIpcRuntimeServices({ ...resolvedConfig.subtitleStyle, nPlusOneColor: resolvedConfig.ankiConnect.nPlusOne.nPlusOne, knownWordColor: resolvedConfig.ankiConnect.nPlusOne.knownWord, + enableJlpt: resolvedConfig.subtitleStyle.enableJlpt, }; }, saveSubtitlePosition: (position: unknown) => diff --git a/src/main/state.ts b/src/main/state.ts index 8c9446c..37ba50f 100644 --- a/src/main/state.ts +++ b/src/main/state.ts @@ -6,6 +6,7 @@ import type { SecondarySubMode, SubtitlePosition, KikuFieldGroupingChoice, + JlptLevel, } from "../types"; import type { CliArgs } from "../cli/args"; import type { SubtitleTimingTracker } from "../subtitle-timing-tracker"; @@ -53,6 +54,7 @@ export interface AppState { backendOverride: string | null; autoStartOverlay: boolean; texthookerOnlyMode: boolean; + jlptLevelLookup: (term: string) => JlptLevel | null; } export interface AppStateInitialValues { @@ -112,6 +114,7 @@ export function createAppState(values: AppStateInitialValues): AppState { backendOverride: values.backendOverride ?? null, autoStartOverlay: values.autoStartOverlay ?? false, texthookerOnlyMode: values.texthookerOnlyMode ?? false, + jlptLevelLookup: () => null, }; } diff --git a/src/renderer/state.ts b/src/renderer/state.ts index 2dc50c6..293d99c 100644 --- a/src/renderer/state.ts +++ b/src/renderer/state.ts @@ -71,6 +71,11 @@ export type RendererState = { knownWordColor: string; nPlusOneColor: string; + jlptN1Color: string; + jlptN2Color: string; + jlptN3Color: string; + jlptN4Color: string; + jlptN5Color: string; keybindingsMap: Map; chordPending: boolean; @@ -130,6 +135,11 @@ export function createRendererState(): RendererState { knownWordColor: "#a6da95", nPlusOneColor: "#c6a0f6", + jlptN1Color: "#ed8796", + jlptN2Color: "#f5a97f", + jlptN3Color: "#f9e2af", + jlptN4Color: "#a6e3a1", + jlptN5Color: "#8aadf4", keybindingsMap: new Map(), chordPending: false, diff --git a/src/renderer/style.css b/src/renderer/style.css index 493247f..3e988fa 100644 --- a/src/renderer/style.css +++ b/src/renderer/style.css @@ -250,6 +250,11 @@ body { color: #cad3f5; --subtitle-known-word-color: #a6da95; --subtitle-n-plus-one-color: #c6a0f6; + --subtitle-jlpt-n1-color: #ed8796; + --subtitle-jlpt-n2-color: #f5a97f; + --subtitle-jlpt-n3-color: #f9e2af; + --subtitle-jlpt-n4-color: #a6e3a1; + --subtitle-jlpt-n5-color: #8aadf4; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.8), -1px -1px 2px rgba(0, 0, 0, 0.5); @@ -296,6 +301,51 @@ body.settings-modal-open #subtitleContainer { text-shadow: 0 0 6px rgba(198, 160, 246, 0.35); } +#subtitleRoot .word.word-jlpt-n1 { + color: inherit; + text-decoration-line: underline; + text-decoration-thickness: 2px; + text-underline-offset: 4px; + text-decoration-color: var(--subtitle-jlpt-n1-color, #ed8796); + text-decoration-style: solid; +} + +#subtitleRoot .word.word-jlpt-n2 { + color: inherit; + text-decoration-line: underline; + text-decoration-thickness: 2px; + text-underline-offset: 4px; + text-decoration-color: var(--subtitle-jlpt-n2-color, #f5a97f); + text-decoration-style: solid; +} + +#subtitleRoot .word.word-jlpt-n3 { + color: inherit; + text-decoration-line: underline; + text-decoration-thickness: 2px; + text-underline-offset: 4px; + text-decoration-color: var(--subtitle-jlpt-n3-color, #f9e2af); + text-decoration-style: solid; +} + +#subtitleRoot .word.word-jlpt-n4 { + color: inherit; + text-decoration-line: underline; + text-decoration-thickness: 2px; + text-underline-offset: 4px; + text-decoration-color: var(--subtitle-jlpt-n4-color, #a6e3a1); + text-decoration-style: solid; +} + +#subtitleRoot .word.word-jlpt-n5 { + color: inherit; + text-decoration-line: underline; + text-decoration-thickness: 2px; + text-underline-offset: 4px; + text-decoration-color: var(--subtitle-jlpt-n5-color, #8aadf4); + text-decoration-style: solid; +} + #subtitleRoot .word:hover { background: rgba(255, 255, 255, 0.2); border-radius: 3px; diff --git a/src/renderer/subtitle-render.test.ts b/src/renderer/subtitle-render.test.ts new file mode 100644 index 0000000..17fabb9 --- /dev/null +++ b/src/renderer/subtitle-render.test.ts @@ -0,0 +1,80 @@ +import test from "node:test"; +import assert from "node:assert/strict"; +import fs from "node:fs"; +import path from "node:path"; + +import type { MergedToken } from "../types"; +import { PartOfSpeech } from "../types.js"; +import { computeWordClass } from "./subtitle-render.js"; + +function createToken(overrides: Partial): MergedToken { + return { + surface: "", + reading: "", + headword: "", + startPos: 0, + endPos: 0, + partOfSpeech: PartOfSpeech.other, + isMerged: true, + isKnown: false, + isNPlusOneTarget: false, + ...overrides, + }; +} + +function extractClassBlock(cssText: string, level: number): string { + const selector = `#subtitleRoot .word.word-jlpt-n${level}`; + const start = cssText.indexOf(selector); + if (start < 0) return ""; + + const openBrace = cssText.indexOf("{", start); + if (openBrace < 0) return ""; + const closeBrace = cssText.indexOf("}", openBrace); + if (closeBrace < 0) return ""; + + return cssText.slice(openBrace + 1, closeBrace); +} + +test("computeWordClass preserves known and n+1 classes while adding JLPT classes", () => { + const knownJlpt = createToken({ + isKnown: true, + jlptLevel: "N1", + surface: "猫", + }); + const nPlusOneJlpt = createToken({ + isNPlusOneTarget: true, + jlptLevel: "N2", + surface: "犬", + }); + + assert.equal(computeWordClass(knownJlpt), "word word-known word-jlpt-n1"); + assert.equal( + computeWordClass(nPlusOneJlpt), + "word word-n-plus-one word-jlpt-n2", + ); +}); + +test("JLPT CSS rules use underline-only styling in renderer stylesheet", () => { + const distCssPath = path.join(process.cwd(), "dist", "renderer", "style.css"); + const srcCssPath = path.join(process.cwd(), "src", "renderer", "style.css"); + + const cssPath = fs.existsSync(distCssPath) + ? distCssPath + : srcCssPath; + if (!fs.existsSync(cssPath)) { + assert.fail( + "JLPT CSS file missing. Run `pnpm run build` first, or ensure src/renderer/style.css exists.", + ); + } + + const cssText = fs.readFileSync(cssPath, "utf-8"); + + for (let level = 1; level <= 5; level += 1) { + const block = extractClassBlock(cssText, level); + assert.ok(block.length > 0, `word-jlpt-n${level} class should exist`); + assert.match(block, /text-decoration-line:\s*underline;/); + assert.match(block, /text-decoration-thickness:\s*2px;/); + assert.match(block, /text-underline-offset:\s*4px;/); + assert.match(block, /color:\s*inherit;/); + } +}); diff --git a/src/renderer/subtitle-render.ts b/src/renderer/subtitle-render.ts index afe78d0..1bef40a 100644 --- a/src/renderer/subtitle-render.ts +++ b/src/renderer/subtitle-render.ts @@ -15,6 +15,15 @@ function normalizeSubtitle(text: string, trim = true): string { return trim ? normalized.trim() : normalized; } +const HEX_COLOR_PATTERN = + /^#(?:[0-9a-fA-F]{3}|[0-9a-fA-F]{4}|[0-9a-fA-F]{6}|[0-9a-fA-F]{8})$/; + +function sanitizeHexColor(value: unknown, fallback: string): string { + return typeof value === "string" && HEX_COLOR_PATTERN.test(value.trim()) + ? value.trim() + : fallback; +} + function renderWithTokens(root: HTMLElement, tokens: MergedToken[]): void { const fragment = document.createDocumentFragment(); @@ -50,16 +59,20 @@ function renderWithTokens(root: HTMLElement, tokens: MergedToken[]): void { root.appendChild(fragment); } -function computeWordClass(token: MergedToken): string { +export function computeWordClass(token: MergedToken): string { + const classes = ["word"]; + if (token.isNPlusOneTarget) { - return "word word-n-plus-one"; + classes.push("word-n-plus-one"); + } else if (token.isKnown) { + classes.push("word-known"); } - if (token.isKnown) { - return "word word-known"; + if (token.jlptLevel) { + classes.push(`word-jlpt-${token.jlptLevel.toLowerCase()}`); } - return "word"; + return classes.join(" "); } function renderCharacterLevel(root: HTMLElement, text: string): void { @@ -189,6 +202,22 @@ export function createSubtitleRenderer(ctx: RendererContext) { style.knownWordColor ?? ctx.state.knownWordColor ?? "#a6da95"; const nPlusOneColor = style.nPlusOneColor ?? ctx.state.nPlusOneColor ?? "#c6a0f6"; + const jlptColors = { + N1: ctx.state.jlptN1Color ?? "#ed8796", + N2: ctx.state.jlptN2Color ?? "#f5a97f", + N3: ctx.state.jlptN3Color ?? "#f9e2af", + N4: ctx.state.jlptN4Color ?? "#a6e3a1", + N5: ctx.state.jlptN5Color ?? "#8aadf4", + ...(style.jlptColors + ? { + N1: sanitizeHexColor(style.jlptColors?.N1, ctx.state.jlptN1Color), + N2: sanitizeHexColor(style.jlptColors?.N2, ctx.state.jlptN2Color), + N3: sanitizeHexColor(style.jlptColors?.N3, ctx.state.jlptN3Color), + N4: sanitizeHexColor(style.jlptColors?.N4, ctx.state.jlptN4Color), + N5: sanitizeHexColor(style.jlptColors?.N5, ctx.state.jlptN5Color), + } + : {}), + }; ctx.state.knownWordColor = knownWordColor; ctx.state.nPlusOneColor = nPlusOneColor; @@ -197,6 +226,16 @@ export function createSubtitleRenderer(ctx: RendererContext) { knownWordColor, ); ctx.dom.subtitleRoot.style.setProperty("--subtitle-n-plus-one-color", nPlusOneColor); + ctx.state.jlptN1Color = jlptColors.N1; + ctx.state.jlptN2Color = jlptColors.N2; + ctx.state.jlptN3Color = jlptColors.N3; + ctx.state.jlptN4Color = jlptColors.N4; + ctx.state.jlptN5Color = jlptColors.N5; + ctx.dom.subtitleRoot.style.setProperty("--subtitle-jlpt-n1-color", jlptColors.N1); + ctx.dom.subtitleRoot.style.setProperty("--subtitle-jlpt-n2-color", jlptColors.N2); + ctx.dom.subtitleRoot.style.setProperty("--subtitle-jlpt-n3-color", jlptColors.N3); + ctx.dom.subtitleRoot.style.setProperty("--subtitle-jlpt-n4-color", jlptColors.N4); + ctx.dom.subtitleRoot.style.setProperty("--subtitle-jlpt-n5-color", jlptColors.N5); const secondaryStyle = style.secondary; if (!secondaryStyle) return; diff --git a/src/token-merger.ts b/src/token-merger.ts index 348e5e7..6176bde 100644 --- a/src/token-merger.ts +++ b/src/token-merger.ts @@ -223,6 +223,7 @@ export function mergeTokens( startPos: prev.startPos, endPos: end, partOfSpeech: prev.partOfSpeech, + pos1: prev.pos1 ?? token.pos1, pos2: prev.pos2 ?? token.pos2, pos3: prev.pos3 ?? token.pos3, isMerged: true, @@ -245,6 +246,7 @@ export function mergeTokens( startPos: start, endPos: end, partOfSpeech: token.partOfSpeech, + pos1: token.pos1, pos2: token.pos2, pos3: token.pos3, isMerged: false, diff --git a/src/types.ts b/src/types.ts index 65dbc0e..692cc1f 100644 --- a/src/types.ts +++ b/src/types.ts @@ -48,13 +48,17 @@ export interface MergedToken { startPos: number; endPos: number; partOfSpeech: PartOfSpeech; + pos1?: string; pos2?: string; pos3?: string; isMerged: boolean; isKnown: boolean; isNPlusOneTarget: boolean; + jlptLevel?: JlptLevel; } +export type JlptLevel = "N1" | "N2" | "N3" | "N4" | "N5"; + export interface WindowGeometry { x: number; y: number; @@ -262,6 +266,7 @@ export interface AnkiConnectConfig { } export interface SubtitleStyleConfig { + enableJlpt?: boolean; fontFamily?: string; fontSize?: number; fontColor?: string; @@ -270,6 +275,13 @@ export interface SubtitleStyleConfig { backgroundColor?: string; nPlusOneColor?: string; knownWordColor?: string; + jlptColors?: { + N1: string; + N2: string; + N3: string; + N4: string; + N5: string; + }; secondary?: { fontFamily?: string; fontSize?: number; diff --git a/vendor/yomitan-jlpt-vocab b/vendor/yomitan-jlpt-vocab new file mode 160000 index 0000000..b062d4e --- /dev/null +++ b/vendor/yomitan-jlpt-vocab @@ -0,0 +1 @@ +Subproject commit b062d4e38c4bdd0950ae1d4ec55f04b176182e03