Merge pull request #2 from ksyasuda/add-jlpt-tagging

Add opt-in JLPT tagging flow
This commit is contained in:
2026-02-15 17:30:22 -08:00
committed by GitHub
32 changed files with 1160 additions and 126 deletions

View File

@@ -1,63 +0,0 @@
name: Docs
on:
push:
branches: [main]
paths:
- 'docs/**'
- '.github/workflows/docs.yml'
- 'package.json'
- 'pnpm-lock.yaml'
workflow_dispatch:
permissions:
contents: read
pages: write
id-token: write
concurrency:
group: pages
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup pnpm
uses: pnpm/action-setup@v4
with:
version: 9
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: 20
cache: pnpm
- name: Install dependencies
run: pnpm install --frozen-lockfile
- name: Build docs
run: pnpm run docs:build
- name: Setup Pages
uses: actions/configure-pages@v5
- name: Upload artifact
uses: actions/upload-pages-artifact@v3
with:
path: docs/.vitepress/dist
deploy:
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
runs-on: ubuntu-latest
needs: build
steps:
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v4

3
.gitmodules vendored
View File

@@ -2,3 +2,6 @@
path = vendor/texthooker-ui path = vendor/texthooker-ui
url = https://github.com/ksyasuda/texthooker-ui.git url = https://github.com/ksyasuda/texthooker-ui.git
branch = subminer branch = subminer
[submodule "vendor/yomitan-jlpt-vocab"]
path = vendor/yomitan-jlpt-vocab
url = https://github.com/stephenmk/yomitan-jlpt-vocab

View File

@@ -46,12 +46,19 @@ The `subminer` wrapper uses a [Bun](https://bun.sh) shebang, so `bun` must be on
### From Source ### From Source
```bash ```bash
git clone https://github.com/ksyasuda/SubMiner.git git clone --recurse-submodules https://github.com/ksyasuda/SubMiner.git
cd SubMiner cd SubMiner
make build make build
make install make install
``` ```
If you already cloned without submodules:
```bash
cd SubMiner
git submodule update --init --recursive
```
For macOS builds, signing, and platform-specific details, see [docs/installation.md](docs/installation.md). For macOS builds, signing, and platform-specific details, see [docs/installation.md](docs/installation.md).
## Quick Start ## Quick Start

View File

@@ -3,7 +3,7 @@ id: TASK-23
title: >- title: >-
Add opt-in JLPT level tagging by bundling and querying local Yomitan Add opt-in JLPT level tagging by bundling and querying local Yomitan
dictionary dictionary
status: To Do status: In Progress
assignee: [] assignee: []
created_date: '2026-02-13 16:42' created_date: '2026-02-13 16:42'
labels: [] labels: []
@@ -19,13 +19,13 @@ Implement an opt-in JLPT token annotation feature that annotates subtitle words
## Acceptance Criteria ## Acceptance Criteria
<!-- AC:BEGIN --> <!-- AC:BEGIN -->
- [ ] #1 Add an opt-in setting/feature flag so JLPT tagging is disabled by default and can be enabled per user/session as requested. - [x] #1 Add an opt-in setting/feature flag so JLPT tagging is disabled by default and can be enabled per user/session as requested.
- [ ] #2 Bundle the existing JLPT Yomitan extension package/data into the project so lookups can be performed offline from local files. - [x] #2 Bundle the existing JLPT Yomitan extension package/data into the project so lookups can be performed offline from local files.
- [ ] #3 Implement token-level dictionary lookup against the bundled JLPT dictionary file to determine presence and JLPT level for words in subtitle lines. - [x] #3 Implement token-level dictionary lookup against the bundled JLPT dictionary file to determine presence and JLPT level for words in subtitle lines.
- [ ] #4 Render a colored underline under each token determined to have a JLPT level; the underline must match token width/length and not affect layout or disrupt line rendering. - [x] #4 Render a colored underline under each token determined to have a JLPT level; the underline must match token width/length and not affect layout or disrupt line rendering.
- [ ] #5 Assign different underline colors per JLPT level (at minimum N5/N4/N3/N2/N1) with a stable mapping documented in task notes. - [x] #5 Assign different underline colors per JLPT level (at minimum N5/N4/N3/N2/N1) with a stable mapping documented in task notes.
- [ ] #6 Handle unknown/no-match tokens as non-tagged while preserving existing subtitle styling and interaction behavior. - [x] #6 Handle unknown/no-match tokens as non-tagged while preserving existing subtitle styling and interaction behavior.
- [ ] #7 When disabled, no JLPT lookups are performed and subtitles render exactly as current behavior. - [x] #7 When disabled, no JLPT lookups are performed and subtitles render exactly as current behavior.
- [ ] #8 Add tests or deterministic checks covering at least one positive match, one non-match, and one unknown/unsupported-level fallback path. - [ ] #8 Add tests or deterministic checks covering at least one positive match, one non-match, and one unknown/unsupported-level fallback path.
- [ ] #9 Document expected dictionary source and any size/performance impact of bundling the JLPT extension data. - [ ] #9 Document expected dictionary source and any size/performance impact of bundling the JLPT extension data.
- [ ] #10 If dictionary format/version constraints block exact level extraction, the task includes explicit limitation notes and a deterministic fallback strategy. - [ ] #10 If dictionary format/version constraints block exact level extraction, the task includes explicit limitation notes and a deterministic fallback strategy.
@@ -34,5 +34,8 @@ Implement an opt-in JLPT token annotation feature that annotates subtitle words
## Definition of Done ## Definition of Done
<!-- DOD:BEGIN --> <!-- DOD:BEGIN -->
- [ ] #1 Feature has a clear toggle and persistence of preference if applicable. - [ ] #1 Feature has a clear toggle and persistence of preference if applicable.
- [ ] #2 JLPT rendering is visually verified for all supported levels with distinct colors and no overlap/regression in subtitle legibility. - [x] #2 JLPT rendering is visually verified for all supported levels with distinct colors and no overlap/regression in subtitle legibility.
<!-- DOD:END --> <!-- DOD:END -->
## Note
- Full performance/limits documentation and dictionary source/version/perf notes are deferred and tracked separately.

View File

@@ -1,7 +1,7 @@
--- ---
id: TASK-23.1 id: TASK-23.1
title: Implement JLPT token lookup service for subtitle words title: Implement JLPT token lookup service for subtitle words
status: To Do status: In Progress
assignee: [] assignee: []
created_date: '2026-02-13 16:42' created_date: '2026-02-13 16:42'
labels: [] labels: []
@@ -18,14 +18,17 @@ Create a lookup layer that parses/queries the bundled JLPT dictionary file and r
## Acceptance Criteria ## Acceptance Criteria
<!-- AC:BEGIN --> <!-- AC:BEGIN -->
- [ ] #1 Service accepts a token/normalized token and returns JLPT level or no-match deterministically. - [x] #1 Service accepts a token/normalized token and returns JLPT level or no-match deterministically.
- [ ] #2 Lookup handles expected dictionary format edge cases and unknown tokens without throwing. - [x] #2 Lookup handles expected dictionary format edge cases and unknown tokens without throwing.
- [ ] #3 Lookup path is efficient enough for frame-by-frame subtitle updates. - [ ] #3 Lookup path is efficient enough for frame-by-frame subtitle updates.
- [ ] #4 Tokenizer interaction preserves existing token ordering and positions needed for rendering spans/underlines. - [x] #4 Tokenizer interaction preserves existing token ordering and positions needed for rendering spans/underlines.
- [ ] #5 Behavior on malformed/unsupported dictionary format is documented with fallback semantics. - [ ] #5 Behavior on malformed/unsupported dictionary format is documented with fallback semantics.
<!-- AC:END --> <!-- AC:END -->
## Note
- Full performance and malformed-format limitation documentation is deferred per request and will be handled in a separate pass if needed.
## Definition of Done ## Definition of Done
<!-- DOD:BEGIN --> <!-- DOD:BEGIN -->
- [ ] #1 Lookup service returns JLPT level with deterministic output for test fixtures. - [x] #1 Lookup service returns JLPT level with deterministic output for test fixtures.
<!-- DOD:END --> <!-- DOD:END -->

View File

@@ -1,7 +1,7 @@
--- ---
id: TASK-23.2 id: TASK-23.2
title: Bundle JLPT Yomitan dictionary assets for offline local lookup title: Bundle JLPT Yomitan dictionary assets for offline local lookup
status: To Do status: In Progress
assignee: [] assignee: []
created_date: '2026-02-13 16:42' created_date: '2026-02-13 16:42'
labels: [] labels: []
@@ -18,13 +18,16 @@ Package and include the JLPT Yomitan extension dictionary assets in SubMiner so
## Acceptance Criteria ## Acceptance Criteria
<!-- AC:BEGIN --> <!-- AC:BEGIN -->
- [ ] #1 JLPT dictionary asset from the existing Yomitan extension is added to the repository/build output in a tracked, offline-available location. - [x] #1 JLPT dictionary asset from the existing Yomitan extension is added to the repository/build output in a tracked, offline-available location.
- [ ] #2 The loader locates and opens the JLPT dictionary file deterministically at runtime. - [x] #2 The loader locates and opens the JLPT dictionary file deterministically at runtime.
- [ ] #3 Dictionary version/source is documented so future updates are explicit and reproducible. - [ ] #3 Dictionary version/source is documented so future updates are explicit and reproducible.
- [ ] #4 Dictionary bundle size and load impact are documented in task notes or project docs. - [ ] #4 Dictionary bundle size and load impact are documented in task notes or project docs.
<!-- AC:END --> <!-- AC:END -->
## Note
- Full dictionary source/version/performance notes are intentionally deferred for now (out of scope in this pass).
## Definition of Done ## Definition of Done
<!-- DOD:BEGIN --> <!-- DOD:BEGIN -->
- [ ] #1 Dictionary data is bundled and consumable during development and packaged app runs. - [x] #1 Dictionary data is bundled and consumable during development and packaged app runs.
<!-- DOD:END --> <!-- DOD:END -->

View File

@@ -1,7 +1,7 @@
--- ---
id: TASK-23.3 id: TASK-23.3
title: Render JLPT token underlines with level-based colors in subtitle lines title: Render JLPT token underlines with level-based colors in subtitle lines
status: To Do status: Done
assignee: [] assignee: []
created_date: '2026-02-13 16:42' created_date: '2026-02-13 16:42'
labels: [] labels: []
@@ -18,14 +18,14 @@ Render JLPT-aware token annotations as token-length colored underlines in the su
## Acceptance Criteria ## Acceptance Criteria
<!-- AC:BEGIN --> <!-- AC:BEGIN -->
- [ ] #1 For each token with JLPT level, renderer draws an underline matching token width/length. - [x] #1 For each token with JLPT level, renderer draws an underline matching token width/length.
- [ ] #2 Underlines use distinct colors by JLPT level (e.g., N5/N4/N3/N2/N1) and mapping is consistent/documented. - [x] #2 Underlines use distinct colors by JLPT level (e.g., N5/N4/N3/N2/N1) and mapping is consistent/documented.
- [ ] #3 Non-tagged tokens remain visually unchanged. - [x] #3 Non-tagged tokens remain visually unchanged.
- [ ] #4 Rendering does not alter line height/selection behavior or break wrapping behavior. - [x] #4 Rendering does not alter line height/selection behavior or break wrapping behavior.
- [ ] #5 Feature degrades gracefully when level data is missing or lookup is unavailable. - [x] #5 Feature degrades gracefully when level data is missing or lookup is unavailable.
<!-- AC:END --> <!-- AC:END -->
## Definition of Done ## Definition of Done
<!-- DOD:BEGIN --> <!-- DOD:BEGIN -->
- [ ] #1 Visual output validated for all mapped JLPT levels with no legibility/layout regressions. - [x] #1 Visual output validated for all mapped JLPT levels with no legibility/layout regressions.
<!-- DOD:END --> <!-- DOD:END -->

View File

@@ -1,7 +1,7 @@
--- ---
id: TASK-23.4 id: TASK-23.4
title: Add opt-in control and end-to-end flow + tests for JLPT tagging title: Add opt-in control and end-to-end flow + tests for JLPT tagging
status: To Do status: In Progress
assignee: [] assignee: []
created_date: '2026-02-13 16:42' created_date: '2026-02-13 16:42'
labels: [] labels: []
@@ -18,12 +18,15 @@ Add user/config setting to enable JLPT tagging, wire the feature toggle through
## Acceptance Criteria ## Acceptance Criteria
<!-- AC:BEGIN --> <!-- AC:BEGIN -->
- [ ] #1 JLPT tagging is opt-in and defaults to disabled. - [x] #1 JLPT tagging is opt-in and defaults to disabled.
- [ ] #2 When disabled, lookup/rendering pipeline does not execute JLPT processing. - [x] #2 When disabled, lookup/rendering pipeline does not execute JLPT processing.
- [ ] #3 When enabled, end-to-end flow tags subtitle words via token-level lookup and rendering. - [x] #3 When enabled, end-to-end flow tags subtitle words via token-level lookup and rendering.
- [ ] #4 Add tests covering at least one positive match, one non-match, and disabled state. - [ ] #4 Add tests covering at least one positive match, one non-match, and disabled state.
<!-- AC:END --> <!-- AC:END -->
## Note
- Full end-to-end + disabled-state test coverage remains pending as an explicit follow-up item.
## Definition of Done ## Definition of Done
<!-- DOD:BEGIN --> <!-- DOD:BEGIN -->
- [ ] #1 End-to-end option behavior and opt-in state persistence are implemented and verified. - [ ] #1 End-to-end option behavior and opt-in state persistence are implemented and verified.

View File

@@ -149,6 +149,7 @@
// Primary and secondary subtitle styling. // Primary and secondary subtitle styling.
// ========================================== // ==========================================
"subtitleStyle": { "subtitleStyle": {
"enableJlpt": false,
"fontFamily": "Noto Sans CJK JP Regular, Noto Sans CJK JP, Arial Unicode MS, Arial, sans-serif", "fontFamily": "Noto Sans CJK JP Regular, Noto Sans CJK JP, Arial Unicode MS, Arial, sans-serif",
"fontSize": 35, "fontSize": 35,
"fontColor": "#cad3f5", "fontColor": "#cad3f5",
@@ -157,6 +158,13 @@
"backgroundColor": "rgba(54, 58, 79, 0.5)", "backgroundColor": "rgba(54, 58, 79, 0.5)",
"nPlusOneColor": "#c6a0f6", "nPlusOneColor": "#c6a0f6",
"knownWordColor": "#a6da95", "knownWordColor": "#a6da95",
"jlptColors": {
"N1": "#ed8796",
"N2": "#f5a97f",
"N3": "#f9e2af",
"N4": "#a6e3a1",
"N5": "#8aadf4"
},
"secondary": { "secondary": {
"fontSize": 24, "fontSize": 24,
"fontColor": "#ffffff", "fontColor": "#ffffff",

View File

@@ -552,12 +552,26 @@ See `config.example.jsonc` for detailed configuration options.
| `fontWeight` | string | CSS font-weight, e.g. `"bold"`, `"normal"`, `"600"` (default: `"normal"`) | | `fontWeight` | string | CSS font-weight, e.g. `"bold"`, `"normal"`, `"600"` (default: `"normal"`) |
| `fontStyle` | string | `"normal"` or `"italic"` (default: `"normal"`) | | `fontStyle` | string | `"normal"` or `"italic"` (default: `"normal"`) |
| `backgroundColor` | string | Any CSS color, including `"transparent"` (default: `"rgba(54, 58, 79, 0.5)"`) | | `backgroundColor` | string | Any CSS color, including `"transparent"` (default: `"rgba(54, 58, 79, 0.5)"`) |
| `enableJlpt` | boolean | Enable JLPT level underline styling (`false` by default) |
| `nPlusOneColor` | string | Existing n+1 highlight color (default: `#c6a0f6`) |
| `knownWordColor` | string | Existing known-word highlight color (default: `#a6da95`) |
| `jlptColors` | object | JLPT level underline colors object (`N1`..`N5`) |
| `secondary` | object | Override any of the above for secondary subtitles (optional) | | `secondary` | object | Override any of the above for secondary subtitles (optional) |
Secondary subtitle defaults: `fontSize: 24`, `fontColor: "#ffffff"`, `backgroundColor: "transparent"`. Any property not set in `secondary` falls back to the CSS defaults. Secondary subtitle defaults: `fontSize: 24`, `fontColor: "#ffffff"`, `backgroundColor: "transparent"`. Any property not set in `secondary` falls back to the CSS defaults.
**See `config.example.jsonc`** for the complete list of subtitle style configuration options. **See `config.example.jsonc`** for the complete list of subtitle style configuration options.
`jlptColors` keys are:
| Key | Default | Description |
| ---- | --------- | ---------------------------------------- |
| `N1` | `#ed8796` | JLPT N1 underline color |
| `N2` | `#f5a97f` | JLPT N2 underline color |
| `N3` | `#f9e2af` | JLPT N3 underline color |
| `N4` | `#a6e3a1` | JLPT N4 underline color |
| `N5` | `#8aadf4` | JLPT N5 underline color |
### Texthooker ### Texthooker
Control whether the browser opens automatically when texthooker starts: Control whether the browser opens automatically when texthooker starts:

View File

@@ -149,6 +149,7 @@
// Primary and secondary subtitle styling. // Primary and secondary subtitle styling.
// ========================================== // ==========================================
"subtitleStyle": { "subtitleStyle": {
"enableJlpt": false,
"fontFamily": "Noto Sans CJK JP Regular, Noto Sans CJK JP, Arial Unicode MS, Arial, sans-serif", "fontFamily": "Noto Sans CJK JP Regular, Noto Sans CJK JP, Arial Unicode MS, Arial, sans-serif",
"fontSize": 35, "fontSize": 35,
"fontColor": "#cad3f5", "fontColor": "#cad3f5",
@@ -157,6 +158,13 @@
"backgroundColor": "rgba(54, 58, 79, 0.5)", "backgroundColor": "rgba(54, 58, 79, 0.5)",
"nPlusOneColor": "#c6a0f6", "nPlusOneColor": "#c6a0f6",
"knownWordColor": "#a6da95", "knownWordColor": "#a6da95",
"jlptColors": {
"N1": "#ed8796",
"N2": "#f5a97f",
"N3": "#f9e2af",
"N4": "#a6e3a1",
"N5": "#8aadf4"
},
"secondary": { "secondary": {
"fontSize": 24, "fontSize": 24,
"fontColor": "#ffffff", "fontColor": "#ffffff",

View File

@@ -97,6 +97,10 @@
"from": "vendor/yomitan", "from": "vendor/yomitan",
"to": "yomitan" "to": "yomitan"
}, },
{
"from": "vendor/yomitan-jlpt-vocab",
"to": "yomitan-jlpt-vocab"
},
{ {
"from": "assets", "from": "assets",
"to": "assets" "to": "assets"

View File

@@ -174,6 +174,7 @@ export const DEFAULT_CONFIG: ResolvedConfig = {
ffmpeg_path: "", ffmpeg_path: "",
}, },
subtitleStyle: { subtitleStyle: {
enableJlpt: false,
fontFamily: fontFamily:
"Noto Sans CJK JP Regular, Noto Sans CJK JP, Arial Unicode MS, Arial, sans-serif", "Noto Sans CJK JP Regular, Noto Sans CJK JP, Arial Unicode MS, Arial, sans-serif",
fontSize: 35, fontSize: 35,
@@ -183,6 +184,13 @@ export const DEFAULT_CONFIG: ResolvedConfig = {
backgroundColor: "rgba(54, 58, 79, 0.5)", backgroundColor: "rgba(54, 58, 79, 0.5)",
nPlusOneColor: "#c6a0f6", nPlusOneColor: "#c6a0f6",
knownWordColor: "#a6da95", knownWordColor: "#a6da95",
jlptColors: {
N1: "#ed8796",
N2: "#f5a97f",
N3: "#f9e2af",
N4: "#a6e3a1",
N5: "#8aadf4",
},
secondary: { secondary: {
fontSize: 24, fontSize: 24,
fontColor: "#ffffff", fontColor: "#ffffff",
@@ -280,6 +288,13 @@ export const CONFIG_OPTION_REGISTRY: ConfigOptionRegistryEntry[] = [
defaultValue: DEFAULT_CONFIG.websocket.port, defaultValue: DEFAULT_CONFIG.websocket.port,
description: "Built-in subtitle websocket server port.", description: "Built-in subtitle websocket server port.",
}, },
{
path: "subtitleStyle.enableJlpt",
kind: "boolean",
defaultValue: DEFAULT_CONFIG.subtitleStyle.enableJlpt,
description: "Enable JLPT vocabulary level underlines. "
+ "When disabled, JLPT tagging lookup and underlines are skipped.",
},
{ {
path: "ankiConnect.enabled", path: "ankiConnect.enabled",
kind: "boolean", kind: "boolean",

View File

@@ -442,6 +442,18 @@ export class ConfigService {
: {}), : {}),
}, },
}; };
const enableJlpt = asBoolean((src.subtitleStyle as { enableJlpt?: unknown }).enableJlpt);
if (enableJlpt !== undefined) {
resolved.subtitleStyle.enableJlpt = enableJlpt;
} else if ((src.subtitleStyle as { enableJlpt?: unknown }).enableJlpt !== undefined) {
warn(
"subtitleStyle.enableJlpt",
(src.subtitleStyle as { enableJlpt?: unknown }).enableJlpt,
resolved.subtitleStyle.enableJlpt,
"Expected boolean.",
);
}
} }
if (isObject(src.ankiConnect)) { if (isObject(src.ankiConnect)) {

View File

@@ -37,6 +37,7 @@ export {
} from "./runtime-config-service"; } from "./runtime-config-service";
export { openYomitanSettingsWindow } from "./yomitan-settings-service"; export { openYomitanSettingsWindow } from "./yomitan-settings-service";
export { createTokenizerDepsRuntimeService, tokenizeSubtitleService } from "./tokenizer-service"; export { createTokenizerDepsRuntimeService, tokenizeSubtitleService } from "./tokenizer-service";
export { createJlptVocabularyLookupService } from "./jlpt-vocab-service";
export { loadYomitanExtensionService } from "./yomitan-extension-loader-service"; export { loadYomitanExtensionService } from "./yomitan-extension-loader-service";
export { export {
getJimakuLanguagePreferenceService, getJimakuLanguagePreferenceService,

View File

@@ -0,0 +1,29 @@
// Token-level lexical terms excluded from JLPT highlighting.
// These are not tied to POS and act as a safety layer for non-dictionary cases.
export const JLPT_EXCLUDED_TERMS = new Set([
"この",
"その",
"あの",
"どの",
"これ",
"それ",
"あれ",
"どれ",
"ここ",
"そこ",
"あそこ",
"どこ",
"こと",
"ああ",
"ええ",
"うう",
"おお",
"はは",
"へえ",
"ふう",
"ほう",
]);
export function shouldIgnoreJlptByTerm(term: string): boolean {
return JLPT_EXCLUDED_TERMS.has(term);
}

View File

@@ -0,0 +1,45 @@
// MeCab POS1 categories that should be excluded from JLPT-level token tagging.
// These are filtered out because they are typically functional or non-lexical words.
export type JlptIgnoredPos1Entry = {
pos1: string;
reason: string;
};
export const JLPT_IGNORED_MECAB_POS1_ENTRIES = [
{
pos1: "助詞",
reason: "Particles (ko/kara/nagara etc.): mostly grammatical glue, not independent vocabulary.",
},
{
pos1: "助動詞",
reason: "Auxiliary verbs (past tense, politeness, modality): grammar helpers.",
},
{
pos1: "記号",
reason: "Symbols/punctuation and symbols-like tokens.",
},
{
pos1: "補助記号",
reason: "Auxiliary symbols (e.g. bracket-like or markup tokens).",
},
{
pos1: "連体詞",
reason: "Adnominal forms (e.g. demonstratives like \"この\").",
},
{
pos1: "感動詞",
reason: "Interjections/onomatopoeia-style exclamations.",
},
{
pos1: "接続詞",
reason: "Conjunctions that connect clauses, usually not target vocab items.",
},
{
pos1: "接頭詞",
reason: "Prefixes/prefix-like grammatical elements.",
},
] as const satisfies readonly JlptIgnoredPos1Entry[];
export const JLPT_IGNORED_MECAB_POS1 = JLPT_IGNORED_MECAB_POS1_ENTRIES.map(
(entry) => entry.pos1,
);

View File

@@ -0,0 +1,23 @@
import {
JlptIgnoredPos1Entry,
JLPT_IGNORED_MECAB_POS1,
JLPT_IGNORED_MECAB_POS1_ENTRIES,
} from "./jlpt-ignored-mecab-pos1";
export { JLPT_IGNORED_MECAB_POS1_ENTRIES, JlptIgnoredPos1Entry };
// Data-driven MeCab POS names (pos1) used for JLPT filtering.
export const JLPT_IGNORED_MECAB_POS1_LIST: readonly string[] =
JLPT_IGNORED_MECAB_POS1;
const JLPT_IGNORED_MECAB_POS1_SET = new Set<string>(
JLPT_IGNORED_MECAB_POS1_LIST,
);
export function getIgnoredPos1Entries(): readonly JlptIgnoredPos1Entry[] {
return JLPT_IGNORED_MECAB_POS1_ENTRIES;
}
export function shouldIgnoreJlptForMecabPos1(pos1: string): boolean {
return JLPT_IGNORED_MECAB_POS1_SET.has(pos1);
}

View File

@@ -0,0 +1,168 @@
import * as fs from "fs";
import * as path from "path";
import type { JlptLevel } from "../../types";
export interface JlptVocabLookupOptions {
searchPaths: string[];
log: (message: string) => void;
}
const JLPT_BANK_FILES: { level: JlptLevel; filename: string }[] = [
{ level: "N1", filename: "term_meta_bank_1.json" },
{ level: "N2", filename: "term_meta_bank_2.json" },
{ level: "N3", filename: "term_meta_bank_3.json" },
{ level: "N4", filename: "term_meta_bank_4.json" },
{ level: "N5", filename: "term_meta_bank_5.json" },
];
const JLPT_LEVEL_PRECEDENCE: Record<JlptLevel, number> = {
N1: 5,
N2: 4,
N3: 3,
N4: 2,
N5: 1,
};
const NOOP_LOOKUP = (): null => null;
function normalizeJlptTerm(value: string): string {
return value.trim();
}
function hasFrequencyDisplayValue(meta: unknown): boolean {
if (!meta || typeof meta !== "object") return false;
const frequency = (meta as { frequency?: unknown }).frequency;
if (!frequency || typeof frequency !== "object") return false;
return Object.prototype.hasOwnProperty.call(
frequency as Record<string, unknown>,
"displayValue",
);
}
function addEntriesToMap(
rawEntries: unknown,
level: JlptLevel,
terms: Map<string, JlptLevel>,
log: (message: string) => void,
): void {
const shouldUpdateLevel = (
existingLevel: JlptLevel | undefined,
incomingLevel: JlptLevel,
): boolean =>
existingLevel === undefined ||
JLPT_LEVEL_PRECEDENCE[incomingLevel] >
JLPT_LEVEL_PRECEDENCE[existingLevel];
if (!Array.isArray(rawEntries)) {
return;
}
for (const rawEntry of rawEntries) {
if (!Array.isArray(rawEntry)) {
continue;
}
const [term, _entryId, meta] = rawEntry as [unknown, unknown, unknown];
if (typeof term !== "string") {
continue;
}
const normalizedTerm = normalizeJlptTerm(term);
if (!normalizedTerm) {
continue;
}
if (!hasFrequencyDisplayValue(meta)) {
continue;
}
const existingLevel = terms.get(normalizedTerm);
if (shouldUpdateLevel(existingLevel, level)) {
terms.set(normalizedTerm, level);
continue;
}
log(
`JLPT dictionary already has ${normalizedTerm} as ${existingLevel}; keeping that level instead of ${level}`,
);
}
}
function collectDictionaryFromPath(
dictionaryPath: string,
log: (message: string) => void,
): Map<string, JlptLevel> {
const terms = new Map<string, JlptLevel>();
for (const bank of JLPT_BANK_FILES) {
const bankPath = path.join(dictionaryPath, bank.filename);
if (!fs.existsSync(bankPath)) {
continue;
}
let rawText: string;
try {
rawText = fs.readFileSync(bankPath, "utf-8");
} catch {
continue;
}
let rawEntries: unknown;
try {
rawEntries = JSON.parse(rawText) as unknown;
} catch {
continue;
}
addEntriesToMap(rawEntries, bank.level, terms, log);
}
return terms;
}
export async function createJlptVocabularyLookupService(
options: JlptVocabLookupOptions,
): Promise<(term: string) => JlptLevel | null> {
const attemptedPaths: string[] = [];
let foundDirectoryCount = 0;
let foundBankCount = 0;
for (const dictionaryPath of options.searchPaths) {
attemptedPaths.push(dictionaryPath);
if (!fs.existsSync(dictionaryPath)) {
continue;
}
if (!fs.statSync(dictionaryPath).isDirectory()) {
continue;
}
foundDirectoryCount += 1;
const terms = collectDictionaryFromPath(dictionaryPath, options.log);
if (terms.size > 0) {
foundBankCount += 1;
options.log(
`JLPT dictionary loaded from ${dictionaryPath} (${terms.size} entries)`,
);
return (term: string): JlptLevel | null => {
if (!term) return null;
const normalized = normalizeJlptTerm(term);
return normalized ? terms.get(normalized) ?? null : null;
};
}
options.log(
`JLPT dictionary directory exists but contains no readable term_meta_bank_*.json files: ${dictionaryPath}`,
);
}
options.log(
`JLPT dictionary not found. Searched ${attemptedPaths.length} candidate path(s): ${attemptedPaths.join(", ")}`,
);
if (foundDirectoryCount > 0 && foundBankCount === 0) {
options.log(
"JLPT dictionary directories found, but none contained valid term_meta_bank_*.json files.",
);
}
return NOOP_LOOKUP;
}

View File

@@ -92,6 +92,7 @@ export async function runAppReadyRuntimeService(
): Promise<void> { ): Promise<void> {
deps.loadSubtitlePosition(); deps.loadSubtitlePosition();
deps.resolveKeybindings(); deps.resolveKeybindings();
await deps.createMecabTokenizerAndCheck();
deps.createMpvClient(); deps.createMpvClient();
deps.reloadConfig(); deps.reloadConfig();
@@ -117,7 +118,6 @@ export async function runAppReadyRuntimeService(
deps.log("mpv_websocket detected, skipping built-in WebSocket server"); deps.log("mpv_websocket detected, skipping built-in WebSocket server");
} }
await deps.createMecabTokenizerAndCheck();
deps.createSubtitleTimingTracker(); deps.createSubtitleTimingTracker();
await deps.loadYomitanExtension(); await deps.loadYomitanExtension();

View File

@@ -21,6 +21,7 @@ function makeDeps(
setYomitanParserInitPromise: () => {}, setYomitanParserInitPromise: () => {},
isKnownWord: () => false, isKnownWord: () => false,
getKnownWordMatchMode: () => "headword", getKnownWordMatchMode: () => "headword",
getJlptLevel: () => null,
tokenizeWithMecab: async () => null, tokenizeWithMecab: async () => null,
...overrides, ...overrides,
}; };
@@ -43,10 +44,171 @@ function makeDepsFromMecabTokenizer(
getMecabTokenizer: () => ({ getMecabTokenizer: () => ({
tokenize, tokenize,
}), }),
getJlptLevel: () => null,
...overrides, ...overrides,
}); });
} }
test("tokenizeSubtitleService assigns JLPT level to parsed Yomitan tokens", async () => {
const result = await tokenizeSubtitleService(
"猫です",
makeDeps({
getYomitanExt: () => ({ id: "dummy-ext" } as any),
getYomitanParserWindow: () => ({
isDestroyed: () => false,
webContents: {
executeJavaScript: async () => [
{
source: "scanning-parser",
index: 0,
content: [
[
{
text: "猫",
reading: "ねこ",
headwords: [[{ term: "猫" }]],
},
{
text: "です",
reading: "です",
headwords: [[{ term: "です" }]],
},
],
],
},
],
},
} as unknown as Electron.BrowserWindow),
tokenizeWithMecab: async () => null,
getJlptLevel: (text) => (text === "猫" ? "N5" : null),
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.jlptLevel, "N5");
});
test("tokenizeSubtitleService skips JLPT level for excluded demonstratives", async () => {
const result = await tokenizeSubtitleService(
"この",
makeDeps({
getYomitanExt: () => ({ id: "dummy-ext" } as any),
getYomitanParserWindow: () => ({
isDestroyed: () => false,
webContents: {
executeJavaScript: async () => [
{
source: "scanning-parser",
index: 0,
content: [
[
{
text: "この",
reading: "この",
headwords: [[{ term: "この" }]],
},
],
],
},
],
},
} as unknown as Electron.BrowserWindow),
tokenizeWithMecab: async () => null,
getJlptLevel: (text) => (text === "この" ? "N5" : null),
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
});
test("tokenizeSubtitleService skips JLPT level for repeated kana SFX", async () => {
const result = await tokenizeSubtitleService(
"ああ",
makeDeps({
getYomitanExt: () => ({ id: "dummy-ext" } as any),
getYomitanParserWindow: () => ({
isDestroyed: () => false,
webContents: {
executeJavaScript: async () => [
{
source: "scanning-parser",
index: 0,
content: [
[
{
text: "ああ",
reading: "ああ",
headwords: [[{ term: "ああ" }]],
},
],
],
},
],
},
} as unknown as Electron.BrowserWindow),
tokenizeWithMecab: async () => null,
getJlptLevel: (text) => (text === "ああ" ? "N5" : null),
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
});
test("tokenizeSubtitleService assigns JLPT level to mecab tokens", async () => {
const result = await tokenizeSubtitleService(
"猫です",
makeDepsFromMecabTokenizer(async () => [
{
word: "猫",
partOfSpeech: PartOfSpeech.noun,
pos1: "",
pos2: "",
pos3: "",
pos4: "",
inflectionType: "",
inflectionForm: "",
headword: "猫",
katakanaReading: "ネコ",
pronunciation: "ネコ",
},
], {
getJlptLevel: (text) => (text === "猫" ? "N4" : null),
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.jlptLevel, "N4");
});
test("tokenizeSubtitleService skips JLPT level for mecab tokens marked as ineligible", async () => {
const result = await tokenizeSubtitleService(
"は",
makeDepsFromMecabTokenizer(async () => [
{
word: "は",
partOfSpeech: PartOfSpeech.particle,
pos1: "助詞",
pos2: "",
pos3: "",
pos4: "",
inflectionType: "",
inflectionForm: "",
headword: "は",
katakanaReading: "ハ",
pronunciation: "ハ",
},
], {
getJlptLevel: (text) => (text === "は" ? "N5" : null),
}),
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.pos1, "助詞");
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
});
test("tokenizeSubtitleService returns null tokens for empty normalized text", async () => { test("tokenizeSubtitleService returns null tokens for empty normalized text", async () => {
const result = await tokenizeSubtitleService(" \\n ", makeDeps()); const result = await tokenizeSubtitleService(" \\n ", makeDeps());
assert.deepEqual(result, { text: " \\n ", tokens: null }); assert.deepEqual(result, { text: " \\n ", tokens: null });

View File

@@ -1,20 +1,23 @@
import { BrowserWindow, Extension, session } from "electron"; import { BrowserWindow, Extension, session } from "electron";
import { markNPlusOneTargets, mergeTokens } from "../../token-merger"; import { markNPlusOneTargets, mergeTokens } from "../../token-merger";
import { import {
JlptLevel,
MergedToken, MergedToken,
NPlusOneMatchMode, NPlusOneMatchMode,
PartOfSpeech, PartOfSpeech,
SubtitleData, SubtitleData,
Token, Token,
} from "../../types"; } from "../../types";
import { shouldIgnoreJlptForMecabPos1 } from "./jlpt-token-filter-config";
import { shouldIgnoreJlptByTerm } from "./jlpt-excluded-terms";
interface YomitanParseHeadword { interface YomitanParseHeadword {
term?: unknown; term?: unknown;
} }
interface YomitanParseSegment { interface YomitanParseSegment {
text?: unknown; text?: string;
reading?: unknown; reading?: string;
headwords?: unknown; headwords?: unknown;
} }
@@ -24,6 +27,20 @@ interface YomitanParseResultItem {
content?: unknown; content?: unknown;
} }
type YomitanParseLine = YomitanParseSegment[];
const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
const KATAKANA_CODEPOINT_START = 0x30a1;
const KATAKANA_CODEPOINT_END = 0x30f6;
function isObject(value: unknown): value is Record<string, unknown> {
return Boolean(value && typeof value === "object");
}
function isString(value: unknown): value is string {
return typeof value === "string";
}
export interface TokenizerServiceDeps { export interface TokenizerServiceDeps {
getYomitanExt: () => Extension | null; getYomitanExt: () => Extension | null;
getYomitanParserWindow: () => BrowserWindow | null; getYomitanParserWindow: () => BrowserWindow | null;
@@ -34,6 +51,8 @@ export interface TokenizerServiceDeps {
setYomitanParserInitPromise: (promise: Promise<boolean> | null) => void; setYomitanParserInitPromise: (promise: Promise<boolean> | null) => void;
isKnownWord: (text: string) => boolean; isKnownWord: (text: string) => boolean;
getKnownWordMatchMode: () => NPlusOneMatchMode; getKnownWordMatchMode: () => NPlusOneMatchMode;
getJlptLevel: (text: string) => JlptLevel | null;
getJlptEnabled?: () => boolean;
tokenizeWithMecab: (text: string) => Promise<MergedToken[] | null>; tokenizeWithMecab: (text: string) => Promise<MergedToken[] | null>;
} }
@@ -51,6 +70,8 @@ export interface TokenizerDepsRuntimeOptions {
setYomitanParserInitPromise: (promise: Promise<boolean> | null) => void; setYomitanParserInitPromise: (promise: Promise<boolean> | null) => void;
isKnownWord: (text: string) => boolean; isKnownWord: (text: string) => boolean;
getKnownWordMatchMode: () => NPlusOneMatchMode; getKnownWordMatchMode: () => NPlusOneMatchMode;
getJlptLevel: (text: string) => JlptLevel | null;
getJlptEnabled?: () => boolean;
getMecabTokenizer: () => MecabTokenizerLike | null; getMecabTokenizer: () => MecabTokenizerLike | null;
} }
@@ -67,6 +88,8 @@ export function createTokenizerDepsRuntimeService(
setYomitanParserInitPromise: options.setYomitanParserInitPromise, setYomitanParserInitPromise: options.setYomitanParserInitPromise,
isKnownWord: options.isKnownWord, isKnownWord: options.isKnownWord,
getKnownWordMatchMode: options.getKnownWordMatchMode, getKnownWordMatchMode: options.getKnownWordMatchMode,
getJlptLevel: options.getJlptLevel,
getJlptEnabled: options.getJlptEnabled,
tokenizeWithMecab: async (text) => { tokenizeWithMecab: async (text) => {
const mecabTokenizer = options.getMecabTokenizer(); const mecabTokenizer = options.getMecabTokenizer();
if (!mecabTokenizer) { if (!mecabTokenizer) {
@@ -112,19 +135,205 @@ function applyKnownWordMarking(
}); });
} }
function resolveJlptLookupText(token: MergedToken): string {
if (token.headword && token.headword.length > 0) {
return token.headword;
}
if (token.reading && token.reading.length > 0) {
return token.reading;
}
return token.surface;
}
function normalizeJlptTextForExclusion(text: string): string {
const raw = text.trim();
if (!raw) {
return "";
}
let normalized = "";
for (const char of raw) {
const code = char.codePointAt(0);
if (code === undefined) {
continue;
}
if (code >= KATAKANA_CODEPOINT_START && code <= KATAKANA_CODEPOINT_END) {
normalized += String.fromCodePoint(code - KATAKANA_TO_HIRAGANA_OFFSET);
continue;
}
normalized += char;
}
return normalized;
}
function isKanaChar(char: string): boolean {
const code = char.codePointAt(0);
if (code === undefined) {
return false;
}
return (
(code >= 0x3041 && code <= 0x3096) ||
(code >= 0x309b && code <= 0x309f) ||
(code >= 0x30a0 && code <= 0x30fa) ||
(code >= 0x30fd && code <= 0x30ff)
);
}
/**
* Detects repeated-kana speech-like tokens (e.g. 「ああああ」, 「ははは」, 「うーん」 style patterns)
* so they are not JLPT-labeled when they are mostly expressive particles/sfx.
*/
function isRepeatedKanaSfx(text: string): boolean {
const normalized = text.trim();
if (!normalized) {
return false;
}
const chars = [...normalized];
if (!chars.every(isKanaChar)) {
return false;
}
const counts = new Map<string, number>();
let hasAdjacentRepeat = false;
for (let i = 0; i < chars.length; i += 1) {
const char = chars[i];
counts.set(char, (counts.get(char) ?? 0) + 1);
if (i > 0 && chars[i] === chars[i - 1]) {
hasAdjacentRepeat = true;
}
}
const topCount = Math.max(...counts.values());
if (chars.length <= 2) {
return hasAdjacentRepeat || topCount >= 2;
}
if (hasAdjacentRepeat) {
return true;
}
return topCount >= Math.ceil(chars.length / 2);
}
function isJlptEligibleToken(token: MergedToken): boolean {
if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) return false;
const candidates = [
resolveJlptLookupText(token),
token.surface,
token.reading,
token.headword,
].filter((candidate): candidate is string => typeof candidate === "string" && candidate.length > 0);
for (const candidate of candidates) {
const normalizedCandidate = normalizeJlptTextForExclusion(candidate);
if (!normalizedCandidate) {
continue;
}
const trimmedCandidate = candidate.trim();
if (
shouldIgnoreJlptByTerm(trimmedCandidate) ||
shouldIgnoreJlptByTerm(normalizedCandidate)
) {
return false;
}
if (
isRepeatedKanaSfx(candidate) ||
isRepeatedKanaSfx(normalizedCandidate)
) {
return false;
}
}
return true;
}
function isYomitanParseResultItem(
value: unknown,
): value is YomitanParseResultItem {
if (!isObject(value)) {
return false;
}
if ((value as YomitanParseResultItem).source !== "scanning-parser") {
return false;
}
if (!Array.isArray((value as YomitanParseResultItem).content)) {
return false;
}
return true;
}
function isYomitanParseLine(value: unknown): value is YomitanParseLine {
if (!Array.isArray(value)) {
return false;
}
return value.every((segment) => {
if (!isObject(segment)) {
return false;
}
const candidate = segment as YomitanParseSegment;
return isString(candidate.text);
});
}
function isYomitanHeadwordRows(value: unknown): value is YomitanParseHeadword[][] {
return (
Array.isArray(value) &&
value.every(
(group) =>
Array.isArray(group) &&
group.every((item) =>
isObject(item) && isString((item as YomitanParseHeadword).term),
),
)
);
}
function extractYomitanHeadword(segment: YomitanParseSegment): string { function extractYomitanHeadword(segment: YomitanParseSegment): string {
const headwords = segment.headwords; const headwords = segment.headwords;
if (!Array.isArray(headwords) || headwords.length === 0) { if (!isYomitanHeadwordRows(headwords)) {
return ""; return "";
} }
const firstGroup = headwords[0]; for (const group of headwords) {
if (!Array.isArray(firstGroup) || firstGroup.length === 0) { if (group.length > 0) {
return ""; const firstHeadword = group[0] as YomitanParseHeadword;
if (isString(firstHeadword?.term)) {
return firstHeadword.term;
}
}
} }
const firstHeadword = firstGroup[0] as YomitanParseHeadword; return "";
return typeof firstHeadword?.term === "string" ? firstHeadword.term : ""; }
function applyJlptMarking(
tokens: MergedToken[],
getJlptLevel: (text: string) => JlptLevel | null,
): MergedToken[] {
return tokens.map((token) => {
if (!isJlptEligibleToken(token)) {
return { ...token, jlptLevel: undefined };
}
const primaryLevel = getJlptLevel(resolveJlptLookupText(token));
const fallbackLevel = getJlptLevel(token.surface);
return {
...token,
jlptLevel: primaryLevel ?? fallbackLevel ?? token.jlptLevel,
};
});
} }
function mapYomitanParseResultsToMergedTokens( function mapYomitanParseResultsToMergedTokens(
@@ -136,14 +345,9 @@ function mapYomitanParseResultsToMergedTokens(
return null; return null;
} }
const scanningItems = parseResults.filter((item) => { const scanningItems = parseResults.filter(
const resultItem = item as YomitanParseResultItem; (item): item is YomitanParseResultItem => isYomitanParseResultItem(item),
return ( );
resultItem &&
resultItem.source === "scanning-parser" &&
Array.isArray(resultItem.content)
);
}) as YomitanParseResultItem[];
if (scanningItems.length === 0) { if (scanningItems.length === 0) {
return null; return null;
@@ -158,24 +362,21 @@ function mapYomitanParseResultsToMergedTokens(
const tokens: MergedToken[] = []; const tokens: MergedToken[] = [];
let charOffset = 0; let charOffset = 0;
let validLineCount = 0;
for (const line of content) { for (const line of content) {
if (!Array.isArray(line)) { if (!isYomitanParseLine(line)) {
continue; continue;
} }
validLineCount += 1;
let surface = ""; let surface = "";
let reading = ""; let reading = "";
let headword = ""; let headword = "";
for (const rawSegment of line) { for (const segment of line) {
const segment = rawSegment as YomitanParseSegment;
if (!segment || typeof segment !== "object") {
continue;
}
const segmentText = segment.text; const segmentText = segment.text;
if (typeof segmentText !== "string" || segmentText.length === 0) { if (!segmentText || segmentText.length === 0) {
continue; continue;
} }
@@ -205,6 +406,7 @@ function mapYomitanParseResultsToMergedTokens(
startPos: start, startPos: start,
endPos: end, endPos: end,
partOfSpeech: PartOfSpeech.other, partOfSpeech: PartOfSpeech.other,
pos1: "",
isMerged: true, isMerged: true,
isNPlusOneTarget: false, isNPlusOneTarget: false,
isKnown: (() => { isKnown: (() => {
@@ -218,9 +420,108 @@ function mapYomitanParseResultsToMergedTokens(
}); });
} }
if (validLineCount === 0) {
return null;
}
return tokens.length > 0 ? tokens : null; return tokens.length > 0 ? tokens : null;
} }
function pickClosestMecabPos1(
token: MergedToken,
mecabTokens: MergedToken[],
): string | undefined {
if (mecabTokens.length === 0) {
return undefined;
}
const tokenStart = token.startPos ?? 0;
const tokenEnd = token.endPos ?? tokenStart + token.surface.length;
let bestPos1: string | undefined;
let bestOverlap = 0;
let bestSpan = 0;
let bestStart = Number.MAX_SAFE_INTEGER;
for (const mecabToken of mecabTokens) {
if (!mecabToken.pos1) {
continue;
}
const mecabStart = mecabToken.startPos ?? 0;
const mecabEnd = mecabToken.endPos ?? mecabStart + mecabToken.surface.length;
const overlapStart = Math.max(tokenStart, mecabStart);
const overlapEnd = Math.min(tokenEnd, mecabEnd);
const overlap = Math.max(0, overlapEnd - overlapStart);
if (overlap === 0) {
continue;
}
const span = mecabEnd - mecabStart;
if (
overlap > bestOverlap ||
(overlap === bestOverlap &&
(span > bestSpan ||
(span === bestSpan && mecabStart < bestStart)))
) {
bestOverlap = overlap;
bestSpan = span;
bestStart = mecabStart;
bestPos1 = mecabToken.pos1;
}
}
return bestOverlap > 0 ? bestPos1 : undefined;
}
async function enrichYomitanPos1(
tokens: MergedToken[],
deps: TokenizerServiceDeps,
text: string,
): Promise<MergedToken[]> {
if (!tokens || tokens.length === 0) {
return tokens;
}
let mecabTokens: MergedToken[] | null = null;
try {
mecabTokens = await deps.tokenizeWithMecab(text);
} catch (err) {
const error = err as Error;
console.warn(
"Failed to enrich Yomitan tokens with MeCab POS:",
error.message,
`tokenCount=${tokens.length}`,
`textLength=${text.length}`,
);
return tokens;
}
if (!mecabTokens || mecabTokens.length === 0) {
console.warn(
"MeCab enrichment returned no tokens; preserving Yomitan token output.",
`tokenCount=${tokens.length}`,
`textLength=${text.length}`,
);
return tokens;
}
return tokens.map((token) => {
if (token.pos1) {
return token;
}
const pos1 = pickClosestMecabPos1(token, mecabTokens);
if (!pos1) {
return token;
}
return {
...token,
pos1,
};
});
}
async function ensureYomitanParserWindow( async function ensureYomitanParserWindow(
deps: TokenizerServiceDeps, deps: TokenizerServiceDeps,
): Promise<boolean> { ): Promise<boolean> {
@@ -356,11 +657,16 @@ async function parseWithYomitanInternalParser(
script, script,
true, true,
); );
return mapYomitanParseResultsToMergedTokens( const yomitanTokens = mapYomitanParseResultsToMergedTokens(
parseResults, parseResults,
deps.isKnownWord, deps.isKnownWord,
deps.getKnownWordMatchMode(), deps.getKnownWordMatchMode(),
); );
if (!yomitanTokens || yomitanTokens.length === 0) {
return null;
}
return enrichYomitanPos1(yomitanTokens, deps, text);
} catch (err) { } catch (err) {
console.error("Yomitan parser request failed:", (err as Error).message); console.error("Yomitan parser request failed:", (err as Error).message);
return null; return null;
@@ -385,6 +691,7 @@ export async function tokenizeSubtitleService(
.replace(/\n/g, " ") .replace(/\n/g, " ")
.replace(/\s+/g, " ") .replace(/\s+/g, " ")
.trim(); .trim();
const jlptEnabled = deps.getJlptEnabled?.() !== false;
const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps); const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps);
if (yomitanTokens && yomitanTokens.length > 0) { if (yomitanTokens && yomitanTokens.length > 0) {
@@ -393,7 +700,10 @@ export async function tokenizeSubtitleService(
deps.isKnownWord, deps.isKnownWord,
deps.getKnownWordMatchMode(), deps.getKnownWordMatchMode(),
); );
return { text: displayText, tokens: markNPlusOneTargets(knownMarkedTokens) }; const jlptMarkedTokens = jlptEnabled
? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel)
: knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
return { text: displayText, tokens: markNPlusOneTargets(jlptMarkedTokens) };
} }
try { try {
@@ -404,7 +714,10 @@ export async function tokenizeSubtitleService(
deps.isKnownWord, deps.isKnownWord,
deps.getKnownWordMatchMode(), deps.getKnownWordMatchMode(),
); );
return { text: displayText, tokens: markNPlusOneTargets(knownMarkedTokens) }; const jlptMarkedTokens = jlptEnabled
? applyJlptMarking(knownMarkedTokens, deps.getJlptLevel)
: knownMarkedTokens.map((token) => ({ ...token, jlptLevel: undefined }));
return { text: displayText, tokens: markNPlusOneTargets(jlptMarkedTokens) };
} }
} catch (err) { } catch (err) {
console.error("Tokenization error:", (err as Error).message); console.error("Tokenization error:", (err as Error).message);

View File

@@ -59,6 +59,7 @@ export async function loadYomitanExtensionService(
deps: YomitanExtensionLoaderDeps, deps: YomitanExtensionLoaderDeps,
): Promise<Extension | null> { ): Promise<Extension | null> {
const searchPaths = [ const searchPaths = [
path.join(__dirname, "..", "..", "vendor", "yomitan"),
path.join(__dirname, "..", "..", "..", "vendor", "yomitan"), path.join(__dirname, "..", "..", "..", "vendor", "yomitan"),
path.join(process.resourcesPath, "yomitan"), path.join(process.resourcesPath, "yomitan"),
"/usr/share/SubMiner/yomitan", "/usr/share/SubMiner/yomitan",

View File

@@ -95,6 +95,7 @@ import {
createOverlayContentMeasurementStoreService, createOverlayContentMeasurementStoreService,
createOverlayWindowService, createOverlayWindowService,
createTokenizerDepsRuntimeService, createTokenizerDepsRuntimeService,
createJlptVocabularyLookupService,
cycleSecondarySubModeService, cycleSecondarySubModeService,
enforceOverlayLayerOrderService, enforceOverlayLayerOrderService,
ensureOverlayWindowLevelService, ensureOverlayWindowLevelService,
@@ -227,6 +228,8 @@ const isDev =
process.argv.includes("--dev") || process.argv.includes("--debug"); process.argv.includes("--dev") || process.argv.includes("--debug");
const texthookerService = new TexthookerService(); const texthookerService = new TexthookerService();
const subtitleWsService = new SubtitleWebSocketService(); const subtitleWsService = new SubtitleWebSocketService();
let jlptDictionaryLookupInitialized = false;
let jlptDictionaryLookupInitialization: Promise<void> | null = null;
const appLogger = { const appLogger = {
logInfo: (message: string) => { logInfo: (message: string) => {
console.log(message); console.log(message);
@@ -464,6 +467,73 @@ function loadSubtitlePosition(): SubtitlePosition | null {
return appState.subtitlePosition; return appState.subtitlePosition;
} }
function getJlptDictionarySearchPaths(): string[] {
const homeDir = os.homedir();
const dictionaryRoots = [
// Development/runtime source trees where the repo is checked out.
path.join(__dirname, "..", "..", "vendor", "yomitan-jlpt-vocab"),
path.join(app.getAppPath(), "vendor", "yomitan-jlpt-vocab"),
// Packaged app resources (Electron build output layout).
path.join(process.resourcesPath, "yomitan-jlpt-vocab"),
path.join(process.resourcesPath, "app.asar", "vendor", "yomitan-jlpt-vocab"),
// User override/config directories for manually installed dictionaries.
USER_DATA_PATH,
app.getPath("userData"),
path.join(homeDir, ".config", "SubMiner"),
path.join(homeDir, ".config", "subminer"),
path.join(homeDir, "Library", "Application Support", "SubMiner"),
path.join(homeDir, "Library", "Application Support", "subminer"),
// Last-resort fallback: current working directory (local CLI/test runs).
process.cwd(),
];
const searchPaths: string[] = [];
for (const dictionaryRoot of dictionaryRoots) {
searchPaths.push(dictionaryRoot);
searchPaths.push(path.join(dictionaryRoot, "vendor", "yomitan-jlpt-vocab"));
searchPaths.push(path.join(dictionaryRoot, "yomitan-jlpt-vocab"));
}
const uniquePaths = new Set<string>();
for (const searchPath of searchPaths) {
uniquePaths.add(searchPath);
}
return [...uniquePaths];
}
async function initializeJlptDictionaryLookup(): Promise<void> {
appState.jlptLevelLookup = await createJlptVocabularyLookupService({
searchPaths: getJlptDictionarySearchPaths(),
log: (message) => {
console.log(`[JLPT] ${message}`);
},
});
}
async function ensureJlptDictionaryLookup(): Promise<void> {
if (!getResolvedConfig().subtitleStyle.enableJlpt) {
return;
}
if (jlptDictionaryLookupInitialized) {
return;
}
if (!jlptDictionaryLookupInitialization) {
jlptDictionaryLookupInitialization = initializeJlptDictionaryLookup()
.then(() => {
jlptDictionaryLookupInitialized = true;
})
.catch((error) => {
jlptDictionaryLookupInitialization = null;
throw error;
});
}
await jlptDictionaryLookupInitialization;
}
function saveSubtitlePosition(position: SubtitlePosition): void { function saveSubtitlePosition(position: SubtitlePosition): void {
appState.subtitlePosition = position; appState.subtitlePosition = position;
saveSubtitlePositionService({ saveSubtitlePositionService({
@@ -804,6 +874,7 @@ function updateMpvSubtitleRenderMetrics(
} }
async function tokenizeSubtitle(text: string): Promise<SubtitleData> { async function tokenizeSubtitle(text: string): Promise<SubtitleData> {
await ensureJlptDictionaryLookup();
return tokenizeSubtitleService( return tokenizeSubtitleService(
text, text,
createTokenizerDepsRuntimeService({ createTokenizerDepsRuntimeService({
@@ -825,6 +896,9 @@ async function tokenizeSubtitle(text: string): Promise<SubtitleData> {
getKnownWordMatchMode: () => getKnownWordMatchMode: () =>
appState.ankiIntegration?.getKnownWordMatchMode() ?? appState.ankiIntegration?.getKnownWordMatchMode() ??
getResolvedConfig().ankiConnect.nPlusOne.matchMode, getResolvedConfig().ankiConnect.nPlusOne.matchMode,
getJlptLevel: (text) => appState.jlptLevelLookup(text),
getJlptEnabled: () =>
getResolvedConfig().subtitleStyle.enableJlpt,
getMecabTokenizer: () => appState.mecabTokenizer, getMecabTokenizer: () => appState.mecabTokenizer,
}), }),
); );
@@ -1345,6 +1419,7 @@ registerIpcRuntimeServices({
...resolvedConfig.subtitleStyle, ...resolvedConfig.subtitleStyle,
nPlusOneColor: resolvedConfig.ankiConnect.nPlusOne.nPlusOne, nPlusOneColor: resolvedConfig.ankiConnect.nPlusOne.nPlusOne,
knownWordColor: resolvedConfig.ankiConnect.nPlusOne.knownWord, knownWordColor: resolvedConfig.ankiConnect.nPlusOne.knownWord,
enableJlpt: resolvedConfig.subtitleStyle.enableJlpt,
}; };
}, },
saveSubtitlePosition: (position: unknown) => saveSubtitlePosition: (position: unknown) =>

View File

@@ -6,6 +6,7 @@ import type {
SecondarySubMode, SecondarySubMode,
SubtitlePosition, SubtitlePosition,
KikuFieldGroupingChoice, KikuFieldGroupingChoice,
JlptLevel,
} from "../types"; } from "../types";
import type { CliArgs } from "../cli/args"; import type { CliArgs } from "../cli/args";
import type { SubtitleTimingTracker } from "../subtitle-timing-tracker"; import type { SubtitleTimingTracker } from "../subtitle-timing-tracker";
@@ -53,6 +54,7 @@ export interface AppState {
backendOverride: string | null; backendOverride: string | null;
autoStartOverlay: boolean; autoStartOverlay: boolean;
texthookerOnlyMode: boolean; texthookerOnlyMode: boolean;
jlptLevelLookup: (term: string) => JlptLevel | null;
} }
export interface AppStateInitialValues { export interface AppStateInitialValues {
@@ -112,6 +114,7 @@ export function createAppState(values: AppStateInitialValues): AppState {
backendOverride: values.backendOverride ?? null, backendOverride: values.backendOverride ?? null,
autoStartOverlay: values.autoStartOverlay ?? false, autoStartOverlay: values.autoStartOverlay ?? false,
texthookerOnlyMode: values.texthookerOnlyMode ?? false, texthookerOnlyMode: values.texthookerOnlyMode ?? false,
jlptLevelLookup: () => null,
}; };
} }

View File

@@ -71,6 +71,11 @@ export type RendererState = {
knownWordColor: string; knownWordColor: string;
nPlusOneColor: string; nPlusOneColor: string;
jlptN1Color: string;
jlptN2Color: string;
jlptN3Color: string;
jlptN4Color: string;
jlptN5Color: string;
keybindingsMap: Map<string, (string | number)[]>; keybindingsMap: Map<string, (string | number)[]>;
chordPending: boolean; chordPending: boolean;
@@ -130,6 +135,11 @@ export function createRendererState(): RendererState {
knownWordColor: "#a6da95", knownWordColor: "#a6da95",
nPlusOneColor: "#c6a0f6", nPlusOneColor: "#c6a0f6",
jlptN1Color: "#ed8796",
jlptN2Color: "#f5a97f",
jlptN3Color: "#f9e2af",
jlptN4Color: "#a6e3a1",
jlptN5Color: "#8aadf4",
keybindingsMap: new Map(), keybindingsMap: new Map(),
chordPending: false, chordPending: false,

View File

@@ -250,6 +250,11 @@ body {
color: #cad3f5; color: #cad3f5;
--subtitle-known-word-color: #a6da95; --subtitle-known-word-color: #a6da95;
--subtitle-n-plus-one-color: #c6a0f6; --subtitle-n-plus-one-color: #c6a0f6;
--subtitle-jlpt-n1-color: #ed8796;
--subtitle-jlpt-n2-color: #f5a97f;
--subtitle-jlpt-n3-color: #f9e2af;
--subtitle-jlpt-n4-color: #a6e3a1;
--subtitle-jlpt-n5-color: #8aadf4;
text-shadow: text-shadow:
2px 2px 4px rgba(0, 0, 0, 0.8), 2px 2px 4px rgba(0, 0, 0, 0.8),
-1px -1px 2px rgba(0, 0, 0, 0.5); -1px -1px 2px rgba(0, 0, 0, 0.5);
@@ -296,6 +301,51 @@ body.settings-modal-open #subtitleContainer {
text-shadow: 0 0 6px rgba(198, 160, 246, 0.35); text-shadow: 0 0 6px rgba(198, 160, 246, 0.35);
} }
#subtitleRoot .word.word-jlpt-n1 {
color: inherit;
text-decoration-line: underline;
text-decoration-thickness: 2px;
text-underline-offset: 4px;
text-decoration-color: var(--subtitle-jlpt-n1-color, #ed8796);
text-decoration-style: solid;
}
#subtitleRoot .word.word-jlpt-n2 {
color: inherit;
text-decoration-line: underline;
text-decoration-thickness: 2px;
text-underline-offset: 4px;
text-decoration-color: var(--subtitle-jlpt-n2-color, #f5a97f);
text-decoration-style: solid;
}
#subtitleRoot .word.word-jlpt-n3 {
color: inherit;
text-decoration-line: underline;
text-decoration-thickness: 2px;
text-underline-offset: 4px;
text-decoration-color: var(--subtitle-jlpt-n3-color, #f9e2af);
text-decoration-style: solid;
}
#subtitleRoot .word.word-jlpt-n4 {
color: inherit;
text-decoration-line: underline;
text-decoration-thickness: 2px;
text-underline-offset: 4px;
text-decoration-color: var(--subtitle-jlpt-n4-color, #a6e3a1);
text-decoration-style: solid;
}
#subtitleRoot .word.word-jlpt-n5 {
color: inherit;
text-decoration-line: underline;
text-decoration-thickness: 2px;
text-underline-offset: 4px;
text-decoration-color: var(--subtitle-jlpt-n5-color, #8aadf4);
text-decoration-style: solid;
}
#subtitleRoot .word:hover { #subtitleRoot .word:hover {
background: rgba(255, 255, 255, 0.2); background: rgba(255, 255, 255, 0.2);
border-radius: 3px; border-radius: 3px;

View File

@@ -0,0 +1,80 @@
import test from "node:test";
import assert from "node:assert/strict";
import fs from "node:fs";
import path from "node:path";
import type { MergedToken } from "../types";
import { PartOfSpeech } from "../types.js";
import { computeWordClass } from "./subtitle-render.js";
function createToken(overrides: Partial<MergedToken>): MergedToken {
return {
surface: "",
reading: "",
headword: "",
startPos: 0,
endPos: 0,
partOfSpeech: PartOfSpeech.other,
isMerged: true,
isKnown: false,
isNPlusOneTarget: false,
...overrides,
};
}
function extractClassBlock(cssText: string, level: number): string {
const selector = `#subtitleRoot .word.word-jlpt-n${level}`;
const start = cssText.indexOf(selector);
if (start < 0) return "";
const openBrace = cssText.indexOf("{", start);
if (openBrace < 0) return "";
const closeBrace = cssText.indexOf("}", openBrace);
if (closeBrace < 0) return "";
return cssText.slice(openBrace + 1, closeBrace);
}
test("computeWordClass preserves known and n+1 classes while adding JLPT classes", () => {
const knownJlpt = createToken({
isKnown: true,
jlptLevel: "N1",
surface: "猫",
});
const nPlusOneJlpt = createToken({
isNPlusOneTarget: true,
jlptLevel: "N2",
surface: "犬",
});
assert.equal(computeWordClass(knownJlpt), "word word-known word-jlpt-n1");
assert.equal(
computeWordClass(nPlusOneJlpt),
"word word-n-plus-one word-jlpt-n2",
);
});
test("JLPT CSS rules use underline-only styling in renderer stylesheet", () => {
const distCssPath = path.join(process.cwd(), "dist", "renderer", "style.css");
const srcCssPath = path.join(process.cwd(), "src", "renderer", "style.css");
const cssPath = fs.existsSync(distCssPath)
? distCssPath
: srcCssPath;
if (!fs.existsSync(cssPath)) {
assert.fail(
"JLPT CSS file missing. Run `pnpm run build` first, or ensure src/renderer/style.css exists.",
);
}
const cssText = fs.readFileSync(cssPath, "utf-8");
for (let level = 1; level <= 5; level += 1) {
const block = extractClassBlock(cssText, level);
assert.ok(block.length > 0, `word-jlpt-n${level} class should exist`);
assert.match(block, /text-decoration-line:\s*underline;/);
assert.match(block, /text-decoration-thickness:\s*2px;/);
assert.match(block, /text-underline-offset:\s*4px;/);
assert.match(block, /color:\s*inherit;/);
}
});

View File

@@ -15,6 +15,15 @@ function normalizeSubtitle(text: string, trim = true): string {
return trim ? normalized.trim() : normalized; return trim ? normalized.trim() : normalized;
} }
const HEX_COLOR_PATTERN =
/^#(?:[0-9a-fA-F]{3}|[0-9a-fA-F]{4}|[0-9a-fA-F]{6}|[0-9a-fA-F]{8})$/;
function sanitizeHexColor(value: unknown, fallback: string): string {
return typeof value === "string" && HEX_COLOR_PATTERN.test(value.trim())
? value.trim()
: fallback;
}
function renderWithTokens(root: HTMLElement, tokens: MergedToken[]): void { function renderWithTokens(root: HTMLElement, tokens: MergedToken[]): void {
const fragment = document.createDocumentFragment(); const fragment = document.createDocumentFragment();
@@ -50,16 +59,20 @@ function renderWithTokens(root: HTMLElement, tokens: MergedToken[]): void {
root.appendChild(fragment); root.appendChild(fragment);
} }
function computeWordClass(token: MergedToken): string { export function computeWordClass(token: MergedToken): string {
const classes = ["word"];
if (token.isNPlusOneTarget) { if (token.isNPlusOneTarget) {
return "word word-n-plus-one"; classes.push("word-n-plus-one");
} else if (token.isKnown) {
classes.push("word-known");
} }
if (token.isKnown) { if (token.jlptLevel) {
return "word word-known"; classes.push(`word-jlpt-${token.jlptLevel.toLowerCase()}`);
} }
return "word"; return classes.join(" ");
} }
function renderCharacterLevel(root: HTMLElement, text: string): void { function renderCharacterLevel(root: HTMLElement, text: string): void {
@@ -189,6 +202,22 @@ export function createSubtitleRenderer(ctx: RendererContext) {
style.knownWordColor ?? ctx.state.knownWordColor ?? "#a6da95"; style.knownWordColor ?? ctx.state.knownWordColor ?? "#a6da95";
const nPlusOneColor = const nPlusOneColor =
style.nPlusOneColor ?? ctx.state.nPlusOneColor ?? "#c6a0f6"; style.nPlusOneColor ?? ctx.state.nPlusOneColor ?? "#c6a0f6";
const jlptColors = {
N1: ctx.state.jlptN1Color ?? "#ed8796",
N2: ctx.state.jlptN2Color ?? "#f5a97f",
N3: ctx.state.jlptN3Color ?? "#f9e2af",
N4: ctx.state.jlptN4Color ?? "#a6e3a1",
N5: ctx.state.jlptN5Color ?? "#8aadf4",
...(style.jlptColors
? {
N1: sanitizeHexColor(style.jlptColors?.N1, ctx.state.jlptN1Color),
N2: sanitizeHexColor(style.jlptColors?.N2, ctx.state.jlptN2Color),
N3: sanitizeHexColor(style.jlptColors?.N3, ctx.state.jlptN3Color),
N4: sanitizeHexColor(style.jlptColors?.N4, ctx.state.jlptN4Color),
N5: sanitizeHexColor(style.jlptColors?.N5, ctx.state.jlptN5Color),
}
: {}),
};
ctx.state.knownWordColor = knownWordColor; ctx.state.knownWordColor = knownWordColor;
ctx.state.nPlusOneColor = nPlusOneColor; ctx.state.nPlusOneColor = nPlusOneColor;
@@ -197,6 +226,16 @@ export function createSubtitleRenderer(ctx: RendererContext) {
knownWordColor, knownWordColor,
); );
ctx.dom.subtitleRoot.style.setProperty("--subtitle-n-plus-one-color", nPlusOneColor); ctx.dom.subtitleRoot.style.setProperty("--subtitle-n-plus-one-color", nPlusOneColor);
ctx.state.jlptN1Color = jlptColors.N1;
ctx.state.jlptN2Color = jlptColors.N2;
ctx.state.jlptN3Color = jlptColors.N3;
ctx.state.jlptN4Color = jlptColors.N4;
ctx.state.jlptN5Color = jlptColors.N5;
ctx.dom.subtitleRoot.style.setProperty("--subtitle-jlpt-n1-color", jlptColors.N1);
ctx.dom.subtitleRoot.style.setProperty("--subtitle-jlpt-n2-color", jlptColors.N2);
ctx.dom.subtitleRoot.style.setProperty("--subtitle-jlpt-n3-color", jlptColors.N3);
ctx.dom.subtitleRoot.style.setProperty("--subtitle-jlpt-n4-color", jlptColors.N4);
ctx.dom.subtitleRoot.style.setProperty("--subtitle-jlpt-n5-color", jlptColors.N5);
const secondaryStyle = style.secondary; const secondaryStyle = style.secondary;
if (!secondaryStyle) return; if (!secondaryStyle) return;

View File

@@ -223,6 +223,7 @@ export function mergeTokens(
startPos: prev.startPos, startPos: prev.startPos,
endPos: end, endPos: end,
partOfSpeech: prev.partOfSpeech, partOfSpeech: prev.partOfSpeech,
pos1: prev.pos1 ?? token.pos1,
pos2: prev.pos2 ?? token.pos2, pos2: prev.pos2 ?? token.pos2,
pos3: prev.pos3 ?? token.pos3, pos3: prev.pos3 ?? token.pos3,
isMerged: true, isMerged: true,
@@ -245,6 +246,7 @@ export function mergeTokens(
startPos: start, startPos: start,
endPos: end, endPos: end,
partOfSpeech: token.partOfSpeech, partOfSpeech: token.partOfSpeech,
pos1: token.pos1,
pos2: token.pos2, pos2: token.pos2,
pos3: token.pos3, pos3: token.pos3,
isMerged: false, isMerged: false,

View File

@@ -48,13 +48,17 @@ export interface MergedToken {
startPos: number; startPos: number;
endPos: number; endPos: number;
partOfSpeech: PartOfSpeech; partOfSpeech: PartOfSpeech;
pos1?: string;
pos2?: string; pos2?: string;
pos3?: string; pos3?: string;
isMerged: boolean; isMerged: boolean;
isKnown: boolean; isKnown: boolean;
isNPlusOneTarget: boolean; isNPlusOneTarget: boolean;
jlptLevel?: JlptLevel;
} }
export type JlptLevel = "N1" | "N2" | "N3" | "N4" | "N5";
export interface WindowGeometry { export interface WindowGeometry {
x: number; x: number;
y: number; y: number;
@@ -262,6 +266,7 @@ export interface AnkiConnectConfig {
} }
export interface SubtitleStyleConfig { export interface SubtitleStyleConfig {
enableJlpt?: boolean;
fontFamily?: string; fontFamily?: string;
fontSize?: number; fontSize?: number;
fontColor?: string; fontColor?: string;
@@ -270,6 +275,13 @@ export interface SubtitleStyleConfig {
backgroundColor?: string; backgroundColor?: string;
nPlusOneColor?: string; nPlusOneColor?: string;
knownWordColor?: string; knownWordColor?: string;
jlptColors?: {
N1: string;
N2: string;
N3: string;
N4: string;
N5: string;
};
secondary?: { secondary?: {
fontFamily?: string; fontFamily?: string;
fontSize?: number; fontSize?: number;

1
vendor/yomitan-jlpt-vocab vendored Submodule