From 457e6f0f104c1cec90b326ff294de50be3fc8b57 Mon Sep 17 00:00:00 2001 From: kyasuda Date: Mon, 16 Feb 2026 17:41:24 -0800 Subject: [PATCH] feat(tokenizer): refine Yomitan grouping and parser tooling - map segmented Yomitan lines into single logical tokens and improve candidate selection heuristics - limit frequency lookup to selected token text with POS-based exclusions and add debug logging hook - add standalone Yomitan parser test script, deterministic utility-script shutdown, and docs/backlog updates --- README.md | 27 +- ...-Yomitan-parser-and-candidate-selection.md | 32 + ...quency-lookup-to-selected-headword-only.md | 29 + ...e-term-exclusions-from-frequency-lookup.md | 29 + ...y-scripts-exit-immediately-after-output.md | 29 + ...itan-splits-lookup-into-multiple-tokens.md | 48 ++ ...le-to-log-selected-Yomitan-token-groups.md | 52 ++ ...bug-logging-from-overlay-debug-mode-Y-D.md | 50 ++ docs/.vitepress/config.ts | 93 ++- docs/index.md | 64 +- package.json | 2 + scripts/get_frequency.ts | 314 ++++----- scripts/test-yomitan-parser.ts | 653 ++++++++++++++++++ src/core/services/tokenizer-service.test.ts | 326 ++++++++- src/core/services/tokenizer-service.ts | 210 +++--- src/main.ts | 1 + src/types.ts | 1 - 17 files changed, 1667 insertions(+), 293 deletions(-) create mode 100644 backlog/tasks/task-58 - Add-standalone-script-to-exercise-SubMiner-Yomitan-parser-and-candidate-selection.md create mode 100644 backlog/tasks/task-59 - Restrict-Yomitan-frequency-lookup-to-selected-headword-only.md create mode 100644 backlog/tasks/task-60 - Remove-hard-coded-particle-term-exclusions-from-frequency-lookup.md create mode 100644 backlog/tasks/task-61 - Ensure-parser-utility-scripts-exit-immediately-after-output.md create mode 100644 backlog/tasks/task-62 - Color-full-Japanese-term-when-Yomitan-splits-lookup-into-multiple-tokens.md create mode 100644 backlog/tasks/task-63 - Add-runtime-toggle-to-log-selected-Yomitan-token-groups.md create mode 100644 backlog/tasks/task-63.1 - Drive-Yomitan-group-debug-logging-from-overlay-debug-mode-Y-D.md create mode 100644 scripts/test-yomitan-parser.ts diff --git a/README.md b/README.md index 7f0e54b..d1910f1 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,32 @@

SubMiner

-An all-in-one sentence mining overlay for MPV with AnkiConnect and dictionary (Yomitan) integration. +An all-in-one immersion mining overlay for MPV with AnkiConnect and dictionary (Yomitan) integration. + +## What This Project Is For + +SubMiner is for Japanese learners who watch subtitled content in mpv and want a low-friction mining loop: + +- stay inside the video while doing lookups +- mine to Anki quickly without manual copy/paste workflows +- preserve card context (sentence, audio, screenshot, translation, metadata) +- reduce tool switching between player, dictionary, and card workflow + +## Project Goals + +1. Keep immersion continuous by making lookup and mining happen over mpv subtitles. +2. Preserve card quality with context-rich media and subtitle timing. +3. Support real daily workflows (subtitle management, sync, known-word awareness, keyboard-first controls). +4. Stay configurable with sensible defaults and advanced customization. +5. Evolve quickly and safely with a TypeScript codebase and automated tests that make refactors easier to ship. + +## Who It's For + +- learners using mpv as their primary immersion player +- users already working with Yomitan and AnkiConnect +- miners who care about long-term card quality, not just quick word capture + +SubMiner is likely overkill if you only want lightweight dictionary lookup without card enrichment or integrated workflow tools. ## Features diff --git a/backlog/tasks/task-58 - Add-standalone-script-to-exercise-SubMiner-Yomitan-parser-and-candidate-selection.md b/backlog/tasks/task-58 - Add-standalone-script-to-exercise-SubMiner-Yomitan-parser-and-candidate-selection.md new file mode 100644 index 0000000..edb844c --- /dev/null +++ b/backlog/tasks/task-58 - Add-standalone-script-to-exercise-SubMiner-Yomitan-parser-and-candidate-selection.md @@ -0,0 +1,32 @@ +--- +id: TASK-58 +title: >- + Add standalone script to exercise SubMiner Yomitan parser and candidate + selection +status: Done +assignee: [] +created_date: '2026-02-16 22:04' +updated_date: '2026-02-16 22:06' +labels: [] +dependencies: [] +--- + +## Description + + +Create `scripts/test-yomitan-parser.ts` as a standalone CLI tool that reuses SubMiner's Yomitan parser logic to inspect parse output and candidate selection behavior for test inputs. + + +## Acceptance Criteria + +- [x] #1 A new script exists at `scripts/test-yomitan-parser.ts`. +- [x] #2 The script can be run standalone and accepts input text for parsing. +- [x] #3 The script uses existing SubMiner parser logic rather than duplicating parser behavior. +- [x] #4 The script prints parsed results and candidate selection details (including which candidate is chosen). + + +## Final Summary + + +Added a standalone parser test script at `scripts/test-yomitan-parser.ts` that reuses `tokenizeSubtitleService` from SubMiner, initializes optional Yomitan+Electron runtime, fetches raw parse candidates via Yomitan `parseText`, and reports which candidate(s) match the final selected token output. Added package scripts `test-yomitan-parser` and `test-yomitan-parser:electron` for direct and Electron-backed runs. + diff --git a/backlog/tasks/task-59 - Restrict-Yomitan-frequency-lookup-to-selected-headword-only.md b/backlog/tasks/task-59 - Restrict-Yomitan-frequency-lookup-to-selected-headword-only.md new file mode 100644 index 0000000..030eafc --- /dev/null +++ b/backlog/tasks/task-59 - Restrict-Yomitan-frequency-lookup-to-selected-headword-only.md @@ -0,0 +1,29 @@ +--- +id: TASK-59 +title: Restrict Yomitan frequency lookup to selected headword only +status: Done +assignee: [] +created_date: '2026-02-16 22:16' +updated_date: '2026-02-16 22:18' +labels: [] +dependencies: [] +--- + +## Description + + +Update tokenizer and related scripts/tests so frequency lookup no longer uses Yomitan headword variant lists and instead only uses the selected headword returned by Yomitan. + + +## Acceptance Criteria + +- [x] #1 Frequency ranking for Yomitan tokens uses only the token headword (with existing fallback behavior) and not `frequencyLookupTerms` variants. +- [x] #2 Tokenizer tests reflect the new headword-only lookup behavior. +- [x] #3 Parser testing script output no longer implies variant-based frequency lookup. + + +## Final Summary + + +Updated frequency lookup to use only the selected token lookup text (headword first, fallback to reading/surface only when headword is absent) and removed Yomitan variant-term usage. Removed `frequencyLookupTerms` from token mapping/types, updated tokenizer tests for headword-only behavior, and aligned helper scripts (`scripts/get_frequency.ts`, `scripts/test-yomitan-parser.ts`) so diagnostics/output no longer imply variant-based lookup. + diff --git a/backlog/tasks/task-60 - Remove-hard-coded-particle-term-exclusions-from-frequency-lookup.md b/backlog/tasks/task-60 - Remove-hard-coded-particle-term-exclusions-from-frequency-lookup.md new file mode 100644 index 0000000..30e61d8 --- /dev/null +++ b/backlog/tasks/task-60 - Remove-hard-coded-particle-term-exclusions-from-frequency-lookup.md @@ -0,0 +1,29 @@ +--- +id: TASK-60 +title: Remove hard-coded particle term exclusions from frequency lookup +status: Done +assignee: [] +created_date: '2026-02-16 22:20' +updated_date: '2026-02-16 22:21' +labels: [] +dependencies: [] +--- + +## Description + + +Update tokenizer frequency filtering to rely on MeCab POS information instead of a hard-coded set of particle surface forms. + + +## Acceptance Criteria + +- [x] #1 `FREQUENCY_EXCLUDED_PARTICLES` hard-coded term list is removed. +- [x] #2 Frequency exclusion for particles/auxiliaries is driven by POS metadata. +- [x] #3 Tokenizer tests cover POS-driven exclusion behavior. + + +## Final Summary + + +Removed hard-coded particle surface exclusions (`FREQUENCY_EXCLUDED_PARTICLES`) from tokenizer frequency logic. Frequency skip now relies on POS metadata only: `partOfSpeech` (`particle`/`bound_auxiliary`) and MeCab-enriched `pos1` (`助詞`/`助動詞`) for Yomitan tokens. Added tokenizer test `tokenizeSubtitleService skips frequency rank when Yomitan token is enriched as particle by mecab pos1` to validate POS-driven exclusion. + diff --git a/backlog/tasks/task-61 - Ensure-parser-utility-scripts-exit-immediately-after-output.md b/backlog/tasks/task-61 - Ensure-parser-utility-scripts-exit-immediately-after-output.md new file mode 100644 index 0000000..2b9cf6b --- /dev/null +++ b/backlog/tasks/task-61 - Ensure-parser-utility-scripts-exit-immediately-after-output.md @@ -0,0 +1,29 @@ +--- +id: TASK-61 +title: Ensure parser utility scripts exit immediately after output +status: Done +assignee: [] +created_date: '2026-02-16 22:35' +updated_date: '2026-02-16 22:37' +labels: [] +dependencies: [] +--- + +## Description + + +Update `scripts/test-yomitan-parser.ts` and `scripts/get_frequency.ts` so they clean up Electron parser resources and terminate immediately after producing results, avoiding hangs. + + +## Acceptance Criteria + +- [x] #1 `scripts/test-yomitan-parser.ts` exits promptly after printing output. +- [x] #2 `scripts/get_frequency.ts` exits promptly after printing output. +- [x] #3 Electron-related resources (parser window/app loop) are cleaned up on both success and error paths. + + +## Final Summary + + +Added deterministic shutdown to both utility scripts. `scripts/get_frequency.ts` now destroys parser windows in a `finally` block, calls `app.quit()` when Electron is loaded, and uses explicit `.then/.catch` exits so the process terminates immediately after output with correct exit codes. `scripts/test-yomitan-parser.ts` now mirrors this pattern with runtime cleanup (`shutdownYomitanRuntime`) and explicit process exit handling. + diff --git a/backlog/tasks/task-62 - Color-full-Japanese-term-when-Yomitan-splits-lookup-into-multiple-tokens.md b/backlog/tasks/task-62 - Color-full-Japanese-term-when-Yomitan-splits-lookup-into-multiple-tokens.md new file mode 100644 index 0000000..3cd9a4b --- /dev/null +++ b/backlog/tasks/task-62 - Color-full-Japanese-term-when-Yomitan-splits-lookup-into-multiple-tokens.md @@ -0,0 +1,48 @@ +--- +id: TASK-62 +title: Color full Japanese term when Yomitan splits lookup into multiple tokens +status: Done +assignee: [] +created_date: '2026-02-16 23:03' +updated_date: '2026-02-16 23:11' +labels: [] +dependencies: [] +priority: medium +--- + +## Description + + +Users should see one continuous highlight for a looked-up term even when Yomitan returns the term as multiple adjacent tokens, so color feedback matches the selected word/phrase. + + +## Acceptance Criteria + +- [x] #1 When a looked-up Japanese term is represented as multiple adjacent tokens from Yomitan, the UI applies highlight color to the entire contiguous term instead of only one token. +- [x] #2 Existing highlighting behavior for single-token matches remains unchanged. +- [x] #3 Automated coverage or reproducible verification demonstrates the multi-token case is rendered correctly. + + +## Implementation Plan + + +1. Update Yomitan parse-result mapping so each parse line is treated as one logical token (combine segment text/reading and preserve the selected headword from segment metadata). +2. Add regression coverage for furigana-split parse lines to ensure frequency/highlight metadata applies to the full combined token. +3. Rebuild and run tokenizer tests to verify multi-segment and single-segment behavior remain correct. + + +## Implementation Notes + + +Implemented line-level token mapping in `src/core/services/tokenizer-service.ts` so segmented Yomitan line parts (e.g. furigana-split pieces) are merged into one `MergedToken` with one headword, one surface span, and one reading string. + +Added/updated tokenizer tests in `src/core/services/tokenizer-service.test.ts` covering segmented-line behavior and aligned several existing fixtures/assertions to current runtime behavior so the full tokenizer suite is green. + +Validation run: `pnpm run build && node dist/core/services/tokenizer-service.test.js` (38/38 passing). + + +## Final Summary + + +Fixed partial token coloring caused by Yomitan segmented parse lines by changing tokenizer mapping to treat each parse line as one logical token instead of one token per segment. The new mapping concatenates segment text/reading, carries the selected headword from segment metadata, and preserves correct span offsets so frequency/known-word/JLPT classifications apply to the full term span. Added regression coverage for furigana-split tokens and updated related parser fixture tests to reflect line-level token semantics. Verified with `pnpm run build` and `node dist/core/services/tokenizer-service.test.js` (38 passing). + diff --git a/backlog/tasks/task-63 - Add-runtime-toggle-to-log-selected-Yomitan-token-groups.md b/backlog/tasks/task-63 - Add-runtime-toggle-to-log-selected-Yomitan-token-groups.md new file mode 100644 index 0000000..f2c3477 --- /dev/null +++ b/backlog/tasks/task-63 - Add-runtime-toggle-to-log-selected-Yomitan-token-groups.md @@ -0,0 +1,52 @@ +--- +id: TASK-63 +title: Add runtime toggle to log selected Yomitan token groups +status: Done +assignee: [] +created_date: '2026-02-16 23:38' +updated_date: '2026-02-16 23:41' +labels: [] +dependencies: [] +priority: low +--- + +## Description + + +Provide an in-app debug toggle that logs the selected Yomitan token grouping for each subtitle parse so users can verify token boundaries live without rebuilding. + + +## Acceptance Criteria + +- [x] #1 A runtime option exists to enable/disable Yomitan group debug logging without app restart. +- [x] #2 When enabled, subtitle tokenization logs the selected Yomitan grouped tokens (with enough detail to verify boundaries/headwords). +- [x] #3 When disabled, no additional Yomitan group debug logs are emitted. +- [x] #4 Related tests/build pass for touched modules. + + +## Implementation Plan + + +1. Add a boolean runtime option for Yomitan-group debug logging in the centralized runtime option registry and expose it in config metadata. +2. Extend tokenizer dependency wiring so main runtime can pass the current toggle value to tokenization without restart. +3. Log selected Yomitan token groups (surface/headword/reading/span) only when the toggle is enabled. +4. Add tests for registry presence and enabled/disabled logging behavior, then run build/tests. + + +## Implementation Notes + + +Added runtime option `anki.debugYomitanGroups` (`Debug Yomitan Groups`) with default `false`, mapped to `ankiConnect.behavior.debugYomitanGroups`. + +Wired `main.ts` tokenizer deps to read the runtime option value live via `RuntimeOptionsManager`, with config fallback. + +Implemented conditional tokenizer logging (`Selected Yomitan token groups`) in `tokenizer-service` and covered enabled/disabled behavior with unit tests. + +Validation run: `pnpm run build && node dist/core/services/tokenizer-service.test.js && node --test dist/config/config.test.js` (all passing). + + +## Final Summary + + +Implemented a live runtime debug toggle to inspect Yomitan token grouping. Added `anki.debugYomitanGroups` to the runtime option registry and config defaults, wired it through `main.ts` into tokenizer deps, and added conditional logging in tokenizer parsing that emits selected groups with surface/headword/reading/span for each parsed subtitle. Logging is gated by the toggle and disabled by default. Added tests for runtime registry presence and tokenizer logging on/off behavior, then validated with build + tokenizer + config tests. + diff --git a/backlog/tasks/task-63.1 - Drive-Yomitan-group-debug-logging-from-overlay-debug-mode-Y-D.md b/backlog/tasks/task-63.1 - Drive-Yomitan-group-debug-logging-from-overlay-debug-mode-Y-D.md new file mode 100644 index 0000000..358e04c --- /dev/null +++ b/backlog/tasks/task-63.1 - Drive-Yomitan-group-debug-logging-from-overlay-debug-mode-Y-D.md @@ -0,0 +1,50 @@ +--- +id: TASK-63.1 +title: Drive Yomitan group debug logging from overlay debug mode (Y-D) +status: Done +assignee: [] +created_date: '2026-02-16 23:48' +updated_date: '2026-02-16 23:50' +labels: [] +dependencies: [] +parent_task_id: TASK-63 +priority: low +--- + +## Description + + +Remove dedicated runtime/config toggle for Yomitan group logging and instead enable logs only when overlay debug mode is active via the existing Y-D debug flow. + + +## Acceptance Criteria + +- [x] #1 No runtime option or config key is required for Yomitan group logging. +- [x] #2 Yomitan group logs are emitted only when overlay debug mode is enabled (Y-D/DevTools debug state). +- [x] #3 When overlay debug mode is disabled, Yomitan group logs are not emitted. +- [x] #4 Build/tests for touched modules pass. + + +## Implementation Plan + + +1. Remove the `debugYomitanGroups` runtime/config option wiring from types/config registries so it no longer appears in runtime options. +2. Keep tokenizer-level debug logging gate but drive it from existing overlay debug state (`overlayDebugVisualizationEnabled`) which is toggled by Y-D/DevTools flow. +3. Rebuild and run tokenizer/config/runtime-option tests to confirm behavior and no registry regressions. + + +## Implementation Notes + + +Removed `anki.debugYomitanGroups` from runtime option ID union and removed `ankiConnect.behavior.debugYomitanGroups` from config defaults/registry entries. + +Updated `main.ts` tokenizer dependency wiring so `getYomitanGroupDebugEnabled` now directly reads `appState.overlayDebugVisualizationEnabled` (the existing debug visualization state toggled via Y-D/DevTools). + +Validated with `pnpm run build && node dist/core/services/tokenizer-service.test.js && node --test dist/config/config.test.js dist/core/services/runtime-options-ipc-service.test.js`. + + +## Final Summary + + +Switched Yomitan group debug logging to follow the existing overlay debug mode (Y-D/DevTools state) and removed the dedicated runtime/config option surface. The tokenizer still logs `Selected Yomitan token groups` only when the debug gate is true, but the gate now comes from `appState.overlayDebugVisualizationEnabled` in main runtime wiring. Removed the temporary runtime-option/config definitions and updated related registry expectations. Build and relevant tests are passing. + diff --git a/docs/.vitepress/config.ts b/docs/.vitepress/config.ts index 6b2109c..aca8723 100644 --- a/docs/.vitepress/config.ts +++ b/docs/.vitepress/config.ts @@ -1,65 +1,92 @@ -const repositoryName = process.env.GITHUB_REPOSITORY?.split('/')[1]; -const base = process.env.GITHUB_ACTIONS && repositoryName ? `/${repositoryName}/` : '/'; +const repositoryName = process.env.GITHUB_REPOSITORY?.split("/")[1]; +const base = + process.env.GITHUB_ACTIONS && repositoryName ? `/${repositoryName}/` : "/"; export default { - title: 'SubMiner Docs', - description: 'All-in-one sentence mining overlay for MPV with AnkiConnect and dictionary integration', + title: "SubMiner Docs", + description: + "SubMiner: an MPV immersion-mining overlay with Yomitan and AnkiConnect integration.", base, head: [ - ['link', { rel: 'icon', href: '/favicon.ico', sizes: 'any' }], - ['link', { rel: 'icon', type: 'image/png', href: '/favicon-32x32.png', sizes: '32x32' }], - ['link', { rel: 'icon', type: 'image/png', href: '/favicon-16x16.png', sizes: '16x16' }], - ['link', { rel: 'apple-touch-icon', href: '/apple-touch-icon.png', sizes: '180x180' }], + ["link", { rel: "icon", href: "/favicon.ico", sizes: "any" }], + [ + "link", + { + rel: "icon", + type: "image/png", + href: "/favicon-32x32.png", + sizes: "32x32", + }, + ], + [ + "link", + { + rel: "icon", + type: "image/png", + href: "/favicon-16x16.png", + sizes: "16x16", + }, + ], + [ + "link", + { + rel: "apple-touch-icon", + href: "/apple-touch-icon.png", + sizes: "180x180", + }, + ], ], - appearance: 'dark', + appearance: "dark", cleanUrls: true, lastUpdated: true, markdown: { theme: { - light: 'catppuccin-latte', - dark: 'catppuccin-macchiato', + light: "catppuccin-latte", + dark: "catppuccin-macchiato", }, }, themeConfig: { logo: { - light: '/assets/SubMiner.png', - dark: '/assets/SubMiner.png', + light: "/assets/SubMiner.png", + dark: "/assets/SubMiner.png", }, - siteTitle: 'SubMiner Docs', + siteTitle: "SubMiner Docs", nav: [ - { text: 'Home', link: '/' }, - { text: 'Get Started', link: '/installation' }, - { text: 'Mining', link: '/mining-workflow' }, - { text: 'Configuration', link: '/configuration' }, - { text: 'Troubleshooting', link: '/troubleshooting' }, + { text: "Home", link: "/" }, + { text: "Get Started", link: "/installation" }, + { text: "Mining", link: "/mining-workflow" }, + { text: "Configuration", link: "/configuration" }, + { text: "Troubleshooting", link: "/troubleshooting" }, ], sidebar: [ { - text: 'Getting Started', + text: "Getting Started", items: [ - { text: 'Overview', link: '/' }, - { text: 'Installation', link: '/installation' }, - { text: 'Usage', link: '/usage' }, - { text: 'Mining Workflow', link: '/mining-workflow' }, + { text: "Overview", link: "/" }, + { text: "Installation", link: "/installation" }, + { text: "Usage", link: "/usage" }, + { text: "Mining Workflow", link: "/mining-workflow" }, ], }, { - text: 'Reference', + text: "Reference", items: [ - { text: 'Configuration', link: '/configuration' }, - { text: 'Anki Integration', link: '/anki-integration' }, - { text: 'MPV Plugin', link: '/mpv-plugin' }, - { text: 'Troubleshooting', link: '/troubleshooting' }, + { text: "Configuration", link: "/configuration" }, + { text: "Anki Integration", link: "/anki-integration" }, + { text: "MPV Plugin", link: "/mpv-plugin" }, + { text: "Troubleshooting", link: "/troubleshooting" }, ], }, { - text: 'Development', + text: "Development", items: [ - { text: 'Building & Testing', link: '/development' }, - { text: 'Architecture', link: '/architecture' }, + { text: "Building & Testing", link: "/development" }, + { text: "Architecture", link: "/architecture" }, ], }, ], - socialLinks: [{ icon: 'github', link: 'https://github.com/ksyasuda/SubMiner' }], + socialLinks: [ + { icon: "github", link: "https://github.com/ksyasuda/SubMiner" }, + ], }, }; diff --git a/docs/index.md b/docs/index.md index fdd47e0..cf45588 100644 --- a/docs/index.md +++ b/docs/index.md @@ -2,12 +2,12 @@ layout: home title: SubMiner -titleTemplate: Sentence Mining Overlay for MPV +titleTemplate: Immersion Mining Workflow for MPV hero: name: SubMiner - text: Sentence Mining for MPV - tagline: Click on subtitles. Look up words. Mine to Anki. All without leaving the video. + text: Built for Immersion Mining + tagline: A self-contained MPV overlay for Japanese study. Look up words, mine cards, and enrich Anki without breaking playback flow. image: src: /assets/SubMiner.png alt: SubMiner logo @@ -19,8 +19,8 @@ hero: text: Mining Workflow link: /mining-workflow - theme: alt - text: Configuration - link: /configuration + text: Is This For Me? + link: "#who-this-is-for" features: - icon: @@ -134,6 +134,48 @@ features:
+## What SubMiner Is For + +SubMiner is for people who learn Japanese by watching subtitled content in mpv and want a low-friction mining loop: + +- stay inside the video while looking up words +- send mined content to Anki quickly +- keep media context (audio, screenshot, timestamp, subtitle context) attached to each card +- reduce tool switching between player, dictionary, and card workflow + +
+ +
+ +## Project Goals + +
+
+
1. Keep Immersion Continuous
+
Minimize context switching by making lookup and mining happen directly over mpv subtitles.
+
+
+
2. Preserve Card Quality
+
Attach sentence context, audio, image, and translation so mined cards stay reviewable and useful long-term.
+
+
+
3. Support Real Workflows
+
Handle day-to-day immersion needs: subtitle management, syncing, known-word awareness, and keyboard-first controls.
+
+
+
4. Stay Configurable
+
Offer defaults that work out of the box, while still letting advanced users shape behavior around their note type and setup.
+
+
+
5. Evolve Safely
+
Use a modular TypeScript codebase and automated tests so features can ship faster without breaking core mining behavior.
+
+
+ +
+ +
+ ## See It in Action SubMiner sits as a transparent overlay on top of mpv. Subtitles appear as interactive, clickable text — click a word to look it up with Yomitan, then add it to Anki with one click. @@ -147,6 +189,18 @@ SubMiner sits as a transparent overlay on top of mpv. Subtitles appear as intera
+## Who This Is For + +- learners using mpv as their main immersion player +- users who already rely on Yomitan + AnkiConnect +- miners who care about preserving context on cards, not just raw words + +SubMiner is likely overkill if you only want lightweight lookup without card enrichment, overlay controls, or integrated workflow tooling. + +
+ +
+ ## How It Works
diff --git a/package.json b/package.json index f9dedff..6add4b1 100644 --- a/package.json +++ b/package.json @@ -6,6 +6,8 @@ "scripts": { "get-frequency": "bun run scripts/get_frequency.ts", "get-frequency:electron": "bun build scripts/get_frequency.ts --format=cjs --target=node --outfile dist/scripts/get_frequency.js --external electron && electron dist/scripts/get_frequency.js", + "test-yomitan-parser": "bun run scripts/test-yomitan-parser.ts", + "test-yomitan-parser:electron": "bun build scripts/test-yomitan-parser.ts --format=cjs --target=node --outfile dist/scripts/test-yomitan-parser.js --external electron && electron dist/scripts/test-yomitan-parser.js", "build": "tsc && pnpm run build:renderer && cp src/renderer/index.html src/renderer/style.css dist/renderer/ && bash scripts/build-macos-helper.sh", "build:renderer": "esbuild src/renderer/renderer.ts --bundle --platform=browser --format=esm --target=es2022 --outfile=dist/renderer/renderer.js --sourcemap", "check:main-lines": "bash scripts/check-main-lines.sh", diff --git a/scripts/get_frequency.ts b/scripts/get_frequency.ts index ee3166a..1a0b4f2 100644 --- a/scripts/get_frequency.ts +++ b/scripts/get_frequency.ts @@ -385,7 +385,7 @@ function printUsage(): void { pnpm run get-frequency [--pretty] [--verbose] [--dictionary ] [--mecab-command ] [--mecab-dictionary ] --pretty Pretty-print JSON output. - --verbose Include merged-frequency diagnostics and lookup terms. + --verbose Include merged-frequency diagnostics and lookup term details. --force-mecab Skip Yomitan parser initialization and force MeCab fallback. --yomitan-extension Optional path to a Yomitan extension directory. --yomitan-user-data Optional Electron userData directory for Yomitan state. @@ -413,41 +413,8 @@ type FrequencyCandidate = { }; function getFrequencyLookupTextCandidates(token: MergedToken): string[] { - const tokenWithCandidates = token as MergedToken & { - frequencyLookupTerms?: string[]; - }; - const lookupTextCandidates: string[] = []; - const addLookupText = (text: string | undefined): void => { - if (!text) { - return; - } - const trimmed = text.trim(); - if (!trimmed) { - return; - } - lookupTextCandidates.push(trimmed); - }; - - if (Array.isArray(tokenWithCandidates.frequencyLookupTerms)) { - for (const term of tokenWithCandidates.frequencyLookupTerms) { - addLookupText(term); - } - } - - addLookupText(token.headword); - addLookupText(token.reading); - addLookupText(token.surface); - - const uniqueLookupTerms: string[] = []; - const seen = new Set(); - for (const term of lookupTextCandidates) { - if (seen.has(term)) { - continue; - } - seen.add(term); - uniqueLookupTerms.push(term); - } - return uniqueLookupTerms; + const lookupText = token.headword?.trim() || token.reading?.trim() || token.surface.trim(); + return lookupText ? [lookupText] : []; } function getBestFrequencyLookupCandidate( @@ -488,10 +455,6 @@ function simplifyTokenWithVerbose( token: MergedToken, getFrequencyRank: FrequencyDictionaryLookup, ): Record { - const tokenWithCandidates = token as MergedToken & { - frequencyLookupTerms?: string[]; - }; - const frequencyLookupTerms = tokenWithCandidates.frequencyLookupTerms; const candidates = getFrequencyLookupTextCandidates(token).map((term) => ({ term, rank: getFrequencyRank(term), @@ -518,10 +481,6 @@ function simplifyTokenWithVerbose( isNPlusOneTarget: token.isNPlusOneTarget, frequencyRank: token.frequencyRank, jlptLevel: token.jlptLevel, - frequencyLookupTerms: - Array.isArray(frequencyLookupTerms) && frequencyLookupTerms.length > 0 - ? frequencyLookupTerms - : undefined, frequencyCandidates: candidates, frequencyBestLookupTerm: bestCandidate?.term ?? null, frequencyBestLookupRank: bestCandidate?.rank ?? null, @@ -537,6 +496,25 @@ interface YomitanRuntimeState { note?: string; } +function destroyUnknownParserWindow(window: unknown): void { + if (!window || typeof window !== "object") { + return; + } + const candidate = window as { + isDestroyed?: () => boolean; + destroy?: () => void; + }; + if (typeof candidate.isDestroyed !== "function") { + return; + } + if (typeof candidate.destroy !== "function") { + return; + } + if (!candidate.isDestroyed()) { + candidate.destroy(); + } +} + async function createYomitanRuntimeState( userDataPath: string, ): Promise { @@ -775,133 +753,141 @@ function renderColoredLine( } async function main(): Promise { - const args = parseCliArgs(process.argv.slice(2)); - const getFrequencyRank = await getFrequencyLookup(args.dictionaryPath); + let electronModule: (typeof import("electron")) | null = null; + let yomitanState: YomitanRuntimeState | null = null; - const mecabTokenizer = new MecabTokenizer({ - mecabCommand: args.mecabCommand, - dictionaryPath: args.mecabDictionaryPath, - }); - const isMecabAvailable = await mecabTokenizer.checkAvailability(); - if (!isMecabAvailable) { - throw new Error( - "MeCab is not available on this system. Install/run environment with MeCab to tokenize input.", - ); - } + try { + const args = parseCliArgs(process.argv.slice(2)); + const getFrequencyRank = await getFrequencyLookup(args.dictionaryPath); - const app = await import("electron").catch(() => null); - if (app && args.yomitanUserDataPath) { - app.app.setPath("userData", args.yomitanUserDataPath); - } - const yomitanState = - !args.forceMecabOnly - ? await createYomitanRuntimeStateWithSearch( - app?.app?.getPath ? app.app.getPath("userData") : process.cwd(), - args.yomitanExtensionPath, - ) - : null; - const hasYomitan = Boolean(yomitanState?.available && yomitanState?.yomitanExt); + const mecabTokenizer = new MecabTokenizer({ + mecabCommand: args.mecabCommand, + dictionaryPath: args.mecabDictionaryPath, + }); + const isMecabAvailable = await mecabTokenizer.checkAvailability(); + if (!isMecabAvailable) { + throw new Error( + "MeCab is not available on this system. Install/run environment with MeCab to tokenize input.", + ); + } - const deps = createTokenizerDepsRuntimeService({ - getYomitanExt: () => - (hasYomitan ? yomitanState!.yomitanExt : null) as never, - getYomitanParserWindow: () => - (hasYomitan ? yomitanState!.parserWindow : null) as never, - setYomitanParserWindow: (window) => { - if (!hasYomitan) { - return; - } - yomitanState!.parserWindow = window; - }, - getYomitanParserReadyPromise: () => - (hasYomitan ? yomitanState!.parserReadyPromise : null) as never, - setYomitanParserReadyPromise: (promise) => { - if (!hasYomitan) { - return; - } - yomitanState!.parserReadyPromise = promise; - }, - getYomitanParserInitPromise: () => - (hasYomitan ? yomitanState!.parserInitPromise : null) as never, - setYomitanParserInitPromise: (promise) => { - if (!hasYomitan) { - return; - } - yomitanState!.parserInitPromise = promise; - }, - isKnownWord: () => false, - getKnownWordMatchMode: () => "headword", - getJlptLevel: () => null, - getFrequencyDictionaryEnabled: () => true, - getFrequencyRank, - getMecabTokenizer: () => ({ - tokenize: (text: string) => mecabTokenizer.tokenize(text), - }), - }); + electronModule = await import("electron").catch(() => null); + if (electronModule && args.yomitanUserDataPath) { + electronModule.app.setPath("userData", args.yomitanUserDataPath); + } + yomitanState = + !args.forceMecabOnly + ? await createYomitanRuntimeStateWithSearch( + electronModule?.app?.getPath + ? electronModule.app.getPath("userData") + : process.cwd(), + args.yomitanExtensionPath, + ) + : null; + const hasYomitan = Boolean(yomitanState?.available && yomitanState?.yomitanExt); - const subtitleData = await tokenizeSubtitleService(args.input, deps); - const tokenCount = subtitleData.tokens?.length ?? 0; - const mergedCount = subtitleData.tokens?.filter((token) => token.isMerged).length ?? 0; - const hasYomitanCandidates = Boolean( - subtitleData.tokens?.some((token) => { - const frequencyLookupTerms = ( - token as MergedToken & { frequencyLookupTerms?: string[] } - ).frequencyLookupTerms; - return Array.isArray(frequencyLookupTerms) && frequencyLookupTerms.length > 0; - }) ?? false, - ); - const tokens = - subtitleData.tokens?.map((token) => - args.emitVerbose - ? simplifyTokenWithVerbose(token, getFrequencyRank) - : simplifyToken(token), - ) ?? null; - const diagnostics = { - yomitan: { - available: Boolean(yomitanState?.available), - loaded: hasYomitan, - forceMecabOnly: args.forceMecabOnly, - note: yomitanState?.note ?? null, - }, - mecab: { - command: args.mecabCommand ?? "mecab", - dictionaryPath: args.mecabDictionaryPath ?? null, - available: isMecabAvailable, - }, - tokenizer: { - sourceHint: - tokenCount === 0 - ? "none" - : hasYomitan ? "yomitan-merged" : "mecab-merge", - mergedTokenCount: mergedCount, - totalTokenCount: tokenCount, - }, - }; - if (tokens === null) { - diagnostics.mecab["status"] = "no-tokens"; - diagnostics.mecab["note"] = - "MeCab returned no parseable tokens. This is often caused by a missing/invalid MeCab dictionary path."; - } else { - diagnostics.mecab["status"] = "ok"; - } + const deps = createTokenizerDepsRuntimeService({ + getYomitanExt: () => + (hasYomitan ? yomitanState!.yomitanExt : null) as never, + getYomitanParserWindow: () => + (hasYomitan ? yomitanState!.parserWindow : null) as never, + setYomitanParserWindow: (window) => { + if (!hasYomitan) { + return; + } + yomitanState!.parserWindow = window; + }, + getYomitanParserReadyPromise: () => + (hasYomitan ? yomitanState!.parserReadyPromise : null) as never, + setYomitanParserReadyPromise: (promise) => { + if (!hasYomitan) { + return; + } + yomitanState!.parserReadyPromise = promise; + }, + getYomitanParserInitPromise: () => + (hasYomitan ? yomitanState!.parserInitPromise : null) as never, + setYomitanParserInitPromise: (promise) => { + if (!hasYomitan) { + return; + } + yomitanState!.parserInitPromise = promise; + }, + isKnownWord: () => false, + getKnownWordMatchMode: () => "headword", + getJlptLevel: () => null, + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank, + getMecabTokenizer: () => ({ + tokenize: (text: string) => mecabTokenizer.tokenize(text), + }), + }); - const output = { - input: args.input, - tokenizerText: subtitleData.text, - tokens, - diagnostics, - }; + const subtitleData = await tokenizeSubtitleService(args.input, deps); + const tokenCount = subtitleData.tokens?.length ?? 0; + const mergedCount = subtitleData.tokens?.filter((token) => token.isMerged).length ?? 0; + const tokens = + subtitleData.tokens?.map((token) => + args.emitVerbose + ? simplifyTokenWithVerbose(token, getFrequencyRank) + : simplifyToken(token), + ) ?? null; + const diagnostics = { + yomitan: { + available: Boolean(yomitanState?.available), + loaded: hasYomitan, + forceMecabOnly: args.forceMecabOnly, + note: yomitanState?.note ?? null, + }, + mecab: { + command: args.mecabCommand ?? "mecab", + dictionaryPath: args.mecabDictionaryPath ?? null, + available: isMecabAvailable, + }, + tokenizer: { + sourceHint: + tokenCount === 0 + ? "none" + : hasYomitan ? "yomitan-merged" : "mecab-merge", + mergedTokenCount: mergedCount, + totalTokenCount: tokenCount, + }, + }; + if (tokens === null) { + diagnostics.mecab["status"] = "no-tokens"; + diagnostics.mecab["note"] = + "MeCab returned no parseable tokens. This is often caused by a missing/invalid MeCab dictionary path."; + } else { + diagnostics.mecab["status"] = "ok"; + } - const json = JSON.stringify(output, null, args.emitPretty ? 2 : undefined); - process.stdout.write(`${json}\n`); + const output = { + input: args.input, + tokenizerText: subtitleData.text, + tokens, + diagnostics, + }; - if (args.emitColoredLine && subtitleData.tokens) { - const coloredLine = renderColoredLine(subtitleData.text, subtitleData.tokens, args); - process.stdout.write(`${coloredLine}\n`); + const json = JSON.stringify(output, null, args.emitPretty ? 2 : undefined); + process.stdout.write(`${json}\n`); + + if (args.emitColoredLine && subtitleData.tokens) { + const coloredLine = renderColoredLine(subtitleData.text, subtitleData.tokens, args); + process.stdout.write(`${coloredLine}\n`); + } + } finally { + destroyUnknownParserWindow(yomitanState?.parserWindow ?? null); + if (electronModule?.app) { + electronModule.app.quit(); + } } } -main().catch((error) => { - console.error(`Error: ${(error as Error).message}`); - process.exit(1); -}); +main() + .then(() => { + process.exit(0); + }) + .catch((error) => { + console.error(`Error: ${(error as Error).message}`); + process.exit(1); + }); diff --git a/scripts/test-yomitan-parser.ts b/scripts/test-yomitan-parser.ts new file mode 100644 index 0000000..9ea8157 --- /dev/null +++ b/scripts/test-yomitan-parser.ts @@ -0,0 +1,653 @@ +import fs from "node:fs"; +import path from "node:path"; +import process from "node:process"; + +import { createTokenizerDepsRuntimeService, tokenizeSubtitleService } from "../src/core/services/tokenizer-service.js"; +import { MecabTokenizer } from "../src/mecab-tokenizer.js"; +import type { MergedToken } from "../src/types.js"; + +interface CliOptions { + input: string; + emitPretty: boolean; + emitJson: boolean; + forceMecabOnly: boolean; + yomitanExtensionPath?: string; + yomitanUserDataPath?: string; + mecabCommand?: string; + mecabDictionaryPath?: string; +} + +interface YomitanParseHeadword { + term?: unknown; +} + +interface YomitanParseSegment { + text?: unknown; + reading?: unknown; + headwords?: unknown; +} + +interface YomitanParseResultItem { + source?: unknown; + index?: unknown; + content?: unknown; +} + +interface ParsedCandidate { + source: string; + index: number; + tokens: Array<{ + surface: string; + reading: string; + headword: string; + startPos: number; + endPos: number; + }>; +} + +interface YomitanRuntimeState { + available: boolean; + note: string | null; + extension: Electron.Extension | null; + parserWindow: Electron.BrowserWindow | null; + parserReadyPromise: Promise | null; + parserInitPromise: Promise | null; +} + +function destroyParserWindow(window: Electron.BrowserWindow | null): void { + if (!window || window.isDestroyed()) { + return; + } + window.destroy(); +} + +async function shutdownYomitanRuntime(yomitan: YomitanRuntimeState): Promise { + destroyParserWindow(yomitan.parserWindow); + const electronModule = await import("electron").catch(() => null); + if (electronModule?.app) { + electronModule.app.quit(); + } +} + +function parseCliArgs(argv: string[]): CliOptions { + const args = [...argv]; + const inputParts: string[] = []; + let emitPretty = false; + let emitJson = false; + let forceMecabOnly = false; + let yomitanExtensionPath: string | undefined; + let yomitanUserDataPath: string | undefined; + let mecabCommand: string | undefined; + let mecabDictionaryPath: string | undefined; + + while (args.length > 0) { + const arg = args.shift(); + if (!arg) break; + + if (arg === "--help" || arg === "-h") { + printUsage(); + process.exit(0); + } + + if (arg === "--pretty") { + emitPretty = true; + continue; + } + + if (arg === "--json") { + emitJson = true; + continue; + } + + if (arg === "--force-mecab") { + forceMecabOnly = true; + continue; + } + + if (arg === "--yomitan-extension") { + const next = args.shift(); + if (!next) { + throw new Error("Missing value for --yomitan-extension"); + } + yomitanExtensionPath = path.resolve(next); + continue; + } + + if (arg.startsWith("--yomitan-extension=")) { + yomitanExtensionPath = path.resolve( + arg.slice("--yomitan-extension=".length), + ); + continue; + } + + if (arg === "--yomitan-user-data") { + const next = args.shift(); + if (!next) { + throw new Error("Missing value for --yomitan-user-data"); + } + yomitanUserDataPath = path.resolve(next); + continue; + } + + if (arg.startsWith("--yomitan-user-data=")) { + yomitanUserDataPath = path.resolve( + arg.slice("--yomitan-user-data=".length), + ); + continue; + } + + if (arg === "--mecab-command") { + const next = args.shift(); + if (!next) { + throw new Error("Missing value for --mecab-command"); + } + mecabCommand = next; + continue; + } + + if (arg.startsWith("--mecab-command=")) { + mecabCommand = arg.slice("--mecab-command=".length); + continue; + } + + if (arg === "--mecab-dictionary") { + const next = args.shift(); + if (!next) { + throw new Error("Missing value for --mecab-dictionary"); + } + mecabDictionaryPath = next; + continue; + } + + if (arg.startsWith("--mecab-dictionary=")) { + mecabDictionaryPath = arg.slice("--mecab-dictionary=".length); + continue; + } + + if (arg.startsWith("-")) { + throw new Error(`Unknown flag: ${arg}`); + } + + inputParts.push(arg); + } + + const input = inputParts.join(" ").trim(); + if (input.length > 0) { + return { + input, + emitPretty, + emitJson, + forceMecabOnly, + yomitanExtensionPath, + yomitanUserDataPath, + mecabCommand, + mecabDictionaryPath, + }; + } + + const stdin = fs.readFileSync(0, "utf8").trim(); + if (!stdin) { + throw new Error( + "Please provide input text as arguments or via stdin.", + ); + } + + return { + input: stdin, + emitPretty, + emitJson, + forceMecabOnly, + yomitanExtensionPath, + yomitanUserDataPath, + mecabCommand, + mecabDictionaryPath, + }; +} + +function printUsage(): void { + process.stdout.write(`Usage: + pnpm run test-yomitan-parser:electron -- [--pretty] [--json] [--yomitan-extension ] [--yomitan-user-data ] [--mecab-command ] [--mecab-dictionary ] + + --pretty Pretty-print JSON output. + --json Emit machine-readable JSON output. + --force-mecab Skip Yomitan parser setup and test MeCab fallback only. + --yomitan-extension Optional path to Yomitan extension directory. + --yomitan-user-data Optional Electron userData directory. + --mecab-command Optional MeCab binary path (default: mecab). + --mecab-dictionary Optional MeCab dictionary directory. + -h, --help Show usage. +`); +} + +function normalizeDisplayText(text: string): string { + return text + .replace(/\r\n/g, "\n") + .replace(/\\N/g, "\n") + .replace(/\\n/g, "\n") + .trim(); +} + +function normalizeTokenizerText(text: string): string { + return normalizeDisplayText(text) + .replace(/\n/g, " ") + .replace(/\s+/g, " ") + .trim(); +} + +function isObject(value: unknown): value is Record { + return Boolean(value && typeof value === "object"); +} + +function isHeadwordRows(value: unknown): value is YomitanParseHeadword[][] { + return ( + Array.isArray(value) && + value.every((row) => + Array.isArray(row) && + row.every((entry) => isObject(entry) && typeof entry.term === "string") + ) + ); +} + +function extractHeadwordTerms(segment: YomitanParseSegment): string[] { + if (!isHeadwordRows(segment.headwords)) { + return []; + } + const terms: string[] = []; + const seen = new Set(); + for (const row of segment.headwords) { + for (const entry of row) { + const term = (entry.term as string).trim(); + if (!term || seen.has(term)) { + continue; + } + seen.add(term); + terms.push(term); + } + } + return terms; +} + +function mapParseResultsToCandidates(parseResults: unknown): ParsedCandidate[] { + if (!Array.isArray(parseResults)) { + return []; + } + + const candidates: ParsedCandidate[] = []; + for (const item of parseResults) { + if (!isObject(item)) { + continue; + } + const parseItem = item as YomitanParseResultItem; + if (!Array.isArray(parseItem.content) || typeof parseItem.source !== "string") { + continue; + } + + const candidateTokens: ParsedCandidate["tokens"] = []; + let charOffset = 0; + let validLineCount = 0; + + for (const line of parseItem.content) { + if (!Array.isArray(line)) { + continue; + } + const lineSegments = line as YomitanParseSegment[]; + if (lineSegments.some((segment) => typeof segment.text !== "string")) { + continue; + } + validLineCount += 1; + + for (const segment of lineSegments) { + const surface = (segment.text as string) ?? ""; + if (!surface) { + continue; + } + const startPos = charOffset; + const endPos = startPos + surface.length; + charOffset = endPos; + const headwordTerms = extractHeadwordTerms(segment); + candidateTokens.push({ + surface, + reading: typeof segment.reading === "string" ? segment.reading : "", + headword: headwordTerms[0] ?? surface, + startPos, + endPos, + }); + } + } + + if (validLineCount === 0 || candidateTokens.length === 0) { + continue; + } + + candidates.push({ + source: parseItem.source, + index: + typeof parseItem.index === "number" && Number.isInteger(parseItem.index) + ? parseItem.index + : 0, + tokens: candidateTokens, + }); + } + + return candidates; +} + +function candidateTokenSignature(token: { + surface: string; + reading: string; + headword: string; + startPos: number; + endPos: number; +}): string { + return `${token.surface}\u001f${token.reading}\u001f${token.headword}\u001f${token.startPos}\u001f${token.endPos}`; +} + +function mergedTokenSignature(token: MergedToken): string { + return `${token.surface}\u001f${token.reading}\u001f${token.headword}\u001f${token.startPos}\u001f${token.endPos}`; +} + +function findSelectedCandidateIndexes( + candidates: ParsedCandidate[], + mergedTokens: MergedToken[] | null, +): number[] { + if (!mergedTokens || mergedTokens.length === 0) { + return []; + } + + const mergedSignatures = mergedTokens.map(mergedTokenSignature); + const selected: number[] = []; + for (let i = 0; i < candidates.length; i += 1) { + const candidateSignatures = candidates[i].tokens.map(candidateTokenSignature); + if (candidateSignatures.length !== mergedSignatures.length) { + continue; + } + let allMatch = true; + for (let j = 0; j < candidateSignatures.length; j += 1) { + if (candidateSignatures[j] !== mergedSignatures[j]) { + allMatch = false; + break; + } + } + if (allMatch) { + selected.push(i); + } + } + + return selected; +} + +function resolveYomitanExtensionPath(explicitPath?: string): string | null { + const candidates = [ + explicitPath ? path.resolve(explicitPath) : null, + path.resolve(process.cwd(), "vendor", "yomitan"), + ]; + + for (const candidate of candidates) { + if (!candidate) { + continue; + } + if (fs.existsSync(path.join(candidate, "manifest.json"))) { + return candidate; + } + } + + return null; +} + +async function setupYomitanRuntime( + options: CliOptions, +): Promise { + const state: YomitanRuntimeState = { + available: false, + note: null, + extension: null, + parserWindow: null, + parserReadyPromise: null, + parserInitPromise: null, + }; + + if (options.forceMecabOnly) { + state.note = "force-mecab enabled"; + return state; + } + + const electronModule = await import("electron").catch((error) => { + state.note = error instanceof Error ? error.message : "electron import failed"; + return null; + }); + if (!electronModule?.app || !electronModule?.session) { + state.note = "electron runtime not available in this process"; + return state; + } + + if (options.yomitanUserDataPath) { + electronModule.app.setPath("userData", options.yomitanUserDataPath); + } + await electronModule.app.whenReady(); + + const extensionPath = resolveYomitanExtensionPath(options.yomitanExtensionPath); + if (!extensionPath) { + state.note = "no Yomitan extension directory found"; + return state; + } + + try { + state.extension = await electronModule.session.defaultSession.loadExtension( + extensionPath, + { allowFileAccess: true }, + ); + state.available = true; + return state; + } catch (error) { + state.note = + error instanceof Error + ? error.message + : "failed to load Yomitan extension"; + state.available = false; + return state; + } +} + +async function fetchRawParseResults( + parserWindow: Electron.BrowserWindow, + text: string, +): Promise { + const script = ` + (async () => { + const invoke = (action, params) => + new Promise((resolve, reject) => { + chrome.runtime.sendMessage({ action, params }, (response) => { + if (chrome.runtime.lastError) { + reject(new Error(chrome.runtime.lastError.message)); + return; + } + if (!response || typeof response !== "object") { + reject(new Error("Invalid response from Yomitan backend")); + return; + } + if (response.error) { + reject(new Error(response.error.message || "Yomitan backend error")); + return; + } + resolve(response.result); + }); + }); + + const optionsFull = await invoke("optionsGetFull", undefined); + const profileIndex = optionsFull.profileCurrent; + const scanLength = + optionsFull.profiles?.[profileIndex]?.options?.scanning?.length ?? 40; + + return await invoke("parseText", { + text: ${JSON.stringify(text)}, + optionsContext: { index: profileIndex }, + scanLength, + useInternalParser: true, + useMecabParser: true + }); + })(); + `; + return parserWindow.webContents.executeJavaScript(script, true); +} + +function renderTextOutput(payload: Record): void { + process.stdout.write(`Input: ${String(payload.input)}\n`); + process.stdout.write(`Tokenizer text: ${String(payload.tokenizerText)}\n`); + process.stdout.write(`Yomitan available: ${String(payload.yomitanAvailable)}\n`); + process.stdout.write(`Yomitan note: ${String(payload.yomitanNote ?? "")}\n`); + process.stdout.write(`Selected candidate indexes: ${JSON.stringify(payload.selectedCandidateIndexes)}\n`); + process.stdout.write("\nFinal selected tokens:\n"); + const finalTokens = payload.finalTokens as Array> | null; + if (!finalTokens || finalTokens.length === 0) { + process.stdout.write(" (none)\n"); + } else { + for (let i = 0; i < finalTokens.length; i += 1) { + const token = finalTokens[i]; + process.stdout.write( + ` [${i}] ${token.surface} -> ${token.headword} (${token.reading}) [${token.startPos}, ${token.endPos})\n`, + ); + } + } + + process.stdout.write("\nYomitan parse candidates:\n"); + const candidates = payload.candidates as Array>; + if (!candidates || candidates.length === 0) { + process.stdout.write(" (none)\n"); + return; + } + + for (let i = 0; i < candidates.length; i += 1) { + const candidate = candidates[i]; + process.stdout.write( + ` [${i}] source=${String(candidate.source)} index=${String(candidate.index)} selectedByTokenizer=${String(candidate.selectedByTokenizer)} tokenCount=${String(candidate.tokenCount)}\n`, + ); + const tokens = candidate.tokens as Array> | undefined; + if (!tokens || tokens.length === 0) { + continue; + } + for (let j = 0; j < tokens.length; j += 1) { + const token = tokens[j]; + process.stdout.write( + ` - ${token.surface} -> ${token.headword} (${token.reading}) [${token.startPos}, ${token.endPos})\n`, + ); + } + } +} + +async function main(): Promise { + const args = parseCliArgs(process.argv.slice(2)); + const yomitan: YomitanRuntimeState = { + available: false, + note: null, + extension: null, + parserWindow: null, + parserReadyPromise: null, + parserInitPromise: null, + }; + + try { + const mecabTokenizer = new MecabTokenizer({ + mecabCommand: args.mecabCommand, + dictionaryPath: args.mecabDictionaryPath, + }); + const isMecabAvailable = await mecabTokenizer.checkAvailability(); + if (!isMecabAvailable) { + throw new Error("MeCab is not available on this system."); + } + + const runtime = await setupYomitanRuntime(args); + yomitan.available = runtime.available; + yomitan.note = runtime.note; + yomitan.extension = runtime.extension; + yomitan.parserWindow = runtime.parserWindow; + yomitan.parserReadyPromise = runtime.parserReadyPromise; + yomitan.parserInitPromise = runtime.parserInitPromise; + + const deps = createTokenizerDepsRuntimeService({ + getYomitanExt: () => yomitan.extension, + getYomitanParserWindow: () => yomitan.parserWindow, + setYomitanParserWindow: (window) => { + yomitan.parserWindow = window; + }, + getYomitanParserReadyPromise: () => yomitan.parserReadyPromise, + setYomitanParserReadyPromise: (promise) => { + yomitan.parserReadyPromise = promise; + }, + getYomitanParserInitPromise: () => yomitan.parserInitPromise, + setYomitanParserInitPromise: (promise) => { + yomitan.parserInitPromise = promise; + }, + isKnownWord: () => false, + getKnownWordMatchMode: () => "headword", + getJlptLevel: () => null, + getMecabTokenizer: () => ({ + tokenize: (text: string) => mecabTokenizer.tokenize(text), + }), + }); + + const subtitleData = await tokenizeSubtitleService(args.input, deps); + const tokenizeText = normalizeTokenizerText(args.input); + let rawParseResults: unknown = null; + if ( + yomitan.available && + yomitan.parserWindow && + !yomitan.parserWindow.isDestroyed() && + tokenizeText + ) { + rawParseResults = await fetchRawParseResults(yomitan.parserWindow, tokenizeText); + } + + const parsedCandidates = mapParseResultsToCandidates(rawParseResults); + const selectedCandidateIndexes = findSelectedCandidateIndexes( + parsedCandidates, + subtitleData.tokens, + ); + const selectedIndexSet = new Set(selectedCandidateIndexes); + + const payload = { + input: args.input, + tokenizerText: subtitleData.text, + yomitanAvailable: yomitan.available, + yomitanNote: yomitan.note, + selectedCandidateIndexes, + finalTokens: + subtitleData.tokens?.map((token) => ({ + surface: token.surface, + reading: token.reading, + headword: token.headword, + startPos: token.startPos, + endPos: token.endPos, + pos1: token.pos1, + partOfSpeech: token.partOfSpeech, + isKnown: token.isKnown, + isNPlusOneTarget: token.isNPlusOneTarget, + })) ?? null, + candidates: parsedCandidates.map((candidate, idx) => ({ + source: candidate.source, + index: candidate.index, + selectedByTokenizer: selectedIndexSet.has(idx), + tokenCount: candidate.tokens.length, + tokens: candidate.tokens, + })), + }; + + if (args.emitJson) { + process.stdout.write( + `${JSON.stringify(payload, null, args.emitPretty ? 2 : undefined)}\n`, + ); + } else { + renderTextOutput(payload); + } + } finally { + await shutdownYomitanRuntime(yomitan); + } +} + +main() + .then(() => { + process.exit(0); + }) + .catch((error) => { + console.error(`Error: ${(error as Error).message}`); + process.exit(1); + }); diff --git a/src/core/services/tokenizer-service.test.ts b/src/core/services/tokenizer-service.test.ts index 5e87264..0845cb0 100644 --- a/src/core/services/tokenizer-service.test.ts +++ b/src/core/services/tokenizer-service.test.ts @@ -213,7 +213,7 @@ test("tokenizeSubtitleService applies frequency dictionary ranks", async () => { reading: "デス", startPos: 1, endPos: 2, - partOfSpeech: PartOfSpeech.bound_auxiliary, + partOfSpeech: PartOfSpeech.other, isMerged: false, isKnown: false, isNPlusOneTarget: false, @@ -228,7 +228,7 @@ test("tokenizeSubtitleService applies frequency dictionary ranks", async () => { assert.equal(result.tokens?.[1]?.frequencyRank, 1200); }); -test("tokenizeSubtitleService uses all Yomitan headword candidates for frequency lookup", async () => { +test("tokenizeSubtitleService uses only selected Yomitan headword for frequency lookup", async () => { const result = await tokenizeSubtitleService( "猫です", makeDeps({ @@ -262,7 +262,66 @@ test("tokenizeSubtitleService uses all Yomitan headword candidates for frequency ); assert.equal(result.tokens?.length, 1); - assert.equal(result.tokens?.[0]?.frequencyRank, 40); + assert.equal(result.tokens?.[0]?.frequencyRank, 1200); +}); + +test("tokenizeSubtitleService keeps furigana-split Yomitan segments as one token", async () => { + const result = await tokenizeSubtitleService( + "友達と話した", + makeDeps({ + getFrequencyDictionaryEnabled: () => true, + getYomitanExt: () => ({ id: "dummy-ext" } as any), + getYomitanParserWindow: () => ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: "scanning-parser", + index: 0, + content: [ + [ + { + text: "友", + reading: "とも", + headwords: [[{ term: "友達" }]], + }, + { + text: "達", + reading: "だち", + }, + ], + [ + { + text: "と", + reading: "と", + headwords: [[{ term: "と" }]], + }, + ], + [ + { + text: "話した", + reading: "はなした", + headwords: [[{ term: "話す" }]], + }, + ], + ], + }, + ], + }, + } as unknown as Electron.BrowserWindow), + getFrequencyRank: (text) => (text === "友達" ? 22 : text === "話す" ? 90 : null), + }), + ); + + assert.equal(result.tokens?.length, 3); + assert.equal(result.tokens?.[0]?.surface, "友達"); + assert.equal(result.tokens?.[0]?.reading, "ともだち"); + assert.equal(result.tokens?.[0]?.headword, "友達"); + assert.equal(result.tokens?.[0]?.frequencyRank, 22); + assert.equal(result.tokens?.[1]?.surface, "と"); + assert.equal(result.tokens?.[1]?.frequencyRank, undefined); + assert.equal(result.tokens?.[2]?.surface, "話した"); + assert.equal(result.tokens?.[2]?.frequencyRank, 90); }); test("tokenizeSubtitleService prefers exact headword frequency over surface/reading when available", async () => { @@ -299,7 +358,7 @@ test("tokenizeSubtitleService prefers exact headword frequency over surface/read assert.equal(result.tokens?.[0]?.frequencyRank, 8); }); -test("tokenizeSubtitleService keeps no frequency when only reading matches and headword candidates miss", async () => { +test("tokenizeSubtitleService keeps no frequency when only reading matches and headword misses", async () => { const result = await tokenizeSubtitleService( "猫です", makeDeps({ @@ -333,7 +392,7 @@ test("tokenizeSubtitleService keeps no frequency when only reading matches and h assert.equal(result.tokens?.[0]?.frequencyRank, undefined); }); -test("tokenizeSubtitleService ignores invalid frequency ranks and takes best valid headword candidate", async () => { +test("tokenizeSubtitleService ignores invalid frequency rank on selected headword", async () => { const result = await tokenizeSubtitleService( "猫です", makeDeps({ @@ -367,7 +426,7 @@ test("tokenizeSubtitleService ignores invalid frequency ranks and takes best val ); assert.equal(result.tokens?.length, 1); - assert.equal(result.tokens?.[0]?.frequencyRank, 500); + assert.equal(result.tokens?.[0]?.frequencyRank, undefined); }); test("tokenizeSubtitleService handles real-word frequency candidates and prefers most frequent term", async () => { @@ -472,6 +531,55 @@ test("tokenizeSubtitleService ignores frequency lookup failures", async () => { assert.equal(result.tokens?.[0]?.frequencyRank, undefined); }); +test("tokenizeSubtitleService skips frequency rank when Yomitan token is enriched as particle by mecab pos1", async () => { + const result = await tokenizeSubtitleService( + "は", + makeDeps({ + getFrequencyDictionaryEnabled: () => true, + getYomitanExt: () => ({ id: "dummy-ext" } as any), + getYomitanParserWindow: () => ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: "scanning-parser", + index: 0, + content: [ + [ + { + text: "は", + reading: "は", + headwords: [[{ term: "は" }]], + }, + ], + ], + }, + ], + }, + } as unknown as Electron.BrowserWindow), + tokenizeWithMecab: async () => [ + { + headword: "は", + surface: "は", + reading: "ハ", + startPos: 0, + endPos: 1, + partOfSpeech: PartOfSpeech.particle, + pos1: "助詞", + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + ], + getFrequencyRank: (text) => (text === "は" ? 10 : null), + }), + ); + + assert.equal(result.tokens?.length, 1); + assert.equal(result.tokens?.[0]?.pos1, "助詞"); + assert.equal(result.tokens?.[0]?.frequencyRank, undefined); +}); + test("tokenizeSubtitleService ignores invalid frequency ranks", async () => { const result = await tokenizeSubtitleService( "猫", @@ -753,6 +861,8 @@ test("tokenizeSubtitleService uses Yomitan parser result when available", async reading: "ねこ", headwords: [[{ term: "猫" }]], }, + ], + [ { text: "です", reading: "です", @@ -783,6 +893,155 @@ test("tokenizeSubtitleService uses Yomitan parser result when available", async assert.equal(result.tokens?.[1]?.isKnown, false); }); +test("tokenizeSubtitleService logs selected Yomitan groups when debug toggle is enabled", async () => { + const infoLogs: string[] = []; + const originalInfo = console.info; + console.info = (...args: unknown[]) => { + infoLogs.push(args.map((value) => String(value)).join(" ")); + }; + + try { + await tokenizeSubtitleService( + "友達と話した", + makeDeps({ + getYomitanExt: () => ({ id: "dummy-ext" } as any), + getYomitanParserWindow: () => ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: "scanning-parser", + index: 0, + content: [ + [ + { + text: "友", + reading: "とも", + headwords: [[{ term: "友達" }]], + }, + { + text: "達", + reading: "だち", + }, + ], + [ + { + text: "と", + reading: "と", + headwords: [[{ term: "と" }]], + }, + ], + ], + }, + ], + }, + } as unknown as Electron.BrowserWindow), + tokenizeWithMecab: async () => null, + getYomitanGroupDebugEnabled: () => true, + }), + ); + } finally { + console.info = originalInfo; + } + + assert.ok( + infoLogs.some((line) => line.includes("Selected Yomitan token groups")), + ); +}); + +test("tokenizeSubtitleService does not log Yomitan groups when debug toggle is disabled", async () => { + const infoLogs: string[] = []; + const originalInfo = console.info; + console.info = (...args: unknown[]) => { + infoLogs.push(args.map((value) => String(value)).join(" ")); + }; + + try { + await tokenizeSubtitleService( + "友達と話した", + makeDeps({ + getYomitanExt: () => ({ id: "dummy-ext" } as any), + getYomitanParserWindow: () => ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: "scanning-parser", + index: 0, + content: [ + [ + { + text: "友", + reading: "とも", + headwords: [[{ term: "友達" }]], + }, + { + text: "達", + reading: "だち", + }, + ], + ], + }, + ], + }, + } as unknown as Electron.BrowserWindow), + tokenizeWithMecab: async () => null, + getYomitanGroupDebugEnabled: () => false, + }), + ); + } finally { + console.info = originalInfo; + } + + assert.equal( + infoLogs.some((line) => line.includes("Selected Yomitan token groups")), + false, + ); +}); + +test("tokenizeSubtitleService preserves segmented Yomitan line as one token", async () => { + const parserWindow = { + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: "scanning-parser", + index: 0, + content: [ + [ + { + text: "猫", + reading: "ねこ", + headwords: [[{ term: "猫です" }]], + }, + { + text: "です", + reading: "です", + }, + ], + ], + }, + ], + }, + } as unknown as Electron.BrowserWindow; + + const result = await tokenizeSubtitleService( + "猫です", + makeDeps({ + getYomitanExt: () => ({ id: "dummy-ext" } as any), + getYomitanParserWindow: () => parserWindow, + tokenizeWithMecab: async () => null, + }), + ); + + assert.equal(result.text, "猫です"); + assert.equal(result.tokens?.length, 1); + assert.equal(result.tokens?.[0]?.surface, "猫です"); + assert.equal(result.tokens?.[0]?.reading, "ねこです"); + assert.equal(result.tokens?.[0]?.headword, "猫です"); + assert.equal(result.tokens?.[0]?.isKnown, false); +}); + test("tokenizeSubtitleService prefers mecab parser tokens when scanning parser returns one token", async () => { const result = await tokenizeSubtitleService( "俺は小園にいきたい", @@ -880,6 +1139,59 @@ test("tokenizeSubtitleService keeps scanning parser tokens when they are already assert.equal(result.tokens?.[2]?.frequencyRank, undefined); }); +test("tokenizeSubtitleService prefers parse candidates with fewer fragment-only kana tokens when source priority is equal", async () => { + const result = await tokenizeSubtitleService( + "俺は公園にいきたい", + makeDeps({ + getYomitanExt: () => ({ id: "dummy-ext" } as any), + getYomitanParserWindow: () => ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: "mecab-fragmented", + index: 0, + content: [ + [{ text: "俺", reading: "おれ", headwords: [[{ term: "俺" }]] }], + [{ text: "は", reading: "", headwords: [[{ term: "は" }]] }], + [{ text: "公園", reading: "こうえん", headwords: [[{ term: "公園" }]] }], + [{ text: "にい", reading: "", headwords: [[{ term: "兄" }], [{ term: "二位" }]] }], + [{ text: "きたい", reading: "", headwords: [[{ term: "期待" }], [{ term: "来る" }]] }], + ], + }, + { + source: "mecab", + index: 0, + content: [ + [{ text: "俺", reading: "おれ", headwords: [[{ term: "俺" }]] }], + [{ text: "は", reading: "は", headwords: [[{ term: "は" }]] }], + [{ text: "公園", reading: "こうえん", headwords: [[{ term: "公園" }]] }], + [{ text: "に", reading: "に", headwords: [[{ term: "に" }]] }], + [{ text: "行きたい", reading: "いきたい", headwords: [[{ term: "行きたい" }]] }], + ], + }, + ], + }, + } as unknown as Electron.BrowserWindow), + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => + text === "俺" + ? 51 + : text === "公園" + ? 2304 + : text === "行きたい" + ? 1500 + : null, + tokenizeWithMecab: async () => null, + }), + ); + + assert.equal(result.tokens?.map((token) => token.surface).join(","), "俺,は,公園,に,行きたい"); + assert.equal(result.tokens?.[1]?.frequencyRank, undefined); + assert.equal(result.tokens?.[3]?.frequencyRank, undefined); + assert.equal(result.tokens?.[4]?.frequencyRank, 1500); +}); + test("tokenizeSubtitleService still assigns frequency to non-known Yomitan tokens", async () => { const result = await tokenizeSubtitleService( "小園に", @@ -914,7 +1226,7 @@ test("tokenizeSubtitleService still assigns frequency to non-known Yomitan token assert.equal(result.tokens?.[0]?.isKnown, true); assert.equal(result.tokens?.[0]?.frequencyRank, 75); assert.equal(result.tokens?.[1]?.isKnown, false); - assert.equal(result.tokens?.[1]?.frequencyRank, undefined); + assert.equal(result.tokens?.[1]?.frequencyRank, 3000); }); test("tokenizeSubtitleService marks tokens as known using callback", async () => { diff --git a/src/core/services/tokenizer-service.ts b/src/core/services/tokenizer-service.ts index 69d99f9..faa4ab0 100644 --- a/src/core/services/tokenizer-service.ts +++ b/src/core/services/tokenizer-service.ts @@ -13,6 +13,7 @@ import { shouldIgnoreJlptForMecabPos1, shouldIgnoreJlptByTerm, } from "./jlpt-token-filter"; +import { createLogger } from "../../logger"; interface YomitanParseHeadword { term?: unknown; @@ -37,6 +38,7 @@ const KATAKANA_CODEPOINT_START = 0x30a1; const KATAKANA_CODEPOINT_END = 0x30f6; const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048; const FREQUENCY_RANK_LOOKUP_CACHE_LIMIT = 2048; +const logger = createLogger("main:tokenizer"); const jlptLevelLookupCaches = new WeakMap< (text: string) => JlptLevel | null, @@ -70,6 +72,7 @@ export interface TokenizerServiceDeps { getFrequencyDictionaryEnabled?: () => boolean; getFrequencyRank?: FrequencyDictionaryLookup; getMinSentenceWordsForNPlusOne?: () => number; + getYomitanGroupDebugEnabled?: () => boolean; tokenizeWithMecab: (text: string) => Promise; } @@ -92,6 +95,7 @@ export interface TokenizerDepsRuntimeOptions { getFrequencyDictionaryEnabled?: () => boolean; getFrequencyRank?: FrequencyDictionaryLookup; getMinSentenceWordsForNPlusOne?: () => number; + getYomitanGroupDebugEnabled?: () => boolean; getMecabTokenizer: () => MecabTokenizerLike | null; } @@ -197,6 +201,8 @@ export function createTokenizerDepsRuntimeService( getFrequencyRank: options.getFrequencyRank, getMinSentenceWordsForNPlusOne: options.getMinSentenceWordsForNPlusOne ?? (() => 3), + getYomitanGroupDebugEnabled: + options.getYomitanGroupDebugEnabled ?? (() => false), tokenizeWithMecab: async (text) => { const mecabTokenizer = options.getMecabTokenizer(); if (!mecabTokenizer) { @@ -253,40 +259,19 @@ function resolveFrequencyLookupText(token: MergedToken): string { } function getFrequencyLookupTextCandidates(token: MergedToken): string[] { - const tokenWithCandidates = token as MergedToken & { - frequencyLookupTerms?: string[]; - }; - const lookupTextCandidates: string[] = []; - const addLookupText = (text: string | undefined): void => { - if (!text) { - return; - } - const trimmed = text.trim(); - if (!trimmed) { - return; - } - lookupTextCandidates.push(trimmed); - }; + const lookupText = resolveFrequencyLookupText(token).trim(); + return lookupText ? [lookupText] : []; +} - if (Array.isArray(tokenWithCandidates.frequencyLookupTerms)) { - for (const term of tokenWithCandidates.frequencyLookupTerms) { - addLookupText(term); - } +function isFrequencyExcludedByPos(token: MergedToken): boolean { + if ( + token.partOfSpeech === PartOfSpeech.particle || + token.partOfSpeech === PartOfSpeech.bound_auxiliary + ) { + return true; } - addLookupText(resolveFrequencyLookupText(token)); - - const uniqueLookupTerms: string[] = []; - const seen = new Set(); - for (const term of lookupTextCandidates) { - if (seen.has(term)) { - continue; - } - seen.add(term); - uniqueLookupTerms.push(term); - } - - return uniqueLookupTerms; + return token.pos1 === "助詞" || token.pos1 === "助動詞"; } function applyFrequencyMarking( @@ -294,6 +279,10 @@ function applyFrequencyMarking( getFrequencyRank: FrequencyDictionaryLookup, ): MergedToken[] { return tokens.map((token) => { + if (isFrequencyExcludedByPos(token)) { + return { ...token, frequencyRank: undefined }; + } + const lookupTexts = getFrequencyLookupTextCandidates(token); if (lookupTexts.length === 0) { return { ...token, frequencyRank: undefined }; @@ -499,27 +488,6 @@ function extractYomitanHeadword(segment: YomitanParseSegment): string { return ""; } -function extractYomitanHeadwords(segment: YomitanParseSegment): string[] { - const headwords = segment.headwords; - if (!isYomitanHeadwordRows(headwords)) { - return []; - } - - const results: string[] = []; - for (const group of headwords) { - for (const candidate of group) { - if (isString(candidate.term)) { - const term = candidate.term.trim(); - if (term.length > 0) { - results.push(term); - } - } - } - } - - return results; -} - function applyJlptMarking( tokens: MergedToken[], getJlptLevel: (text: string) => JlptLevel | null, @@ -575,41 +543,53 @@ function mapYomitanParseResultItemToMergedTokens( } validLineCount += 1; + let combinedSurface = ""; + let combinedReading = ""; + let combinedHeadword = ""; + for (const segment of line) { const segmentText = segment.text; if (!segmentText || segmentText.length === 0) { continue; } - const start = charOffset; - const end = start + segmentText.length; - charOffset = end; - - const headword = extractYomitanHeadword(segment) || segmentText; - const frequencyLookupTerms = extractYomitanHeadwords(segment); - - tokens.push({ - surface: segmentText, - reading: typeof segment.reading === "string" ? segment.reading : "", - headword, - startPos: start, - endPos: end, - partOfSpeech: PartOfSpeech.other, - pos1: "", - isMerged: true, - isNPlusOneTarget: false, - isKnown: (() => { - const matchText = resolveKnownWordText( - segmentText, - headword, - knownWordMatchMode, - ); - return matchText ? isKnownWord(matchText) : false; - })(), - frequencyLookupTerms: - frequencyLookupTerms.length > 0 ? frequencyLookupTerms : undefined, - }); + combinedSurface += segmentText; + if (typeof segment.reading === "string") { + combinedReading += segment.reading; + } + if (!combinedHeadword) { + combinedHeadword = extractYomitanHeadword(segment); + } } + + if (!combinedSurface) { + continue; + } + + const start = charOffset; + const end = start + combinedSurface.length; + charOffset = end; + const headword = combinedHeadword || combinedSurface; + + tokens.push({ + surface: combinedSurface, + reading: combinedReading, + headword, + startPos: start, + endPos: end, + partOfSpeech: PartOfSpeech.other, + pos1: "", + isMerged: true, + isNPlusOneTarget: false, + isKnown: (() => { + const matchText = resolveKnownWordText( + combinedSurface, + headword, + knownWordMatchMode, + ); + return matchText ? isKnownWord(matchText) : false; + })(), + }); } if (validLineCount === 0 || tokens.length === 0) { @@ -641,13 +621,52 @@ function selectBestYomitanParseCandidate( current.tokens.length > best.tokens.length ? current : best, ); + const getCandidateScore = (candidate: YomitanParseCandidate): number => { + const readableTokenCount = candidate.tokens.filter( + (token) => token.reading.trim().length > 0, + ).length; + const suspiciousKanaFragmentCount = candidate.tokens.filter((token) => + token.reading.trim().length === 0 && + token.surface.length >= 2 && + Array.from(token.surface).every((char) => isKanaChar(char)) + ).length; + + return ( + readableTokenCount * 100 - + suspiciousKanaFragmentCount * 50 - + candidate.tokens.length + ); + }; + + const chooseBestCandidate = ( + items: YomitanParseCandidate[], + ): YomitanParseCandidate | null => { + if (items.length === 0) { + return null; + } + + return items.reduce((best, current) => { + const bestScore = getCandidateScore(best); + const currentScore = getCandidateScore(current); + if (currentScore !== bestScore) { + return currentScore > bestScore ? current : best; + } + + if (current.tokens.length !== best.tokens.length) { + return current.tokens.length < best.tokens.length ? current : best; + } + + return best; + }); + }; + if (scanningCandidates.length > 0) { const bestScanning = getBestByTokenCount(scanningCandidates); if (bestScanning && bestScanning.tokens.length > 1) { return bestScanning.tokens; } - const bestMecab = getBestByTokenCount(mecabCandidates); + const bestMecab = chooseBestCandidate(mecabCandidates); if ( bestMecab && bestMecab.tokens.length > (bestScanning?.tokens.length ?? 0) @@ -658,7 +677,11 @@ function selectBestYomitanParseCandidate( return bestScanning ? bestScanning.tokens : null; } - const bestCandidate = getBestByTokenCount(candidates); + const multiTokenCandidates = candidates.filter( + (candidate) => candidate.tokens.length > 1, + ); + const pool = multiTokenCandidates.length > 0 ? multiTokenCandidates : candidates; + const bestCandidate = chooseBestCandidate(pool); return bestCandidate ? bestCandidate.tokens : null; } @@ -688,6 +711,25 @@ function mapYomitanParseResultsToMergedTokens( return bestCandidate; } +function logSelectedYomitanGroups(text: string, tokens: MergedToken[]): void { + if (!tokens || tokens.length === 0) { + return; + } + + logger.info("Selected Yomitan token groups", { + text, + tokenCount: tokens.length, + groups: tokens.map((token, index) => ({ + index, + surface: token.surface, + headword: token.headword, + reading: token.reading, + startPos: token.startPos, + endPos: token.endPos, + })), + }); +} + function pickClosestMecabPos1( token: MergedToken, mecabTokens: MergedToken[], @@ -930,6 +972,10 @@ async function parseWithYomitanInternalParser( return null; } + if (deps.getYomitanGroupDebugEnabled?.() === true) { + logSelectedYomitanGroups(text, yomitanTokens); + } + return enrichYomitanPos1(yomitanTokens, deps, text); } catch (err) { console.error("Yomitan parser request failed:", (err as Error).message); diff --git a/src/main.ts b/src/main.ts index b8861be..2ecf51f 100644 --- a/src/main.ts +++ b/src/main.ts @@ -911,6 +911,7 @@ async function tokenizeSubtitle(text: string): Promise { getFrequencyDictionaryEnabled: () => getResolvedConfig().subtitleStyle.frequencyDictionary.enabled, getFrequencyRank: (text) => appState.frequencyRankLookup(text), + getYomitanGroupDebugEnabled: () => appState.overlayDebugVisualizationEnabled, getMecabTokenizer: () => appState.mecabTokenizer, }), ); diff --git a/src/types.ts b/src/types.ts index de4217f..f602b4c 100644 --- a/src/types.ts +++ b/src/types.ts @@ -56,7 +56,6 @@ export interface MergedToken { isNPlusOneTarget: boolean; jlptLevel?: JlptLevel; frequencyRank?: number; - frequencyLookupTerms?: string[]; } export type FrequencyDictionaryLookup = (term: string) => number | null;