diff --git a/backlog/tasks/task-90 - Normalize-narrow-Unicode-whitespace-in-tokenizer-input.md b/backlog/tasks/task-90 - Normalize-narrow-Unicode-whitespace-in-tokenizer-input.md new file mode 100644 index 0000000..7f36e84 --- /dev/null +++ b/backlog/tasks/task-90 - Normalize-narrow-Unicode-whitespace-in-tokenizer-input.md @@ -0,0 +1,36 @@ +--- +id: TASK-90 +title: Normalize narrow Unicode whitespace in tokenizer input +status: Done +assignee: [] +created_date: '2026-02-20 06:17' +updated_date: '2026-02-20 06:20' +labels: [] +dependencies: [] +priority: medium +--- + +## Description + + +Fix tokenizer behavior where subtitle lines containing narrow/invisible Unicode spacing between Japanese segments can be split/grouped incorrectly compared with normal space handling. + + +## Acceptance Criteria + +- [x] #1 A regression test reproduces the subtitle sample containing narrow/invisible Unicode spacing and fails before fix. +- [x] #2 Tokenizer normalization treats narrow/invisible spacing variants consistently with regular spacing for grouping/highlight behavior. +- [x] #3 Existing tokenizer tests still pass. + + +## Implementation Notes + + +Linked from subagent session `codex-narrow-space-tokenizer-20260220T061716Z-p97s`. + +Added `src/subtitle/stages/normalize.test.ts` regression for `\u200B` separator in subtitle sample and updated `normalizeTokenizerInput` to map `U+200B/U+2060/U+FEFF` to regular spaces before whitespace collapsing. + +Validation: +- `bun run build && node --test dist/subtitle/stages/normalize.test.js` +- `node --test dist/core/services/tokenizer.test.js` + diff --git a/backlog/tasks/task-91 - Add-config-toggle-to-preserve-visible-overlay-subtitle-line-breaks.md b/backlog/tasks/task-91 - Add-config-toggle-to-preserve-visible-overlay-subtitle-line-breaks.md new file mode 100644 index 0000000..9571455 --- /dev/null +++ b/backlog/tasks/task-91 - Add-config-toggle-to-preserve-visible-overlay-subtitle-line-breaks.md @@ -0,0 +1,36 @@ +--- +id: TASK-91 +title: Add config toggle to preserve visible overlay subtitle line breaks +status: Done +assignee: [] +created_date: '2026-02-20 06:35' +updated_date: '2026-02-20 06:42' +labels: [] +dependencies: [] +priority: medium +--- + +## Description + + +Add a `subtitleStyle` config option that keeps visible-overlay subtitle line breaks (newline/carriage-return normalized to line breaks) instead of flattening them to spaces. Default should preserve current behavior for consistency with texthooker. + + +## Acceptance Criteria + +- [x] #1 New config option exists with default disabled and validation/docs coverage. +- [x] #2 When enabled, visible overlay preserves subtitle line breaks while rendering tokenized subtitles. +- [x] #3 When disabled, current rendering behavior remains unchanged. +- [x] #4 Relevant config + renderer tests pass. + + +## Implementation Notes + + +Added `subtitleStyle.preserveLineBreaks` (default `false`) to types/default config/registry/config validation and docs/example configs. + +Renderer now supports line-break-preserving token output via `alignTokensToSourceText` in `src/renderer/subtitle-render.ts`, which inserts source-text separators (including `\n`) between token spans when enabled. + +Validation: +- `bun run build && node --test dist/config/config.test.js dist/renderer/subtitle-render.test.js` + diff --git a/config.example.jsonc b/config.example.jsonc index ffdcaa2..89cfb3d 100644 --- a/config.example.jsonc +++ b/config.example.jsonc @@ -166,6 +166,7 @@ // ========================================== "subtitleStyle": { "enableJlpt": false, // Enable JLPT vocabulary level underlines. When disabled, JLPT tagging lookup and underlines are skipped. Values: true | false + "preserveLineBreaks": false, // Preserve line breaks in visible overlay subtitle rendering. When false, line breaks are flattened to spaces for a single-line flow. Values: true | false "fontFamily": "Noto Sans CJK JP Regular, Noto Sans CJK JP, Arial Unicode MS, Arial, sans-serif", // Font family setting. "fontSize": 35, // Font size setting. "fontColor": "#cad3f5", // Font color setting. diff --git a/docs/configuration.md b/docs/configuration.md index 790ca3f..8c12c2a 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -724,6 +724,7 @@ See `config.example.jsonc` for detailed configuration options. | `fontStyle` | string | `"normal"` or `"italic"` (default: `"normal"`) | | `backgroundColor` | string | Any CSS color, including `"transparent"` (default: `"rgb(30, 32, 48, 0.88)"`) | | `enableJlpt` | boolean | Enable JLPT level underline styling (`false` by default) | +| `preserveLineBreaks` | boolean | Preserve line breaks in visible overlay subtitle rendering (`false` by default). Enable to mirror mpv line layout. | | `frequencyDictionary.enabled` | boolean | Enable frequency highlighting from dictionary lookups (`false` by default) | | `frequencyDictionary.sourcePath` | string | Path to a local frequency dictionary root. Leave empty or omit to use the built-in bundled dictionary search paths. | | `frequencyDictionary.topX` | number | Only color tokens whose frequency rank is `<= topX` (`1000` by default) | diff --git a/docs/public/config.example.jsonc b/docs/public/config.example.jsonc index ffdcaa2..89cfb3d 100644 --- a/docs/public/config.example.jsonc +++ b/docs/public/config.example.jsonc @@ -166,6 +166,7 @@ // ========================================== "subtitleStyle": { "enableJlpt": false, // Enable JLPT vocabulary level underlines. When disabled, JLPT tagging lookup and underlines are skipped. Values: true | false + "preserveLineBreaks": false, // Preserve line breaks in visible overlay subtitle rendering. When false, line breaks are flattened to spaces for a single-line flow. Values: true | false "fontFamily": "Noto Sans CJK JP Regular, Noto Sans CJK JP, Arial Unicode MS, Arial, sans-serif", // Font family setting. "fontSize": 35, // Font size setting. "fontColor": "#cad3f5", // Font color setting. diff --git a/docs/subagents/INDEX.md b/docs/subagents/INDEX.md index 27ee144..e49fe45 100644 --- a/docs/subagents/INDEX.md +++ b/docs/subagents/INDEX.md @@ -6,10 +6,12 @@ Read first. Keep concise. | ------------ | -------------- | ---------------------------------------------------- | --------- | ------------------------------------- | ---------------------- | | `codex-main` | `planner-exec` | `Fix frequency/N+1 regression in plugin --start flow` | `in_progress` | `docs/subagents/agents/codex-main.md` | `2026-02-19T19:36:46Z` | | `codex-config-validation-20260219T172015Z-iiyf` | `codex-config-validation` | `Find root cause of config validation error for ~/.config/SubMiner/config.jsonc` | `completed` | `docs/subagents/agents/codex-config-validation-20260219T172015Z-iiyf.md` | `2026-02-19T17:26:17Z` | -| `codex-task85-20260219T233711Z-46hc` | `codex-task85` | `Resume TASK-85 maintainability refactor from latest handoff point` | `in_progress` | `docs/subagents/agents/codex-task85-20260219T233711Z-46hc.md` | `2026-02-20T05:31:05Z` | +| `codex-task85-20260219T233711Z-46hc` | `codex-task85` | `Resume TASK-85 maintainability refactor from latest handoff point` | `in_progress` | `docs/subagents/agents/codex-task85-20260219T233711Z-46hc.md` | `2026-02-20T05:50:43Z` | | `codex-anilist-deeplink-20260219T233926Z` | `anilist-deeplink` | `Fix external subminer:// AniList callback handling from browser` | `done` | `docs/subagents/agents/codex-anilist-deeplink-20260219T233926Z.md` | `2026-02-19T23:59:21Z` | | `codex-texthooker-highlights-20260220T002354Z-927c` | `codex-texthooker-highlights` | `Add optional texthooker highlight toggles for known/n+1/frequency/JLPT` | `completed` | `docs/subagents/agents/codex-texthooker-highlights-20260220T002354Z-927c.md` | `2026-02-20T00:30:49Z` | | `codex-texthooker-ui-playwright-20260220T003827Z-k3p9` | `codex-texthooker-ui-playwright` | `Run Playwright MCP smoke/regression checks for texthooker-ui changes` | `completed` | `docs/subagents/agents/codex-texthooker-ui-playwright-20260220T003827Z-k3p9.md` | `2026-02-20T00:42:09Z` | | `codex-texthooker-color-ws-20260220T005844Z-r7m2` | `codex-texthooker-color-ws` | `Fix texthooker websocket payload so token highlight colors render` | `completed` | `docs/subagents/agents/codex-texthooker-color-ws-20260220T005844Z-r7m2.md` | `2026-02-20T01:01:00Z` | | `codex-nplusone-pos1-20260220T012300Z-c5he` | `codex-nplusone-pos1` | `Fix N+1 false-negative when Yomitan functional tokens inflate unknown candidate count` | `completed` | `docs/subagents/agents/codex-nplusone-pos1-20260220T012300Z-c5he.md` | `2026-02-20T01:28:20Z` | | `codex-subtitle-bg-20260220T054247Z-h9cu` | `codex-subtitle-bg` | `Update default subtitle background color to requested RGBA value` | `completed` | `docs/subagents/agents/codex-subtitle-bg-20260220T054247Z-h9cu.md` | `2026-02-20T05:44:45Z` | +| `codex-narrow-space-tokenizer-20260220T061716Z-p97s` | `codex-narrow-space-tokenizer` | `Fix tokenization when subtitle line contains narrow/invisible Unicode spacing between segments` | `completed` | `docs/subagents/agents/codex-narrow-space-tokenizer-20260220T061716Z-p97s.md` | `2026-02-20T06:20:07Z` | +| `codex-preserve-linebreaks-20260220T063538Z-s4nd` | `codex-preserve-linebreaks` | `Add config option to preserve subtitle line breaks in visible overlay rendering` | `completed` | `docs/subagents/agents/codex-preserve-linebreaks-20260220T063538Z-s4nd.md` | `2026-02-20T06:42:51Z` | diff --git a/docs/subagents/agents/codex-narrow-space-tokenizer-20260220T061716Z-p97s.md b/docs/subagents/agents/codex-narrow-space-tokenizer-20260220T061716Z-p97s.md new file mode 100644 index 0000000..84fe77a --- /dev/null +++ b/docs/subagents/agents/codex-narrow-space-tokenizer-20260220T061716Z-p97s.md @@ -0,0 +1,34 @@ +# Agent: `codex-narrow-space-tokenizer-20260220T061716Z-p97s` + +- alias: `codex-narrow-space-tokenizer` +- mission: `Fix narrow/invisible subtitle spacing causing incorrect tokenizer boundaries.` +- status: `done` +- branch: `main` +- started_at: `2026-02-20T06:17:31Z` +- heartbeat_minutes: `5` + +## Current Work (newest first) +- [2026-02-20T06:20:07Z] handoff: normalized invisible separators in tokenizer input; added regression test; targeted tests green. +- [2026-02-20T06:19:20Z] test: `bun run build && node --test dist/subtitle/stages/normalize.test.js` (pass, 1/1); `node --test dist/core/services/tokenizer.test.js` (pass, 43/43). +- [2026-02-20T06:18:38Z] edit: updated `normalizeTokenizerInput` to map `U+200B/U+2060/U+FEFF` to regular spaces before whitespace collapsing. +- [2026-02-20T06:18:02Z] test: added failing regression for subtitle sample with `\u200B` separator. +- [2026-02-20T06:17:31Z] intent: create TASK-90; TDD-first regression for narrow Unicode spacing in subtitle line `キリキリと かかってこい`. +- [2026-02-20T06:17:31Z] progress: coordination started; index row added; scanning tokenizer normalization points next. + +## Files Touched +- `docs/subagents/INDEX.md` +- `docs/subagents/agents/codex-narrow-space-tokenizer-20260220T061716Z-p97s.md` +- `docs/subagents/collaboration.md` +- `backlog/tasks/task-90 - Normalize-narrow-Unicode-whitespace-in-tokenizer-input.md` +- `src/subtitle/stages/normalize.ts` +- `src/subtitle/stages/normalize.test.ts` + +## Assumptions +- issue likely Unicode spacing code point treated as token boundary. +- target behavior: collapse/normalize narrow spacing to standard spacing before lookup token grouping. + +## Open Questions / Blockers +- possible overlap with TASK-85 refactor touching tokenizer paths. + +## Next Step +- done. diff --git a/docs/subagents/agents/codex-preserve-linebreaks-20260220T063538Z-s4nd.md b/docs/subagents/agents/codex-preserve-linebreaks-20260220T063538Z-s4nd.md new file mode 100644 index 0000000..b6a2436 --- /dev/null +++ b/docs/subagents/agents/codex-preserve-linebreaks-20260220T063538Z-s4nd.md @@ -0,0 +1,42 @@ +# Agent: `codex-preserve-linebreaks-20260220T063538Z-s4nd` + +- alias: `codex-preserve-linebreaks` +- mission: `Add config option to preserve subtitle line breaks in visible overlay rendering.` +- status: `done` +- branch: `main` +- started_at: `2026-02-20T06:35:38Z` +- heartbeat_minutes: `5` + +## Current Work (newest first) +- [2026-02-20T06:42:51Z] handoff: TASK-91 complete; added config flag `subtitleStyle.preserveLineBreaks` (default false), renderer token-linebreak alignment path, tests/docs/examples updated. +- [2026-02-20T06:42:20Z] test: `bun run build && node --test dist/config/config.test.js dist/renderer/subtitle-render.test.js` pass (43/43); macOS helper compile falls back due sandboxed Swift cache write. +- [2026-02-20T06:41:07Z] edit: added `alignTokensToSourceText` helper + preserve-line-break render path in `src/renderer/subtitle-render.ts`; state/config plumbing added. +- [2026-02-20T06:39:34Z] test: added config parse/warn coverage + renderer helper newline-segment test. +- [2026-02-20T06:35:38Z] intent: create backlog ticket; implement opt-in config flag default-off; keep current normalization default behavior. +- [2026-02-20T06:35:38Z] progress: located normalization/render paths in `src/core/services/tokenizer.ts` and `src/renderer/subtitle-render.ts`. + +## Files Touched +- `docs/subagents/INDEX.md` +- `docs/subagents/agents/codex-preserve-linebreaks-20260220T063538Z-s4nd.md` +- `docs/subagents/collaboration.md` +- `backlog/tasks/task-91 - Add-config-toggle-to-preserve-visible-overlay-subtitle-line-breaks.md` +- `src/types.ts` +- `src/config/definitions.ts` +- `src/config/service.ts` +- `src/config/config.test.ts` +- `src/renderer/state.ts` +- `src/renderer/subtitle-render.ts` +- `src/renderer/subtitle-render.test.ts` +- `docs/configuration.md` +- `config.example.jsonc` +- `docs/public/config.example.jsonc` + +## Assumptions +- request targets visible overlay rendering parity with MPV line breaks. +- default behavior must remain whitespace-collapsed for tokenizer/texthooker consistency. + +## Open Questions / Blockers +- none. + +## Next Step +- done. diff --git a/docs/subagents/collaboration.md b/docs/subagents/collaboration.md index 4a4bbb2..ff4d0fa 100644 --- a/docs/subagents/collaboration.md +++ b/docs/subagents/collaboration.md @@ -7,3 +7,7 @@ Shared notes. Append-only. - [2026-02-20T00:01:40Z] [codex-anilist-deeplink|anilist-deeplink] preparing commit; scoping staged set to repo changes, excluding external reference dirs (vendor/yomitan-jlpt-vocab, mpv-anilist-updater). - [2026-02-20T05:42:54Z] [codex-subtitle-bg-20260220T054247Z-h9cu|codex-subtitle-bg] short config tweak requested: update default subtitle background color; scoping to config defaults/tests only. - [2026-02-20T05:44:45Z] [codex-subtitle-bg-20260220T054247Z-h9cu|codex-subtitle-bg] completed TASK-89; updated default subtitle background in config defaults/docs/examples/renderer CSS; config tests green. +- [2026-02-20T06:17:31Z] [codex-narrow-space-tokenizer-20260220T061716Z-p97s|codex-narrow-space-tokenizer] potential overlap notice: investigating tokenizer whitespace normalization and tests (likely `src/core/services/tokenizer-service.ts` + tests); coordinating to avoid clobber with ongoing TASK-85 refactor touches. +- [2026-02-20T06:20:07Z] [codex-narrow-space-tokenizer-20260220T061716Z-p97s|codex-narrow-space-tokenizer] completed TASK-90 fix in `src/subtitle/stages/normalize.ts`; normalize `U+200B/U+2060/U+FEFF` to spaces for tokenizer input; added regression test `src/subtitle/stages/normalize.test.ts`; targeted tokenizer suite green. +- [2026-02-20T06:35:38Z] [codex-preserve-linebreaks-20260220T063538Z-s4nd|codex-preserve-linebreaks] overlap note: touching subtitle config + renderer render path (`src/types.ts`, `src/config/*`, `src/renderer/subtitle-render.ts`, docs/config examples) to add optional preserve-line-breaks behavior while keeping default normalization unchanged. +- [2026-02-20T06:42:51Z] [codex-preserve-linebreaks-20260220T063538Z-s4nd|codex-preserve-linebreaks] completed TASK-91; added `subtitleStyle.preserveLineBreaks` config (default false), renderer token/source alignment helper to preserve visible overlay line breaks when enabled, config+renderer tests green. diff --git a/src/config/config.test.ts b/src/config/config.test.ts index 877658c..ed18e69 100644 --- a/src/config/config.test.ts +++ b/src/config/config.test.ts @@ -24,6 +24,7 @@ test('loads defaults when config is missing', () => { assert.equal(config.jellyfin.autoAnnounce, false); assert.equal(config.jellyfin.remoteControlDeviceName, 'SubMiner'); assert.equal(config.subtitleStyle.backgroundColor, 'rgb(30, 32, 48, 0.88)'); + assert.equal(config.subtitleStyle.preserveLineBreaks, false); assert.equal(config.immersionTracking.enabled, true); assert.equal(config.immersionTracking.dbPath, ''); assert.equal(config.immersionTracking.batchSize, 25); @@ -38,6 +39,44 @@ test('loads defaults when config is missing', () => { assert.equal(config.immersionTracking.retention.vacuumIntervalDays, 7); }); +test('parses subtitleStyle.preserveLineBreaks and warns on invalid values', () => { + const validDir = makeTempDir(); + fs.writeFileSync( + path.join(validDir, 'config.jsonc'), + `{ + "subtitleStyle": { + "preserveLineBreaks": true + } + }`, + 'utf-8', + ); + + const validService = new ConfigService(validDir); + assert.equal(validService.getConfig().subtitleStyle.preserveLineBreaks, true); + + const invalidDir = makeTempDir(); + fs.writeFileSync( + path.join(invalidDir, 'config.jsonc'), + `{ + "subtitleStyle": { + "preserveLineBreaks": "yes" + } + }`, + 'utf-8', + ); + + const invalidService = new ConfigService(invalidDir); + assert.equal( + invalidService.getConfig().subtitleStyle.preserveLineBreaks, + DEFAULT_CONFIG.subtitleStyle.preserveLineBreaks, + ); + assert.ok( + invalidService + .getWarnings() + .some((warning) => warning.path === 'subtitleStyle.preserveLineBreaks'), + ); +}); + test('parses anilist.enabled and warns for invalid value', () => { const dir = makeTempDir(); fs.writeFileSync( @@ -885,6 +924,7 @@ test('template generator includes known keys', () => { assert.match(output, /"logging":/); assert.match(output, /"websocket":/); assert.match(output, /"youtubeSubgen":/); + assert.match(output, /"preserveLineBreaks": false/); assert.match(output, /"nPlusOne"\s*:\s*\{/); assert.match(output, /"nPlusOne": "#c6a0f6"/); assert.match(output, /"knownWord": "#a6da95"/); diff --git a/src/config/definitions.ts b/src/config/definitions.ts index 7997ae1..cf4b08f 100644 --- a/src/config/definitions.ts +++ b/src/config/definitions.ts @@ -172,6 +172,7 @@ export const DEFAULT_CONFIG: ResolvedConfig = { }, subtitleStyle: { enableJlpt: false, + preserveLineBreaks: false, fontFamily: 'Noto Sans CJK JP Regular, Noto Sans CJK JP, Arial Unicode MS, Arial, sans-serif', fontSize: 35, fontColor: '#cad3f5', @@ -343,6 +344,14 @@ export const CONFIG_OPTION_REGISTRY: ConfigOptionRegistryEntry[] = [ 'Enable JLPT vocabulary level underlines. ' + 'When disabled, JLPT tagging lookup and underlines are skipped.', }, + { + path: 'subtitleStyle.preserveLineBreaks', + kind: 'boolean', + defaultValue: DEFAULT_CONFIG.subtitleStyle.preserveLineBreaks, + description: + 'Preserve line breaks in visible overlay subtitle rendering. ' + + 'When false, line breaks are flattened to spaces for a single-line flow.', + }, { path: 'subtitleStyle.frequencyDictionary.enabled', kind: 'boolean', diff --git a/src/config/service.ts b/src/config/service.ts index 9b5a0f1..73cbcec 100644 --- a/src/config/service.ts +++ b/src/config/service.ts @@ -746,6 +746,8 @@ export class ConfigService { } if (isObject(src.subtitleStyle)) { + const fallbackSubtitleStyleEnableJlpt = resolved.subtitleStyle.enableJlpt; + const fallbackSubtitleStylePreserveLineBreaks = resolved.subtitleStyle.preserveLineBreaks; resolved.subtitleStyle = { ...resolved.subtitleStyle, ...(src.subtitleStyle as ResolvedConfig['subtitleStyle']), @@ -761,6 +763,7 @@ export class ConfigService { if (enableJlpt !== undefined) { resolved.subtitleStyle.enableJlpt = enableJlpt; } else if ((src.subtitleStyle as { enableJlpt?: unknown }).enableJlpt !== undefined) { + resolved.subtitleStyle.enableJlpt = fallbackSubtitleStyleEnableJlpt; warn( 'subtitleStyle.enableJlpt', (src.subtitleStyle as { enableJlpt?: unknown }).enableJlpt, @@ -769,6 +772,23 @@ export class ConfigService { ); } + const preserveLineBreaks = asBoolean( + (src.subtitleStyle as { preserveLineBreaks?: unknown }).preserveLineBreaks, + ); + if (preserveLineBreaks !== undefined) { + resolved.subtitleStyle.preserveLineBreaks = preserveLineBreaks; + } else if ( + (src.subtitleStyle as { preserveLineBreaks?: unknown }).preserveLineBreaks !== undefined + ) { + resolved.subtitleStyle.preserveLineBreaks = fallbackSubtitleStylePreserveLineBreaks; + warn( + 'subtitleStyle.preserveLineBreaks', + (src.subtitleStyle as { preserveLineBreaks?: unknown }).preserveLineBreaks, + resolved.subtitleStyle.preserveLineBreaks, + 'Expected boolean.', + ); + } + const frequencyDictionary = isObject( (src.subtitleStyle as { frequencyDictionary?: unknown }).frequencyDictionary, ) diff --git a/src/renderer/state.ts b/src/renderer/state.ts index 18db689..4601407 100644 --- a/src/renderer/state.ts +++ b/src/renderer/state.ts @@ -79,6 +79,7 @@ export type RendererState = { jlptN3Color: string; jlptN4Color: string; jlptN5Color: string; + preserveSubtitleLineBreaks: boolean; frequencyDictionaryEnabled: boolean; frequencyDictionaryTopX: number; frequencyDictionaryMode: 'single' | 'banded'; @@ -155,6 +156,7 @@ export function createRendererState(): RendererState { jlptN3Color: '#f9e2af', jlptN4Color: '#a6e3a1', jlptN5Color: '#8aadf4', + preserveSubtitleLineBreaks: false, frequencyDictionaryEnabled: false, frequencyDictionaryTopX: 1000, frequencyDictionaryMode: 'single', diff --git a/src/renderer/subtitle-render.test.ts b/src/renderer/subtitle-render.test.ts index 692fb44..31f87dc 100644 --- a/src/renderer/subtitle-render.test.ts +++ b/src/renderer/subtitle-render.test.ts @@ -5,7 +5,7 @@ import path from 'node:path'; import type { MergedToken } from '../types'; import { PartOfSpeech } from '../types.js'; -import { computeWordClass } from './subtitle-render.js'; +import { alignTokensToSourceText, computeWordClass } from './subtitle-render.js'; function createToken(overrides: Partial): MergedToken { return { @@ -203,6 +203,19 @@ test('computeWordClass skips frequency class when rank is out of topX', () => { assert.equal(actual, 'word'); }); +test('alignTokensToSourceText preserves newline separators between adjacent token surfaces', () => { + const tokens = [ + createToken({ surface: 'キリキリと', reading: 'きりきりと', headword: 'キリキリと' }), + createToken({ surface: 'かかってこい', reading: 'かかってこい', headword: 'かかってこい' }), + ]; + + const segments = alignTokensToSourceText(tokens, 'キリキリと\nかかってこい'); + assert.deepEqual( + segments.map((segment) => (segment.kind === 'text' ? `text:${segment.text}` : 'token')), + ['token', 'text:\n', 'token'], + ); +}); + test('JLPT CSS rules use underline-only styling in renderer stylesheet', () => { const distCssPath = path.join(process.cwd(), 'dist', 'renderer', 'style.css'); const srcCssPath = path.join(process.cwd(), 'src', 'renderer', 'style.css'); diff --git a/src/renderer/subtitle-render.ts b/src/renderer/subtitle-render.ts index f996b8e..43491f2 100644 --- a/src/renderer/subtitle-render.ts +++ b/src/renderer/subtitle-render.ts @@ -9,11 +9,15 @@ type FrequencyRenderSettings = { bandedColors: [string, string, string, string, string]; }; -function normalizeSubtitle(text: string, trim = true): string { +function normalizeSubtitle(text: string, trim = true, collapseLineBreaks = false): string { if (!text) return ''; let normalized = text.replace(/\\N/g, '\n').replace(/\\n/g, '\n'); normalized = normalized.replace(/\{[^}]*\}/g, ''); + if (collapseLineBreaks) { + normalized = normalized.replace(/\n/g, ' '); + normalized = normalized.replace(/\s+/g, ' '); + } return trim ? normalized.trim() : normalized; } @@ -90,6 +94,8 @@ function renderWithTokens( root: HTMLElement, tokens: MergedToken[], frequencyRenderSettings?: Partial, + sourceText?: string, + preserveLineBreaks = false, ): void { const resolvedFrequencyRenderSettings = { ...DEFAULT_FREQUENCY_RENDER_SETTINGS, @@ -110,6 +116,29 @@ function renderWithTokens( const fragment = document.createDocumentFragment(); + if (preserveLineBreaks && sourceText) { + const normalizedSource = normalizeSubtitle(sourceText, true, false); + const segments = alignTokensToSourceText(tokens, normalizedSource); + + for (const segment of segments) { + if (segment.kind === 'text') { + renderPlainTextPreserveLineBreaks(fragment, segment.text); + continue; + } + + const token = segment.token; + const span = document.createElement('span'); + span.className = computeWordClass(token, resolvedFrequencyRenderSettings); + span.textContent = token.surface; + if (token.reading) span.dataset.reading = token.reading; + if (token.headword) span.dataset.headword = token.headword; + fragment.appendChild(span); + } + + root.appendChild(fragment); + return; + } + for (const token of tokens) { const surface = token.surface; @@ -142,6 +171,50 @@ function renderWithTokens( root.appendChild(fragment); } +type SubtitleRenderSegment = { kind: 'text'; text: string } | { kind: 'token'; token: MergedToken }; + +export function alignTokensToSourceText( + tokens: MergedToken[], + sourceText: string, +): SubtitleRenderSegment[] { + if (tokens.length === 0) { + return sourceText ? [{ kind: 'text', text: sourceText }] : []; + } + + const segments: SubtitleRenderSegment[] = []; + let cursor = 0; + + for (const token of tokens) { + const surface = token.surface; + if (!surface) { + continue; + } + + const foundIndex = sourceText.indexOf(surface, cursor); + if (foundIndex < 0) { + if (cursor < sourceText.length) { + segments.push({ kind: 'text', text: sourceText.slice(cursor) }); + } + segments.push({ kind: 'token', token }); + cursor = sourceText.length; + continue; + } + + if (foundIndex > cursor) { + segments.push({ kind: 'text', text: sourceText.slice(cursor, foundIndex) }); + } + + segments.push({ kind: 'token', token }); + cursor = foundIndex + surface.length; + } + + if (cursor < sourceText.length) { + segments.push({ kind: 'text', text: sourceText.slice(cursor) }); + } + + return segments; +} + export function computeWordClass( token: MergedToken, frequencySettings?: Partial, @@ -199,7 +272,7 @@ function renderCharacterLevel(root: HTMLElement, text: string): void { root.appendChild(fragment); } -function renderPlainTextPreserveLineBreaks(root: HTMLElement, text: string): void { +function renderPlainTextPreserveLineBreaks(root: ParentNode, text: string): void { const lines = text.split('\n'); const fragment = document.createDocumentFragment(); @@ -246,7 +319,13 @@ export function createSubtitleRenderer(ctx: RendererContext) { const normalized = normalizeSubtitle(text); if (tokens && tokens.length > 0) { - renderWithTokens(ctx.dom.subtitleRoot, tokens, getFrequencyRenderSettings()); + renderWithTokens( + ctx.dom.subtitleRoot, + tokens, + getFrequencyRenderSettings(), + text, + ctx.state.preserveSubtitleLineBreaks, + ); return; } renderCharacterLevel(ctx.dom.subtitleRoot, normalized); @@ -346,6 +425,7 @@ export function createSubtitleRenderer(ctx: RendererContext) { ctx.state.jlptN3Color = jlptColors.N3; ctx.state.jlptN4Color = jlptColors.N4; ctx.state.jlptN5Color = jlptColors.N5; + ctx.state.preserveSubtitleLineBreaks = style.preserveLineBreaks ?? false; ctx.dom.subtitleRoot.style.setProperty('--subtitle-jlpt-n1-color', jlptColors.N1); ctx.dom.subtitleRoot.style.setProperty('--subtitle-jlpt-n2-color', jlptColors.N2); ctx.dom.subtitleRoot.style.setProperty('--subtitle-jlpt-n3-color', jlptColors.N3); diff --git a/src/subtitle/stages/normalize.test.ts b/src/subtitle/stages/normalize.test.ts new file mode 100644 index 0000000..9584b06 --- /dev/null +++ b/src/subtitle/stages/normalize.test.ts @@ -0,0 +1,10 @@ +import test from 'node:test'; +import assert from 'node:assert/strict'; +import { normalizeTokenizerInput } from './normalize'; + +test('normalizeTokenizerInput collapses zero-width separators between Japanese segments', () => { + const input = 'キリキリと\u200bかかってこい\nこのヘナチョコ冒険者どもめが!'; + const normalized = normalizeTokenizerInput(input); + + assert.equal(normalized, 'キリキリと かかってこい このヘナチョコ冒険者どもめが!'); +}); diff --git a/src/subtitle/stages/normalize.ts b/src/subtitle/stages/normalize.ts index 234d7ca..6cbe0f8 100644 --- a/src/subtitle/stages/normalize.ts +++ b/src/subtitle/stages/normalize.ts @@ -2,6 +2,12 @@ export function normalizeDisplayText(text: string): string { return text.replace(/\r\n/g, '\n').replace(/\\N/g, '\n').replace(/\\n/g, '\n').trim(); } +const INVISIBLE_SEPARATOR_PATTERN = /[\u200b\u2060\ufeff]/g; + export function normalizeTokenizerInput(displayText: string): string { - return displayText.replace(/\n/g, ' ').replace(/\s+/g, ' ').trim(); + return displayText + .replace(/\n/g, ' ') + .replace(INVISIBLE_SEPARATOR_PATTERN, ' ') + .replace(/\s+/g, ' ') + .trim(); } diff --git a/src/types.ts b/src/types.ts index b038112..bd462e5 100644 --- a/src/types.ts +++ b/src/types.ts @@ -270,6 +270,7 @@ export interface AnkiConnectConfig { export interface SubtitleStyleConfig { enableJlpt?: boolean; + preserveLineBreaks?: boolean; fontFamily?: string; fontSize?: number; fontColor?: string;