diff --git a/docs/subagents/INDEX.md b/docs/subagents/INDEX.md index ee79974..318030d 100644 --- a/docs/subagents/INDEX.md +++ b/docs/subagents/INDEX.md @@ -86,3 +86,4 @@ Read first. Keep concise. | `codex-task109-discord-presence-20260222T220537Z-lkfv` | `codex-task109-discord-presence` | `Execute TASK-109 Discord Rich Presence integration end-to-end with plan-first workflow (no commit)` | `handoff` | `docs/subagents/agents/codex-task109-discord-presence-20260222T220537Z-lkfv.md` | `2026-02-22T22:36:40Z` | | `opencode-task103-jellyfin-main-composer-20260222T221152Z-n3p7` | `opencode-task103-jellyfin-main-composer` | `Implement TASK-103 Jellyfin runtime wiring extraction from main.ts into composer module(s), tests, docs, and required validations (no commit).` | `in_progress` | `docs/subagents/agents/opencode-task103-jellyfin-main-composer-20260222T221152Z-n3p7.md` | `2026-02-22T22:11:52Z` | | `opencode-task109-discord-presence-20260223T011027Z-j9r4` | `opencode-task109-discord-presence` | `Finalize TASK-109 Discord Rich Presence with plan-first workflow and backlog closure.` | `in_progress` | `docs/subagents/agents/opencode-task109-discord-presence-20260223T011027Z-j9r4.md` | `2026-02-23T01:15:39Z` | +| `codex-task88-yomitan-flow-20260223T012755Z-x4m2` | `codex-task88-yomitan-flow` | `Execute TASK-88 remove MeCab fallback tokenizer and simplify Yomitan token flow via plan-first workflow (no commit).` | `handoff` | `docs/subagents/agents/codex-task88-yomitan-flow-20260223T012755Z-x4m2.md` | `2026-02-23T01:44:16Z` | diff --git a/docs/subagents/agents/codex-task88-yomitan-flow-20260223T012755Z-x4m2.md b/docs/subagents/agents/codex-task88-yomitan-flow-20260223T012755Z-x4m2.md new file mode 100644 index 0000000..92d0903 --- /dev/null +++ b/docs/subagents/agents/codex-task88-yomitan-flow-20260223T012755Z-x4m2.md @@ -0,0 +1,40 @@ +# Agent: `codex-task88-yomitan-flow-20260223T012755Z-x4m2` + +- alias: `codex-task88-yomitan-flow` +- mission: `Execute TASK-88 remove MeCab fallback tokenizer and simplify Yomitan token flow via plan-first workflow (no commit).` +- status: `handoff` +- branch: `main` +- started_at: `2026-02-23T01:27:55Z` +- heartbeat_minutes: `5` + +## Current Work (newest first) + +- [2026-02-23T01:44:16Z] handoff: implementation + docs updates complete for TASK-88 scope; tokenizer fallback removed, parser-selection simplified to scanning-parser-only, focused tokenizer/subtitle tests + build + docs build green. +- [2026-02-23T01:44:16Z] test: `bun test src/core/services/tokenizer/parser-selection-stage.test.ts src/core/services/tokenizer.test.ts` pass (47); `bun test src/core/services/subtitle-processing-controller.test.ts` pass (6); `bun run build` pass; `bun run docs:build` pass. +- [2026-02-23T01:30:00Z] progress: wrote plan at `docs/plans/2026-02-23-task-88-yomitan-only-token-flow.md` via writing-plans skill and executed via executing-plans skill. +- [2026-02-23T01:27:55Z] intent: load backlog context for TASK-88, write plan with writing-plans skill, execute with executing-plans skill, validate via focused/full tests, no commit. + +## Files Touched + +- `docs/subagents/agents/codex-task88-yomitan-flow-20260223T012755Z-x4m2.md` +- `docs/subagents/INDEX.md` +- `docs/subagents/collaboration.md` +- `docs/plans/2026-02-23-task-88-yomitan-only-token-flow.md` +- `src/core/services/tokenizer.ts` +- `src/core/services/tokenizer/parser-selection-stage.ts` +- `src/core/services/tokenizer/parser-selection-stage.test.ts` +- `src/core/services/tokenizer.test.ts` +- `docs/usage.md` +- `docs/troubleshooting.md` + +## Assumptions + +- Backlog is initialized and TASK-88 title/context from MCP search is authoritative despite stale `task_view` collision on legacy TASK-88. + +## Open Questions / Blockers + +- Backlog MCP `task_view TASK-88` resolves to a legacy completed TASK-88 entry; current TASK-88 content had to be read from `backlog/tasks/task-88 - Remove-MeCab-fallback-tokenizer-and-simplify-Yomitan-token-flow.md`. + +## Next Step + +- If needed, repair duplicate TASK-88 ID collision in Backlog MCP so `task_view`/`task_edit` target the active To Do ticket. diff --git a/docs/subagents/collaboration.md b/docs/subagents/collaboration.md index 0b40ecf..6ae369b 100644 --- a/docs/subagents/collaboration.md +++ b/docs/subagents/collaboration.md @@ -148,3 +148,5 @@ Shared notes. Append-only. - [2026-02-23T01:10:27Z] [opencode-task109-discord-presence-20260223T011027Z-j9r4|opencode-task109-discord-presence] starting TASK-109 closure pass via Backlog MCP + writing-plans/executing-plans; scope validate existing Discord config/runtime/docs changes, close remaining DoD evidence, and finalize task status if gates pass. - [2026-02-23T01:15:39Z] [opencode-task109-discord-presence-20260223T011027Z-j9r4|opencode-task109-discord-presence] user feedback from real Discord session: status resumed to Playing with noticeable delay; tuned default `discordPresence.updateIntervalMs` from 15000 to 3000 in defaults/docs/examples and updated focused config expectations; reran focused config + discord presence tests green. +- [2026-02-23T01:27:55Z] [codex-task88-yomitan-flow-20260223T012755Z-x4m2|codex-task88-yomitan-flow] starting TASK-88 via Backlog MCP + writing-plans/executing-plans; expected overlap in tokenizer modules (`src/core/services/tokenizer*`, Yomitan flow wiring/tests); will keep scope to MeCab fallback removal and token flow simplification. +- [2026-02-23T01:44:16Z] [codex-task88-yomitan-flow-20260223T012755Z-x4m2|codex-task88-yomitan-flow] completed TASK-88 implementation pass: removed MeCab fallback branch from `tokenizeSubtitle`, restricted parser-selection to `scanning-parser` candidates, refreshed tokenizer regressions for Yomitan-only flow, updated usage/troubleshooting docs, and verified tokenizer+subtitle suites/build/docs-build green. diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 626c54c..39ce9ca 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -103,7 +103,7 @@ If you installed from the AppImage and see this error, the package may be incomp **"MeCab not found on system"** -This is informational, not an error. SubMiner uses Yomitan's internal parser as the primary tokenizer and falls back to MeCab when needed. If MeCab is not installed, Yomitan handles all tokenization. +This is informational, not an error. SubMiner tokenization is driven by Yomitan's internal parser. MeCab availability checks may still run for auxiliary token metadata, but MeCab is not used as a tokenization fallback path. To install MeCab: @@ -113,10 +113,10 @@ To install MeCab: **Words are not segmented correctly** -Japanese word boundaries depend on the tokenizer. If segmentation seems wrong: +Japanese word boundaries depend on Yomitan parser output. If segmentation seems wrong: -- Install MeCab for improved accuracy as a fallback. -- Note that CJK characters without spaces are segmented using `Intl.Segmenter` or character-level fallback, which is not always perfect. +- Verify Yomitan dictionaries are installed and active. +- Note that CJK characters without spaces are segmented using parser heuristics, which is not always perfect. ## Media Generation diff --git a/docs/usage.md b/docs/usage.md index 1d80c11..38de2e1 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -209,7 +209,7 @@ These keybindings only work when the overlay window has focus. See [Configuratio 1. MPV runs with an IPC socket at `/tmp/subminer-socket` 2. The overlay connects and subscribes to subtitle changes -3. Subtitles are tokenized with Yomitan's internal parser, with MeCab fallback when needed +3. Subtitles are tokenized with Yomitan's internal parser 4. Words are displayed as clickable spans 5. Clicking a word triggers Yomitan popup for dictionary lookup 6. Texthooker server runs at `http://127.0.0.1:5174` for external tools diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index 016db68..eccdc3f 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -1,12 +1,7 @@ import test from 'node:test'; import assert from 'node:assert/strict'; import { PartOfSpeech } from '../../types'; -import { - createTokenizerDepsRuntime, - TokenizerServiceDeps, - TokenizerDepsRuntimeOptions, - tokenizeSubtitle, -} from './tokenizer'; +import { createTokenizerDepsRuntime, TokenizerServiceDeps, tokenizeSubtitle } from './tokenizer'; function makeDeps(overrides: Partial = {}): TokenizerServiceDeps { return { @@ -25,24 +20,37 @@ function makeDeps(overrides: Partial = {}): TokenizerServi }; } -function makeDepsFromMecabTokenizer( - tokenize: (text: string) => Promise, - overrides: Partial = {}, +interface YomitanTokenInput { + surface: string; + reading?: string; + headword?: string; +} + +function makeDepsFromYomitanTokens( + tokens: YomitanTokenInput[], + overrides: Partial = {}, ): TokenizerServiceDeps { - return createTokenizerDepsRuntime({ - getYomitanExt: () => null, - getYomitanParserWindow: () => null, - setYomitanParserWindow: () => {}, - getYomitanParserReadyPromise: () => null, - setYomitanParserReadyPromise: () => {}, - getYomitanParserInitPromise: () => null, - setYomitanParserInitPromise: () => {}, - isKnownWord: () => false, - getKnownWordMatchMode: () => 'headword', - getMecabTokenizer: () => ({ - tokenize, - }), - getJlptLevel: () => null, + return makeDeps({ + getYomitanExt: () => ({ id: 'dummy-ext' }) as any, + getYomitanParserWindow: () => + ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: 'scanning-parser', + index: 0, + content: tokens.map((token) => [ + { + text: token.surface, + reading: token.reading ?? token.surface, + headwords: [[{ term: token.headword ?? token.surface }]], + }, + ]), + }, + ], + }, + }) as unknown as Electron.BrowserWindow, ...overrides, }); } @@ -91,34 +99,10 @@ test('tokenizeSubtitle caches JLPT lookups across repeated tokens', async () => let lookupCalls = 0; const result = await tokenizeSubtitle( '猫猫', - makeDepsFromMecabTokenizer( - async () => [ - { - word: '猫', - partOfSpeech: PartOfSpeech.noun, - pos1: '', - pos2: '', - pos3: '', - pos4: '', - inflectionType: '', - inflectionForm: '', - headword: '猫', - katakanaReading: 'ネコ', - pronunciation: 'ネコ', - }, - { - word: '猫', - partOfSpeech: PartOfSpeech.noun, - pos1: '', - pos2: '', - pos3: '', - pos4: '', - inflectionType: '', - inflectionForm: '', - headword: '猫', - katakanaReading: 'ネコ', - pronunciation: 'ネコ', - }, + makeDepsFromYomitanTokens( + [ + { surface: '猫', reading: 'ねこ', headword: '猫' }, + { surface: '猫', reading: 'ねこ', headword: '猫' }, ], { getJlptLevel: (text) => { @@ -138,26 +122,9 @@ test('tokenizeSubtitle caches JLPT lookups across repeated tokens', async () => test('tokenizeSubtitle leaves JLPT unset for non-matching tokens', async () => { const result = await tokenizeSubtitle( '猫', - makeDepsFromMecabTokenizer( - async () => [ - { - word: '猫', - partOfSpeech: PartOfSpeech.noun, - pos1: '', - pos2: '', - pos3: '', - pos4: '', - inflectionType: '', - inflectionForm: '', - headword: '猫', - katakanaReading: 'ネコ', - pronunciation: 'ネコ', - }, - ], - { - getJlptLevel: () => null, - }, - ), + makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], { + getJlptLevel: () => null, + }), ); assert.equal(result.tokens?.length, 1); @@ -168,20 +135,7 @@ test('tokenizeSubtitle skips JLPT lookups when disabled', async () => { let lookupCalls = 0; const result = await tokenizeSubtitle( '猫です', - makeDeps({ - tokenizeWithMecab: async () => [ - { - headword: '猫', - surface: '猫', - reading: 'ネコ', - startPos: 0, - endPos: 1, - partOfSpeech: PartOfSpeech.noun, - isMerged: false, - isKnown: false, - isNPlusOneTarget: false, - }, - ], + makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], { getJlptLevel: () => { lookupCalls += 1; return 'N5'; @@ -198,34 +152,16 @@ test('tokenizeSubtitle skips JLPT lookups when disabled', async () => { test('tokenizeSubtitle applies frequency dictionary ranks', async () => { const result = await tokenizeSubtitle( '猫です', - makeDeps({ - getFrequencyDictionaryEnabled: () => true, - tokenizeWithMecab: async () => [ - { - headword: '猫', - surface: '猫', - reading: 'ネコ', - startPos: 0, - endPos: 1, - partOfSpeech: PartOfSpeech.noun, - isMerged: false, - isKnown: false, - isNPlusOneTarget: false, - }, - { - headword: 'です', - surface: 'です', - reading: 'デス', - startPos: 1, - endPos: 2, - partOfSpeech: PartOfSpeech.other, - isMerged: false, - isKnown: false, - isNPlusOneTarget: false, - }, + makeDepsFromYomitanTokens( + [ + { surface: '猫', reading: 'ねこ', headword: '猫' }, + { surface: 'です', reading: 'です', headword: 'です' }, ], - getFrequencyRank: (text) => (text === '猫' ? 23 : 1200), - }), + { + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => (text === '猫' ? 23 : 1200), + }, + ), ); assert.equal(result.tokens?.length, 2); @@ -588,38 +524,20 @@ test('tokenizeSubtitle skips frequency rank when Yomitan token is enriched as pa test('tokenizeSubtitle ignores invalid frequency ranks', async () => { const result = await tokenizeSubtitle( '猫', - makeDeps({ - getFrequencyDictionaryEnabled: () => true, - tokenizeWithMecab: async () => [ - { - headword: '猫', - surface: '猫', - reading: 'ネコ', - startPos: 0, - endPos: 1, - partOfSpeech: PartOfSpeech.noun, - isMerged: false, - isKnown: false, - isNPlusOneTarget: false, - }, - { - headword: 'です', - surface: 'です', - reading: 'デス', - startPos: 1, - endPos: 2, - partOfSpeech: PartOfSpeech.bound_auxiliary, - isMerged: false, - isKnown: false, - isNPlusOneTarget: false, - }, + makeDepsFromYomitanTokens( + [ + { surface: '猫', reading: 'ねこ', headword: '猫' }, + { surface: 'です', reading: 'です', headword: 'です' }, ], - getFrequencyRank: (text) => { - if (text === '猫') return Number.NaN; - if (text === 'です') return -1; - return 100; + { + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => { + if (text === '猫') return Number.NaN; + if (text === 'です') return -1; + return 100; + }, }, - }), + ), ); assert.equal(result.tokens?.length, 2); @@ -631,21 +549,8 @@ test('tokenizeSubtitle skips frequency lookups when disabled', async () => { let frequencyCalls = 0; const result = await tokenizeSubtitle( '猫', - makeDeps({ + makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], { getFrequencyDictionaryEnabled: () => false, - tokenizeWithMecab: async () => [ - { - headword: '猫', - surface: '猫', - reading: 'ネコ', - startPos: 0, - endPos: 1, - partOfSpeech: PartOfSpeech.noun, - isMerged: false, - isKnown: false, - isNPlusOneTarget: false, - }, - ], getFrequencyRank: () => { frequencyCalls += 1; return 10; @@ -728,63 +633,28 @@ test('tokenizeSubtitle skips JLPT level for repeated kana SFX', async () => { assert.equal(result.tokens?.[0]?.jlptLevel, undefined); }); -test('tokenizeSubtitle assigns JLPT level to mecab tokens', async () => { +test('tokenizeSubtitle assigns JLPT level to Yomitan tokens', async () => { const result = await tokenizeSubtitle( '猫です', - makeDepsFromMecabTokenizer( - async () => [ - { - word: '猫', - partOfSpeech: PartOfSpeech.noun, - pos1: '', - pos2: '', - pos3: '', - pos4: '', - inflectionType: '', - inflectionForm: '', - headword: '猫', - katakanaReading: 'ネコ', - pronunciation: 'ネコ', - }, - ], - { - getJlptLevel: (text) => (text === '猫' ? 'N4' : null), - }, - ), + makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], { + getJlptLevel: (text) => (text === '猫' ? 'N4' : null), + }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.jlptLevel, 'N4'); }); -test('tokenizeSubtitle skips JLPT level for mecab tokens marked as ineligible', async () => { +test('tokenizeSubtitle can assign JLPT level to Yomitan particle token', async () => { const result = await tokenizeSubtitle( 'は', - makeDepsFromMecabTokenizer( - async () => [ - { - word: 'は', - partOfSpeech: PartOfSpeech.particle, - pos1: '助詞', - pos2: '', - pos3: '', - pos4: '', - inflectionType: '', - inflectionForm: '', - headword: 'は', - katakanaReading: 'ハ', - pronunciation: 'ハ', - }, - ], - { - getJlptLevel: (text) => (text === 'は' ? 'N5' : null), - }, - ), + makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は' }], { + getJlptLevel: (text) => (text === 'は' ? 'N5' : null), + }), ); assert.equal(result.tokens?.length, 1); - assert.equal(result.tokens?.[0]?.pos1, '助詞'); - assert.equal(result.tokens?.[0]?.jlptLevel, undefined); + assert.equal(result.tokens?.[0]?.jlptLevel, 'N5'); }); test('tokenizeSubtitle returns null tokens for empty normalized text', async () => { @@ -792,58 +662,34 @@ test('tokenizeSubtitle returns null tokens for empty normalized text', async () assert.deepEqual(result, { text: ' \\n ', tokens: null }); }); -test('tokenizeSubtitle normalizes newlines before mecab fallback', async () => { - let tokenizeInput = ''; +test('tokenizeSubtitle normalizes newlines before Yomitan parse request', async () => { + let parseInput = ''; const result = await tokenizeSubtitle( '猫\\Nです\nね', makeDeps({ - tokenizeWithMecab: async (text) => { - tokenizeInput = text; - return [ - { - surface: '猫ですね', - reading: 'ネコデスネ', - headword: '猫ですね', - startPos: 0, - endPos: 4, - partOfSpeech: PartOfSpeech.other, - isMerged: true, - isKnown: false, - isNPlusOneTarget: false, + getYomitanExt: () => ({ id: 'dummy-ext' }) as any, + getYomitanParserWindow: () => + ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async (script: string) => { + parseInput = script; + return null; + }, }, - ]; - }, + }) as unknown as Electron.BrowserWindow, }), ); - assert.equal(tokenizeInput, '猫 です ね'); + assert.match(parseInput, /猫 です ね/); assert.equal(result.text, '猫\nです\nね'); - assert.equal(result.tokens?.[0]?.surface, '猫ですね'); + assert.equal(result.tokens, null); }); -test('tokenizeSubtitle falls back to mecab tokens when available', async () => { - const result = await tokenizeSubtitle( - '猫です', - makeDeps({ - tokenizeWithMecab: async () => [ - { - surface: '猫', - reading: 'ネコ', - headword: '猫', - startPos: 0, - endPos: 1, - partOfSpeech: PartOfSpeech.noun, - isMerged: false, - isKnown: false, - isNPlusOneTarget: false, - }, - ], - }), - ); +test('tokenizeSubtitle returns null tokens when Yomitan parsing is unavailable', async () => { + const result = await tokenizeSubtitle('猫です', makeDeps()); - assert.equal(result.text, '猫です'); - assert.equal(result.tokens?.length, 1); - assert.equal(result.tokens?.[0]?.surface, '猫'); + assert.deepEqual(result, { text: '猫です', tokens: null }); }); test('tokenizeSubtitle returns null tokens when mecab throws', async () => { @@ -1055,7 +901,7 @@ test('tokenizeSubtitle preserves segmented Yomitan line as one token', async () assert.equal(result.tokens?.[0]?.isKnown, false); }); -test('tokenizeSubtitle prefers mecab parser tokens when scanning parser returns one token', async () => { +test('tokenizeSubtitle keeps scanning parser token when scanning parser returns one token', async () => { const result = await tokenizeSubtitle( '俺は小園にいきたい', makeDeps({ @@ -1128,10 +974,9 @@ test('tokenizeSubtitle prefers mecab parser tokens when scanning parser returns }), ); - assert.equal(result.tokens?.length, 5); - assert.equal(result.tokens?.map((token) => token.surface).join(','), '俺,は,小園,に,いきたい'); - assert.equal(result.tokens?.[2]?.surface, '小園'); - assert.equal(result.tokens?.[2]?.frequencyRank, 25); + assert.equal(result.tokens?.length, 1); + assert.equal(result.tokens?.map((token) => token.surface).join(','), '俺は小園にいきたい'); + assert.equal(result.tokens?.[0]?.frequencyRank, undefined); }); test('tokenizeSubtitle keeps scanning parser tokens when they are already split', async () => { @@ -1228,7 +1073,7 @@ test('tokenizeSubtitle keeps scanning parser tokens when they are already split' assert.equal(result.tokens?.[2]?.frequencyRank, undefined); }); -test('tokenizeSubtitle prefers parse candidates with fewer fragment-only kana tokens when source priority is equal', async () => { +test('tokenizeSubtitle keeps parsing explicit by scanning-parser source only', async () => { const result = await tokenizeSubtitle( '俺は公園にいきたい', makeDeps({ @@ -1239,7 +1084,7 @@ test('tokenizeSubtitle prefers parse candidates with fewer fragment-only kana to webContents: { executeJavaScript: async () => [ { - source: 'mecab-fragmented', + source: 'scanning-parser', index: 0, content: [ [ @@ -1274,7 +1119,7 @@ test('tokenizeSubtitle prefers parse candidates with fewer fragment-only kana to ], }, { - source: 'mecab', + source: 'scanning-parser', index: 0, content: [ [ @@ -1379,26 +1224,9 @@ test('tokenizeSubtitle still assigns frequency to non-known Yomitan tokens', asy test('tokenizeSubtitle marks tokens as known using callback', async () => { const result = await tokenizeSubtitle( '猫です', - makeDepsFromMecabTokenizer( - async () => [ - { - word: '猫', - partOfSpeech: PartOfSpeech.noun, - pos1: '', - pos2: '', - pos3: '', - pos4: '', - inflectionType: '', - inflectionForm: '', - headword: '猫', - katakanaReading: 'ネコ', - pronunciation: 'ネコ', - }, - ], - { - isKnownWord: (text) => text === '猫', - }, - ), + makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], { + isKnownWord: (text) => text === '猫', + }), ); assert.equal(result.text, '猫です'); @@ -1408,51 +1236,17 @@ test('tokenizeSubtitle marks tokens as known using callback', async () => { test('tokenizeSubtitle still assigns frequency rank to non-known tokens', async () => { const result = await tokenizeSubtitle( '既知未知', - makeDeps({ - tokenizeWithMecab: async () => [ - { - surface: '既知', - reading: 'キチ', - partOfSpeech: PartOfSpeech.noun, - pos1: '', - pos2: '', - pos3: '', - pos4: '', - inflectionType: '', - inflectionForm: '', - headword: '既知', - katakanaReading: 'キチ', - pronunciation: 'キチ', - startPos: 0, - endPos: 2, - isMerged: false, - isKnown: false, - isNPlusOneTarget: false, - }, - { - surface: '未知', - reading: 'ミチ', - partOfSpeech: PartOfSpeech.noun, - pos1: '', - pos2: '', - pos3: '', - pos4: '', - inflectionType: '', - inflectionForm: '', - headword: '未知', - katakanaReading: 'ミチ', - pronunciation: 'ミチ', - startPos: 2, - endPos: 4, - isMerged: false, - isKnown: false, - isNPlusOneTarget: false, - }, + makeDepsFromYomitanTokens( + [ + { surface: '既知', reading: 'きち', headword: '既知' }, + { surface: '未知', reading: 'みち', headword: '未知' }, ], - getFrequencyDictionaryEnabled: () => true, - getFrequencyRank: (text) => (text === '既知' ? 20 : text === '未知' ? 30 : null), - isKnownWord: (text) => text === '既知', - }), + { + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => (text === '既知' ? 20 : text === '未知' ? 30 : null), + isKnownWord: (text) => text === '既知', + }, + ), ); assert.equal(result.tokens?.length, 2); @@ -1465,33 +1259,16 @@ test('tokenizeSubtitle still assigns frequency rank to non-known tokens', async test('tokenizeSubtitle selects one N+1 target token', async () => { const result = await tokenizeSubtitle( '猫です', - makeDeps({ - tokenizeWithMecab: async () => [ - { - surface: '私', - reading: 'ワタシ', - headword: '私', - startPos: 0, - endPos: 1, - partOfSpeech: PartOfSpeech.noun, - isMerged: false, - isKnown: true, - isNPlusOneTarget: false, - }, - { - surface: '犬', - reading: 'イヌ', - headword: '犬', - startPos: 1, - endPos: 2, - partOfSpeech: PartOfSpeech.noun, - isMerged: false, - isKnown: false, - isNPlusOneTarget: false, - }, + makeDepsFromYomitanTokens( + [ + { surface: '私', reading: 'わたし', headword: '私' }, + { surface: '犬', reading: 'いぬ', headword: '犬' }, ], - getMinSentenceWordsForNPlusOne: () => 2, - }), + { + getMinSentenceWordsForNPlusOne: () => 2, + isKnownWord: (text) => text === '私', + }, + ), ); const targets = result.tokens?.filter((token) => token.isNPlusOneTarget) ?? []; @@ -1502,32 +1279,13 @@ test('tokenizeSubtitle selects one N+1 target token', async () => { test('tokenizeSubtitle does not mark target when sentence has multiple candidates', async () => { const result = await tokenizeSubtitle( '猫犬', - makeDeps({ - tokenizeWithMecab: async () => [ - { - surface: '猫', - reading: 'ネコ', - headword: '猫', - startPos: 0, - endPos: 1, - partOfSpeech: PartOfSpeech.noun, - isMerged: false, - isKnown: false, - isNPlusOneTarget: false, - }, - { - surface: '犬', - reading: 'イヌ', - headword: '犬', - startPos: 1, - endPos: 2, - partOfSpeech: PartOfSpeech.noun, - isMerged: false, - isKnown: false, - isNPlusOneTarget: false, - }, + makeDepsFromYomitanTokens( + [ + { surface: '猫', reading: 'ねこ', headword: '猫' }, + { surface: '犬', reading: 'いぬ', headword: '犬' }, ], - }), + {}, + ), ); assert.equal( @@ -1846,32 +1604,13 @@ test('tokenizeSubtitle keeps correct MeCab pos1 enrichment when Yomitan offsets test('tokenizeSubtitle does not color 1-2 word sentences by default', async () => { const result = await tokenizeSubtitle( '猫です', - makeDeps({ - tokenizeWithMecab: async () => [ - { - surface: '私', - reading: 'ワタシ', - headword: '私', - startPos: 0, - endPos: 1, - partOfSpeech: PartOfSpeech.noun, - isMerged: false, - isKnown: true, - isNPlusOneTarget: false, - }, - { - surface: '犬', - reading: 'イヌ', - headword: '犬', - startPos: 1, - endPos: 2, - partOfSpeech: PartOfSpeech.noun, - isMerged: false, - isKnown: false, - isNPlusOneTarget: false, - }, + makeDepsFromYomitanTokens( + [ + { surface: '私', reading: 'わたし', headword: '私' }, + { surface: '犬', reading: 'いぬ', headword: '犬' }, ], - }), + {}, + ), ); assert.equal( @@ -1883,26 +1622,9 @@ test('tokenizeSubtitle does not color 1-2 word sentences by default', async () = test('tokenizeSubtitle checks known words by headword, not surface', async () => { const result = await tokenizeSubtitle( '猫です', - makeDepsFromMecabTokenizer( - async () => [ - { - word: '猫', - partOfSpeech: PartOfSpeech.noun, - pos1: '', - pos2: '', - pos3: '', - pos4: '', - inflectionType: '', - inflectionForm: '', - headword: '猫です', - katakanaReading: 'ネコ', - pronunciation: 'ネコ', - }, - ], - { - isKnownWord: (text) => text === '猫です', - }, - ), + makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫です' }], { + isKnownWord: (text) => text === '猫です', + }), ); assert.equal(result.text, '猫です'); @@ -1912,34 +1634,17 @@ test('tokenizeSubtitle checks known words by headword, not surface', async () => test('tokenizeSubtitle checks known words by surface when configured', async () => { const result = await tokenizeSubtitle( '猫です', - makeDepsFromMecabTokenizer( - async () => [ - { - word: '猫', - partOfSpeech: PartOfSpeech.noun, - pos1: '', - pos2: '', - pos3: '', - pos4: '', - inflectionType: '', - inflectionForm: '', - headword: '猫です', - katakanaReading: 'ネコ', - pronunciation: 'ネコ', - }, - ], - { - getKnownWordMatchMode: () => 'surface', - isKnownWord: (text) => text === '猫', - }, - ), + makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫です' }], { + getKnownWordMatchMode: () => 'surface', + isKnownWord: (text) => text === '猫', + }), ); assert.equal(result.text, '猫です'); assert.equal(result.tokens?.[0]?.isKnown, true); }); -test('createTokenizerDepsRuntime checks MeCab availability before first tokenize call', async () => { +test('createTokenizerDepsRuntime checks MeCab availability before first tokenizeWithMecab call', async () => { let available = false; let checkCalls = 0; @@ -1984,10 +1689,10 @@ test('createTokenizerDepsRuntime checks MeCab availability before first tokenize }), }); - const first = await tokenizeSubtitle('仮面', deps); - const second = await tokenizeSubtitle('仮面', deps); + const first = await deps.tokenizeWithMecab('仮面'); + const second = await deps.tokenizeWithMecab('仮面'); assert.equal(checkCalls, 1); - assert.equal(first.tokens?.[0]?.surface, '仮面'); - assert.equal(second.tokens?.[0]?.surface, '仮面'); + assert.equal(first?.[0]?.surface, '仮面'); + assert.equal(second?.[0]?.surface, '仮面'); }); diff --git a/src/core/services/tokenizer.ts b/src/core/services/tokenizer.ts index 5279cef..0634cf0 100644 --- a/src/core/services/tokenizer.ts +++ b/src/core/services/tokenizer.ts @@ -216,17 +216,5 @@ export async function tokenizeSubtitle( }; } - try { - const mecabTokens = await deps.tokenizeWithMecab(tokenizeText); - if (mecabTokens && mecabTokens.length > 0) { - return { - text: displayText, - tokens: applyAnnotationStage(mecabTokens, deps), - }; - } - } catch (err) { - logger.error('Tokenization error:', (err as Error).message); - } - return { text: displayText, tokens: null }; } diff --git a/src/core/services/tokenizer/parser-selection-stage.test.ts b/src/core/services/tokenizer/parser-selection-stage.test.ts index 143a1f3..9856d10 100644 --- a/src/core/services/tokenizer/parser-selection-stage.test.ts +++ b/src/core/services/tokenizer/parser-selection-stage.test.ts @@ -48,7 +48,7 @@ test('prefers scanning parser when scanning candidate has more than one token', assert.equal(tokens?.map((token) => token.surface).join(','), '小園,に'); }); -test('prefers mecab candidate when scanning candidate is single token and mecab has better split', () => { +test('keeps scanning parser candidate when scanning candidate is single token', () => { const parseResults = [ makeParseItem('scanning-parser', [ [{ text: '俺は公園にいきたい', reading: 'おれはこうえんにいきたい' }], @@ -63,17 +63,17 @@ test('prefers mecab candidate when scanning candidate is single token and mecab ]; const tokens = selectYomitanParseTokens(parseResults, () => false, 'headword'); - assert.equal(tokens?.map((token) => token.surface).join(','), '俺,は,公園,に,いきたい'); + assert.equal(tokens?.map((token) => token.surface).join(','), '俺は公園にいきたい'); }); test('tie-break prefers fewer suspicious kana fragments', () => { const parseResults = [ - makeParseItem('mecab-fragmented', [ + makeParseItem('scanning-parser', [ [{ text: '俺', reading: 'おれ', headword: '俺' }], [{ text: 'にい', reading: '', headword: '兄' }], [{ text: 'きたい', reading: '', headword: '期待' }], ]), - makeParseItem('mecab', [ + makeParseItem('scanning-parser', [ [{ text: '俺', reading: 'おれ', headword: '俺' }], [{ text: 'に', reading: 'に', headword: 'に' }], [{ text: '行きたい', reading: 'いきたい', headword: '行きたい' }], @@ -83,3 +83,16 @@ test('tie-break prefers fewer suspicious kana fragments', () => { const tokens = selectYomitanParseTokens(parseResults, () => false, 'headword'); assert.equal(tokens?.map((token) => token.surface).join(','), '俺,に,行きたい'); }); + +test('returns null when only mecab-source candidates are present', () => { + const parseResults = [ + makeParseItem('mecab', [ + [{ text: '俺', reading: 'おれ', headword: '俺' }], + [{ text: 'は', reading: 'は', headword: 'は' }], + [{ text: '公園', reading: 'こうえん', headword: '公園' }], + ]), + ]; + + const tokens = selectYomitanParseTokens(parseResults, () => false, 'headword'); + assert.equal(tokens, null); +}); diff --git a/src/core/services/tokenizer/parser-selection-stage.ts b/src/core/services/tokenizer/parser-selection-stage.ts index 0e42107..43fe9ac 100644 --- a/src/core/services/tokenizer/parser-selection-stage.ts +++ b/src/core/services/tokenizer/parser-selection-stage.ts @@ -199,14 +199,9 @@ export function selectBestYomitanParseCandidate( const scanningCandidates = candidates.filter( (candidate) => candidate.source === 'scanning-parser', ); - const mecabCandidates = candidates.filter((candidate) => candidate.source === 'mecab'); - - const getBestByTokenCount = (items: YomitanParseCandidate[]): YomitanParseCandidate | null => - items.length === 0 - ? null - : items.reduce((best, current) => - current.tokens.length > best.tokens.length ? current : best, - ); + if (scanningCandidates.length === 0) { + return null; + } const getCandidateScore = (candidate: YomitanParseCandidate): number => { const readableTokenCount = candidate.tokens.filter( @@ -242,22 +237,10 @@ export function selectBestYomitanParseCandidate( }); }; - if (scanningCandidates.length > 0) { - const bestScanning = getBestByTokenCount(scanningCandidates); - if (bestScanning && bestScanning.tokens.length > 1) { - return bestScanning.tokens; - } - - const bestMecab = chooseBestCandidate(mecabCandidates); - if (bestMecab && bestMecab.tokens.length > (bestScanning?.tokens.length ?? 0)) { - return bestMecab.tokens; - } - - return bestScanning ? bestScanning.tokens : null; - } - - const multiTokenCandidates = candidates.filter((candidate) => candidate.tokens.length > 1); - const pool = multiTokenCandidates.length > 0 ? multiTokenCandidates : candidates; + const multiTokenCandidates = scanningCandidates.filter( + (candidate) => candidate.tokens.length > 1, + ); + const pool = multiTokenCandidates.length > 0 ? multiTokenCandidates : scanningCandidates; const bestCandidate = chooseBestCandidate(pool); return bestCandidate ? bestCandidate.tokens : null; }