diff --git a/backlog/tasks/task-77 - Split-tokenizer-pipeline-into-parser-selection-enrichment-and-annotation-stages.md b/backlog/tasks/task-77 - Split-tokenizer-pipeline-into-parser-selection-enrichment-and-annotation-stages.md index 42f04d1..27e536b 100644 --- a/backlog/tasks/task-77 - Split-tokenizer-pipeline-into-parser-selection-enrichment-and-annotation-stages.md +++ b/backlog/tasks/task-77 - Split-tokenizer-pipeline-into-parser-selection-enrichment-and-annotation-stages.md @@ -1,10 +1,13 @@ --- id: TASK-77 -title: Split tokenizer pipeline into parser selection enrichment and annotation stages -status: To Do -assignee: [] +title: >- + Split tokenizer pipeline into parser selection enrichment and annotation + stages +status: Done +assignee: + - '@opencode' created_date: '2026-02-18 11:43' -updated_date: '2026-02-18 11:43' +updated_date: '2026-02-21 23:47' labels: - tokenizer - subtitles @@ -40,15 +43,43 @@ priority: high ## Acceptance Criteria -- [ ] #1 Tokenizer code is split into explicit stages with narrow interfaces -- [ ] #2 Candidate selection logic is pure + directly testable -- [ ] #3 Parser lifecycle concerns are separated from annotation passes -- [ ] #4 Existing tokenization behavior preserved in regression tests +- [x] #1 Tokenizer code is split into explicit stages with narrow interfaces +- [x] #2 Candidate selection logic is pure + directly testable +- [x] #3 Parser lifecycle concerns are separated from annotation passes +- [x] #4 Existing tokenization behavior preserved in regression tests +## Implementation Plan + + +1. Extract pure parser-selection stage from `src/core/services/tokenizer.ts` into `src/core/services/tokenizer/parser-selection-stage.ts` (parse-result mapping + candidate scoring/selection) and add direct stage tests for source preference/tie-break scoring. +2. Extract MeCab POS1 enrichment stage into `src/core/services/tokenizer/parser-enrichment-stage.ts` with direct tests for overlap and surface-sequence fallback behavior. +3. Extract annotation stage into `src/core/services/tokenizer/annotation-stage.ts` to handle known-word/frequency/JLPT/N+1 passes behind a narrow API, with new stage-level tests. +4. Separate parser window/runtime lifecycle into `src/core/services/tokenizer/yomitan-parser-runtime.ts`, keep `tokenizer.ts` as thin orchestrator, run tokenizer + core src/dist gates, then finalize TASK-77 AC/DoD evidence in Backlog MCP. + + +## Implementation Notes + + +2026-02-21: started execution pass in current session; loaded Backlog context and tokenizer module/tests before drafting implementation plan via writing-plans skill. + +Implemented tokenizer pipeline split with new stage modules: `src/core/services/tokenizer/parser-selection-stage.ts`, `src/core/services/tokenizer/parser-enrichment-stage.ts`, `src/core/services/tokenizer/annotation-stage.ts`, and parser lifecycle runtime in `src/core/services/tokenizer/yomitan-parser-runtime.ts`; reduced `src/core/services/tokenizer.ts` to orchestration facade over stages. + +Added direct stage-level tests: `src/core/services/tokenizer/parser-selection-stage.test.ts`, `src/core/services/tokenizer/parser-enrichment-stage.test.ts`, and `src/core/services/tokenizer/annotation-stage.test.ts`, and wired them into `test:core:src` + `test:core:dist` scripts in `package.json`. + +Verification: `bun test src/core/services/tokenizer.test.ts src/core/services/tokenizer/annotation-stage.test.ts src/core/services/tokenizer/parser-selection-stage.test.ts src/core/services/tokenizer/parser-enrichment-stage.test.ts` PASS (53/53); `bun run test:core:src` PASS (219 pass, 6 skip); `bun run build` PASS; `bun run test:core:dist` PASS (214 pass, 10 skip). + + +## Final Summary + + +Split tokenizer internals into explicit stages while preserving external behavior: parser candidate mapping/selection moved to `parser-selection-stage`, MeCab POS1 enrichment moved to `parser-enrichment-stage`, post-token annotation (known-word, frequency, JLPT, N+1) moved to `annotation-stage`, and Yomitan parser window lifecycle isolated in `yomitan-parser-runtime`. `src/core/services/tokenizer.ts` now acts as a thin orchestrator that normalizes subtitle text, requests parser output, runs stage pipeline, and handles MeCab fallback. + +Added direct stage-level tests for scoring/selection and annotation semantics (`parser-selection-stage.test.ts`, `parser-enrichment-stage.test.ts`, `annotation-stage.test.ts`) and included them in both source and dist core test lanes via `package.json`. Validation passed across targeted tokenizer tests plus full core gates (`test:core:src`, `build`, `test:core:dist`) with no tokenizer regression. + + ## Definition of Done -- [ ] #1 Tokenizer-related test suites pass -- [ ] #2 New stage-level tests exist for scoring and annotation +- [x] #1 Tokenizer-related test suites pass +- [x] #2 New stage-level tests exist for scoring and annotation - diff --git a/package.json b/package.json index 57e1e6e..9e283cc 100644 --- a/package.json +++ b/package.json @@ -24,8 +24,8 @@ "test:config:dist": "node --test dist/config/config.test.js dist/config/path-resolution.test.js dist/config/resolve/anki-connect.test.js dist/config/resolve/subtitle-style.test.js dist/config/resolve/jellyfin.test.js", "test:config:smoke:dist": "node --test dist/config/path-resolution.test.js", "test:launcher:src": "bun test launcher/config.test.ts launcher/parse-args.test.ts launcher/main.test.ts", - "test:core:src": "bun test src/cli/args.test.ts src/cli/help.test.ts src/core/services/cli-command.test.ts src/core/services/field-grouping-overlay.test.ts src/core/services/numeric-shortcut-session.test.ts src/core/services/secondary-subtitle.test.ts src/core/services/mpv-render-metrics.test.ts src/core/services/overlay-content-measurement.test.ts src/core/services/mpv-control.test.ts src/core/services/mpv.test.ts src/core/services/runtime-options-ipc.test.ts src/core/services/runtime-config.test.ts src/core/services/config-hot-reload.test.ts src/core/services/tokenizer.test.ts src/core/services/subsync.test.ts src/core/services/overlay-bridge.test.ts src/core/services/overlay-shortcut-handler.test.ts src/core/services/mining.test.ts src/core/services/anki-jimaku.test.ts src/core/services/jellyfin.test.ts src/core/services/jellyfin-remote.test.ts src/core/services/immersion-tracker-service.test.ts src/core/services/app-ready.test.ts src/core/services/startup-bootstrap.test.ts src/core/services/subtitle-processing-controller.test.ts src/core/services/anilist/anilist-update-queue.test.ts src/renderer/error-recovery.test.ts src/subsync/utils.test.ts src/main/anilist-url-guard.test.ts src/window-trackers/x11-tracker.test.ts launcher/config.test.ts launcher/parse-args.test.ts launcher/main.test.ts", - "test:core:dist": "node --test dist/cli/args.test.js dist/cli/help.test.js dist/core/services/cli-command.test.js dist/core/services/ipc.test.js dist/core/services/field-grouping-overlay.test.js dist/core/services/numeric-shortcut-session.test.js dist/core/services/secondary-subtitle.test.js dist/core/services/mpv-render-metrics.test.js dist/core/services/overlay-content-measurement.test.js dist/core/services/mpv-control.test.js dist/core/services/mpv.test.js dist/core/services/runtime-options-ipc.test.js dist/core/services/runtime-config.test.js dist/core/services/config-hot-reload.test.js dist/core/services/tokenizer.test.js dist/core/services/subsync.test.js dist/core/services/overlay-bridge.test.js dist/core/services/overlay-manager.test.js dist/core/services/overlay-shortcut-handler.test.js dist/core/services/mining.test.js dist/core/services/anki-jimaku.test.js dist/core/services/jellyfin.test.js dist/core/services/jellyfin-remote.test.js dist/core/services/immersion-tracker-service.test.js dist/core/services/app-ready.test.js dist/core/services/startup-bootstrap.test.js dist/core/services/subtitle-processing-controller.test.js dist/core/services/anilist/anilist-token-store.test.js dist/core/services/anilist/anilist-update-queue.test.js dist/renderer/error-recovery.test.js dist/subsync/utils.test.js dist/main/anilist-url-guard.test.js dist/window-trackers/x11-tracker.test.js", + "test:core:src": "bun test src/cli/args.test.ts src/cli/help.test.ts src/core/services/cli-command.test.ts src/core/services/field-grouping-overlay.test.ts src/core/services/numeric-shortcut-session.test.ts src/core/services/secondary-subtitle.test.ts src/core/services/mpv-render-metrics.test.ts src/core/services/overlay-content-measurement.test.ts src/core/services/mpv-control.test.ts src/core/services/mpv.test.ts src/core/services/runtime-options-ipc.test.ts src/core/services/runtime-config.test.ts src/core/services/config-hot-reload.test.ts src/core/services/tokenizer.test.ts src/core/services/tokenizer/annotation-stage.test.ts src/core/services/tokenizer/parser-selection-stage.test.ts src/core/services/tokenizer/parser-enrichment-stage.test.ts src/core/services/subsync.test.ts src/core/services/overlay-bridge.test.ts src/core/services/overlay-shortcut-handler.test.ts src/core/services/mining.test.ts src/core/services/anki-jimaku.test.ts src/core/services/jellyfin.test.ts src/core/services/jellyfin-remote.test.ts src/core/services/immersion-tracker-service.test.ts src/core/services/app-ready.test.ts src/core/services/startup-bootstrap.test.ts src/core/services/subtitle-processing-controller.test.ts src/core/services/anilist/anilist-update-queue.test.ts src/renderer/error-recovery.test.ts src/subsync/utils.test.ts src/main/anilist-url-guard.test.ts src/window-trackers/x11-tracker.test.ts launcher/config.test.ts launcher/parse-args.test.ts launcher/main.test.ts", + "test:core:dist": "node --test dist/cli/args.test.js dist/cli/help.test.js dist/core/services/cli-command.test.js dist/core/services/ipc.test.js dist/core/services/field-grouping-overlay.test.js dist/core/services/numeric-shortcut-session.test.js dist/core/services/secondary-subtitle.test.js dist/core/services/mpv-render-metrics.test.js dist/core/services/overlay-content-measurement.test.js dist/core/services/mpv-control.test.js dist/core/services/mpv.test.js dist/core/services/runtime-options-ipc.test.js dist/core/services/runtime-config.test.js dist/core/services/config-hot-reload.test.js dist/core/services/tokenizer.test.js dist/core/services/tokenizer/annotation-stage.test.js dist/core/services/tokenizer/parser-selection-stage.test.js dist/core/services/tokenizer/parser-enrichment-stage.test.js dist/core/services/subsync.test.js dist/core/services/overlay-bridge.test.js dist/core/services/overlay-manager.test.js dist/core/services/overlay-shortcut-handler.test.js dist/core/services/mining.test.js dist/core/services/anki-jimaku.test.js dist/core/services/jellyfin.test.js dist/core/services/jellyfin-remote.test.js dist/core/services/immersion-tracker-service.test.js dist/core/services/app-ready.test.js dist/core/services/startup-bootstrap.test.js dist/core/services/subtitle-processing-controller.test.js dist/core/services/anilist/anilist-token-store.test.js dist/core/services/anilist/anilist-update-queue.test.js dist/renderer/error-recovery.test.js dist/subsync/utils.test.js dist/main/anilist-url-guard.test.js dist/window-trackers/x11-tracker.test.js", "test:core:smoke:dist": "node --test dist/cli/help.test.js dist/core/services/runtime-config.test.js dist/core/services/ipc.test.js dist/core/services/overlay-manager.test.js dist/core/services/anilist/anilist-token-store.test.js dist/core/services/startup-bootstrap.test.js dist/renderer/error-recovery.test.js dist/main/anilist-url-guard.test.js dist/window-trackers/x11-tracker.test.js", "test:smoke:dist": "bun run test:config:smoke:dist && bun run test:core:smoke:dist", "test:subtitle:dist": "echo \"Subtitle tests are currently not configured\"", diff --git a/src/core/services/tokenizer.ts b/src/core/services/tokenizer.ts index c1d0644..5279cef 100644 --- a/src/core/services/tokenizer.ts +++ b/src/core/services/tokenizer.ts @@ -1,59 +1,21 @@ import type { BrowserWindow, Extension } from 'electron'; -import { markNPlusOneTargets, mergeTokens } from '../../token-merger'; +import { mergeTokens } from '../../token-merger'; +import { createLogger } from '../../logger'; import { - JlptLevel, MergedToken, NPlusOneMatchMode, - PartOfSpeech, SubtitleData, Token, FrequencyDictionaryLookup, + JlptLevel, } from '../../types'; -import { shouldIgnoreJlptForMecabPos1, shouldIgnoreJlptByTerm } from './jlpt-token-filter'; -import { createLogger } from '../../logger'; +import { annotateTokens } from './tokenizer/annotation-stage'; +import { enrichTokensWithMecabPos1 } from './tokenizer/parser-enrichment-stage'; +import { selectYomitanParseTokens } from './tokenizer/parser-selection-stage'; +import { requestYomitanParseResults } from './tokenizer/yomitan-parser-runtime'; -interface YomitanParseHeadword { - term?: unknown; -} - -interface YomitanParseSegment { - text?: string; - reading?: string; - headwords?: unknown; -} - -interface YomitanParseResultItem { - source?: unknown; - index?: unknown; - content?: unknown; -} - -type YomitanParseLine = YomitanParseSegment[]; - -const KATAKANA_TO_HIRAGANA_OFFSET = 0x60; -const KATAKANA_CODEPOINT_START = 0x30a1; -const KATAKANA_CODEPOINT_END = 0x30f6; -const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048; -const FREQUENCY_RANK_LOOKUP_CACHE_LIMIT = 2048; const logger = createLogger('main:tokenizer'); -const jlptLevelLookupCaches = new WeakMap< - (text: string) => JlptLevel | null, - Map ->(); -const frequencyRankLookupCaches = new WeakMap< - FrequencyDictionaryLookup, - Map ->(); - -function isObject(value: unknown): value is Record { - return Boolean(value && typeof value === 'object'); -} - -function isString(value: unknown): value is string { - return typeof value === 'string'; -} - export interface TokenizerServiceDeps { getYomitanExt: () => Extension | null; getYomitanParserWindow: () => BrowserWindow | null; @@ -98,89 +60,6 @@ export interface TokenizerDepsRuntimeOptions { getMecabTokenizer: () => MecabTokenizerLike | null; } -function getCachedJlptLevel( - lookupText: string, - getJlptLevel: (text: string) => JlptLevel | null, -): JlptLevel | null { - const normalizedText = lookupText.trim(); - if (!normalizedText) { - return null; - } - - let cache = jlptLevelLookupCaches.get(getJlptLevel); - if (!cache) { - cache = new Map(); - jlptLevelLookupCaches.set(getJlptLevel, cache); - } - - if (cache.has(normalizedText)) { - return cache.get(normalizedText) ?? null; - } - - let level: JlptLevel | null; - try { - level = getJlptLevel(normalizedText); - } catch { - level = null; - } - - cache.set(normalizedText, level); - while (cache.size > JLPT_LEVEL_LOOKUP_CACHE_LIMIT) { - const firstKey = cache.keys().next().value; - if (firstKey !== undefined) { - cache.delete(firstKey); - } - } - - return level; -} - -function normalizeFrequencyLookupText(rawText: string): string { - return rawText.trim().toLowerCase(); -} - -function getCachedFrequencyRank( - lookupText: string, - getFrequencyRank: FrequencyDictionaryLookup, -): number | null { - const normalizedText = normalizeFrequencyLookupText(lookupText); - if (!normalizedText) { - return null; - } - - let cache = frequencyRankLookupCaches.get(getFrequencyRank); - if (!cache) { - cache = new Map(); - frequencyRankLookupCaches.set(getFrequencyRank, cache); - } - - if (cache.has(normalizedText)) { - return cache.get(normalizedText) ?? null; - } - - let rank: number | null; - try { - rank = getFrequencyRank(normalizedText); - } catch { - rank = null; - } - if (rank !== null) { - if (!Number.isFinite(rank) || rank <= 0) { - rank = null; - } - } - - cache.set(normalizedText, rank); - while (cache.size > FREQUENCY_RANK_LOOKUP_CACHE_LIMIT) { - const firstKey = cache.keys().next().value; - if (firstKey !== undefined) { - cache.delete(firstKey); - } - } - - return rank; -} - export function createTokenizerDepsRuntime( options: TokenizerDepsRuntimeOptions, ): TokenizerServiceDeps { @@ -224,463 +103,14 @@ export function createTokenizerDepsRuntime( if (!rawTokens || rawTokens.length === 0) { return null; } + return mergeTokens(rawTokens, options.isKnownWord, options.getKnownWordMatchMode()); }, }; } -function resolveKnownWordText( - surface: string, - headword: string, - matchMode: NPlusOneMatchMode, -): string { - return matchMode === 'surface' ? surface : headword; -} - -function applyKnownWordMarking( - tokens: MergedToken[], - isKnownWord: (text: string) => boolean, - knownWordMatchMode: NPlusOneMatchMode, -): MergedToken[] { - return tokens.map((token) => { - const matchText = resolveKnownWordText(token.surface, token.headword, knownWordMatchMode); - - return { - ...token, - isKnown: token.isKnown || (matchText ? isKnownWord(matchText) : false), - }; - }); -} - -function resolveFrequencyLookupText(token: MergedToken): string { - if (token.headword && token.headword.length > 0) { - return token.headword; - } - if (token.reading && token.reading.length > 0) { - return token.reading; - } - return token.surface; -} - -function getFrequencyLookupTextCandidates(token: MergedToken): string[] { - const lookupText = resolveFrequencyLookupText(token).trim(); - return lookupText ? [lookupText] : []; -} - -function isFrequencyExcludedByPos(token: MergedToken): boolean { - if ( - token.partOfSpeech === PartOfSpeech.particle || - token.partOfSpeech === PartOfSpeech.bound_auxiliary - ) { - return true; - } - - return token.pos1 === '助詞' || token.pos1 === '助動詞'; -} - -function applyFrequencyMarking( - tokens: MergedToken[], - getFrequencyRank: FrequencyDictionaryLookup, -): MergedToken[] { - return tokens.map((token) => { - if (isFrequencyExcludedByPos(token)) { - return { ...token, frequencyRank: undefined }; - } - - const lookupTexts = getFrequencyLookupTextCandidates(token); - if (lookupTexts.length === 0) { - return { ...token, frequencyRank: undefined }; - } - - let bestRank: number | null = null; - for (const lookupText of lookupTexts) { - const rank = getCachedFrequencyRank(lookupText, getFrequencyRank); - if (rank === null) { - continue; - } - if (bestRank === null || rank < bestRank) { - bestRank = rank; - } - } - - return { - ...token, - frequencyRank: bestRank ?? undefined, - }; - }); -} - -function resolveJlptLookupText(token: MergedToken): string { - if (token.headword && token.headword.length > 0) { - return token.headword; - } - if (token.reading && token.reading.length > 0) { - return token.reading; - } - return token.surface; -} - -function normalizeJlptTextForExclusion(text: string): string { - const raw = text.trim(); - if (!raw) { - return ''; - } - - let normalized = ''; - for (const char of raw) { - const code = char.codePointAt(0); - if (code === undefined) { - continue; - } - - if (code >= KATAKANA_CODEPOINT_START && code <= KATAKANA_CODEPOINT_END) { - normalized += String.fromCodePoint(code - KATAKANA_TO_HIRAGANA_OFFSET); - continue; - } - - normalized += char; - } - - return normalized; -} - -function isKanaChar(char: string): boolean { - const code = char.codePointAt(0); - if (code === undefined) { - return false; - } - - return ( - (code >= 0x3041 && code <= 0x3096) || - (code >= 0x309b && code <= 0x309f) || - (code >= 0x30a0 && code <= 0x30fa) || - (code >= 0x30fd && code <= 0x30ff) - ); -} - -/** - * Detects repeated-kana speech-like tokens (e.g. 「ああああ」, 「ははは」, 「うーん」 style patterns) - * so they are not JLPT-labeled when they are mostly expressive particles/sfx. - */ -function isRepeatedKanaSfx(text: string): boolean { - const normalized = text.trim(); - if (!normalized) { - return false; - } - - const chars = [...normalized]; - if (!chars.every(isKanaChar)) { - return false; - } - - const counts = new Map(); - let hasAdjacentRepeat = false; - - for (let i = 0; i < chars.length; i += 1) { - const char = chars[i]!; - counts.set(char, (counts.get(char) ?? 0) + 1); - if (i > 0 && chars[i] === chars[i - 1]) { - hasAdjacentRepeat = true; - } - } - - const topCount = Math.max(...counts.values()); - if (chars.length <= 2) { - return hasAdjacentRepeat || topCount >= 2; - } - - if (hasAdjacentRepeat) { - return true; - } - - return topCount >= Math.ceil(chars.length / 2); -} - -function isJlptEligibleToken(token: MergedToken): boolean { - if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) return false; - - const candidates = [ - resolveJlptLookupText(token), - token.surface, - token.reading, - token.headword, - ].filter( - (candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0, - ); - - for (const candidate of candidates) { - const normalizedCandidate = normalizeJlptTextForExclusion(candidate); - if (!normalizedCandidate) { - continue; - } - - const trimmedCandidate = candidate.trim(); - if (shouldIgnoreJlptByTerm(trimmedCandidate) || shouldIgnoreJlptByTerm(normalizedCandidate)) { - return false; - } - - if (isRepeatedKanaSfx(candidate) || isRepeatedKanaSfx(normalizedCandidate)) { - return false; - } - } - - return true; -} - -function isYomitanParseResultItem(value: unknown): value is YomitanParseResultItem { - if (!isObject(value)) { - return false; - } - if (!isString((value as YomitanParseResultItem).source)) { - return false; - } - if (!Array.isArray((value as YomitanParseResultItem).content)) { - return false; - } - return true; -} - -function isYomitanParseLine(value: unknown): value is YomitanParseLine { - if (!Array.isArray(value)) { - return false; - } - - return value.every((segment) => { - if (!isObject(segment)) { - return false; - } - - const candidate = segment as YomitanParseSegment; - return isString(candidate.text); - }); -} - -function isYomitanHeadwordRows(value: unknown): value is YomitanParseHeadword[][] { - return ( - Array.isArray(value) && - value.every( - (group) => - Array.isArray(group) && - group.every((item) => isObject(item) && isString((item as YomitanParseHeadword).term)), - ) - ); -} - -function extractYomitanHeadword(segment: YomitanParseSegment): string { - const headwords = segment.headwords; - if (!isYomitanHeadwordRows(headwords)) { - return ''; - } - - for (const group of headwords) { - if (group.length > 0) { - const firstHeadword = group[0] as YomitanParseHeadword; - if (isString(firstHeadword?.term)) { - return firstHeadword.term; - } - } - } - - return ''; -} - -function applyJlptMarking( - tokens: MergedToken[], - getJlptLevel: (text: string) => JlptLevel | null, -): MergedToken[] { - return tokens.map((token) => { - if (!isJlptEligibleToken(token)) { - return { ...token, jlptLevel: undefined }; - } - - const primaryLevel = getCachedJlptLevel(resolveJlptLookupText(token), getJlptLevel); - const fallbackLevel = - primaryLevel === null ? getCachedJlptLevel(token.surface, getJlptLevel) : null; - - return { - ...token, - jlptLevel: primaryLevel ?? fallbackLevel ?? token.jlptLevel, - }; - }); -} - -interface YomitanParseCandidate { - source: string; - index: number; - tokens: MergedToken[]; -} - -function mapYomitanParseResultItemToMergedTokens( - parseResult: YomitanParseResultItem, - isKnownWord: (text: string) => boolean, - knownWordMatchMode: NPlusOneMatchMode, -): YomitanParseCandidate | null { - const content = parseResult.content; - if (!Array.isArray(content) || content.length === 0) { - return null; - } - - const source = String(parseResult.source ?? ''); - const index = - typeof parseResult.index === 'number' && Number.isInteger(parseResult.index) - ? parseResult.index - : 0; - - const tokens: MergedToken[] = []; - let charOffset = 0; - let validLineCount = 0; - - for (const line of content) { - if (!isYomitanParseLine(line)) { - continue; - } - validLineCount += 1; - - let combinedSurface = ''; - let combinedReading = ''; - let combinedHeadword = ''; - - for (const segment of line) { - const segmentText = segment.text; - if (!segmentText || segmentText.length === 0) { - continue; - } - - combinedSurface += segmentText; - if (typeof segment.reading === 'string') { - combinedReading += segment.reading; - } - if (!combinedHeadword) { - combinedHeadword = extractYomitanHeadword(segment); - } - } - - if (!combinedSurface) { - continue; - } - - const start = charOffset; - const end = start + combinedSurface.length; - charOffset = end; - const headword = combinedHeadword || combinedSurface; - - tokens.push({ - surface: combinedSurface, - reading: combinedReading, - headword, - startPos: start, - endPos: end, - partOfSpeech: PartOfSpeech.other, - pos1: '', - isMerged: true, - isNPlusOneTarget: false, - isKnown: (() => { - const matchText = resolveKnownWordText(combinedSurface, headword, knownWordMatchMode); - return matchText ? isKnownWord(matchText) : false; - })(), - }); - } - - if (validLineCount === 0 || tokens.length === 0) { - return null; - } - - return { source, index, tokens }; -} - -function selectBestYomitanParseCandidate( - candidates: YomitanParseCandidate[], -): MergedToken[] | null { - if (candidates.length === 0) { - return null; - } - - const scanningCandidates = candidates.filter( - (candidate) => candidate.source === 'scanning-parser', - ); - const mecabCandidates = candidates.filter((candidate) => candidate.source === 'mecab'); - - const getBestByTokenCount = (items: YomitanParseCandidate[]): YomitanParseCandidate | null => - items.length === 0 - ? null - : items.reduce((best, current) => - current.tokens.length > best.tokens.length ? current : best, - ); - - const getCandidateScore = (candidate: YomitanParseCandidate): number => { - const readableTokenCount = candidate.tokens.filter( - (token) => token.reading.trim().length > 0, - ).length; - const suspiciousKanaFragmentCount = candidate.tokens.filter( - (token) => - token.reading.trim().length === 0 && - token.surface.length >= 2 && - Array.from(token.surface).every((char) => isKanaChar(char)), - ).length; - - return readableTokenCount * 100 - suspiciousKanaFragmentCount * 50 - candidate.tokens.length; - }; - - const chooseBestCandidate = (items: YomitanParseCandidate[]): YomitanParseCandidate | null => { - if (items.length === 0) { - return null; - } - - return items.reduce((best, current) => { - const bestScore = getCandidateScore(best); - const currentScore = getCandidateScore(current); - if (currentScore !== bestScore) { - return currentScore > bestScore ? current : best; - } - - if (current.tokens.length !== best.tokens.length) { - return current.tokens.length < best.tokens.length ? current : best; - } - - return best; - }); - }; - - if (scanningCandidates.length > 0) { - const bestScanning = getBestByTokenCount(scanningCandidates); - if (bestScanning && bestScanning.tokens.length > 1) { - return bestScanning.tokens; - } - - const bestMecab = chooseBestCandidate(mecabCandidates); - if (bestMecab && bestMecab.tokens.length > (bestScanning?.tokens.length ?? 0)) { - return bestMecab.tokens; - } - - return bestScanning ? bestScanning.tokens : null; - } - - const multiTokenCandidates = candidates.filter((candidate) => candidate.tokens.length > 1); - const pool = multiTokenCandidates.length > 0 ? multiTokenCandidates : candidates; - const bestCandidate = chooseBestCandidate(pool); - return bestCandidate ? bestCandidate.tokens : null; -} - -function mapYomitanParseResultsToMergedTokens( - parseResults: unknown, - isKnownWord: (text: string) => boolean, - knownWordMatchMode: NPlusOneMatchMode, -): MergedToken[] | null { - if (!Array.isArray(parseResults) || parseResults.length === 0) { - return null; - } - - const candidates = parseResults - .filter((item): item is YomitanParseResultItem => isYomitanParseResultItem(item)) - .map((item) => mapYomitanParseResultItemToMergedTokens(item, isKnownWord, knownWordMatchMode)) - .filter((candidate): candidate is YomitanParseCandidate => candidate !== null); - - const bestCandidate = selectBestYomitanParseCandidate(candidates); - return bestCandidate; -} - function logSelectedYomitanGroups(text: string, tokens: MergedToken[]): void { - if (!tokens || tokens.length === 0) { + if (tokens.length === 0) { return; } @@ -698,335 +128,67 @@ function logSelectedYomitanGroups(text: string, tokens: MergedToken[]): void { }); } -function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): string | undefined { - if (mecabTokens.length === 0) { - return undefined; - } - - const tokenStart = token.startPos ?? 0; - const tokenEnd = token.endPos ?? tokenStart + token.surface.length; - let bestSurfaceMatchPos1: string | undefined; - let bestSurfaceMatchDistance = Number.MAX_SAFE_INTEGER; - let bestSurfaceMatchEndDistance = Number.MAX_SAFE_INTEGER; - - for (const mecabToken of mecabTokens) { - if (!mecabToken.pos1) { - continue; - } - - if (mecabToken.surface !== token.surface) { - continue; - } - - const mecabStart = mecabToken.startPos ?? 0; - const mecabEnd = mecabToken.endPos ?? mecabStart + mecabToken.surface.length; - const startDistance = Math.abs(mecabStart - tokenStart); - const endDistance = Math.abs(mecabEnd - tokenEnd); - - if ( - startDistance < bestSurfaceMatchDistance || - (startDistance === bestSurfaceMatchDistance && endDistance < bestSurfaceMatchEndDistance) - ) { - bestSurfaceMatchDistance = startDistance; - bestSurfaceMatchEndDistance = endDistance; - bestSurfaceMatchPos1 = mecabToken.pos1; - } - } - - if (bestSurfaceMatchPos1) { - return bestSurfaceMatchPos1; - } - - let bestPos1: string | undefined; - let bestOverlap = 0; - let bestSpan = 0; - let bestStartDistance = Number.MAX_SAFE_INTEGER; - let bestStart = Number.MAX_SAFE_INTEGER; - - for (const mecabToken of mecabTokens) { - if (!mecabToken.pos1) { - continue; - } - - const mecabStart = mecabToken.startPos ?? 0; - const mecabEnd = mecabToken.endPos ?? mecabStart + mecabToken.surface.length; - const overlapStart = Math.max(tokenStart, mecabStart); - const overlapEnd = Math.min(tokenEnd, mecabEnd); - const overlap = Math.max(0, overlapEnd - overlapStart); - if (overlap === 0) { - continue; - } - - const span = mecabEnd - mecabStart; - if ( - overlap > bestOverlap || - (overlap === bestOverlap && - (Math.abs(mecabStart - tokenStart) < bestStartDistance || - (Math.abs(mecabStart - tokenStart) === bestStartDistance && - (span > bestSpan || (span === bestSpan && mecabStart < bestStart))))) - ) { - bestOverlap = overlap; - bestSpan = span; - bestStartDistance = Math.abs(mecabStart - tokenStart); - bestStart = mecabStart; - bestPos1 = mecabToken.pos1; - } - } - - return bestOverlap > 0 ? bestPos1 : undefined; +function getAnnotationOptions(deps: TokenizerServiceDeps): { + jlptEnabled: boolean; + frequencyEnabled: boolean; + minSentenceWordsForNPlusOne: number | undefined; +} { + return { + jlptEnabled: deps.getJlptEnabled?.() !== false, + frequencyEnabled: deps.getFrequencyDictionaryEnabled?.() !== false, + minSentenceWordsForNPlusOne: deps.getMinSentenceWordsForNPlusOne?.(), + }; } -function fillMissingPos1BySurfaceSequence( - tokens: MergedToken[], - mecabTokens: MergedToken[], -): MergedToken[] { - const indexedMecabTokens = mecabTokens - .map((token, index) => ({ token, index })) - .filter(({ token }) => token.pos1 && token.surface.trim().length > 0); +function applyAnnotationStage(tokens: MergedToken[], deps: TokenizerServiceDeps): MergedToken[] { + const options = getAnnotationOptions(deps); - if (indexedMecabTokens.length === 0) { - return tokens; - } - - let cursor = 0; - return tokens.map((token) => { - if (token.pos1 && token.pos1.trim().length > 0) { - return token; - } - - const surface = token.surface.trim(); - if (!surface) { - return token; - } - - let best: { pos1: string; index: number } | null = null; - for (const candidate of indexedMecabTokens) { - if (candidate.token.surface !== surface) { - continue; - } - if (candidate.index < cursor) { - continue; - } - best = { pos1: candidate.token.pos1 as string, index: candidate.index }; - break; - } - - if (!best) { - for (const candidate of indexedMecabTokens) { - if (candidate.token.surface !== surface) { - continue; - } - best = { pos1: candidate.token.pos1 as string, index: candidate.index }; - break; - } - } - - if (!best) { - return token; - } - - cursor = best.index + 1; - return { - ...token, - pos1: best.pos1, - }; - }); -} - -async function enrichYomitanPos1( - tokens: MergedToken[], - deps: TokenizerServiceDeps, - text: string, -): Promise { - if (!tokens || tokens.length === 0) { - return tokens; - } - - let mecabTokens: MergedToken[] | null = null; - try { - mecabTokens = await deps.tokenizeWithMecab(text); - } catch (err) { - const error = err as Error; - logger.warn( - 'Failed to enrich Yomitan tokens with MeCab POS:', - error.message, - `tokenCount=${tokens.length}`, - `textLength=${text.length}`, - ); - return tokens; - } - - if (!mecabTokens || mecabTokens.length === 0) { - logger.warn( - 'MeCab enrichment returned no tokens; preserving Yomitan token output.', - `tokenCount=${tokens.length}`, - `textLength=${text.length}`, - ); - return tokens; - } - - const overlapEnriched = tokens.map((token) => { - if (token.pos1) { - return token; - } - - const pos1 = pickClosestMecabPos1(token, mecabTokens); - if (!pos1) { - return token; - } - - return { - ...token, - pos1, - }; - }); - - return fillMissingPos1BySurfaceSequence(overlapEnriched, mecabTokens); -} - -async function ensureYomitanParserWindow(deps: TokenizerServiceDeps): Promise { - const electron = await import('electron'); - const yomitanExt = deps.getYomitanExt(); - if (!yomitanExt) { - return false; - } - - const currentWindow = deps.getYomitanParserWindow(); - if (currentWindow && !currentWindow.isDestroyed()) { - return true; - } - - const existingInitPromise = deps.getYomitanParserInitPromise(); - if (existingInitPromise) { - return existingInitPromise; - } - - const initPromise = (async () => { - const { BrowserWindow, session } = electron; - const parserWindow = new BrowserWindow({ - show: false, - width: 800, - height: 600, - webPreferences: { - contextIsolation: true, - nodeIntegration: false, - session: session.defaultSession, - }, - }); - deps.setYomitanParserWindow(parserWindow); - - deps.setYomitanParserReadyPromise( - new Promise((resolve, reject) => { - parserWindow.webContents.once('did-finish-load', () => resolve()); - parserWindow.webContents.once('did-fail-load', (_event, _errorCode, errorDescription) => { - reject(new Error(errorDescription)); - }); - }), - ); - - parserWindow.on('closed', () => { - if (deps.getYomitanParserWindow() === parserWindow) { - deps.setYomitanParserWindow(null); - deps.setYomitanParserReadyPromise(null); - } - }); - - try { - await parserWindow.loadURL(`chrome-extension://${yomitanExt.id}/search.html`); - const readyPromise = deps.getYomitanParserReadyPromise(); - if (readyPromise) { - await readyPromise; - } - return true; - } catch (err) { - logger.error('Failed to initialize Yomitan parser window:', (err as Error).message); - if (!parserWindow.isDestroyed()) { - parserWindow.destroy(); - } - if (deps.getYomitanParserWindow() === parserWindow) { - deps.setYomitanParserWindow(null); - deps.setYomitanParserReadyPromise(null); - } - return false; - } finally { - deps.setYomitanParserInitPromise(null); - } - })(); - - deps.setYomitanParserInitPromise(initPromise); - return initPromise; + return annotateTokens( + tokens, + { + isKnownWord: deps.isKnownWord, + knownWordMatchMode: deps.getKnownWordMatchMode(), + getJlptLevel: deps.getJlptLevel, + getFrequencyRank: deps.getFrequencyRank, + }, + options, + ); } async function parseWithYomitanInternalParser( text: string, deps: TokenizerServiceDeps, ): Promise { - const yomitanExt = deps.getYomitanExt(); - if (!text || !yomitanExt) { + const parseResults = await requestYomitanParseResults(text, deps, logger); + if (!parseResults) { return null; } - const isReady = await ensureYomitanParserWindow(deps); - const parserWindow = deps.getYomitanParserWindow(); - if (!isReady || !parserWindow || parserWindow.isDestroyed()) { + const selectedTokens = selectYomitanParseTokens( + parseResults, + deps.isKnownWord, + deps.getKnownWordMatchMode(), + ); + if (!selectedTokens || selectedTokens.length === 0) { return null; } - const script = ` - (async () => { - const invoke = (action, params) => - new Promise((resolve, reject) => { - chrome.runtime.sendMessage({ action, params }, (response) => { - if (chrome.runtime.lastError) { - reject(new Error(chrome.runtime.lastError.message)); - return; - } - if (!response || typeof response !== "object") { - reject(new Error("Invalid response from Yomitan backend")); - return; - } - if (response.error) { - reject(new Error(response.error.message || "Yomitan backend error")); - return; - } - resolve(response.result); - }); - }); - - const optionsFull = await invoke("optionsGetFull", undefined); - const profileIndex = optionsFull.profileCurrent; - const scanLength = - optionsFull.profiles?.[profileIndex]?.options?.scanning?.length ?? 40; - - return await invoke("parseText", { - text: ${JSON.stringify(text)}, - optionsContext: { index: profileIndex }, - scanLength, - useInternalParser: true, - useMecabParser: true - }); - })(); - `; + if (deps.getYomitanGroupDebugEnabled?.() === true) { + logSelectedYomitanGroups(text, selectedTokens); + } try { - const parseResults = await parserWindow.webContents.executeJavaScript(script, true); - const yomitanTokens = mapYomitanParseResultsToMergedTokens( - parseResults, - deps.isKnownWord, - deps.getKnownWordMatchMode(), - ); - if (!yomitanTokens || yomitanTokens.length === 0) { - return null; - } - - if (deps.getYomitanGroupDebugEnabled?.() === true) { - logSelectedYomitanGroups(text, yomitanTokens); - } - - return enrichYomitanPos1(yomitanTokens, deps, text); + const mecabTokens = await deps.tokenizeWithMecab(text); + return enrichTokensWithMecabPos1(selectedTokens, mecabTokens); } catch (err) { - logger.error('Yomitan parser request failed:', (err as Error).message); - return null; + const error = err as Error; + logger.warn( + 'Failed to enrich Yomitan tokens with MeCab POS:', + error.message, + `tokenCount=${selectedTokens.length}`, + `textLength=${text.length}`, + ); + return selectedTokens; } } @@ -1034,14 +196,6 @@ export async function tokenizeSubtitle( text: string, deps: TokenizerServiceDeps, ): Promise { - const minSentenceWordsForNPlusOne = deps.getMinSentenceWordsForNPlusOne?.(); - const sanitizedMinSentenceWordsForNPlusOne = - minSentenceWordsForNPlusOne !== undefined && - Number.isInteger(minSentenceWordsForNPlusOne) && - minSentenceWordsForNPlusOne > 0 - ? minSentenceWordsForNPlusOne - : 3; - const displayText = text .replace(/\r\n/g, '\n') .replace(/\\N/g, '\n') @@ -1053,60 +207,21 @@ export async function tokenizeSubtitle( } const tokenizeText = displayText.replace(/\n/g, ' ').replace(/\s+/g, ' ').trim(); - const jlptEnabled = deps.getJlptEnabled?.() !== false; - const frequencyEnabled = deps.getFrequencyDictionaryEnabled?.() !== false; - const frequencyLookup = deps.getFrequencyRank; const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps); if (yomitanTokens && yomitanTokens.length > 0) { - const knownMarkedTokens = applyKnownWordMarking( - yomitanTokens, - deps.isKnownWord, - deps.getKnownWordMatchMode(), - ); - const frequencyMarkedTokens = - frequencyEnabled && frequencyLookup - ? applyFrequencyMarking(knownMarkedTokens, frequencyLookup) - : knownMarkedTokens.map((token) => ({ - ...token, - frequencyRank: undefined, - })); - const jlptMarkedTokens = jlptEnabled - ? applyJlptMarking(frequencyMarkedTokens, deps.getJlptLevel) - : frequencyMarkedTokens.map((token) => ({ - ...token, - jlptLevel: undefined, - })); return { text: displayText, - tokens: markNPlusOneTargets(jlptMarkedTokens, sanitizedMinSentenceWordsForNPlusOne), + tokens: applyAnnotationStage(yomitanTokens, deps), }; } try { const mecabTokens = await deps.tokenizeWithMecab(tokenizeText); if (mecabTokens && mecabTokens.length > 0) { - const knownMarkedTokens = applyKnownWordMarking( - mecabTokens, - deps.isKnownWord, - deps.getKnownWordMatchMode(), - ); - const frequencyMarkedTokens = - frequencyEnabled && frequencyLookup - ? applyFrequencyMarking(knownMarkedTokens, frequencyLookup) - : knownMarkedTokens.map((token) => ({ - ...token, - frequencyRank: undefined, - })); - const jlptMarkedTokens = jlptEnabled - ? applyJlptMarking(frequencyMarkedTokens, deps.getJlptLevel) - : frequencyMarkedTokens.map((token) => ({ - ...token, - jlptLevel: undefined, - })); return { text: displayText, - tokens: markNPlusOneTargets(jlptMarkedTokens, sanitizedMinSentenceWordsForNPlusOne), + tokens: applyAnnotationStage(mecabTokens, deps), }; } } catch (err) { diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts new file mode 100644 index 0000000..98a5fd8 --- /dev/null +++ b/src/core/services/tokenizer/annotation-stage.test.ts @@ -0,0 +1,159 @@ +import assert from 'node:assert/strict'; +import test from 'node:test'; +import { MergedToken, PartOfSpeech } from '../../../types'; +import { annotateTokens, AnnotationStageDeps } from './annotation-stage'; + +function makeToken(overrides: Partial = {}): MergedToken { + return { + surface: '猫', + reading: 'ネコ', + headword: '猫', + startPos: 0, + endPos: 1, + partOfSpeech: PartOfSpeech.noun, + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + ...overrides, + }; +} + +function makeDeps(overrides: Partial = {}): AnnotationStageDeps { + return { + isKnownWord: () => false, + knownWordMatchMode: 'headword', + getJlptLevel: () => null, + ...overrides, + }; +} + +test('annotateTokens known-word match mode uses headword vs surface', () => { + const tokens = [makeToken({ surface: '食べた', headword: '食べる', reading: 'タベタ' })]; + const isKnownWord = (text: string): boolean => text === '食べる'; + + const headwordResult = annotateTokens( + tokens, + makeDeps({ + isKnownWord, + knownWordMatchMode: 'headword', + }), + ); + const surfaceResult = annotateTokens( + tokens, + makeDeps({ + isKnownWord, + knownWordMatchMode: 'surface', + }), + ); + + assert.equal(headwordResult[0]?.isKnown, true); + assert.equal(surfaceResult[0]?.isKnown, false); +}); + +test('annotateTokens excludes frequency for particle/bound_auxiliary and pos1 exclusions', () => { + const lookupCalls: string[] = []; + const tokens = [ + makeToken({ surface: 'は', headword: 'は', partOfSpeech: PartOfSpeech.particle }), + makeToken({ + surface: 'です', + headword: 'です', + partOfSpeech: PartOfSpeech.bound_auxiliary, + startPos: 1, + endPos: 3, + }), + makeToken({ + surface: 'の', + headword: 'の', + partOfSpeech: PartOfSpeech.other, + pos1: '助詞', + startPos: 3, + endPos: 4, + }), + makeToken({ + surface: '猫', + headword: '猫', + partOfSpeech: PartOfSpeech.noun, + startPos: 4, + endPos: 5, + }), + ]; + + const result = annotateTokens( + tokens, + makeDeps({ + getFrequencyRank: (text) => { + lookupCalls.push(text); + return text === '猫' ? 11 : 999; + }, + }), + ); + + assert.equal(result[0]?.frequencyRank, undefined); + assert.equal(result[1]?.frequencyRank, undefined); + assert.equal(result[2]?.frequencyRank, undefined); + assert.equal(result[3]?.frequencyRank, 11); + assert.deepEqual(lookupCalls, ['猫']); +}); + +test('annotateTokens handles JLPT disabled and eligibility exclusion paths', () => { + let disabledLookupCalls = 0; + const disabledResult = annotateTokens( + [makeToken({ surface: '猫', headword: '猫' })], + makeDeps({ + getJlptLevel: () => { + disabledLookupCalls += 1; + return 'N5'; + }, + }), + { jlptEnabled: false }, + ); + assert.equal(disabledResult[0]?.jlptLevel, undefined); + assert.equal(disabledLookupCalls, 0); + + let excludedLookupCalls = 0; + const excludedResult = annotateTokens( + [ + makeToken({ + surface: '!', + headword: '!', + reading: '', + pos1: '記号', + partOfSpeech: PartOfSpeech.symbol, + }), + ], + makeDeps({ + getJlptLevel: () => { + excludedLookupCalls += 1; + return 'N5'; + }, + }), + ); + assert.equal(excludedResult[0]?.jlptLevel, undefined); + assert.equal(excludedLookupCalls, 0); +}); + +test('annotateTokens N+1 handoff marks expected target when threshold is satisfied', () => { + const tokens = [ + makeToken({ surface: '私', headword: '私', startPos: 0, endPos: 1 }), + makeToken({ surface: '猫', headword: '猫', startPos: 1, endPos: 2 }), + makeToken({ + surface: '見る', + headword: '見る', + partOfSpeech: PartOfSpeech.verb, + startPos: 2, + endPos: 4, + }), + ]; + + const result = annotateTokens( + tokens, + makeDeps({ + isKnownWord: (text) => text === '私' || text === '見る', + }), + { minSentenceWordsForNPlusOne: 3 }, + ); + + assert.equal(result[0]?.isNPlusOneTarget, false); + assert.equal(result[1]?.isNPlusOneTarget, true); + assert.equal(result[2]?.isNPlusOneTarget, false); +}); diff --git a/src/core/services/tokenizer/annotation-stage.ts b/src/core/services/tokenizer/annotation-stage.ts new file mode 100644 index 0000000..d40bf08 --- /dev/null +++ b/src/core/services/tokenizer/annotation-stage.ts @@ -0,0 +1,375 @@ +import { markNPlusOneTargets } from '../../../token-merger'; +import { + FrequencyDictionaryLookup, + JlptLevel, + MergedToken, + NPlusOneMatchMode, + PartOfSpeech, +} from '../../../types'; +import { shouldIgnoreJlptByTerm, shouldIgnoreJlptForMecabPos1 } from '../jlpt-token-filter'; + +const KATAKANA_TO_HIRAGANA_OFFSET = 0x60; +const KATAKANA_CODEPOINT_START = 0x30a1; +const KATAKANA_CODEPOINT_END = 0x30f6; +const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048; +const FREQUENCY_RANK_LOOKUP_CACHE_LIMIT = 2048; + +const jlptLevelLookupCaches = new WeakMap< + (text: string) => JlptLevel | null, + Map +>(); +const frequencyRankLookupCaches = new WeakMap< + FrequencyDictionaryLookup, + Map +>(); + +export interface AnnotationStageDeps { + isKnownWord: (text: string) => boolean; + knownWordMatchMode: NPlusOneMatchMode; + getJlptLevel: (text: string) => JlptLevel | null; + getFrequencyRank?: FrequencyDictionaryLookup; +} + +export interface AnnotationStageOptions { + jlptEnabled?: boolean; + frequencyEnabled?: boolean; + minSentenceWordsForNPlusOne?: number; +} + +function resolveKnownWordText( + surface: string, + headword: string, + matchMode: NPlusOneMatchMode, +): string { + return matchMode === 'surface' ? surface : headword; +} + +function applyKnownWordMarking( + tokens: MergedToken[], + isKnownWord: (text: string) => boolean, + knownWordMatchMode: NPlusOneMatchMode, +): MergedToken[] { + return tokens.map((token) => { + const matchText = resolveKnownWordText(token.surface, token.headword, knownWordMatchMode); + + return { + ...token, + isKnown: token.isKnown || (matchText ? isKnownWord(matchText) : false), + }; + }); +} + +function normalizeFrequencyLookupText(rawText: string): string { + return rawText.trim().toLowerCase(); +} + +function getCachedFrequencyRank( + lookupText: string, + getFrequencyRank: FrequencyDictionaryLookup, +): number | null { + const normalizedText = normalizeFrequencyLookupText(lookupText); + if (!normalizedText) { + return null; + } + + let cache = frequencyRankLookupCaches.get(getFrequencyRank); + if (!cache) { + cache = new Map(); + frequencyRankLookupCaches.set(getFrequencyRank, cache); + } + + if (cache.has(normalizedText)) { + return cache.get(normalizedText) ?? null; + } + + let rank: number | null; + try { + rank = getFrequencyRank(normalizedText); + } catch { + rank = null; + } + if (rank !== null) { + if (!Number.isFinite(rank) || rank <= 0) { + rank = null; + } + } + + cache.set(normalizedText, rank); + while (cache.size > FREQUENCY_RANK_LOOKUP_CACHE_LIMIT) { + const firstKey = cache.keys().next().value; + if (firstKey !== undefined) { + cache.delete(firstKey); + } + } + + return rank; +} + +function resolveFrequencyLookupText(token: MergedToken): string { + if (token.headword && token.headword.length > 0) { + return token.headword; + } + if (token.reading && token.reading.length > 0) { + return token.reading; + } + return token.surface; +} + +function getFrequencyLookupTextCandidates(token: MergedToken): string[] { + const lookupText = resolveFrequencyLookupText(token).trim(); + return lookupText ? [lookupText] : []; +} + +function isFrequencyExcludedByPos(token: MergedToken): boolean { + if ( + token.partOfSpeech === PartOfSpeech.particle || + token.partOfSpeech === PartOfSpeech.bound_auxiliary + ) { + return true; + } + + return token.pos1 === '助詞' || token.pos1 === '助動詞'; +} + +function applyFrequencyMarking( + tokens: MergedToken[], + getFrequencyRank: FrequencyDictionaryLookup, +): MergedToken[] { + return tokens.map((token) => { + if (isFrequencyExcludedByPos(token)) { + return { ...token, frequencyRank: undefined }; + } + + const lookupTexts = getFrequencyLookupTextCandidates(token); + if (lookupTexts.length === 0) { + return { ...token, frequencyRank: undefined }; + } + + let bestRank: number | null = null; + for (const lookupText of lookupTexts) { + const rank = getCachedFrequencyRank(lookupText, getFrequencyRank); + if (rank === null) { + continue; + } + if (bestRank === null || rank < bestRank) { + bestRank = rank; + } + } + + return { + ...token, + frequencyRank: bestRank ?? undefined, + }; + }); +} + +function getCachedJlptLevel( + lookupText: string, + getJlptLevel: (text: string) => JlptLevel | null, +): JlptLevel | null { + const normalizedText = lookupText.trim(); + if (!normalizedText) { + return null; + } + + let cache = jlptLevelLookupCaches.get(getJlptLevel); + if (!cache) { + cache = new Map(); + jlptLevelLookupCaches.set(getJlptLevel, cache); + } + + if (cache.has(normalizedText)) { + return cache.get(normalizedText) ?? null; + } + + let level: JlptLevel | null; + try { + level = getJlptLevel(normalizedText); + } catch { + level = null; + } + + cache.set(normalizedText, level); + while (cache.size > JLPT_LEVEL_LOOKUP_CACHE_LIMIT) { + const firstKey = cache.keys().next().value; + if (firstKey !== undefined) { + cache.delete(firstKey); + } + } + + return level; +} + +function resolveJlptLookupText(token: MergedToken): string { + if (token.headword && token.headword.length > 0) { + return token.headword; + } + if (token.reading && token.reading.length > 0) { + return token.reading; + } + return token.surface; +} + +function normalizeJlptTextForExclusion(text: string): string { + const raw = text.trim(); + if (!raw) { + return ''; + } + + let normalized = ''; + for (const char of raw) { + const code = char.codePointAt(0); + if (code === undefined) { + continue; + } + + if (code >= KATAKANA_CODEPOINT_START && code <= KATAKANA_CODEPOINT_END) { + normalized += String.fromCodePoint(code - KATAKANA_TO_HIRAGANA_OFFSET); + continue; + } + + normalized += char; + } + + return normalized; +} + +function isKanaChar(char: string): boolean { + const code = char.codePointAt(0); + if (code === undefined) { + return false; + } + + return ( + (code >= 0x3041 && code <= 0x3096) || + (code >= 0x309b && code <= 0x309f) || + (code >= 0x30a0 && code <= 0x30fa) || + (code >= 0x30fd && code <= 0x30ff) + ); +} + +function isRepeatedKanaSfx(text: string): boolean { + const normalized = text.trim(); + if (!normalized) { + return false; + } + + const chars = [...normalized]; + if (!chars.every(isKanaChar)) { + return false; + } + + const counts = new Map(); + let hasAdjacentRepeat = false; + + for (let i = 0; i < chars.length; i += 1) { + const char = chars[i]!; + counts.set(char, (counts.get(char) ?? 0) + 1); + if (i > 0 && chars[i] === chars[i - 1]) { + hasAdjacentRepeat = true; + } + } + + const topCount = Math.max(...counts.values()); + if (chars.length <= 2) { + return hasAdjacentRepeat || topCount >= 2; + } + + if (hasAdjacentRepeat) { + return true; + } + + return topCount >= Math.ceil(chars.length / 2); +} + +function isJlptEligibleToken(token: MergedToken): boolean { + if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) { + return false; + } + + const candidates = [ + resolveJlptLookupText(token), + token.surface, + token.reading, + token.headword, + ].filter( + (candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0, + ); + + for (const candidate of candidates) { + const normalizedCandidate = normalizeJlptTextForExclusion(candidate); + if (!normalizedCandidate) { + continue; + } + + const trimmedCandidate = candidate.trim(); + if (shouldIgnoreJlptByTerm(trimmedCandidate) || shouldIgnoreJlptByTerm(normalizedCandidate)) { + return false; + } + + if (isRepeatedKanaSfx(candidate) || isRepeatedKanaSfx(normalizedCandidate)) { + return false; + } + } + + return true; +} + +function applyJlptMarking( + tokens: MergedToken[], + getJlptLevel: (text: string) => JlptLevel | null, +): MergedToken[] { + return tokens.map((token) => { + if (!isJlptEligibleToken(token)) { + return { ...token, jlptLevel: undefined }; + } + + const primaryLevel = getCachedJlptLevel(resolveJlptLookupText(token), getJlptLevel); + const fallbackLevel = + primaryLevel === null ? getCachedJlptLevel(token.surface, getJlptLevel) : null; + + return { + ...token, + jlptLevel: primaryLevel ?? fallbackLevel ?? token.jlptLevel, + }; + }); +} + +export function annotateTokens( + tokens: MergedToken[], + deps: AnnotationStageDeps, + options: AnnotationStageOptions = {}, +): MergedToken[] { + const knownMarkedTokens = applyKnownWordMarking( + tokens, + deps.isKnownWord, + deps.knownWordMatchMode, + ); + + const frequencyEnabled = options.frequencyEnabled !== false; + const frequencyMarkedTokens = + frequencyEnabled && deps.getFrequencyRank + ? applyFrequencyMarking(knownMarkedTokens, deps.getFrequencyRank) + : knownMarkedTokens.map((token) => ({ + ...token, + frequencyRank: undefined, + })); + + const jlptEnabled = options.jlptEnabled !== false; + const jlptMarkedTokens = jlptEnabled + ? applyJlptMarking(frequencyMarkedTokens, deps.getJlptLevel) + : frequencyMarkedTokens.map((token) => ({ + ...token, + jlptLevel: undefined, + })); + + const minSentenceWordsForNPlusOne = options.minSentenceWordsForNPlusOne; + const sanitizedMinSentenceWordsForNPlusOne = + minSentenceWordsForNPlusOne !== undefined && + Number.isInteger(minSentenceWordsForNPlusOne) && + minSentenceWordsForNPlusOne > 0 + ? minSentenceWordsForNPlusOne + : 3; + + return markNPlusOneTargets(jlptMarkedTokens, sanitizedMinSentenceWordsForNPlusOne); +} diff --git a/src/core/services/tokenizer/parser-enrichment-stage.test.ts b/src/core/services/tokenizer/parser-enrichment-stage.test.ts new file mode 100644 index 0000000..a00f82c --- /dev/null +++ b/src/core/services/tokenizer/parser-enrichment-stage.test.ts @@ -0,0 +1,49 @@ +import test from 'node:test'; +import assert from 'node:assert/strict'; +import { MergedToken, PartOfSpeech } from '../../../types'; +import { enrichTokensWithMecabPos1 } from './parser-enrichment-stage'; + +function makeToken(overrides: Partial): MergedToken { + return { + surface: 'token', + reading: '', + headword: 'token', + startPos: 0, + endPos: 1, + partOfSpeech: PartOfSpeech.other, + isMerged: true, + isKnown: false, + isNPlusOneTarget: false, + pos1: '', + ...overrides, + }; +} + +test('enrichTokensWithMecabPos1 picks pos1 by best overlap when no surface match exists', () => { + const tokens = [makeToken({ surface: 'grouped', startPos: 2, endPos: 7 })]; + const mecabTokens = [ + makeToken({ surface: 'left', startPos: 0, endPos: 4, pos1: 'A' }), + makeToken({ surface: 'right', startPos: 2, endPos: 6, pos1: 'B' }), + ]; + + const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens); + assert.equal(enriched[0]?.pos1, 'B'); +}); + +test('enrichTokensWithMecabPos1 fills missing pos1 using surface-sequence fallback', () => { + const tokens = [makeToken({ surface: ' は ', startPos: 10, endPos: 13 })]; + const mecabTokens = [makeToken({ surface: 'は', startPos: 0, endPos: 1, pos1: '助詞' })]; + + const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens); + assert.equal(enriched[0]?.pos1, '助詞'); +}); + +test('enrichTokensWithMecabPos1 passes through unchanged when mecab tokens are null or empty', () => { + const tokens = [makeToken({ surface: '猫', startPos: 0, endPos: 1 })]; + + const nullResult = enrichTokensWithMecabPos1(tokens, null); + assert.strictEqual(nullResult, tokens); + + const emptyResult = enrichTokensWithMecabPos1(tokens, []); + assert.strictEqual(emptyResult, tokens); +}); diff --git a/src/core/services/tokenizer/parser-enrichment-stage.ts b/src/core/services/tokenizer/parser-enrichment-stage.ts new file mode 100644 index 0000000..3c3aeb2 --- /dev/null +++ b/src/core/services/tokenizer/parser-enrichment-stage.ts @@ -0,0 +1,167 @@ +import { MergedToken } from '../../../types'; + +function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): string | undefined { + if (mecabTokens.length === 0) { + return undefined; + } + + const tokenStart = token.startPos ?? 0; + const tokenEnd = token.endPos ?? tokenStart + token.surface.length; + let bestSurfaceMatchPos1: string | undefined; + let bestSurfaceMatchDistance = Number.MAX_SAFE_INTEGER; + let bestSurfaceMatchEndDistance = Number.MAX_SAFE_INTEGER; + + for (const mecabToken of mecabTokens) { + if (!mecabToken.pos1) { + continue; + } + + if (mecabToken.surface !== token.surface) { + continue; + } + + const mecabStart = mecabToken.startPos ?? 0; + const mecabEnd = mecabToken.endPos ?? mecabStart + mecabToken.surface.length; + const startDistance = Math.abs(mecabStart - tokenStart); + const endDistance = Math.abs(mecabEnd - tokenEnd); + + if ( + startDistance < bestSurfaceMatchDistance || + (startDistance === bestSurfaceMatchDistance && endDistance < bestSurfaceMatchEndDistance) + ) { + bestSurfaceMatchDistance = startDistance; + bestSurfaceMatchEndDistance = endDistance; + bestSurfaceMatchPos1 = mecabToken.pos1; + } + } + + if (bestSurfaceMatchPos1) { + return bestSurfaceMatchPos1; + } + + let bestPos1: string | undefined; + let bestOverlap = 0; + let bestSpan = 0; + let bestStartDistance = Number.MAX_SAFE_INTEGER; + let bestStart = Number.MAX_SAFE_INTEGER; + + for (const mecabToken of mecabTokens) { + if (!mecabToken.pos1) { + continue; + } + + const mecabStart = mecabToken.startPos ?? 0; + const mecabEnd = mecabToken.endPos ?? mecabStart + mecabToken.surface.length; + const overlapStart = Math.max(tokenStart, mecabStart); + const overlapEnd = Math.min(tokenEnd, mecabEnd); + const overlap = Math.max(0, overlapEnd - overlapStart); + if (overlap === 0) { + continue; + } + + const span = mecabEnd - mecabStart; + if ( + overlap > bestOverlap || + (overlap === bestOverlap && + (Math.abs(mecabStart - tokenStart) < bestStartDistance || + (Math.abs(mecabStart - tokenStart) === bestStartDistance && + (span > bestSpan || (span === bestSpan && mecabStart < bestStart))))) + ) { + bestOverlap = overlap; + bestSpan = span; + bestStartDistance = Math.abs(mecabStart - tokenStart); + bestStart = mecabStart; + bestPos1 = mecabToken.pos1; + } + } + + return bestOverlap > 0 ? bestPos1 : undefined; +} + +function fillMissingPos1BySurfaceSequence( + tokens: MergedToken[], + mecabTokens: MergedToken[], +): MergedToken[] { + const indexedMecabTokens = mecabTokens + .map((token, index) => ({ token, index })) + .filter(({ token }) => token.pos1 && token.surface.trim().length > 0); + + if (indexedMecabTokens.length === 0) { + return tokens; + } + + let cursor = 0; + return tokens.map((token) => { + if (token.pos1 && token.pos1.trim().length > 0) { + return token; + } + + const surface = token.surface.trim(); + if (!surface) { + return token; + } + + let best: { pos1: string; index: number } | null = null; + for (const candidate of indexedMecabTokens) { + if (candidate.token.surface !== surface) { + continue; + } + if (candidate.index < cursor) { + continue; + } + best = { pos1: candidate.token.pos1 as string, index: candidate.index }; + break; + } + + if (!best) { + for (const candidate of indexedMecabTokens) { + if (candidate.token.surface !== surface) { + continue; + } + best = { pos1: candidate.token.pos1 as string, index: candidate.index }; + break; + } + } + + if (!best) { + return token; + } + + cursor = best.index + 1; + return { + ...token, + pos1: best.pos1, + }; + }); +} + +export function enrichTokensWithMecabPos1( + tokens: MergedToken[], + mecabTokens: MergedToken[] | null, +): MergedToken[] { + if (!tokens || tokens.length === 0) { + return tokens; + } + + if (!mecabTokens || mecabTokens.length === 0) { + return tokens; + } + + const overlapEnriched = tokens.map((token) => { + if (token.pos1) { + return token; + } + + const pos1 = pickClosestMecabPos1(token, mecabTokens); + if (!pos1) { + return token; + } + + return { + ...token, + pos1, + }; + }); + + return fillMissingPos1BySurfaceSequence(overlapEnriched, mecabTokens); +} diff --git a/src/core/services/tokenizer/parser-selection-stage.test.ts b/src/core/services/tokenizer/parser-selection-stage.test.ts new file mode 100644 index 0000000..143a1f3 --- /dev/null +++ b/src/core/services/tokenizer/parser-selection-stage.test.ts @@ -0,0 +1,85 @@ +import test from 'node:test'; +import assert from 'node:assert/strict'; +import { selectYomitanParseTokens } from './parser-selection-stage'; + +interface ParseSegmentInput { + text: string; + reading?: string; + headword?: string; +} + +function makeParseItem( + source: string, + lines: ParseSegmentInput[][], +): { + source: string; + index: number; + content: Array< + Array<{ text: string; reading?: string; headwords?: Array> }> + >; +} { + return { + source, + index: 0, + content: lines.map((line) => + line.map((segment) => ({ + text: segment.text, + reading: segment.reading, + headwords: segment.headword ? [[{ term: segment.headword }]] : undefined, + })), + ), + }; +} + +test('prefers scanning parser when scanning candidate has more than one token', () => { + const parseResults = [ + makeParseItem('scanning-parser', [ + [{ text: '小園', reading: 'おうえん', headword: '小園' }], + [{ text: 'に', reading: 'に', headword: 'に' }], + ]), + makeParseItem('mecab', [ + [{ text: '小', reading: 'お', headword: '小' }], + [{ text: '園', reading: 'えん', headword: '園' }], + [{ text: 'に', reading: 'に', headword: 'に' }], + ]), + ]; + + const tokens = selectYomitanParseTokens(parseResults, () => false, 'headword'); + assert.equal(tokens?.map((token) => token.surface).join(','), '小園,に'); +}); + +test('prefers mecab candidate when scanning candidate is single token and mecab has better split', () => { + const parseResults = [ + makeParseItem('scanning-parser', [ + [{ text: '俺は公園にいきたい', reading: 'おれはこうえんにいきたい' }], + ]), + makeParseItem('mecab', [ + [{ text: '俺', reading: 'おれ', headword: '俺' }], + [{ text: 'は', reading: 'は', headword: 'は' }], + [{ text: '公園', reading: 'こうえん', headword: '公園' }], + [{ text: 'に', reading: 'に', headword: 'に' }], + [{ text: 'いきたい', reading: 'いきたい', headword: '行きたい' }], + ]), + ]; + + const tokens = selectYomitanParseTokens(parseResults, () => false, 'headword'); + assert.equal(tokens?.map((token) => token.surface).join(','), '俺,は,公園,に,いきたい'); +}); + +test('tie-break prefers fewer suspicious kana fragments', () => { + const parseResults = [ + makeParseItem('mecab-fragmented', [ + [{ text: '俺', reading: 'おれ', headword: '俺' }], + [{ text: 'にい', reading: '', headword: '兄' }], + [{ text: 'きたい', reading: '', headword: '期待' }], + ]), + makeParseItem('mecab', [ + [{ text: '俺', reading: 'おれ', headword: '俺' }], + [{ text: 'に', reading: 'に', headword: 'に' }], + [{ text: '行きたい', reading: 'いきたい', headword: '行きたい' }], + ]), + ]; + + const tokens = selectYomitanParseTokens(parseResults, () => false, 'headword'); + assert.equal(tokens?.map((token) => token.surface).join(','), '俺,に,行きたい'); +}); diff --git a/src/core/services/tokenizer/parser-selection-stage.ts b/src/core/services/tokenizer/parser-selection-stage.ts new file mode 100644 index 0000000..0e42107 --- /dev/null +++ b/src/core/services/tokenizer/parser-selection-stage.ts @@ -0,0 +1,281 @@ +import { MergedToken, NPlusOneMatchMode, PartOfSpeech } from '../../../types'; + +interface YomitanParseHeadword { + term?: unknown; +} + +interface YomitanParseSegment { + text?: string; + reading?: string; + headwords?: unknown; +} + +interface YomitanParseResultItem { + source?: unknown; + index?: unknown; + content?: unknown; +} + +type YomitanParseLine = YomitanParseSegment[]; + +export interface YomitanParseCandidate { + source: string; + index: number; + tokens: MergedToken[]; +} + +function isObject(value: unknown): value is Record { + return Boolean(value && typeof value === 'object'); +} + +function isString(value: unknown): value is string { + return typeof value === 'string'; +} + +function resolveKnownWordText( + surface: string, + headword: string, + matchMode: NPlusOneMatchMode, +): string { + return matchMode === 'surface' ? surface : headword; +} + +function isKanaChar(char: string): boolean { + const code = char.codePointAt(0); + if (code === undefined) { + return false; + } + + return ( + (code >= 0x3041 && code <= 0x3096) || + (code >= 0x309b && code <= 0x309f) || + (code >= 0x30a0 && code <= 0x30fa) || + (code >= 0x30fd && code <= 0x30ff) + ); +} + +function isYomitanParseLine(value: unknown): value is YomitanParseLine { + if (!Array.isArray(value)) { + return false; + } + + return value.every((segment) => { + if (!isObject(segment)) { + return false; + } + + const candidate = segment as YomitanParseSegment; + return isString(candidate.text); + }); +} + +export function isYomitanParseResultItem(value: unknown): value is YomitanParseResultItem { + if (!isObject(value)) { + return false; + } + if (!isString((value as YomitanParseResultItem).source)) { + return false; + } + if (!Array.isArray((value as YomitanParseResultItem).content)) { + return false; + } + return true; +} + +function isYomitanHeadwordRows(value: unknown): value is YomitanParseHeadword[][] { + return ( + Array.isArray(value) && + value.every( + (group) => + Array.isArray(group) && + group.every((item) => isObject(item) && isString((item as YomitanParseHeadword).term)), + ) + ); +} + +function extractYomitanHeadword(segment: YomitanParseSegment): string { + const headwords = segment.headwords; + if (!isYomitanHeadwordRows(headwords)) { + return ''; + } + + for (const group of headwords) { + if (group.length > 0) { + const firstHeadword = group[0] as YomitanParseHeadword; + if (isString(firstHeadword?.term)) { + return firstHeadword.term; + } + } + } + + return ''; +} + +export function mapYomitanParseResultItemToMergedTokens( + parseResult: YomitanParseResultItem, + isKnownWord: (text: string) => boolean, + knownWordMatchMode: NPlusOneMatchMode, +): YomitanParseCandidate | null { + const content = parseResult.content; + if (!Array.isArray(content) || content.length === 0) { + return null; + } + + const source = String(parseResult.source ?? ''); + const index = + typeof parseResult.index === 'number' && Number.isInteger(parseResult.index) + ? parseResult.index + : 0; + + const tokens: MergedToken[] = []; + let charOffset = 0; + let validLineCount = 0; + + for (const line of content) { + if (!isYomitanParseLine(line)) { + continue; + } + validLineCount += 1; + + let combinedSurface = ''; + let combinedReading = ''; + let combinedHeadword = ''; + + for (const segment of line) { + const segmentText = segment.text; + if (!segmentText || segmentText.length === 0) { + continue; + } + + combinedSurface += segmentText; + if (typeof segment.reading === 'string') { + combinedReading += segment.reading; + } + if (!combinedHeadword) { + combinedHeadword = extractYomitanHeadword(segment); + } + } + + if (!combinedSurface) { + continue; + } + + const start = charOffset; + const end = start + combinedSurface.length; + charOffset = end; + const headword = combinedHeadword || combinedSurface; + + tokens.push({ + surface: combinedSurface, + reading: combinedReading, + headword, + startPos: start, + endPos: end, + partOfSpeech: PartOfSpeech.other, + pos1: '', + isMerged: true, + isNPlusOneTarget: false, + isKnown: (() => { + const matchText = resolveKnownWordText(combinedSurface, headword, knownWordMatchMode); + return matchText ? isKnownWord(matchText) : false; + })(), + }); + } + + if (validLineCount === 0 || tokens.length === 0) { + return null; + } + + return { source, index, tokens }; +} + +export function selectBestYomitanParseCandidate( + candidates: YomitanParseCandidate[], +): MergedToken[] | null { + if (candidates.length === 0) { + return null; + } + + const scanningCandidates = candidates.filter( + (candidate) => candidate.source === 'scanning-parser', + ); + const mecabCandidates = candidates.filter((candidate) => candidate.source === 'mecab'); + + const getBestByTokenCount = (items: YomitanParseCandidate[]): YomitanParseCandidate | null => + items.length === 0 + ? null + : items.reduce((best, current) => + current.tokens.length > best.tokens.length ? current : best, + ); + + const getCandidateScore = (candidate: YomitanParseCandidate): number => { + const readableTokenCount = candidate.tokens.filter( + (token) => token.reading.trim().length > 0, + ).length; + const suspiciousKanaFragmentCount = candidate.tokens.filter( + (token) => + token.reading.trim().length === 0 && + token.surface.length >= 2 && + Array.from(token.surface).every((char) => isKanaChar(char)), + ).length; + + return readableTokenCount * 100 - suspiciousKanaFragmentCount * 50 - candidate.tokens.length; + }; + + const chooseBestCandidate = (items: YomitanParseCandidate[]): YomitanParseCandidate | null => { + if (items.length === 0) { + return null; + } + + return items.reduce((best, current) => { + const bestScore = getCandidateScore(best); + const currentScore = getCandidateScore(current); + if (currentScore !== bestScore) { + return currentScore > bestScore ? current : best; + } + + if (current.tokens.length !== best.tokens.length) { + return current.tokens.length < best.tokens.length ? current : best; + } + + return best; + }); + }; + + if (scanningCandidates.length > 0) { + const bestScanning = getBestByTokenCount(scanningCandidates); + if (bestScanning && bestScanning.tokens.length > 1) { + return bestScanning.tokens; + } + + const bestMecab = chooseBestCandidate(mecabCandidates); + if (bestMecab && bestMecab.tokens.length > (bestScanning?.tokens.length ?? 0)) { + return bestMecab.tokens; + } + + return bestScanning ? bestScanning.tokens : null; + } + + const multiTokenCandidates = candidates.filter((candidate) => candidate.tokens.length > 1); + const pool = multiTokenCandidates.length > 0 ? multiTokenCandidates : candidates; + const bestCandidate = chooseBestCandidate(pool); + return bestCandidate ? bestCandidate.tokens : null; +} + +export function selectYomitanParseTokens( + parseResults: unknown, + isKnownWord: (text: string) => boolean, + knownWordMatchMode: NPlusOneMatchMode, +): MergedToken[] | null { + if (!Array.isArray(parseResults) || parseResults.length === 0) { + return null; + } + + const candidates = parseResults + .filter((item): item is YomitanParseResultItem => isYomitanParseResultItem(item)) + .map((item) => mapYomitanParseResultItemToMergedTokens(item, isKnownWord, knownWordMatchMode)) + .filter((candidate): candidate is YomitanParseCandidate => candidate !== null); + + const bestCandidate = selectBestYomitanParseCandidate(candidates); + return bestCandidate; +} diff --git a/src/core/services/tokenizer/yomitan-parser-runtime.ts b/src/core/services/tokenizer/yomitan-parser-runtime.ts new file mode 100644 index 0000000..5955cfa --- /dev/null +++ b/src/core/services/tokenizer/yomitan-parser-runtime.ts @@ -0,0 +1,154 @@ +import type { BrowserWindow, Extension } from 'electron'; + +interface LoggerLike { + error: (message: string, ...args: unknown[]) => void; +} + +interface YomitanParserRuntimeDeps { + getYomitanExt: () => Extension | null; + getYomitanParserWindow: () => BrowserWindow | null; + setYomitanParserWindow: (window: BrowserWindow | null) => void; + getYomitanParserReadyPromise: () => Promise | null; + setYomitanParserReadyPromise: (promise: Promise | null) => void; + getYomitanParserInitPromise: () => Promise | null; + setYomitanParserInitPromise: (promise: Promise | null) => void; +} + +async function ensureYomitanParserWindow( + deps: YomitanParserRuntimeDeps, + logger: LoggerLike, +): Promise { + const electron = await import('electron'); + const yomitanExt = deps.getYomitanExt(); + if (!yomitanExt) { + return false; + } + + const currentWindow = deps.getYomitanParserWindow(); + if (currentWindow && !currentWindow.isDestroyed()) { + return true; + } + + const existingInitPromise = deps.getYomitanParserInitPromise(); + if (existingInitPromise) { + return existingInitPromise; + } + + const initPromise = (async () => { + const { BrowserWindow, session } = electron; + const parserWindow = new BrowserWindow({ + show: false, + width: 800, + height: 600, + webPreferences: { + contextIsolation: true, + nodeIntegration: false, + session: session.defaultSession, + }, + }); + deps.setYomitanParserWindow(parserWindow); + + deps.setYomitanParserReadyPromise( + new Promise((resolve, reject) => { + parserWindow.webContents.once('did-finish-load', () => resolve()); + parserWindow.webContents.once('did-fail-load', (_event, _errorCode, errorDescription) => { + reject(new Error(errorDescription)); + }); + }), + ); + + parserWindow.on('closed', () => { + if (deps.getYomitanParserWindow() === parserWindow) { + deps.setYomitanParserWindow(null); + deps.setYomitanParserReadyPromise(null); + } + }); + + try { + await parserWindow.loadURL(`chrome-extension://${yomitanExt.id}/search.html`); + const readyPromise = deps.getYomitanParserReadyPromise(); + if (readyPromise) { + await readyPromise; + } + + return true; + } catch (err) { + logger.error('Failed to initialize Yomitan parser window:', (err as Error).message); + if (!parserWindow.isDestroyed()) { + parserWindow.destroy(); + } + if (deps.getYomitanParserWindow() === parserWindow) { + deps.setYomitanParserWindow(null); + deps.setYomitanParserReadyPromise(null); + } + + return false; + } finally { + deps.setYomitanParserInitPromise(null); + } + })(); + + deps.setYomitanParserInitPromise(initPromise); + return initPromise; +} + +export async function requestYomitanParseResults( + text: string, + deps: YomitanParserRuntimeDeps, + logger: LoggerLike, +): Promise { + const yomitanExt = deps.getYomitanExt(); + if (!text || !yomitanExt) { + return null; + } + + const isReady = await ensureYomitanParserWindow(deps, logger); + const parserWindow = deps.getYomitanParserWindow(); + if (!isReady || !parserWindow || parserWindow.isDestroyed()) { + return null; + } + + const script = ` + (async () => { + const invoke = (action, params) => + new Promise((resolve, reject) => { + chrome.runtime.sendMessage({ action, params }, (response) => { + if (chrome.runtime.lastError) { + reject(new Error(chrome.runtime.lastError.message)); + return; + } + if (!response || typeof response !== "object") { + reject(new Error("Invalid response from Yomitan backend")); + return; + } + if (response.error) { + reject(new Error(response.error.message || "Yomitan backend error")); + return; + } + resolve(response.result); + }); + }); + + const optionsFull = await invoke("optionsGetFull", undefined); + const profileIndex = optionsFull.profileCurrent; + const scanLength = + optionsFull.profiles?.[profileIndex]?.options?.scanning?.length ?? 40; + + return await invoke("parseText", { + text: ${JSON.stringify(text)}, + optionsContext: { index: profileIndex }, + scanLength, + useInternalParser: true, + useMecabParser: true + }); + })(); + `; + + try { + const parseResults = await parserWindow.webContents.executeJavaScript(script, true); + return Array.isArray(parseResults) ? parseResults : null; + } catch (err) { + logger.error('Yomitan parser request failed:', (err as Error).message); + return null; + } +}