diff --git a/src/core/services/jlpt-vocab-service.ts b/src/core/services/jlpt-vocab-service.ts index f896e4f..696a237 100644 --- a/src/core/services/jlpt-vocab-service.ts +++ b/src/core/services/jlpt-vocab-service.ts @@ -15,6 +15,13 @@ const JLPT_BANK_FILES: { level: JlptLevel; filename: string }[] = [ { level: "N4", filename: "term_meta_bank_4.json" }, { level: "N5", filename: "term_meta_bank_5.json" }, ]; +const JLPT_LEVEL_PRECEDENCE: Record = { + N1: 5, + N2: 4, + N3: 3, + N4: 2, + N5: 1, +}; const NOOP_LOOKUP = (): null => null; @@ -38,6 +45,14 @@ function addEntriesToMap( terms: Map, log: (message: string) => void, ): void { + const shouldUpdateLevel = ( + existingLevel: JlptLevel | undefined, + incomingLevel: JlptLevel, + ): boolean => + existingLevel === undefined || + JLPT_LEVEL_PRECEDENCE[incomingLevel] > + JLPT_LEVEL_PRECEDENCE[existingLevel]; + if (!Array.isArray(rawEntries)) { return; } @@ -61,55 +76,14 @@ function addEntriesToMap( continue; } - if (!terms.has(normalizedTerm)) { + const existingLevel = terms.get(normalizedTerm); + if (shouldUpdateLevel(existingLevel, level)) { terms.set(normalizedTerm, level); continue; } - if (terms.get(normalizedTerm) !== "N1" && level === "N1") { - terms.set(normalizedTerm, level); - continue; - } - - if (terms.get(normalizedTerm) !== "N1" && terms.get(normalizedTerm) !== "N2" && level === "N2") { - terms.set(normalizedTerm, level); - continue; - } - - if ( - terms.get(normalizedTerm) !== "N1" && - terms.get(normalizedTerm) !== "N2" && - terms.get(normalizedTerm) !== "N3" && - level === "N3" - ) { - terms.set(normalizedTerm, level); - continue; - } - - if ( - terms.get(normalizedTerm) !== "N1" && - terms.get(normalizedTerm) !== "N2" && - terms.get(normalizedTerm) !== "N3" && - terms.get(normalizedTerm) !== "N4" && - level === "N4" - ) { - terms.set(normalizedTerm, level); - continue; - } - - if ( - terms.get(normalizedTerm) !== "N1" && - terms.get(normalizedTerm) !== "N2" && - terms.get(normalizedTerm) !== "N3" && - terms.get(normalizedTerm) !== "N4" && - terms.get(normalizedTerm) !== "N5" && - level === "N5" - ) { - terms.set(normalizedTerm, level); - } - log( - `JLPT dictionary already has ${normalizedTerm} as ${terms.get(normalizedTerm)}; keeping that level instead of ${level}`, + `JLPT dictionary already has ${normalizedTerm} as ${existingLevel}; keeping that level instead of ${level}`, ); } } diff --git a/src/core/services/tokenizer-service.ts b/src/core/services/tokenizer-service.ts index 126bdf9..c598068 100644 --- a/src/core/services/tokenizer-service.ts +++ b/src/core/services/tokenizer-service.ts @@ -16,8 +16,8 @@ interface YomitanParseHeadword { } interface YomitanParseSegment { - text?: unknown; - reading?: unknown; + text?: string; + reading?: string; headwords?: unknown; } @@ -27,6 +27,20 @@ interface YomitanParseResultItem { content?: unknown; } +type YomitanParseLine = YomitanParseSegment[]; + +const KATAKANA_TO_HIRAGANA_OFFSET = 0x60; +const KATAKANA_CODEPOINT_START = 0x30a1; +const KATAKANA_CODEPOINT_END = 0x30f6; + +function isObject(value: unknown): value is Record { + return Boolean(value && typeof value === "object"); +} + +function isString(value: unknown): value is string { + return typeof value === "string"; +} + export interface TokenizerServiceDeps { getYomitanExt: () => Extension | null; getYomitanParserWindow: () => BrowserWindow | null; @@ -144,8 +158,8 @@ function normalizeJlptTextForExclusion(text: string): string { continue; } - if (code >= 0x30a1 && code <= 0x30f6) { - normalized += String.fromCodePoint(code - 0x60); + if (code >= KATAKANA_CODEPOINT_START && code <= KATAKANA_CODEPOINT_END) { + normalized += String.fromCodePoint(code - KATAKANA_TO_HIRAGANA_OFFSET); continue; } @@ -238,6 +252,67 @@ function isJlptEligibleToken(token: MergedToken): boolean { return true; } +function isYomitanParseResultItem( + value: unknown, +): value is YomitanParseResultItem { + if (!isObject(value)) { + return false; + } + if ((value as YomitanParseResultItem).source !== "scanning-parser") { + return false; + } + if (!Array.isArray((value as YomitanParseResultItem).content)) { + return false; + } + return true; +} + +function isYomitanParseLine(value: unknown): value is YomitanParseLine { + if (!Array.isArray(value)) { + return false; + } + + return value.every((segment) => { + if (!isObject(segment)) { + return false; + } + + const candidate = segment as YomitanParseSegment; + return isString(candidate.text); + }); +} + +function isYomitanHeadwordRows(value: unknown): value is YomitanParseHeadword[][] { + return ( + Array.isArray(value) && + value.every( + (group) => + Array.isArray(group) && + group.every((item) => + isObject(item) && isString((item as YomitanParseHeadword).term), + ), + ) + ); +} + +function extractYomitanHeadword(segment: YomitanParseSegment): string { + const headwords = segment.headwords; + if (!isYomitanHeadwordRows(headwords)) { + return ""; + } + + for (const group of headwords) { + if (group.length > 0) { + const firstHeadword = group[0] as YomitanParseHeadword; + if (isString(firstHeadword?.term)) { + return firstHeadword.term; + } + } + } + + return ""; +} + function applyJlptMarking( tokens: MergedToken[], getJlptLevel: (text: string) => JlptLevel | null, @@ -250,46 +325,25 @@ function applyJlptMarking( const primaryLevel = getJlptLevel(resolveJlptLookupText(token)); const fallbackLevel = getJlptLevel(token.surface); - return { - ...token, - jlptLevel: primaryLevel ?? fallbackLevel ?? token.jlptLevel, - }; + return { + ...token, + jlptLevel: primaryLevel ?? fallbackLevel ?? token.jlptLevel, + }; }); } -function extractYomitanHeadword(segment: YomitanParseSegment): string { - const headwords = segment.headwords; - if (!Array.isArray(headwords) || headwords.length === 0) { - return ""; - } - - const firstGroup = headwords[0]; - if (!Array.isArray(firstGroup) || firstGroup.length === 0) { - return ""; - } - - const firstHeadword = firstGroup[0] as YomitanParseHeadword; - return typeof firstHeadword?.term === "string" ? firstHeadword.term : ""; -} - function mapYomitanParseResultsToMergedTokens( parseResults: unknown, isKnownWord: (text: string) => boolean, knownWordMatchMode: NPlusOneMatchMode, - getJlptLevel: (text: string) => JlptLevel | null, ): MergedToken[] | null { if (!Array.isArray(parseResults) || parseResults.length === 0) { return null; } - const scanningItems = parseResults.filter((item) => { - const resultItem = item as YomitanParseResultItem; - return ( - resultItem && - resultItem.source === "scanning-parser" && - Array.isArray(resultItem.content) - ); - }) as YomitanParseResultItem[]; + const scanningItems = parseResults.filter( + (item): item is YomitanParseResultItem => isYomitanParseResultItem(item), + ); if (scanningItems.length === 0) { return null; @@ -304,24 +358,21 @@ function mapYomitanParseResultsToMergedTokens( const tokens: MergedToken[] = []; let charOffset = 0; + let validLineCount = 0; for (const line of content) { - if (!Array.isArray(line)) { + if (!isYomitanParseLine(line)) { continue; } + validLineCount += 1; let surface = ""; let reading = ""; let headword = ""; - for (const rawSegment of line) { - const segment = rawSegment as YomitanParseSegment; - if (!segment || typeof segment !== "object") { - continue; - } - + for (const segment of line) { const segmentText = segment.text; - if (typeof segmentText !== "string" || segmentText.length === 0) { + if (!segmentText || segmentText.length === 0) { continue; } @@ -365,6 +416,9 @@ function mapYomitanParseResultsToMergedTokens( }); } + if (validLineCount === 0) { + return null; + } return tokens.length > 0 ? tokens : null; } @@ -428,14 +482,22 @@ async function enrichYomitanPos1( try { mecabTokens = await deps.tokenizeWithMecab(text); } catch (err) { + const error = err as Error; console.warn( "Failed to enrich Yomitan tokens with MeCab POS:", - (err as Error).message, + error.message, + `tokenCount=${tokens.length}`, + `textLength=${text.length}`, ); return tokens; } if (!mecabTokens || mecabTokens.length === 0) { + console.warn( + "MeCab enrichment returned no tokens; preserving Yomitan token output.", + `tokenCount=${tokens.length}`, + `textLength=${text.length}`, + ); return tokens; } @@ -591,11 +653,10 @@ async function parseWithYomitanInternalParser( script, true, ); - const yomitanTokens = mapYomitanParseResultsToMergedTokens( + const yomitanTokens = mapYomitanParseResultsToMergedTokens( parseResults, deps.isKnownWord, deps.getKnownWordMatchMode(), - deps.getJlptLevel, ); if (!yomitanTokens || yomitanTokens.length === 0) { return null; diff --git a/src/main.ts b/src/main.ts index 5349592..2704a7f 100644 --- a/src/main.ts +++ b/src/main.ts @@ -469,106 +469,37 @@ function loadSubtitlePosition(): SubtitlePosition | null { function getJlptDictionarySearchPaths(): string[] { const homeDir = os.homedir(); - const userDataPath = app.getPath("userData"); - return [ + const dictionaryRoots = [ + // Source checkout paths (development + source tree) path.join(__dirname, "..", "..", "vendor", "yomitan-jlpt-vocab"), - path.join( - __dirname, - "..", - "..", - "vendor", - "yomitan-jlpt-vocab", - "yomitan-jlpt-vocab", - ), - path.join(__dirname, "..", "..", "..", "vendor", "yomitan-jlpt-vocab"), - path.join( - __dirname, - "..", - "..", - "..", - "vendor", - "yomitan-jlpt-vocab", - "yomitan-jlpt-vocab", - ), - path.join(process.resourcesPath, "yomitan-jlpt-vocab"), - path.join( - process.resourcesPath, - "yomitan-jlpt-vocab", - "yomitan-jlpt-vocab", - ), path.join(app.getAppPath(), "vendor", "yomitan-jlpt-vocab"), - path.join( - app.getAppPath(), - "vendor", - "yomitan-jlpt-vocab", - "yomitan-jlpt-vocab", - ), + // Runtime package bundle paths + path.join(process.resourcesPath, "yomitan-jlpt-vocab"), path.join(process.resourcesPath, "app.asar", "vendor", "yomitan-jlpt-vocab"), - path.join( - process.resourcesPath, - "app.asar", - "vendor", - "yomitan-jlpt-vocab", - "yomitan-jlpt-vocab", - ), - path.join(USER_DATA_PATH, "yomitan-jlpt-vocab"), - path.join(USER_DATA_PATH, "yomitan-jlpt-vocab", "yomitan-jlpt-vocab"), - path.join(userDataPath, "yomitan-jlpt-vocab"), - path.join(userDataPath, "yomitan-jlpt-vocab", "yomitan-jlpt-vocab"), - path.join(homeDir, ".config", "SubMiner", "yomitan-jlpt-vocab"), - path.join( - homeDir, - ".config", - "SubMiner", - "yomitan-jlpt-vocab", - "yomitan-jlpt-vocab", - ), - path.join(homeDir, ".config", "subminer", "yomitan-jlpt-vocab"), - path.join( - homeDir, - ".config", - "subminer", - "yomitan-jlpt-vocab", - "yomitan-jlpt-vocab", - ), - path.join( - homeDir, - "Library", - "Application Support", - "SubMiner", - "yomitan-jlpt-vocab", - ), - path.join( - homeDir, - "Library", - "Application Support", - "SubMiner", - "yomitan-jlpt-vocab", - "yomitan-jlpt-vocab", - ), - path.join( - homeDir, - "Library", - "Application Support", - "subminer", - "yomitan-jlpt-vocab", - ), - path.join( - homeDir, - "Library", - "Application Support", - "subminer", - "yomitan-jlpt-vocab", - "yomitan-jlpt-vocab", - ), - path.join(process.cwd(), "vendor", "yomitan-jlpt-vocab"), - path.join( - process.cwd(), - "vendor", - "yomitan-jlpt-vocab", - "yomitan-jlpt-vocab", - ), + // User-configurable override locations + USER_DATA_PATH, + app.getPath("userData"), + path.join(homeDir, ".config", "SubMiner"), + path.join(homeDir, ".config", "subminer"), + path.join(homeDir, "Library", "Application Support", "SubMiner"), + path.join(homeDir, "Library", "Application Support", "subminer"), + // CLI invocation path (when launched from project root) + process.cwd(), ]; + + const searchPaths: string[] = []; + for (const dictionaryRoot of dictionaryRoots) { + searchPaths.push(dictionaryRoot); + searchPaths.push(path.join(dictionaryRoot, "vendor", "yomitan-jlpt-vocab")); + searchPaths.push(path.join(dictionaryRoot, "yomitan-jlpt-vocab")); + } + + const uniquePaths = new Set(); + for (const searchPath of searchPaths) { + uniquePaths.add(searchPath); + } + + return [...uniquePaths]; } async function initializeJlptDictionaryLookup(): Promise { diff --git a/src/renderer/subtitle-render.test.ts b/src/renderer/subtitle-render.test.ts index 1d7d624..d53fffe 100644 --- a/src/renderer/subtitle-render.test.ts +++ b/src/renderer/subtitle-render.test.ts @@ -65,7 +65,7 @@ test("JLPT CSS rules use underline-only styling in renderer stylesheet", () => { assert.ok(block.length > 0, `word-jlpt-n${level} class should exist`); assert.match(block, /text-decoration-line:\s*underline;/); assert.match(block, /text-decoration-thickness:\s*2px;/); - assert.match(block, /text-underline-offset:\s*2px;/); + assert.match(block, /text-underline-offset:\s*4px;/); assert.match(block, /color:\s*inherit;/); } });