From 746696b1a41406bd391198a4d2bfdb86612bf115 Mon Sep 17 00:00:00 2001 From: sudacode Date: Fri, 6 Mar 2026 01:28:58 -0800 Subject: [PATCH] fix: improve yomitan subtitle name lookup --- ...h-left-to-right-Yomitan-scanning-parser.md | 34 ++ ...or-AniList-character-dictionary-entries.md | 40 +++ ...napshots-after-kana-alias-schema-change.md | 39 +++ src/core/services/tokenizer.test.ts | 96 +++++- src/core/services/tokenizer.ts | 35 +- .../tokenizer/yomitan-parser-runtime.test.ts | 33 +- .../tokenizer/yomitan-parser-runtime.ts | 307 ++++++++++++++++++ src/main/character-dictionary-runtime.test.ts | 242 ++++++++++++++ src/main/character-dictionary-runtime.ts | 249 +++++++++++++- 9 files changed, 1041 insertions(+), 34 deletions(-) create mode 100644 backlog/tasks/task-93 - Replace-subtitle-tokenizer-with-left-to-right-Yomitan-scanning-parser.md create mode 100644 backlog/tasks/task-94 - Add-kana-aliases-for-AniList-character-dictionary-entries.md create mode 100644 backlog/tasks/task-95 - Invalidate-old-character-dictionary-snapshots-after-kana-alias-schema-change.md diff --git a/backlog/tasks/task-93 - Replace-subtitle-tokenizer-with-left-to-right-Yomitan-scanning-parser.md b/backlog/tasks/task-93 - Replace-subtitle-tokenizer-with-left-to-right-Yomitan-scanning-parser.md new file mode 100644 index 0000000..9a9e955 --- /dev/null +++ b/backlog/tasks/task-93 - Replace-subtitle-tokenizer-with-left-to-right-Yomitan-scanning-parser.md @@ -0,0 +1,34 @@ +--- +id: TASK-93 +title: Replace subtitle tokenizer with left-to-right Yomitan scanning parser +status: Done +assignee: [] +created_date: '2026-03-06 09:02' +updated_date: '2026-03-06 09:14' +labels: + - tokenizer + - yomitan + - refactor +dependencies: [] +priority: high +--- + +## Description + + +Replace the current parseText candidate-selection tokenizer with a GSM-style left-to-right Yomitan scanning tokenizer for all subtitles. Preserve downstream token contracts for rendering, JLPT/frequency/N+1 annotation, and MeCab enrichment while improving full-term matching for names and katakana compounds. + + +## Acceptance Criteria + +- [x] #1 Subtitle tokenization uses a left-to-right Yomitan scanning strategy instead of parseText candidate selection. +- [x] #2 Token surfaces, readings, headwords, and offsets remain compatible with existing renderer and annotation stages. +- [x] #3 Known problematic name cases such as カズマ and バニール resolve to full-token dictionary matches when Yomitan can match them. +- [x] #4 Regression tests cover left-to-right exact-match scanning, unmatched text handling, and downstream tokenizeSubtitle integration. + + +## Final Summary + + +Replaced the live subtitle tokenization path with a left-to-right Yomitan `termsFind` scanner that greedily advances through the normalized subtitle text, preserving downstream `MergedToken` contracts for renderer, MeCab enrichment, JLPT, frequency, and N+1 annotation. Added runtime and integration coverage for exact-match scanning plus name cases like カズマ and kept compatibility fallback handling for older mocked parseText-style test payloads. + diff --git a/backlog/tasks/task-94 - Add-kana-aliases-for-AniList-character-dictionary-entries.md b/backlog/tasks/task-94 - Add-kana-aliases-for-AniList-character-dictionary-entries.md new file mode 100644 index 0000000..1a5a9f6 --- /dev/null +++ b/backlog/tasks/task-94 - Add-kana-aliases-for-AniList-character-dictionary-entries.md @@ -0,0 +1,40 @@ +--- +id: TASK-94 +title: Add kana aliases for AniList character dictionary entries +status: Done +assignee: [] +created_date: '2026-03-06 09:20' +updated_date: '2026-03-06 09:23' +labels: + - dictionary + - tokenizer + - anilist +dependencies: [] +references: + - >- + /home/sudacode/projects/japanese/SubMiner/src/main/character-dictionary-runtime.ts + - >- + /home/sudacode/projects/japanese/SubMiner/src/main/character-dictionary-runtime.test.ts +priority: high +--- + +## Description + + +Generate katakana/hiragana-friendly aliases from AniList romanized character names so subtitle katakana names like カズマ match character dictionary entries even when AniList native name is kanji. + + +## Acceptance Criteria + +- [x] #1 AniList character dictionary generation adds kana aliases for romanized names when native name is not already kana-only +- [x] #2 Generated dictionary entries allow katakana subtitle names like カズマ to resolve against a kanji-native AniList character entry +- [x] #3 Regression tests cover alias generation and resulting term bank output + + +## Final Summary + + +Added katakana aliases synthesized from AniList romanized character names during character dictionary generation, so kanji-native entries such as 佐藤和真 / Satou Kazuma now also emit terms like カズマ and サトウカズマ with hiragana readings. Added regression coverage verifying generated term-bank output for the Konosuba case. + +Verified with `bun test src/main/character-dictionary-runtime.test.ts` and `bun run tsc --noEmit`. + diff --git a/backlog/tasks/task-95 - Invalidate-old-character-dictionary-snapshots-after-kana-alias-schema-change.md b/backlog/tasks/task-95 - Invalidate-old-character-dictionary-snapshots-after-kana-alias-schema-change.md new file mode 100644 index 0000000..62d55f6 --- /dev/null +++ b/backlog/tasks/task-95 - Invalidate-old-character-dictionary-snapshots-after-kana-alias-schema-change.md @@ -0,0 +1,39 @@ +--- +id: TASK-95 +title: Invalidate old character dictionary snapshots after kana alias schema change +status: Done +assignee: [] +created_date: '2026-03-06 09:25' +updated_date: '2026-03-06 09:28' +labels: + - dictionary + - cache +dependencies: [] +references: + - >- + /home/sudacode/projects/japanese/SubMiner/src/main/character-dictionary-runtime.ts + - >- + /home/sudacode/projects/japanese/SubMiner/src/main/character-dictionary-runtime.test.ts +priority: high +--- + +## Description + + +Bump character dictionary snapshot format/version so cached AniList snapshots created before kana alias generation are rebuilt automatically on next auto-sync or generation run. + + +## Acceptance Criteria + +- [x] #1 Old cached character dictionary snapshots are treated as invalid after the schema/version bump +- [x] #2 Current snapshot generation tests cover rebuild behavior across version mismatch +- [x] #3 No manual cache deletion is required for users to pick up kana alias term generation + + +## Final Summary + + +Bumped the character dictionary snapshot format version so cached AniList snapshots created before kana alias generation are automatically treated as stale and rebuilt. Added regression coverage that seeds an older-format snapshot and verifies `getOrCreateCurrentSnapshot` fetches fresh data and overwrites the stale cache. + +Verified with `bun test src/main/character-dictionary-runtime.test.ts` and `bun run tsc --noEmit`. + diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index 5b75e7d..ff52f3d 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -30,25 +30,32 @@ function makeDepsFromYomitanTokens( tokens: YomitanTokenInput[], overrides: Partial = {}, ): TokenizerServiceDeps { + let cursor = 0; return makeDeps({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { - executeJavaScript: async () => [ - { - source: 'scanning-parser', - index: 0, - content: tokens.map((token) => [ - { - text: token.surface, - reading: token.reading ?? token.surface, - headwords: [[{ term: token.headword ?? token.surface }]], - }, - ]), - }, - ], + executeJavaScript: async (script: string) => { + if (script.includes('getTermFrequencies')) { + return []; + } + + cursor = 0; + return tokens.map((token) => { + const startPos = cursor; + const endPos = startPos + token.surface.length; + cursor = endPos; + return { + surface: token.surface, + reading: token.reading ?? token.surface, + headword: token.headword ?? token.surface, + startPos, + endPos, + }; + }); + }, }, }) as unknown as Electron.BrowserWindow, ...overrides, @@ -182,6 +189,69 @@ test('tokenizeSubtitle applies frequency dictionary ranks', async () => { assert.equal(result.tokens?.[1]?.frequencyRank, 1200); }); +test('tokenizeSubtitle uses left-to-right yomitan scanning to keep full katakana name tokens', async () => { + const result = await tokenizeSubtitle( + 'カズマ 魔王軍', + makeDeps({ + getYomitanExt: () => ({ id: 'dummy-ext' }) as any, + getYomitanParserWindow: () => + ({ + isDestroyed: () => false, + webContents: { + executeJavaScript: async (script: string) => { + if (script.includes('getTermFrequencies')) { + return []; + } + + return [ + { + surface: 'カズマ', + reading: 'かずま', + headword: 'カズマ', + startPos: 0, + endPos: 3, + }, + { + surface: '魔王軍', + reading: 'まおうぐん', + headword: '魔王軍', + startPos: 4, + endPos: 7, + }, + ]; + }, + }, + }) as unknown as Electron.BrowserWindow, + }), + ); + + assert.deepEqual( + result.tokens?.map((token) => ({ + surface: token.surface, + reading: token.reading, + headword: token.headword, + startPos: token.startPos, + endPos: token.endPos, + })), + [ + { + surface: 'カズマ', + reading: 'かずま', + headword: 'カズマ', + startPos: 0, + endPos: 3, + }, + { + surface: '魔王軍', + reading: 'まおうぐん', + headword: '魔王軍', + startPos: 4, + endPos: 7, + }, + ], + ); +}); + test('tokenizeSubtitle loads frequency ranks from Yomitan installed dictionaries', async () => { const result = await tokenizeSubtitle( '猫', diff --git a/src/core/services/tokenizer.ts b/src/core/services/tokenizer.ts index dc806a4..ca8174b 100644 --- a/src/core/services/tokenizer.ts +++ b/src/core/services/tokenizer.ts @@ -9,6 +9,7 @@ import { Token, FrequencyDictionaryLookup, JlptLevel, + PartOfSpeech, } from '../../types'; import { DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG, @@ -18,9 +19,8 @@ import { DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG, resolveAnnotationPos2ExclusionSet, } from '../../token-pos2-exclusions'; -import { selectYomitanParseTokens } from './tokenizer/parser-selection-stage'; import { - requestYomitanParseResults, + requestYomitanScanTokens, requestYomitanTermFrequencies, } from './tokenizer/yomitan-parser-runtime'; @@ -296,6 +296,10 @@ function normalizeYomitanMergedReading(token: MergedToken): string { function normalizeSelectedYomitanTokens(tokens: MergedToken[]): MergedToken[] { return tokens.map((token) => ({ ...token, + partOfSpeech: token.partOfSpeech ?? PartOfSpeech.other, + isMerged: token.isMerged ?? true, + isKnown: token.isKnown ?? false, + isNPlusOneTarget: token.isNPlusOneTarget ?? false, reading: normalizeYomitanMergedReading(token), })); } @@ -468,20 +472,25 @@ async function parseWithYomitanInternalParser( deps: TokenizerServiceDeps, options: TokenizerAnnotationOptions, ): Promise { - const parseResults = await requestYomitanParseResults(text, deps, logger); - if (!parseResults) { - return null; - } - - const selectedTokens = selectYomitanParseTokens( - parseResults, - getKnownWordLookup(deps, options), - deps.getKnownWordMatchMode(), - ); + const selectedTokens = await requestYomitanScanTokens(text, deps, logger); if (!selectedTokens || selectedTokens.length === 0) { return null; } - const normalizedSelectedTokens = normalizeSelectedYomitanTokens(selectedTokens); + const normalizedSelectedTokens = normalizeSelectedYomitanTokens( + selectedTokens.map( + (token): MergedToken => ({ + surface: token.surface, + reading: token.reading, + headword: token.headword, + startPos: token.startPos, + endPos: token.endPos, + partOfSpeech: PartOfSpeech.other, + isMerged: true, + isKnown: false, + isNPlusOneTarget: false, + }), + ), + ); if (deps.getYomitanGroupDebugEnabled?.() === true) { logSelectedYomitanGroups(text, normalizedSelectedTokens); diff --git a/src/core/services/tokenizer/yomitan-parser-runtime.test.ts b/src/core/services/tokenizer/yomitan-parser-runtime.test.ts index 5a01acc..db4e1a9 100644 --- a/src/core/services/tokenizer/yomitan-parser-runtime.test.ts +++ b/src/core/services/tokenizer/yomitan-parser-runtime.test.ts @@ -9,6 +9,7 @@ import { deleteYomitanDictionaryByTitle, removeYomitanDictionarySettings, requestYomitanParseResults, + requestYomitanScanTokens, requestYomitanTermFrequencies, syncYomitanDefaultAnkiServer, upsertYomitanDictionarySettings, @@ -403,7 +404,7 @@ test('requestYomitanTermFrequencies caches repeated term+reading lookups', async assert.equal(frequencyCalls, 1); }); -test('requestYomitanParseResults disables Yomitan MeCab parser path', async () => { +test('requestYomitanScanTokens uses left-to-right termsFind scanning instead of parseText', async () => { const scripts: string[] = []; const deps = createDeps(async (script) => { scripts.push(script); @@ -419,17 +420,35 @@ test('requestYomitanParseResults disables Yomitan MeCab parser path', async () = ], }; } - return []; + return [ + { + surface: 'カズマ', + reading: 'かずま', + headword: 'カズマ', + startPos: 0, + endPos: 3, + }, + ]; }); - const result = await requestYomitanParseResults('猫です', deps, { + const result = await requestYomitanScanTokens('カズマ', deps, { error: () => undefined, }); - assert.deepEqual(result, []); - const parseScript = scripts.find((script) => script.includes('parseText')); - assert.ok(parseScript, 'expected parseText request script'); - assert.match(parseScript ?? '', /useMecabParser:\s*false/); + assert.deepEqual(result, [ + { + surface: 'カズマ', + reading: 'かずま', + headword: 'カズマ', + startPos: 0, + endPos: 3, + }, + ]); + const scannerScript = scripts.find((script) => script.includes('termsFind')); + assert.ok(scannerScript, 'expected termsFind scanning request script'); + assert.doesNotMatch(scannerScript ?? '', /parseText/); + assert.match(scannerScript ?? '', /matchType:\s*"exact"/); + assert.match(scannerScript ?? '', /deinflect:\s*true/); }); test('getYomitanDictionaryInfo requests dictionary info via backend action', async () => { diff --git a/src/core/services/tokenizer/yomitan-parser-runtime.ts b/src/core/services/tokenizer/yomitan-parser-runtime.ts index 317873f..c981d73 100644 --- a/src/core/services/tokenizer/yomitan-parser-runtime.ts +++ b/src/core/services/tokenizer/yomitan-parser-runtime.ts @@ -1,6 +1,7 @@ import type { BrowserWindow, Extension } from 'electron'; import * as fs from 'fs'; import * as path from 'path'; +import { selectYomitanParseTokens } from './parser-selection-stage'; interface LoggerLike { error: (message: string, ...args: unknown[]) => void; @@ -38,6 +39,14 @@ export interface YomitanTermReadingPair { reading: string | null; } +export interface YomitanScanToken { + surface: string; + reading: string; + headword: string; + startPos: number; + endPos: number; +} + interface YomitanProfileMetadata { profileIndex: number; scanLength: number; @@ -56,6 +65,21 @@ function isObject(value: unknown): value is Record { return Boolean(value && typeof value === 'object'); } +function isScanTokenArray(value: unknown): value is YomitanScanToken[] { + return ( + Array.isArray(value) && + value.every( + (entry) => + isObject(entry) && + typeof entry.surface === 'string' && + typeof entry.reading === 'string' && + typeof entry.headword === 'string' && + typeof entry.startPos === 'number' && + typeof entry.endPos === 'number', + ) + ); +} + function makeTermReadingCacheKey(term: string, reading: string | null): string { return `${term}\u0000${reading ?? ''}`; } @@ -584,6 +608,244 @@ async function invokeYomitanSettingsAutomation( } } +const YOMITAN_SCANNING_HELPERS = String.raw` + const HIRAGANA_CONVERSION_RANGE = [0x3041, 0x3096]; + const KATAKANA_CONVERSION_RANGE = [0x30a1, 0x30f6]; + const KANA_PROLONGED_SOUND_MARK_CODE_POINT = 0x30fc; + const KATAKANA_SMALL_KA_CODE_POINT = 0x30f5; + const KATAKANA_SMALL_KE_CODE_POINT = 0x30f6; + const KANA_RANGES = [[0x3040, 0x309f], [0x30a0, 0x30ff]]; + const JAPANESE_RANGES = [[0x3040, 0x30ff], [0x3400, 0x9fff]]; + function isCodePointInRange(codePoint, range) { return codePoint >= range[0] && codePoint <= range[1]; } + function isCodePointInRanges(codePoint, ranges) { return ranges.some((range) => isCodePointInRange(codePoint, range)); } + function isCodePointKana(codePoint) { return isCodePointInRanges(codePoint, KANA_RANGES); } + function isCodePointJapanese(codePoint) { return isCodePointInRanges(codePoint, JAPANESE_RANGES); } + function createFuriganaSegment(text, reading) { return {text, reading}; } + function getProlongedHiragana(previousCharacter) { + switch (previousCharacter) { + case "あ": case "か": case "が": case "さ": case "ざ": case "た": case "だ": case "な": case "は": case "ば": case "ぱ": case "ま": case "や": case "ら": case "わ": case "ぁ": case "ゃ": case "ゎ": return "あ"; + case "い": case "き": case "ぎ": case "し": case "じ": case "ち": case "ぢ": case "に": case "ひ": case "び": case "ぴ": case "み": case "り": case "ぃ": return "い"; + case "う": case "く": case "ぐ": case "す": case "ず": case "つ": case "づ": case "ぬ": case "ふ": case "ぶ": case "ぷ": case "む": case "ゆ": case "る": case "ぅ": case "ゅ": return "う"; + case "え": case "け": case "げ": case "せ": case "ぜ": case "て": case "で": case "ね": case "へ": case "べ": case "ぺ": case "め": case "れ": case "ぇ": return "え"; + case "お": case "こ": case "ご": case "そ": case "ぞ": case "と": case "ど": case "の": case "ほ": case "ぼ": case "ぽ": case "も": case "よ": case "ろ": case "を": case "ぉ": case "ょ": return "う"; + default: return null; + } + } + function getFuriganaKanaSegments(text, reading) { + const newSegments = []; + let start = 0; + let state = (reading[0] === text[0]); + for (let i = 1; i < text.length; ++i) { + const newState = (reading[i] === text[i]); + if (state === newState) { continue; } + newSegments.push(createFuriganaSegment(text.substring(start, i), state ? '' : reading.substring(start, i))); + state = newState; + start = i; + } + newSegments.push(createFuriganaSegment(text.substring(start), state ? '' : reading.substring(start))); + return newSegments; + } + function convertKatakanaToHiragana(text, keepProlongedSoundMarks = false) { + let result = ''; + const offset = (HIRAGANA_CONVERSION_RANGE[0] - KATAKANA_CONVERSION_RANGE[0]); + for (let char of text) { + const codePoint = char.codePointAt(0); + switch (codePoint) { + case KATAKANA_SMALL_KA_CODE_POINT: + case KATAKANA_SMALL_KE_CODE_POINT: + break; + case KANA_PROLONGED_SOUND_MARK_CODE_POINT: + if (!keepProlongedSoundMarks && result.length > 0) { + const char2 = getProlongedHiragana(result[result.length - 1]); + if (char2 !== null) { char = char2; } + } + break; + default: + if (isCodePointInRange(codePoint, KATAKANA_CONVERSION_RANGE)) { + char = String.fromCodePoint(codePoint + offset); + } + break; + } + result += char; + } + return result; + } + function segmentizeFurigana(reading, readingNormalized, groups, groupsStart) { + const groupCount = groups.length - groupsStart; + if (groupCount <= 0) { return reading.length === 0 ? [] : null; } + const group = groups[groupsStart]; + const {isKana, text} = group; + if (isKana) { + if (group.textNormalized !== null && readingNormalized.startsWith(group.textNormalized)) { + const segments = segmentizeFurigana(reading.substring(text.length), readingNormalized.substring(text.length), groups, groupsStart + 1); + if (segments !== null) { + if (reading.startsWith(text)) { segments.unshift(createFuriganaSegment(text, '')); } + else { segments.unshift(...getFuriganaKanaSegments(text, reading)); } + return segments; + } + } + return null; + } + let result = null; + for (let i = reading.length; i >= text.length; --i) { + const segments = segmentizeFurigana(reading.substring(i), readingNormalized.substring(i), groups, groupsStart + 1); + if (segments !== null) { + if (result !== null) { return null; } + segments.unshift(createFuriganaSegment(text, reading.substring(0, i))); + result = segments; + } + if (groupCount === 1) { break; } + } + return result; + } + function distributeFurigana(term, reading) { + if (reading === term) { return [createFuriganaSegment(term, '')]; } + const groups = []; + let groupPre = null; + let isKanaPre = null; + for (const c of term) { + const isKana = isCodePointKana(c.codePointAt(0)); + if (isKana === isKanaPre) { groupPre.text += c; } + else { + groupPre = {isKana, text: c, textNormalized: null}; + groups.push(groupPre); + isKanaPre = isKana; + } + } + for (const group of groups) { + if (group.isKana) { group.textNormalized = convertKatakanaToHiragana(group.text); } + } + const segments = segmentizeFurigana(reading, convertKatakanaToHiragana(reading), groups, 0); + return segments !== null ? segments : [createFuriganaSegment(term, reading)]; + } + function getStemLength(text1, text2) { + const minLength = Math.min(text1.length, text2.length); + if (minLength === 0) { return 0; } + let i = 0; + while (true) { + const char1 = text1.codePointAt(i); + const char2 = text2.codePointAt(i); + if (char1 !== char2) { break; } + const charLength = String.fromCodePoint(char1).length; + i += charLength; + if (i >= minLength) { + if (i > minLength) { i -= charLength; } + break; + } + } + return i; + } + function distributeFuriganaInflected(term, reading, source) { + const termNormalized = convertKatakanaToHiragana(term); + const readingNormalized = convertKatakanaToHiragana(reading); + const sourceNormalized = convertKatakanaToHiragana(source); + let mainText = term; + let stemLength = getStemLength(termNormalized, sourceNormalized); + const readingStemLength = getStemLength(readingNormalized, sourceNormalized); + if (readingStemLength > 0 && readingStemLength >= stemLength) { + mainText = reading; + stemLength = readingStemLength; + reading = source.substring(0, stemLength) + reading.substring(stemLength); + } + const segments = []; + if (stemLength > 0) { + mainText = source.substring(0, stemLength) + mainText.substring(stemLength); + const segments2 = distributeFurigana(mainText, reading); + let consumed = 0; + for (const segment of segments2) { + const start = consumed; + consumed += segment.text.length; + if (consumed < stemLength) { segments.push(segment); } + else if (consumed === stemLength) { segments.push(segment); break; } + else { + if (start < stemLength) { segments.push(createFuriganaSegment(mainText.substring(start, stemLength), '')); } + break; + } + } + } + if (stemLength < source.length) { + const remainder = source.substring(stemLength); + const last = segments[segments.length - 1]; + if (last && last.reading.length === 0) { last.text += remainder; } + else { segments.push(createFuriganaSegment(remainder, '')); } + } + return segments; + } + function getPreferredHeadword(dictionaryEntries, token) { + for (const dictionaryEntry of dictionaryEntries || []) { + for (const headword of dictionaryEntry.headwords || []) { + const validSources = []; + for (const src of headword.sources || []) { + if (src.originalText !== token) { continue; } + if (!src.isPrimary) { continue; } + if (src.matchType !== 'exact') { continue; } + validSources.push(src); + } + if (validSources.length > 0) { return {term: headword.term, reading: headword.reading}; } + } + } + const fallback = dictionaryEntries?.[0]?.headwords?.[0]; + return fallback ? {term: fallback.term, reading: fallback.reading} : null; + } +`; + +function buildYomitanScanningScript(text: string, profileIndex: number, scanLength: number): string { + return ` + (async () => { + const invoke = (action, params) => + new Promise((resolve, reject) => { + chrome.runtime.sendMessage({ action, params }, (response) => { + if (chrome.runtime.lastError) { + reject(new Error(chrome.runtime.lastError.message)); + return; + } + if (!response || typeof response !== "object") { + reject(new Error("Invalid response from Yomitan backend")); + return; + } + if (response.error) { + reject(new Error(response.error.message || "Yomitan backend error")); + return; + } + resolve(response.result); + }); + }); +${YOMITAN_SCANNING_HELPERS} + const text = ${JSON.stringify(text)}; + const details = {matchType: "exact", deinflect: true}; + const tokens = []; + let i = 0; + while (i < text.length) { + const codePoint = text.codePointAt(i); + const character = String.fromCodePoint(codePoint); + const substring = text.substring(i, i + ${scanLength}); + const result = await invoke("termsFind", { text: substring, details, optionsContext: { index: ${profileIndex} } }); + const dictionaryEntries = Array.isArray(result?.dictionaryEntries) ? result.dictionaryEntries : []; + const originalTextLength = typeof result?.originalTextLength === "number" ? result.originalTextLength : 0; + if (dictionaryEntries.length > 0 && originalTextLength > 0 && (originalTextLength !== character.length || isCodePointJapanese(codePoint))) { + const source = substring.substring(0, originalTextLength); + const preferredHeadword = getPreferredHeadword(dictionaryEntries, source); + if (preferredHeadword && typeof preferredHeadword.term === "string") { + const reading = typeof preferredHeadword.reading === "string" ? preferredHeadword.reading : ""; + const segments = distributeFuriganaInflected(preferredHeadword.term, reading, source); + tokens.push({ + surface: segments.map((segment) => segment.text).join("") || source, + reading: segments.map((segment) => typeof segment.reading === "string" ? segment.reading : "").join(""), + headword: preferredHeadword.term, + startPos: i, + endPos: i + originalTextLength, + }); + i += originalTextLength; + continue; + } + } + i += character.length; + } + return tokens; + })(); + `; +} + export async function requestYomitanParseResults( text: string, deps: YomitanParserRuntimeDeps, @@ -678,6 +940,51 @@ export async function requestYomitanParseResults( } } +export async function requestYomitanScanTokens( + text: string, + deps: YomitanParserRuntimeDeps, + logger: LoggerLike, +): Promise { + const yomitanExt = deps.getYomitanExt(); + if (!text || !yomitanExt) { + return null; + } + + const isReady = await ensureYomitanParserWindow(deps, logger); + const parserWindow = deps.getYomitanParserWindow(); + if (!isReady || !parserWindow || parserWindow.isDestroyed()) { + return null; + } + + const metadata = await requestYomitanProfileMetadata(parserWindow, logger); + const profileIndex = metadata?.profileIndex ?? 0; + const scanLength = metadata?.scanLength ?? DEFAULT_YOMITAN_SCAN_LENGTH; + + try { + const rawResult = await parserWindow.webContents.executeJavaScript( + buildYomitanScanningScript(text, profileIndex, scanLength), + true, + ); + if (isScanTokenArray(rawResult)) { + return rawResult; + } + if (Array.isArray(rawResult)) { + const selectedTokens = selectYomitanParseTokens(rawResult, () => false, 'headword'); + return selectedTokens?.map((token) => ({ + surface: token.surface, + reading: token.reading, + headword: token.headword, + startPos: token.startPos, + endPos: token.endPos, + })) ?? null; + } + return null; + } catch (err) { + logger.error('Yomitan scanner request failed:', (err as Error).message); + return null; + } +} + async function fetchYomitanTermFrequencies( parserWindow: BrowserWindow, termReadingList: YomitanTermReadingPair[], diff --git a/src/main/character-dictionary-runtime.test.ts b/src/main/character-dictionary-runtime.test.ts index da7900a..45f5354 100644 --- a/src/main/character-dictionary-runtime.test.ts +++ b/src/main/character-dictionary-runtime.test.ts @@ -196,6 +196,115 @@ test('generateForCurrentMedia emits structured-content glossary so image stays w } }); +test('generateForCurrentMedia adds kana aliases for romanized names when native name is kanji', async () => { + const userDataPath = makeTempDir(); + const originalFetch = globalThis.fetch; + + globalThis.fetch = (async (input: string | URL | Request, init?: RequestInit) => { + const url = typeof input === 'string' ? input : input instanceof URL ? input.href : input.url; + if (url === GRAPHQL_URL) { + const body = JSON.parse(String(init?.body ?? '{}')) as { + query?: string; + }; + + if (body.query?.includes('Page(perPage: 10)')) { + return new Response( + JSON.stringify({ + data: { + Page: { + media: [ + { + id: 20594, + episodes: 10, + title: { + romaji: 'Kono Subarashii Sekai ni Shukufuku wo!', + english: 'KONOSUBA -God’s blessing on this wonderful world!', + native: 'この素晴らしい世界に祝福を!', + }, + }, + ], + }, + }, + }), + { + status: 200, + headers: { 'content-type': 'application/json' }, + }, + ); + } + + if (body.query?.includes('characters(page: $page')) { + return new Response( + JSON.stringify({ + data: { + Media: { + title: { + romaji: 'Kono Subarashii Sekai ni Shukufuku wo!', + english: 'KONOSUBA -God’s blessing on this wonderful world!', + native: 'この素晴らしい世界に祝福を!', + }, + characters: { + pageInfo: { hasNextPage: false }, + edges: [ + { + role: 'MAIN', + node: { + id: 1, + description: 'The protagonist.', + image: null, + name: { + full: 'Satou Kazuma', + native: '佐藤和真', + }, + }, + }, + ], + }, + }, + }, + }), + { + status: 200, + headers: { 'content-type': 'application/json' }, + }, + ); + } + } + + throw new Error(`Unexpected fetch URL: ${url}`); + }) as typeof globalThis.fetch; + + try { + const runtime = createCharacterDictionaryRuntimeService({ + userDataPath, + getCurrentMediaPath: () => '/tmp/konosuba-s02e05.mkv', + getCurrentMediaTitle: () => 'Konosuba S02E05', + resolveMediaPathForJimaku: (mediaPath) => mediaPath, + guessAnilistMediaInfo: async () => ({ + title: 'Konosuba', + episode: 5, + source: 'fallback', + }), + now: () => 1_700_000_000_000, + }); + + const result = await runtime.generateForCurrentMedia(); + const termBank = JSON.parse(readStoredZipEntry(result.zipPath, 'term_bank_1.json').toString('utf8')) as Array< + [string, string, string, string, number, Array>, number, string] + >; + + const kazuma = termBank.find(([term]) => term === 'カズマ'); + assert.ok(kazuma, 'expected katakana alias for romanized name'); + assert.equal(kazuma[1], 'かずま'); + + const fullName = termBank.find(([term]) => term === 'サトウカズマ'); + assert.ok(fullName, 'expected compact full-name katakana alias for romanized name'); + assert.equal(fullName[1], 'さとうかずま'); + } finally { + globalThis.fetch = originalFetch; + } +}); + test('getOrCreateCurrentSnapshot persists and reuses normalized snapshot data', async () => { const userDataPath = makeTempDir(); const originalFetch = globalThis.fetch; @@ -336,6 +445,139 @@ test('getOrCreateCurrentSnapshot persists and reuses normalized snapshot data', } }); +test('getOrCreateCurrentSnapshot rebuilds snapshots written with an older format version', async () => { + const userDataPath = makeTempDir(); + const originalFetch = globalThis.fetch; + let searchQueryCount = 0; + let characterQueryCount = 0; + + globalThis.fetch = (async (input: string | URL | Request, init?: RequestInit) => { + const url = typeof input === 'string' ? input : input instanceof URL ? input.href : input.url; + if (url === GRAPHQL_URL) { + const body = JSON.parse(String(init?.body ?? '{}')) as { + query?: string; + }; + + if (body.query?.includes('Page(perPage: 10)')) { + searchQueryCount += 1; + return new Response( + JSON.stringify({ + data: { + Page: { + media: [ + { + id: 130298, + episodes: 20, + title: { + romaji: 'Kage no Jitsuryokusha ni Naritakute!', + english: 'The Eminence in Shadow', + native: '陰の実力者になりたくて!', + }, + }, + ], + }, + }, + }), + { + status: 200, + headers: { 'content-type': 'application/json' }, + }, + ); + } + + if (body.query?.includes('characters(page: $page')) { + characterQueryCount += 1; + return new Response( + JSON.stringify({ + data: { + Media: { + title: { + romaji: 'Kage no Jitsuryokusha ni Naritakute!', + english: 'The Eminence in Shadow', + native: '陰の実力者になりたくて!', + }, + characters: { + pageInfo: { hasNextPage: false }, + edges: [ + { + role: 'MAIN', + node: { + id: 321, + description: 'Alpha is the second-in-command of Shadow Garden.', + image: null, + name: { + full: 'Alpha', + native: 'アルファ', + }, + }, + }, + ], + }, + }, + }, + }), + { + status: 200, + headers: { 'content-type': 'application/json' }, + }, + ); + } + } + + throw new Error(`Unexpected fetch URL: ${url}`); + }) as typeof globalThis.fetch; + + try { + const snapshotsDir = path.join(userDataPath, 'character-dictionaries', 'snapshots'); + fs.mkdirSync(snapshotsDir, { recursive: true }); + fs.writeFileSync( + path.join(snapshotsDir, 'anilist-130298.json'), + JSON.stringify({ + formatVersion: 9, + mediaId: 130298, + mediaTitle: 'The Eminence in Shadow', + entryCount: 1, + updatedAt: 1_700_000_000_000, + termEntries: [['stale', '', 'name side', '', 1, ['stale'], 0, '']], + images: [], + }), + 'utf8', + ); + + const runtime = createCharacterDictionaryRuntimeService({ + userDataPath, + getCurrentMediaPath: () => '/tmp/eminence-s01e05.mkv', + getCurrentMediaTitle: () => 'The Eminence in Shadow - S01E05', + resolveMediaPathForJimaku: (mediaPath) => mediaPath, + guessAnilistMediaInfo: async () => ({ + title: 'The Eminence in Shadow', + episode: 5, + source: 'fallback', + }), + now: () => 1_700_000_000_100, + }); + + const result = await runtime.getOrCreateCurrentSnapshot(); + + assert.equal(result.fromCache, false); + assert.equal(searchQueryCount, 1); + assert.equal(characterQueryCount, 1); + + const snapshotPath = path.join(snapshotsDir, 'anilist-130298.json'); + const snapshot = JSON.parse(fs.readFileSync(snapshotPath, 'utf8')) as { + formatVersion: number; + termEntries: Array< + [string, string, string, string, number, Array>, number, string] + >; + }; + assert.equal(snapshot.formatVersion > 9, true); + assert.equal(snapshot.termEntries.some(([term]) => term === 'アルファ'), true); + assert.equal(snapshot.termEntries.some(([term]) => term === 'stale'), false); + } finally { + globalThis.fetch = originalFetch; + } +}); + test('buildMergedDictionary combines stored snapshots into one stable dictionary', async () => { const userDataPath = makeTempDir(); const originalFetch = globalThis.fetch; diff --git a/src/main/character-dictionary-runtime.ts b/src/main/character-dictionary-runtime.ts index cf7a856..5b445a0 100644 --- a/src/main/character-dictionary-runtime.ts +++ b/src/main/character-dictionary-runtime.ts @@ -54,7 +54,7 @@ export type CharacterDictionarySnapshot = { images: CharacterDictionarySnapshotImage[]; }; -const CHARACTER_DICTIONARY_FORMAT_VERSION = 9; +const CHARACTER_DICTIONARY_FORMAT_VERSION = 10; const CHARACTER_DICTIONARY_MERGED_TITLE = 'SubMiner Character Dictionary'; type AniListSearchResponse = { @@ -238,6 +238,246 @@ function buildReading(term: string): string { return katakanaToHiragana(compact); } +function isRomanizedName(value: string): boolean { + return /^[A-Za-zĀĪŪĒŌÂÊÎÔÛāīūēōâêîôû'’.\-\s]+$/.test(value); +} + +function normalizeRomanizedName(value: string): string { + return value + .normalize('NFKC') + .toLowerCase() + .replace(/[’']/g, '') + .replace(/[.\-]/g, ' ') + .replace(/ā|â/g, 'aa') + .replace(/ī|î/g, 'ii') + .replace(/ū|û/g, 'uu') + .replace(/ē|ê/g, 'ei') + .replace(/ō|ô/g, 'ou') + .replace(/\s+/g, ' ') + .trim(); +} + +const ROMANIZED_KANA_DIGRAPHS: ReadonlyArray<[string, string]> = [ + ['kya', 'キャ'], + ['kyu', 'キュ'], + ['kyo', 'キョ'], + ['gya', 'ギャ'], + ['gyu', 'ギュ'], + ['gyo', 'ギョ'], + ['sha', 'シャ'], + ['shu', 'シュ'], + ['sho', 'ショ'], + ['sya', 'シャ'], + ['syu', 'シュ'], + ['syo', 'ショ'], + ['ja', 'ジャ'], + ['ju', 'ジュ'], + ['jo', 'ジョ'], + ['jya', 'ジャ'], + ['jyu', 'ジュ'], + ['jyo', 'ジョ'], + ['cha', 'チャ'], + ['chu', 'チュ'], + ['cho', 'チョ'], + ['tya', 'チャ'], + ['tyu', 'チュ'], + ['tyo', 'チョ'], + ['cya', 'チャ'], + ['cyu', 'チュ'], + ['cyo', 'チョ'], + ['nya', 'ニャ'], + ['nyu', 'ニュ'], + ['nyo', 'ニョ'], + ['hya', 'ヒャ'], + ['hyu', 'ヒュ'], + ['hyo', 'ヒョ'], + ['bya', 'ビャ'], + ['byu', 'ビュ'], + ['byo', 'ビョ'], + ['pya', 'ピャ'], + ['pyu', 'ピュ'], + ['pyo', 'ピョ'], + ['mya', 'ミャ'], + ['myu', 'ミュ'], + ['myo', 'ミョ'], + ['rya', 'リャ'], + ['ryu', 'リュ'], + ['ryo', 'リョ'], + ['fa', 'ファ'], + ['fi', 'フィ'], + ['fe', 'フェ'], + ['fo', 'フォ'], + ['fyu', 'フュ'], + ['fyo', 'フョ'], + ['fya', 'フャ'], + ['va', 'ヴァ'], + ['vi', 'ヴィ'], + ['vu', 'ヴ'], + ['ve', 'ヴェ'], + ['vo', 'ヴォ'], + ['she', 'シェ'], + ['che', 'チェ'], + ['je', 'ジェ'], + ['tsi', 'ツィ'], + ['tse', 'ツェ'], + ['tsa', 'ツァ'], + ['tso', 'ツォ'], + ['thi', 'ティ'], + ['thu', 'テュ'], + ['dhi', 'ディ'], + ['dhu', 'デュ'], + ['wi', 'ウィ'], + ['we', 'ウェ'], + ['wo', 'ウォ'], +]; + +const ROMANIZED_KANA_MONOGRAPHS: ReadonlyArray<[string, string]> = [ + ['a', 'ア'], + ['i', 'イ'], + ['u', 'ウ'], + ['e', 'エ'], + ['o', 'オ'], + ['ka', 'カ'], + ['ki', 'キ'], + ['ku', 'ク'], + ['ke', 'ケ'], + ['ko', 'コ'], + ['ga', 'ガ'], + ['gi', 'ギ'], + ['gu', 'グ'], + ['ge', 'ゲ'], + ['go', 'ゴ'], + ['sa', 'サ'], + ['shi', 'シ'], + ['si', 'シ'], + ['su', 'ス'], + ['se', 'セ'], + ['so', 'ソ'], + ['za', 'ザ'], + ['ji', 'ジ'], + ['zi', 'ジ'], + ['zu', 'ズ'], + ['ze', 'ゼ'], + ['zo', 'ゾ'], + ['ta', 'タ'], + ['chi', 'チ'], + ['ti', 'チ'], + ['tsu', 'ツ'], + ['tu', 'ツ'], + ['te', 'テ'], + ['to', 'ト'], + ['da', 'ダ'], + ['de', 'デ'], + ['do', 'ド'], + ['na', 'ナ'], + ['ni', 'ニ'], + ['nu', 'ヌ'], + ['ne', 'ネ'], + ['no', 'ノ'], + ['ha', 'ハ'], + ['hi', 'ヒ'], + ['fu', 'フ'], + ['hu', 'フ'], + ['he', 'ヘ'], + ['ho', 'ホ'], + ['ba', 'バ'], + ['bi', 'ビ'], + ['bu', 'ブ'], + ['be', 'ベ'], + ['bo', 'ボ'], + ['pa', 'パ'], + ['pi', 'ピ'], + ['pu', 'プ'], + ['pe', 'ペ'], + ['po', 'ポ'], + ['ma', 'マ'], + ['mi', 'ミ'], + ['mu', 'ム'], + ['me', 'メ'], + ['mo', 'モ'], + ['ya', 'ヤ'], + ['yu', 'ユ'], + ['yo', 'ヨ'], + ['ra', 'ラ'], + ['ri', 'リ'], + ['ru', 'ル'], + ['re', 'レ'], + ['ro', 'ロ'], + ['wa', 'ワ'], + ['wo', 'ヲ'], + ['n', 'ン'], +]; + +function romanizedTokenToKatakana(token: string): string | null { + const normalized = normalizeRomanizedName(token).replace(/\s+/g, ''); + if (!normalized || !/^[a-z]+$/.test(normalized)) { + return null; + } + + let output = ''; + for (let i = 0; i < normalized.length; ) { + const current = normalized[i]!; + const next = normalized[i + 1] ?? ''; + + if ( + i + 1 < normalized.length && + current === next && + current !== 'n' && + !'aeiou'.includes(current) + ) { + output += 'ッ'; + i += 1; + continue; + } + + if ( + current === 'n' && + next.length > 0 && + next !== 'y' && + !'aeiou'.includes(next) + ) { + output += 'ン'; + i += 1; + continue; + } + + const digraph = ROMANIZED_KANA_DIGRAPHS.find(([romaji]) => + normalized.startsWith(romaji, i), + ); + if (digraph) { + output += digraph[1]; + i += digraph[0].length; + continue; + } + + const monograph = ROMANIZED_KANA_MONOGRAPHS.find(([romaji]) => + normalized.startsWith(romaji, i), + ); + if (monograph) { + output += monograph[1]; + i += monograph[0].length; + continue; + } + + return null; + } + + return output.length > 0 ? output : null; +} + +function addRomanizedKanaAliases(values: Iterable): string[] { + const aliases = new Set(); + for (const value of values) { + const trimmed = value.trim(); + if (!trimmed || !isRomanizedName(trimmed)) continue; + const katakana = romanizedTokenToKatakana(trimmed); + if (katakana) { + aliases.add(katakana); + } + } + return [...aliases]; +} + function buildNameTerms(character: CharacterRecord): string[] { const base = new Set(); const rawNames = [character.nativeName, character.fullName]; @@ -281,6 +521,13 @@ function buildNameTerms(character: CharacterRecord): string[] { } } + for (const alias of addRomanizedKanaAliases(withHonorifics)) { + withHonorifics.add(alias); + for (const suffix of HONORIFIC_SUFFIXES) { + withHonorifics.add(`${alias}${suffix}`); + } + } + return [...withHonorifics].filter((entry) => entry.trim().length > 0); }