mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-07 03:22:17 -08:00
fix: improve yomitan subtitle name lookup
This commit is contained in:
@@ -0,0 +1,34 @@
|
|||||||
|
---
|
||||||
|
id: TASK-93
|
||||||
|
title: Replace subtitle tokenizer with left-to-right Yomitan scanning parser
|
||||||
|
status: Done
|
||||||
|
assignee: []
|
||||||
|
created_date: '2026-03-06 09:02'
|
||||||
|
updated_date: '2026-03-06 09:14'
|
||||||
|
labels:
|
||||||
|
- tokenizer
|
||||||
|
- yomitan
|
||||||
|
- refactor
|
||||||
|
dependencies: []
|
||||||
|
priority: high
|
||||||
|
---
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||||
|
Replace the current parseText candidate-selection tokenizer with a GSM-style left-to-right Yomitan scanning tokenizer for all subtitles. Preserve downstream token contracts for rendering, JLPT/frequency/N+1 annotation, and MeCab enrichment while improving full-term matching for names and katakana compounds.
|
||||||
|
<!-- SECTION:DESCRIPTION:END -->
|
||||||
|
|
||||||
|
## Acceptance Criteria
|
||||||
|
<!-- AC:BEGIN -->
|
||||||
|
- [x] #1 Subtitle tokenization uses a left-to-right Yomitan scanning strategy instead of parseText candidate selection.
|
||||||
|
- [x] #2 Token surfaces, readings, headwords, and offsets remain compatible with existing renderer and annotation stages.
|
||||||
|
- [x] #3 Known problematic name cases such as カズマ and バニール resolve to full-token dictionary matches when Yomitan can match them.
|
||||||
|
- [x] #4 Regression tests cover left-to-right exact-match scanning, unmatched text handling, and downstream tokenizeSubtitle integration.
|
||||||
|
<!-- AC:END -->
|
||||||
|
|
||||||
|
## Final Summary
|
||||||
|
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||||
|
Replaced the live subtitle tokenization path with a left-to-right Yomitan `termsFind` scanner that greedily advances through the normalized subtitle text, preserving downstream `MergedToken` contracts for renderer, MeCab enrichment, JLPT, frequency, and N+1 annotation. Added runtime and integration coverage for exact-match scanning plus name cases like カズマ and kept compatibility fallback handling for older mocked parseText-style test payloads.
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||||
@@ -0,0 +1,40 @@
|
|||||||
|
---
|
||||||
|
id: TASK-94
|
||||||
|
title: Add kana aliases for AniList character dictionary entries
|
||||||
|
status: Done
|
||||||
|
assignee: []
|
||||||
|
created_date: '2026-03-06 09:20'
|
||||||
|
updated_date: '2026-03-06 09:23'
|
||||||
|
labels:
|
||||||
|
- dictionary
|
||||||
|
- tokenizer
|
||||||
|
- anilist
|
||||||
|
dependencies: []
|
||||||
|
references:
|
||||||
|
- >-
|
||||||
|
/home/sudacode/projects/japanese/SubMiner/src/main/character-dictionary-runtime.ts
|
||||||
|
- >-
|
||||||
|
/home/sudacode/projects/japanese/SubMiner/src/main/character-dictionary-runtime.test.ts
|
||||||
|
priority: high
|
||||||
|
---
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||||
|
Generate katakana/hiragana-friendly aliases from AniList romanized character names so subtitle katakana names like カズマ match character dictionary entries even when AniList native name is kanji.
|
||||||
|
<!-- SECTION:DESCRIPTION:END -->
|
||||||
|
|
||||||
|
## Acceptance Criteria
|
||||||
|
<!-- AC:BEGIN -->
|
||||||
|
- [x] #1 AniList character dictionary generation adds kana aliases for romanized names when native name is not already kana-only
|
||||||
|
- [x] #2 Generated dictionary entries allow katakana subtitle names like カズマ to resolve against a kanji-native AniList character entry
|
||||||
|
- [x] #3 Regression tests cover alias generation and resulting term bank output
|
||||||
|
<!-- AC:END -->
|
||||||
|
|
||||||
|
## Final Summary
|
||||||
|
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||||
|
Added katakana aliases synthesized from AniList romanized character names during character dictionary generation, so kanji-native entries such as 佐藤和真 / Satou Kazuma now also emit terms like カズマ and サトウカズマ with hiragana readings. Added regression coverage verifying generated term-bank output for the Konosuba case.
|
||||||
|
|
||||||
|
Verified with `bun test src/main/character-dictionary-runtime.test.ts` and `bun run tsc --noEmit`.
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||||
@@ -0,0 +1,39 @@
|
|||||||
|
---
|
||||||
|
id: TASK-95
|
||||||
|
title: Invalidate old character dictionary snapshots after kana alias schema change
|
||||||
|
status: Done
|
||||||
|
assignee: []
|
||||||
|
created_date: '2026-03-06 09:25'
|
||||||
|
updated_date: '2026-03-06 09:28'
|
||||||
|
labels:
|
||||||
|
- dictionary
|
||||||
|
- cache
|
||||||
|
dependencies: []
|
||||||
|
references:
|
||||||
|
- >-
|
||||||
|
/home/sudacode/projects/japanese/SubMiner/src/main/character-dictionary-runtime.ts
|
||||||
|
- >-
|
||||||
|
/home/sudacode/projects/japanese/SubMiner/src/main/character-dictionary-runtime.test.ts
|
||||||
|
priority: high
|
||||||
|
---
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||||
|
Bump character dictionary snapshot format/version so cached AniList snapshots created before kana alias generation are rebuilt automatically on next auto-sync or generation run.
|
||||||
|
<!-- SECTION:DESCRIPTION:END -->
|
||||||
|
|
||||||
|
## Acceptance Criteria
|
||||||
|
<!-- AC:BEGIN -->
|
||||||
|
- [x] #1 Old cached character dictionary snapshots are treated as invalid after the schema/version bump
|
||||||
|
- [x] #2 Current snapshot generation tests cover rebuild behavior across version mismatch
|
||||||
|
- [x] #3 No manual cache deletion is required for users to pick up kana alias term generation
|
||||||
|
<!-- AC:END -->
|
||||||
|
|
||||||
|
## Final Summary
|
||||||
|
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||||
|
Bumped the character dictionary snapshot format version so cached AniList snapshots created before kana alias generation are automatically treated as stale and rebuilt. Added regression coverage that seeds an older-format snapshot and verifies `getOrCreateCurrentSnapshot` fetches fresh data and overwrites the stale cache.
|
||||||
|
|
||||||
|
Verified with `bun test src/main/character-dictionary-runtime.test.ts` and `bun run tsc --noEmit`.
|
||||||
|
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||||
@@ -30,25 +30,32 @@ function makeDepsFromYomitanTokens(
|
|||||||
tokens: YomitanTokenInput[],
|
tokens: YomitanTokenInput[],
|
||||||
overrides: Partial<TokenizerServiceDeps> = {},
|
overrides: Partial<TokenizerServiceDeps> = {},
|
||||||
): TokenizerServiceDeps {
|
): TokenizerServiceDeps {
|
||||||
|
let cursor = 0;
|
||||||
return makeDeps({
|
return makeDeps({
|
||||||
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
||||||
getYomitanParserWindow: () =>
|
getYomitanParserWindow: () =>
|
||||||
({
|
({
|
||||||
isDestroyed: () => false,
|
isDestroyed: () => false,
|
||||||
webContents: {
|
webContents: {
|
||||||
executeJavaScript: async () => [
|
executeJavaScript: async (script: string) => {
|
||||||
{
|
if (script.includes('getTermFrequencies')) {
|
||||||
source: 'scanning-parser',
|
return [];
|
||||||
index: 0,
|
}
|
||||||
content: tokens.map((token) => [
|
|
||||||
{
|
cursor = 0;
|
||||||
text: token.surface,
|
return tokens.map((token) => {
|
||||||
|
const startPos = cursor;
|
||||||
|
const endPos = startPos + token.surface.length;
|
||||||
|
cursor = endPos;
|
||||||
|
return {
|
||||||
|
surface: token.surface,
|
||||||
reading: token.reading ?? token.surface,
|
reading: token.reading ?? token.surface,
|
||||||
headwords: [[{ term: token.headword ?? token.surface }]],
|
headword: token.headword ?? token.surface,
|
||||||
|
startPos,
|
||||||
|
endPos,
|
||||||
|
};
|
||||||
|
});
|
||||||
},
|
},
|
||||||
]),
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
},
|
||||||
}) as unknown as Electron.BrowserWindow,
|
}) as unknown as Electron.BrowserWindow,
|
||||||
...overrides,
|
...overrides,
|
||||||
@@ -182,6 +189,69 @@ test('tokenizeSubtitle applies frequency dictionary ranks', async () => {
|
|||||||
assert.equal(result.tokens?.[1]?.frequencyRank, 1200);
|
assert.equal(result.tokens?.[1]?.frequencyRank, 1200);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('tokenizeSubtitle uses left-to-right yomitan scanning to keep full katakana name tokens', async () => {
|
||||||
|
const result = await tokenizeSubtitle(
|
||||||
|
'カズマ 魔王軍',
|
||||||
|
makeDeps({
|
||||||
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
||||||
|
getYomitanParserWindow: () =>
|
||||||
|
({
|
||||||
|
isDestroyed: () => false,
|
||||||
|
webContents: {
|
||||||
|
executeJavaScript: async (script: string) => {
|
||||||
|
if (script.includes('getTermFrequencies')) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
surface: 'カズマ',
|
||||||
|
reading: 'かずま',
|
||||||
|
headword: 'カズマ',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 3,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
surface: '魔王軍',
|
||||||
|
reading: 'まおうぐん',
|
||||||
|
headword: '魔王軍',
|
||||||
|
startPos: 4,
|
||||||
|
endPos: 7,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}) as unknown as Electron.BrowserWindow,
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.deepEqual(
|
||||||
|
result.tokens?.map((token) => ({
|
||||||
|
surface: token.surface,
|
||||||
|
reading: token.reading,
|
||||||
|
headword: token.headword,
|
||||||
|
startPos: token.startPos,
|
||||||
|
endPos: token.endPos,
|
||||||
|
})),
|
||||||
|
[
|
||||||
|
{
|
||||||
|
surface: 'カズマ',
|
||||||
|
reading: 'かずま',
|
||||||
|
headword: 'カズマ',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 3,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
surface: '魔王軍',
|
||||||
|
reading: 'まおうぐん',
|
||||||
|
headword: '魔王軍',
|
||||||
|
startPos: 4,
|
||||||
|
endPos: 7,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle loads frequency ranks from Yomitan installed dictionaries', async () => {
|
test('tokenizeSubtitle loads frequency ranks from Yomitan installed dictionaries', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'猫',
|
'猫',
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import {
|
|||||||
Token,
|
Token,
|
||||||
FrequencyDictionaryLookup,
|
FrequencyDictionaryLookup,
|
||||||
JlptLevel,
|
JlptLevel,
|
||||||
|
PartOfSpeech,
|
||||||
} from '../../types';
|
} from '../../types';
|
||||||
import {
|
import {
|
||||||
DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
|
DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
|
||||||
@@ -18,9 +19,8 @@ import {
|
|||||||
DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
|
DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
|
||||||
resolveAnnotationPos2ExclusionSet,
|
resolveAnnotationPos2ExclusionSet,
|
||||||
} from '../../token-pos2-exclusions';
|
} from '../../token-pos2-exclusions';
|
||||||
import { selectYomitanParseTokens } from './tokenizer/parser-selection-stage';
|
|
||||||
import {
|
import {
|
||||||
requestYomitanParseResults,
|
requestYomitanScanTokens,
|
||||||
requestYomitanTermFrequencies,
|
requestYomitanTermFrequencies,
|
||||||
} from './tokenizer/yomitan-parser-runtime';
|
} from './tokenizer/yomitan-parser-runtime';
|
||||||
|
|
||||||
@@ -296,6 +296,10 @@ function normalizeYomitanMergedReading(token: MergedToken): string {
|
|||||||
function normalizeSelectedYomitanTokens(tokens: MergedToken[]): MergedToken[] {
|
function normalizeSelectedYomitanTokens(tokens: MergedToken[]): MergedToken[] {
|
||||||
return tokens.map((token) => ({
|
return tokens.map((token) => ({
|
||||||
...token,
|
...token,
|
||||||
|
partOfSpeech: token.partOfSpeech ?? PartOfSpeech.other,
|
||||||
|
isMerged: token.isMerged ?? true,
|
||||||
|
isKnown: token.isKnown ?? false,
|
||||||
|
isNPlusOneTarget: token.isNPlusOneTarget ?? false,
|
||||||
reading: normalizeYomitanMergedReading(token),
|
reading: normalizeYomitanMergedReading(token),
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
@@ -468,20 +472,25 @@ async function parseWithYomitanInternalParser(
|
|||||||
deps: TokenizerServiceDeps,
|
deps: TokenizerServiceDeps,
|
||||||
options: TokenizerAnnotationOptions,
|
options: TokenizerAnnotationOptions,
|
||||||
): Promise<MergedToken[] | null> {
|
): Promise<MergedToken[] | null> {
|
||||||
const parseResults = await requestYomitanParseResults(text, deps, logger);
|
const selectedTokens = await requestYomitanScanTokens(text, deps, logger);
|
||||||
if (!parseResults) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
const selectedTokens = selectYomitanParseTokens(
|
|
||||||
parseResults,
|
|
||||||
getKnownWordLookup(deps, options),
|
|
||||||
deps.getKnownWordMatchMode(),
|
|
||||||
);
|
|
||||||
if (!selectedTokens || selectedTokens.length === 0) {
|
if (!selectedTokens || selectedTokens.length === 0) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
const normalizedSelectedTokens = normalizeSelectedYomitanTokens(selectedTokens);
|
const normalizedSelectedTokens = normalizeSelectedYomitanTokens(
|
||||||
|
selectedTokens.map(
|
||||||
|
(token): MergedToken => ({
|
||||||
|
surface: token.surface,
|
||||||
|
reading: token.reading,
|
||||||
|
headword: token.headword,
|
||||||
|
startPos: token.startPos,
|
||||||
|
endPos: token.endPos,
|
||||||
|
partOfSpeech: PartOfSpeech.other,
|
||||||
|
isMerged: true,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
}),
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
if (deps.getYomitanGroupDebugEnabled?.() === true) {
|
if (deps.getYomitanGroupDebugEnabled?.() === true) {
|
||||||
logSelectedYomitanGroups(text, normalizedSelectedTokens);
|
logSelectedYomitanGroups(text, normalizedSelectedTokens);
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import {
|
|||||||
deleteYomitanDictionaryByTitle,
|
deleteYomitanDictionaryByTitle,
|
||||||
removeYomitanDictionarySettings,
|
removeYomitanDictionarySettings,
|
||||||
requestYomitanParseResults,
|
requestYomitanParseResults,
|
||||||
|
requestYomitanScanTokens,
|
||||||
requestYomitanTermFrequencies,
|
requestYomitanTermFrequencies,
|
||||||
syncYomitanDefaultAnkiServer,
|
syncYomitanDefaultAnkiServer,
|
||||||
upsertYomitanDictionarySettings,
|
upsertYomitanDictionarySettings,
|
||||||
@@ -403,7 +404,7 @@ test('requestYomitanTermFrequencies caches repeated term+reading lookups', async
|
|||||||
assert.equal(frequencyCalls, 1);
|
assert.equal(frequencyCalls, 1);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('requestYomitanParseResults disables Yomitan MeCab parser path', async () => {
|
test('requestYomitanScanTokens uses left-to-right termsFind scanning instead of parseText', async () => {
|
||||||
const scripts: string[] = [];
|
const scripts: string[] = [];
|
||||||
const deps = createDeps(async (script) => {
|
const deps = createDeps(async (script) => {
|
||||||
scripts.push(script);
|
scripts.push(script);
|
||||||
@@ -419,17 +420,35 @@ test('requestYomitanParseResults disables Yomitan MeCab parser path', async () =
|
|||||||
],
|
],
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
return [];
|
return [
|
||||||
|
{
|
||||||
|
surface: 'カズマ',
|
||||||
|
reading: 'かずま',
|
||||||
|
headword: 'カズマ',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 3,
|
||||||
|
},
|
||||||
|
];
|
||||||
});
|
});
|
||||||
|
|
||||||
const result = await requestYomitanParseResults('猫です', deps, {
|
const result = await requestYomitanScanTokens('カズマ', deps, {
|
||||||
error: () => undefined,
|
error: () => undefined,
|
||||||
});
|
});
|
||||||
|
|
||||||
assert.deepEqual(result, []);
|
assert.deepEqual(result, [
|
||||||
const parseScript = scripts.find((script) => script.includes('parseText'));
|
{
|
||||||
assert.ok(parseScript, 'expected parseText request script');
|
surface: 'カズマ',
|
||||||
assert.match(parseScript ?? '', /useMecabParser:\s*false/);
|
reading: 'かずま',
|
||||||
|
headword: 'カズマ',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 3,
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
const scannerScript = scripts.find((script) => script.includes('termsFind'));
|
||||||
|
assert.ok(scannerScript, 'expected termsFind scanning request script');
|
||||||
|
assert.doesNotMatch(scannerScript ?? '', /parseText/);
|
||||||
|
assert.match(scannerScript ?? '', /matchType:\s*"exact"/);
|
||||||
|
assert.match(scannerScript ?? '', /deinflect:\s*true/);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('getYomitanDictionaryInfo requests dictionary info via backend action', async () => {
|
test('getYomitanDictionaryInfo requests dictionary info via backend action', async () => {
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import type { BrowserWindow, Extension } from 'electron';
|
import type { BrowserWindow, Extension } from 'electron';
|
||||||
import * as fs from 'fs';
|
import * as fs from 'fs';
|
||||||
import * as path from 'path';
|
import * as path from 'path';
|
||||||
|
import { selectYomitanParseTokens } from './parser-selection-stage';
|
||||||
|
|
||||||
interface LoggerLike {
|
interface LoggerLike {
|
||||||
error: (message: string, ...args: unknown[]) => void;
|
error: (message: string, ...args: unknown[]) => void;
|
||||||
@@ -38,6 +39,14 @@ export interface YomitanTermReadingPair {
|
|||||||
reading: string | null;
|
reading: string | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface YomitanScanToken {
|
||||||
|
surface: string;
|
||||||
|
reading: string;
|
||||||
|
headword: string;
|
||||||
|
startPos: number;
|
||||||
|
endPos: number;
|
||||||
|
}
|
||||||
|
|
||||||
interface YomitanProfileMetadata {
|
interface YomitanProfileMetadata {
|
||||||
profileIndex: number;
|
profileIndex: number;
|
||||||
scanLength: number;
|
scanLength: number;
|
||||||
@@ -56,6 +65,21 @@ function isObject(value: unknown): value is Record<string, unknown> {
|
|||||||
return Boolean(value && typeof value === 'object');
|
return Boolean(value && typeof value === 'object');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function isScanTokenArray(value: unknown): value is YomitanScanToken[] {
|
||||||
|
return (
|
||||||
|
Array.isArray(value) &&
|
||||||
|
value.every(
|
||||||
|
(entry) =>
|
||||||
|
isObject(entry) &&
|
||||||
|
typeof entry.surface === 'string' &&
|
||||||
|
typeof entry.reading === 'string' &&
|
||||||
|
typeof entry.headword === 'string' &&
|
||||||
|
typeof entry.startPos === 'number' &&
|
||||||
|
typeof entry.endPos === 'number',
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
function makeTermReadingCacheKey(term: string, reading: string | null): string {
|
function makeTermReadingCacheKey(term: string, reading: string | null): string {
|
||||||
return `${term}\u0000${reading ?? ''}`;
|
return `${term}\u0000${reading ?? ''}`;
|
||||||
}
|
}
|
||||||
@@ -584,6 +608,244 @@ async function invokeYomitanSettingsAutomation<T>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const YOMITAN_SCANNING_HELPERS = String.raw`
|
||||||
|
const HIRAGANA_CONVERSION_RANGE = [0x3041, 0x3096];
|
||||||
|
const KATAKANA_CONVERSION_RANGE = [0x30a1, 0x30f6];
|
||||||
|
const KANA_PROLONGED_SOUND_MARK_CODE_POINT = 0x30fc;
|
||||||
|
const KATAKANA_SMALL_KA_CODE_POINT = 0x30f5;
|
||||||
|
const KATAKANA_SMALL_KE_CODE_POINT = 0x30f6;
|
||||||
|
const KANA_RANGES = [[0x3040, 0x309f], [0x30a0, 0x30ff]];
|
||||||
|
const JAPANESE_RANGES = [[0x3040, 0x30ff], [0x3400, 0x9fff]];
|
||||||
|
function isCodePointInRange(codePoint, range) { return codePoint >= range[0] && codePoint <= range[1]; }
|
||||||
|
function isCodePointInRanges(codePoint, ranges) { return ranges.some((range) => isCodePointInRange(codePoint, range)); }
|
||||||
|
function isCodePointKana(codePoint) { return isCodePointInRanges(codePoint, KANA_RANGES); }
|
||||||
|
function isCodePointJapanese(codePoint) { return isCodePointInRanges(codePoint, JAPANESE_RANGES); }
|
||||||
|
function createFuriganaSegment(text, reading) { return {text, reading}; }
|
||||||
|
function getProlongedHiragana(previousCharacter) {
|
||||||
|
switch (previousCharacter) {
|
||||||
|
case "あ": case "か": case "が": case "さ": case "ざ": case "た": case "だ": case "な": case "は": case "ば": case "ぱ": case "ま": case "や": case "ら": case "わ": case "ぁ": case "ゃ": case "ゎ": return "あ";
|
||||||
|
case "い": case "き": case "ぎ": case "し": case "じ": case "ち": case "ぢ": case "に": case "ひ": case "び": case "ぴ": case "み": case "り": case "ぃ": return "い";
|
||||||
|
case "う": case "く": case "ぐ": case "す": case "ず": case "つ": case "づ": case "ぬ": case "ふ": case "ぶ": case "ぷ": case "む": case "ゆ": case "る": case "ぅ": case "ゅ": return "う";
|
||||||
|
case "え": case "け": case "げ": case "せ": case "ぜ": case "て": case "で": case "ね": case "へ": case "べ": case "ぺ": case "め": case "れ": case "ぇ": return "え";
|
||||||
|
case "お": case "こ": case "ご": case "そ": case "ぞ": case "と": case "ど": case "の": case "ほ": case "ぼ": case "ぽ": case "も": case "よ": case "ろ": case "を": case "ぉ": case "ょ": return "う";
|
||||||
|
default: return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
function getFuriganaKanaSegments(text, reading) {
|
||||||
|
const newSegments = [];
|
||||||
|
let start = 0;
|
||||||
|
let state = (reading[0] === text[0]);
|
||||||
|
for (let i = 1; i < text.length; ++i) {
|
||||||
|
const newState = (reading[i] === text[i]);
|
||||||
|
if (state === newState) { continue; }
|
||||||
|
newSegments.push(createFuriganaSegment(text.substring(start, i), state ? '' : reading.substring(start, i)));
|
||||||
|
state = newState;
|
||||||
|
start = i;
|
||||||
|
}
|
||||||
|
newSegments.push(createFuriganaSegment(text.substring(start), state ? '' : reading.substring(start)));
|
||||||
|
return newSegments;
|
||||||
|
}
|
||||||
|
function convertKatakanaToHiragana(text, keepProlongedSoundMarks = false) {
|
||||||
|
let result = '';
|
||||||
|
const offset = (HIRAGANA_CONVERSION_RANGE[0] - KATAKANA_CONVERSION_RANGE[0]);
|
||||||
|
for (let char of text) {
|
||||||
|
const codePoint = char.codePointAt(0);
|
||||||
|
switch (codePoint) {
|
||||||
|
case KATAKANA_SMALL_KA_CODE_POINT:
|
||||||
|
case KATAKANA_SMALL_KE_CODE_POINT:
|
||||||
|
break;
|
||||||
|
case KANA_PROLONGED_SOUND_MARK_CODE_POINT:
|
||||||
|
if (!keepProlongedSoundMarks && result.length > 0) {
|
||||||
|
const char2 = getProlongedHiragana(result[result.length - 1]);
|
||||||
|
if (char2 !== null) { char = char2; }
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
if (isCodePointInRange(codePoint, KATAKANA_CONVERSION_RANGE)) {
|
||||||
|
char = String.fromCodePoint(codePoint + offset);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
result += char;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
function segmentizeFurigana(reading, readingNormalized, groups, groupsStart) {
|
||||||
|
const groupCount = groups.length - groupsStart;
|
||||||
|
if (groupCount <= 0) { return reading.length === 0 ? [] : null; }
|
||||||
|
const group = groups[groupsStart];
|
||||||
|
const {isKana, text} = group;
|
||||||
|
if (isKana) {
|
||||||
|
if (group.textNormalized !== null && readingNormalized.startsWith(group.textNormalized)) {
|
||||||
|
const segments = segmentizeFurigana(reading.substring(text.length), readingNormalized.substring(text.length), groups, groupsStart + 1);
|
||||||
|
if (segments !== null) {
|
||||||
|
if (reading.startsWith(text)) { segments.unshift(createFuriganaSegment(text, '')); }
|
||||||
|
else { segments.unshift(...getFuriganaKanaSegments(text, reading)); }
|
||||||
|
return segments;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
let result = null;
|
||||||
|
for (let i = reading.length; i >= text.length; --i) {
|
||||||
|
const segments = segmentizeFurigana(reading.substring(i), readingNormalized.substring(i), groups, groupsStart + 1);
|
||||||
|
if (segments !== null) {
|
||||||
|
if (result !== null) { return null; }
|
||||||
|
segments.unshift(createFuriganaSegment(text, reading.substring(0, i)));
|
||||||
|
result = segments;
|
||||||
|
}
|
||||||
|
if (groupCount === 1) { break; }
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
function distributeFurigana(term, reading) {
|
||||||
|
if (reading === term) { return [createFuriganaSegment(term, '')]; }
|
||||||
|
const groups = [];
|
||||||
|
let groupPre = null;
|
||||||
|
let isKanaPre = null;
|
||||||
|
for (const c of term) {
|
||||||
|
const isKana = isCodePointKana(c.codePointAt(0));
|
||||||
|
if (isKana === isKanaPre) { groupPre.text += c; }
|
||||||
|
else {
|
||||||
|
groupPre = {isKana, text: c, textNormalized: null};
|
||||||
|
groups.push(groupPre);
|
||||||
|
isKanaPre = isKana;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (const group of groups) {
|
||||||
|
if (group.isKana) { group.textNormalized = convertKatakanaToHiragana(group.text); }
|
||||||
|
}
|
||||||
|
const segments = segmentizeFurigana(reading, convertKatakanaToHiragana(reading), groups, 0);
|
||||||
|
return segments !== null ? segments : [createFuriganaSegment(term, reading)];
|
||||||
|
}
|
||||||
|
function getStemLength(text1, text2) {
|
||||||
|
const minLength = Math.min(text1.length, text2.length);
|
||||||
|
if (minLength === 0) { return 0; }
|
||||||
|
let i = 0;
|
||||||
|
while (true) {
|
||||||
|
const char1 = text1.codePointAt(i);
|
||||||
|
const char2 = text2.codePointAt(i);
|
||||||
|
if (char1 !== char2) { break; }
|
||||||
|
const charLength = String.fromCodePoint(char1).length;
|
||||||
|
i += charLength;
|
||||||
|
if (i >= minLength) {
|
||||||
|
if (i > minLength) { i -= charLength; }
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
function distributeFuriganaInflected(term, reading, source) {
|
||||||
|
const termNormalized = convertKatakanaToHiragana(term);
|
||||||
|
const readingNormalized = convertKatakanaToHiragana(reading);
|
||||||
|
const sourceNormalized = convertKatakanaToHiragana(source);
|
||||||
|
let mainText = term;
|
||||||
|
let stemLength = getStemLength(termNormalized, sourceNormalized);
|
||||||
|
const readingStemLength = getStemLength(readingNormalized, sourceNormalized);
|
||||||
|
if (readingStemLength > 0 && readingStemLength >= stemLength) {
|
||||||
|
mainText = reading;
|
||||||
|
stemLength = readingStemLength;
|
||||||
|
reading = source.substring(0, stemLength) + reading.substring(stemLength);
|
||||||
|
}
|
||||||
|
const segments = [];
|
||||||
|
if (stemLength > 0) {
|
||||||
|
mainText = source.substring(0, stemLength) + mainText.substring(stemLength);
|
||||||
|
const segments2 = distributeFurigana(mainText, reading);
|
||||||
|
let consumed = 0;
|
||||||
|
for (const segment of segments2) {
|
||||||
|
const start = consumed;
|
||||||
|
consumed += segment.text.length;
|
||||||
|
if (consumed < stemLength) { segments.push(segment); }
|
||||||
|
else if (consumed === stemLength) { segments.push(segment); break; }
|
||||||
|
else {
|
||||||
|
if (start < stemLength) { segments.push(createFuriganaSegment(mainText.substring(start, stemLength), '')); }
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (stemLength < source.length) {
|
||||||
|
const remainder = source.substring(stemLength);
|
||||||
|
const last = segments[segments.length - 1];
|
||||||
|
if (last && last.reading.length === 0) { last.text += remainder; }
|
||||||
|
else { segments.push(createFuriganaSegment(remainder, '')); }
|
||||||
|
}
|
||||||
|
return segments;
|
||||||
|
}
|
||||||
|
function getPreferredHeadword(dictionaryEntries, token) {
|
||||||
|
for (const dictionaryEntry of dictionaryEntries || []) {
|
||||||
|
for (const headword of dictionaryEntry.headwords || []) {
|
||||||
|
const validSources = [];
|
||||||
|
for (const src of headword.sources || []) {
|
||||||
|
if (src.originalText !== token) { continue; }
|
||||||
|
if (!src.isPrimary) { continue; }
|
||||||
|
if (src.matchType !== 'exact') { continue; }
|
||||||
|
validSources.push(src);
|
||||||
|
}
|
||||||
|
if (validSources.length > 0) { return {term: headword.term, reading: headword.reading}; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const fallback = dictionaryEntries?.[0]?.headwords?.[0];
|
||||||
|
return fallback ? {term: fallback.term, reading: fallback.reading} : null;
|
||||||
|
}
|
||||||
|
`;
|
||||||
|
|
||||||
|
function buildYomitanScanningScript(text: string, profileIndex: number, scanLength: number): string {
|
||||||
|
return `
|
||||||
|
(async () => {
|
||||||
|
const invoke = (action, params) =>
|
||||||
|
new Promise((resolve, reject) => {
|
||||||
|
chrome.runtime.sendMessage({ action, params }, (response) => {
|
||||||
|
if (chrome.runtime.lastError) {
|
||||||
|
reject(new Error(chrome.runtime.lastError.message));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!response || typeof response !== "object") {
|
||||||
|
reject(new Error("Invalid response from Yomitan backend"));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (response.error) {
|
||||||
|
reject(new Error(response.error.message || "Yomitan backend error"));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
resolve(response.result);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
${YOMITAN_SCANNING_HELPERS}
|
||||||
|
const text = ${JSON.stringify(text)};
|
||||||
|
const details = {matchType: "exact", deinflect: true};
|
||||||
|
const tokens = [];
|
||||||
|
let i = 0;
|
||||||
|
while (i < text.length) {
|
||||||
|
const codePoint = text.codePointAt(i);
|
||||||
|
const character = String.fromCodePoint(codePoint);
|
||||||
|
const substring = text.substring(i, i + ${scanLength});
|
||||||
|
const result = await invoke("termsFind", { text: substring, details, optionsContext: { index: ${profileIndex} } });
|
||||||
|
const dictionaryEntries = Array.isArray(result?.dictionaryEntries) ? result.dictionaryEntries : [];
|
||||||
|
const originalTextLength = typeof result?.originalTextLength === "number" ? result.originalTextLength : 0;
|
||||||
|
if (dictionaryEntries.length > 0 && originalTextLength > 0 && (originalTextLength !== character.length || isCodePointJapanese(codePoint))) {
|
||||||
|
const source = substring.substring(0, originalTextLength);
|
||||||
|
const preferredHeadword = getPreferredHeadword(dictionaryEntries, source);
|
||||||
|
if (preferredHeadword && typeof preferredHeadword.term === "string") {
|
||||||
|
const reading = typeof preferredHeadword.reading === "string" ? preferredHeadword.reading : "";
|
||||||
|
const segments = distributeFuriganaInflected(preferredHeadword.term, reading, source);
|
||||||
|
tokens.push({
|
||||||
|
surface: segments.map((segment) => segment.text).join("") || source,
|
||||||
|
reading: segments.map((segment) => typeof segment.reading === "string" ? segment.reading : "").join(""),
|
||||||
|
headword: preferredHeadword.term,
|
||||||
|
startPos: i,
|
||||||
|
endPos: i + originalTextLength,
|
||||||
|
});
|
||||||
|
i += originalTextLength;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
i += character.length;
|
||||||
|
}
|
||||||
|
return tokens;
|
||||||
|
})();
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
|
||||||
export async function requestYomitanParseResults(
|
export async function requestYomitanParseResults(
|
||||||
text: string,
|
text: string,
|
||||||
deps: YomitanParserRuntimeDeps,
|
deps: YomitanParserRuntimeDeps,
|
||||||
@@ -678,6 +940,51 @@ export async function requestYomitanParseResults(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function requestYomitanScanTokens(
|
||||||
|
text: string,
|
||||||
|
deps: YomitanParserRuntimeDeps,
|
||||||
|
logger: LoggerLike,
|
||||||
|
): Promise<YomitanScanToken[] | null> {
|
||||||
|
const yomitanExt = deps.getYomitanExt();
|
||||||
|
if (!text || !yomitanExt) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const isReady = await ensureYomitanParserWindow(deps, logger);
|
||||||
|
const parserWindow = deps.getYomitanParserWindow();
|
||||||
|
if (!isReady || !parserWindow || parserWindow.isDestroyed()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const metadata = await requestYomitanProfileMetadata(parserWindow, logger);
|
||||||
|
const profileIndex = metadata?.profileIndex ?? 0;
|
||||||
|
const scanLength = metadata?.scanLength ?? DEFAULT_YOMITAN_SCAN_LENGTH;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const rawResult = await parserWindow.webContents.executeJavaScript(
|
||||||
|
buildYomitanScanningScript(text, profileIndex, scanLength),
|
||||||
|
true,
|
||||||
|
);
|
||||||
|
if (isScanTokenArray(rawResult)) {
|
||||||
|
return rawResult;
|
||||||
|
}
|
||||||
|
if (Array.isArray(rawResult)) {
|
||||||
|
const selectedTokens = selectYomitanParseTokens(rawResult, () => false, 'headword');
|
||||||
|
return selectedTokens?.map((token) => ({
|
||||||
|
surface: token.surface,
|
||||||
|
reading: token.reading,
|
||||||
|
headword: token.headword,
|
||||||
|
startPos: token.startPos,
|
||||||
|
endPos: token.endPos,
|
||||||
|
})) ?? null;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
} catch (err) {
|
||||||
|
logger.error('Yomitan scanner request failed:', (err as Error).message);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async function fetchYomitanTermFrequencies(
|
async function fetchYomitanTermFrequencies(
|
||||||
parserWindow: BrowserWindow,
|
parserWindow: BrowserWindow,
|
||||||
termReadingList: YomitanTermReadingPair[],
|
termReadingList: YomitanTermReadingPair[],
|
||||||
|
|||||||
@@ -196,6 +196,115 @@ test('generateForCurrentMedia emits structured-content glossary so image stays w
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('generateForCurrentMedia adds kana aliases for romanized names when native name is kanji', async () => {
|
||||||
|
const userDataPath = makeTempDir();
|
||||||
|
const originalFetch = globalThis.fetch;
|
||||||
|
|
||||||
|
globalThis.fetch = (async (input: string | URL | Request, init?: RequestInit) => {
|
||||||
|
const url = typeof input === 'string' ? input : input instanceof URL ? input.href : input.url;
|
||||||
|
if (url === GRAPHQL_URL) {
|
||||||
|
const body = JSON.parse(String(init?.body ?? '{}')) as {
|
||||||
|
query?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (body.query?.includes('Page(perPage: 10)')) {
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({
|
||||||
|
data: {
|
||||||
|
Page: {
|
||||||
|
media: [
|
||||||
|
{
|
||||||
|
id: 20594,
|
||||||
|
episodes: 10,
|
||||||
|
title: {
|
||||||
|
romaji: 'Kono Subarashii Sekai ni Shukufuku wo!',
|
||||||
|
english: 'KONOSUBA -God’s blessing on this wonderful world!',
|
||||||
|
native: 'この素晴らしい世界に祝福を!',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
{
|
||||||
|
status: 200,
|
||||||
|
headers: { 'content-type': 'application/json' },
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (body.query?.includes('characters(page: $page')) {
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({
|
||||||
|
data: {
|
||||||
|
Media: {
|
||||||
|
title: {
|
||||||
|
romaji: 'Kono Subarashii Sekai ni Shukufuku wo!',
|
||||||
|
english: 'KONOSUBA -God’s blessing on this wonderful world!',
|
||||||
|
native: 'この素晴らしい世界に祝福を!',
|
||||||
|
},
|
||||||
|
characters: {
|
||||||
|
pageInfo: { hasNextPage: false },
|
||||||
|
edges: [
|
||||||
|
{
|
||||||
|
role: 'MAIN',
|
||||||
|
node: {
|
||||||
|
id: 1,
|
||||||
|
description: 'The protagonist.',
|
||||||
|
image: null,
|
||||||
|
name: {
|
||||||
|
full: 'Satou Kazuma',
|
||||||
|
native: '佐藤和真',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
{
|
||||||
|
status: 200,
|
||||||
|
headers: { 'content-type': 'application/json' },
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error(`Unexpected fetch URL: ${url}`);
|
||||||
|
}) as typeof globalThis.fetch;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const runtime = createCharacterDictionaryRuntimeService({
|
||||||
|
userDataPath,
|
||||||
|
getCurrentMediaPath: () => '/tmp/konosuba-s02e05.mkv',
|
||||||
|
getCurrentMediaTitle: () => 'Konosuba S02E05',
|
||||||
|
resolveMediaPathForJimaku: (mediaPath) => mediaPath,
|
||||||
|
guessAnilistMediaInfo: async () => ({
|
||||||
|
title: 'Konosuba',
|
||||||
|
episode: 5,
|
||||||
|
source: 'fallback',
|
||||||
|
}),
|
||||||
|
now: () => 1_700_000_000_000,
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await runtime.generateForCurrentMedia();
|
||||||
|
const termBank = JSON.parse(readStoredZipEntry(result.zipPath, 'term_bank_1.json').toString('utf8')) as Array<
|
||||||
|
[string, string, string, string, number, Array<string | Record<string, unknown>>, number, string]
|
||||||
|
>;
|
||||||
|
|
||||||
|
const kazuma = termBank.find(([term]) => term === 'カズマ');
|
||||||
|
assert.ok(kazuma, 'expected katakana alias for romanized name');
|
||||||
|
assert.equal(kazuma[1], 'かずま');
|
||||||
|
|
||||||
|
const fullName = termBank.find(([term]) => term === 'サトウカズマ');
|
||||||
|
assert.ok(fullName, 'expected compact full-name katakana alias for romanized name');
|
||||||
|
assert.equal(fullName[1], 'さとうかずま');
|
||||||
|
} finally {
|
||||||
|
globalThis.fetch = originalFetch;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
test('getOrCreateCurrentSnapshot persists and reuses normalized snapshot data', async () => {
|
test('getOrCreateCurrentSnapshot persists and reuses normalized snapshot data', async () => {
|
||||||
const userDataPath = makeTempDir();
|
const userDataPath = makeTempDir();
|
||||||
const originalFetch = globalThis.fetch;
|
const originalFetch = globalThis.fetch;
|
||||||
@@ -336,6 +445,139 @@ test('getOrCreateCurrentSnapshot persists and reuses normalized snapshot data',
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('getOrCreateCurrentSnapshot rebuilds snapshots written with an older format version', async () => {
|
||||||
|
const userDataPath = makeTempDir();
|
||||||
|
const originalFetch = globalThis.fetch;
|
||||||
|
let searchQueryCount = 0;
|
||||||
|
let characterQueryCount = 0;
|
||||||
|
|
||||||
|
globalThis.fetch = (async (input: string | URL | Request, init?: RequestInit) => {
|
||||||
|
const url = typeof input === 'string' ? input : input instanceof URL ? input.href : input.url;
|
||||||
|
if (url === GRAPHQL_URL) {
|
||||||
|
const body = JSON.parse(String(init?.body ?? '{}')) as {
|
||||||
|
query?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (body.query?.includes('Page(perPage: 10)')) {
|
||||||
|
searchQueryCount += 1;
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({
|
||||||
|
data: {
|
||||||
|
Page: {
|
||||||
|
media: [
|
||||||
|
{
|
||||||
|
id: 130298,
|
||||||
|
episodes: 20,
|
||||||
|
title: {
|
||||||
|
romaji: 'Kage no Jitsuryokusha ni Naritakute!',
|
||||||
|
english: 'The Eminence in Shadow',
|
||||||
|
native: '陰の実力者になりたくて!',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
{
|
||||||
|
status: 200,
|
||||||
|
headers: { 'content-type': 'application/json' },
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (body.query?.includes('characters(page: $page')) {
|
||||||
|
characterQueryCount += 1;
|
||||||
|
return new Response(
|
||||||
|
JSON.stringify({
|
||||||
|
data: {
|
||||||
|
Media: {
|
||||||
|
title: {
|
||||||
|
romaji: 'Kage no Jitsuryokusha ni Naritakute!',
|
||||||
|
english: 'The Eminence in Shadow',
|
||||||
|
native: '陰の実力者になりたくて!',
|
||||||
|
},
|
||||||
|
characters: {
|
||||||
|
pageInfo: { hasNextPage: false },
|
||||||
|
edges: [
|
||||||
|
{
|
||||||
|
role: 'MAIN',
|
||||||
|
node: {
|
||||||
|
id: 321,
|
||||||
|
description: 'Alpha is the second-in-command of Shadow Garden.',
|
||||||
|
image: null,
|
||||||
|
name: {
|
||||||
|
full: 'Alpha',
|
||||||
|
native: 'アルファ',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
{
|
||||||
|
status: 200,
|
||||||
|
headers: { 'content-type': 'application/json' },
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error(`Unexpected fetch URL: ${url}`);
|
||||||
|
}) as typeof globalThis.fetch;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const snapshotsDir = path.join(userDataPath, 'character-dictionaries', 'snapshots');
|
||||||
|
fs.mkdirSync(snapshotsDir, { recursive: true });
|
||||||
|
fs.writeFileSync(
|
||||||
|
path.join(snapshotsDir, 'anilist-130298.json'),
|
||||||
|
JSON.stringify({
|
||||||
|
formatVersion: 9,
|
||||||
|
mediaId: 130298,
|
||||||
|
mediaTitle: 'The Eminence in Shadow',
|
||||||
|
entryCount: 1,
|
||||||
|
updatedAt: 1_700_000_000_000,
|
||||||
|
termEntries: [['stale', '', 'name side', '', 1, ['stale'], 0, '']],
|
||||||
|
images: [],
|
||||||
|
}),
|
||||||
|
'utf8',
|
||||||
|
);
|
||||||
|
|
||||||
|
const runtime = createCharacterDictionaryRuntimeService({
|
||||||
|
userDataPath,
|
||||||
|
getCurrentMediaPath: () => '/tmp/eminence-s01e05.mkv',
|
||||||
|
getCurrentMediaTitle: () => 'The Eminence in Shadow - S01E05',
|
||||||
|
resolveMediaPathForJimaku: (mediaPath) => mediaPath,
|
||||||
|
guessAnilistMediaInfo: async () => ({
|
||||||
|
title: 'The Eminence in Shadow',
|
||||||
|
episode: 5,
|
||||||
|
source: 'fallback',
|
||||||
|
}),
|
||||||
|
now: () => 1_700_000_000_100,
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await runtime.getOrCreateCurrentSnapshot();
|
||||||
|
|
||||||
|
assert.equal(result.fromCache, false);
|
||||||
|
assert.equal(searchQueryCount, 1);
|
||||||
|
assert.equal(characterQueryCount, 1);
|
||||||
|
|
||||||
|
const snapshotPath = path.join(snapshotsDir, 'anilist-130298.json');
|
||||||
|
const snapshot = JSON.parse(fs.readFileSync(snapshotPath, 'utf8')) as {
|
||||||
|
formatVersion: number;
|
||||||
|
termEntries: Array<
|
||||||
|
[string, string, string, string, number, Array<string | Record<string, unknown>>, number, string]
|
||||||
|
>;
|
||||||
|
};
|
||||||
|
assert.equal(snapshot.formatVersion > 9, true);
|
||||||
|
assert.equal(snapshot.termEntries.some(([term]) => term === 'アルファ'), true);
|
||||||
|
assert.equal(snapshot.termEntries.some(([term]) => term === 'stale'), false);
|
||||||
|
} finally {
|
||||||
|
globalThis.fetch = originalFetch;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
test('buildMergedDictionary combines stored snapshots into one stable dictionary', async () => {
|
test('buildMergedDictionary combines stored snapshots into one stable dictionary', async () => {
|
||||||
const userDataPath = makeTempDir();
|
const userDataPath = makeTempDir();
|
||||||
const originalFetch = globalThis.fetch;
|
const originalFetch = globalThis.fetch;
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ export type CharacterDictionarySnapshot = {
|
|||||||
images: CharacterDictionarySnapshotImage[];
|
images: CharacterDictionarySnapshotImage[];
|
||||||
};
|
};
|
||||||
|
|
||||||
const CHARACTER_DICTIONARY_FORMAT_VERSION = 9;
|
const CHARACTER_DICTIONARY_FORMAT_VERSION = 10;
|
||||||
const CHARACTER_DICTIONARY_MERGED_TITLE = 'SubMiner Character Dictionary';
|
const CHARACTER_DICTIONARY_MERGED_TITLE = 'SubMiner Character Dictionary';
|
||||||
|
|
||||||
type AniListSearchResponse = {
|
type AniListSearchResponse = {
|
||||||
@@ -238,6 +238,246 @@ function buildReading(term: string): string {
|
|||||||
return katakanaToHiragana(compact);
|
return katakanaToHiragana(compact);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function isRomanizedName(value: string): boolean {
|
||||||
|
return /^[A-Za-zĀĪŪĒŌÂÊÎÔÛāīūēōâêîôû'’.\-\s]+$/.test(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeRomanizedName(value: string): string {
|
||||||
|
return value
|
||||||
|
.normalize('NFKC')
|
||||||
|
.toLowerCase()
|
||||||
|
.replace(/[’']/g, '')
|
||||||
|
.replace(/[.\-]/g, ' ')
|
||||||
|
.replace(/ā|â/g, 'aa')
|
||||||
|
.replace(/ī|î/g, 'ii')
|
||||||
|
.replace(/ū|û/g, 'uu')
|
||||||
|
.replace(/ē|ê/g, 'ei')
|
||||||
|
.replace(/ō|ô/g, 'ou')
|
||||||
|
.replace(/\s+/g, ' ')
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
const ROMANIZED_KANA_DIGRAPHS: ReadonlyArray<[string, string]> = [
|
||||||
|
['kya', 'キャ'],
|
||||||
|
['kyu', 'キュ'],
|
||||||
|
['kyo', 'キョ'],
|
||||||
|
['gya', 'ギャ'],
|
||||||
|
['gyu', 'ギュ'],
|
||||||
|
['gyo', 'ギョ'],
|
||||||
|
['sha', 'シャ'],
|
||||||
|
['shu', 'シュ'],
|
||||||
|
['sho', 'ショ'],
|
||||||
|
['sya', 'シャ'],
|
||||||
|
['syu', 'シュ'],
|
||||||
|
['syo', 'ショ'],
|
||||||
|
['ja', 'ジャ'],
|
||||||
|
['ju', 'ジュ'],
|
||||||
|
['jo', 'ジョ'],
|
||||||
|
['jya', 'ジャ'],
|
||||||
|
['jyu', 'ジュ'],
|
||||||
|
['jyo', 'ジョ'],
|
||||||
|
['cha', 'チャ'],
|
||||||
|
['chu', 'チュ'],
|
||||||
|
['cho', 'チョ'],
|
||||||
|
['tya', 'チャ'],
|
||||||
|
['tyu', 'チュ'],
|
||||||
|
['tyo', 'チョ'],
|
||||||
|
['cya', 'チャ'],
|
||||||
|
['cyu', 'チュ'],
|
||||||
|
['cyo', 'チョ'],
|
||||||
|
['nya', 'ニャ'],
|
||||||
|
['nyu', 'ニュ'],
|
||||||
|
['nyo', 'ニョ'],
|
||||||
|
['hya', 'ヒャ'],
|
||||||
|
['hyu', 'ヒュ'],
|
||||||
|
['hyo', 'ヒョ'],
|
||||||
|
['bya', 'ビャ'],
|
||||||
|
['byu', 'ビュ'],
|
||||||
|
['byo', 'ビョ'],
|
||||||
|
['pya', 'ピャ'],
|
||||||
|
['pyu', 'ピュ'],
|
||||||
|
['pyo', 'ピョ'],
|
||||||
|
['mya', 'ミャ'],
|
||||||
|
['myu', 'ミュ'],
|
||||||
|
['myo', 'ミョ'],
|
||||||
|
['rya', 'リャ'],
|
||||||
|
['ryu', 'リュ'],
|
||||||
|
['ryo', 'リョ'],
|
||||||
|
['fa', 'ファ'],
|
||||||
|
['fi', 'フィ'],
|
||||||
|
['fe', 'フェ'],
|
||||||
|
['fo', 'フォ'],
|
||||||
|
['fyu', 'フュ'],
|
||||||
|
['fyo', 'フョ'],
|
||||||
|
['fya', 'フャ'],
|
||||||
|
['va', 'ヴァ'],
|
||||||
|
['vi', 'ヴィ'],
|
||||||
|
['vu', 'ヴ'],
|
||||||
|
['ve', 'ヴェ'],
|
||||||
|
['vo', 'ヴォ'],
|
||||||
|
['she', 'シェ'],
|
||||||
|
['che', 'チェ'],
|
||||||
|
['je', 'ジェ'],
|
||||||
|
['tsi', 'ツィ'],
|
||||||
|
['tse', 'ツェ'],
|
||||||
|
['tsa', 'ツァ'],
|
||||||
|
['tso', 'ツォ'],
|
||||||
|
['thi', 'ティ'],
|
||||||
|
['thu', 'テュ'],
|
||||||
|
['dhi', 'ディ'],
|
||||||
|
['dhu', 'デュ'],
|
||||||
|
['wi', 'ウィ'],
|
||||||
|
['we', 'ウェ'],
|
||||||
|
['wo', 'ウォ'],
|
||||||
|
];
|
||||||
|
|
||||||
|
const ROMANIZED_KANA_MONOGRAPHS: ReadonlyArray<[string, string]> = [
|
||||||
|
['a', 'ア'],
|
||||||
|
['i', 'イ'],
|
||||||
|
['u', 'ウ'],
|
||||||
|
['e', 'エ'],
|
||||||
|
['o', 'オ'],
|
||||||
|
['ka', 'カ'],
|
||||||
|
['ki', 'キ'],
|
||||||
|
['ku', 'ク'],
|
||||||
|
['ke', 'ケ'],
|
||||||
|
['ko', 'コ'],
|
||||||
|
['ga', 'ガ'],
|
||||||
|
['gi', 'ギ'],
|
||||||
|
['gu', 'グ'],
|
||||||
|
['ge', 'ゲ'],
|
||||||
|
['go', 'ゴ'],
|
||||||
|
['sa', 'サ'],
|
||||||
|
['shi', 'シ'],
|
||||||
|
['si', 'シ'],
|
||||||
|
['su', 'ス'],
|
||||||
|
['se', 'セ'],
|
||||||
|
['so', 'ソ'],
|
||||||
|
['za', 'ザ'],
|
||||||
|
['ji', 'ジ'],
|
||||||
|
['zi', 'ジ'],
|
||||||
|
['zu', 'ズ'],
|
||||||
|
['ze', 'ゼ'],
|
||||||
|
['zo', 'ゾ'],
|
||||||
|
['ta', 'タ'],
|
||||||
|
['chi', 'チ'],
|
||||||
|
['ti', 'チ'],
|
||||||
|
['tsu', 'ツ'],
|
||||||
|
['tu', 'ツ'],
|
||||||
|
['te', 'テ'],
|
||||||
|
['to', 'ト'],
|
||||||
|
['da', 'ダ'],
|
||||||
|
['de', 'デ'],
|
||||||
|
['do', 'ド'],
|
||||||
|
['na', 'ナ'],
|
||||||
|
['ni', 'ニ'],
|
||||||
|
['nu', 'ヌ'],
|
||||||
|
['ne', 'ネ'],
|
||||||
|
['no', 'ノ'],
|
||||||
|
['ha', 'ハ'],
|
||||||
|
['hi', 'ヒ'],
|
||||||
|
['fu', 'フ'],
|
||||||
|
['hu', 'フ'],
|
||||||
|
['he', 'ヘ'],
|
||||||
|
['ho', 'ホ'],
|
||||||
|
['ba', 'バ'],
|
||||||
|
['bi', 'ビ'],
|
||||||
|
['bu', 'ブ'],
|
||||||
|
['be', 'ベ'],
|
||||||
|
['bo', 'ボ'],
|
||||||
|
['pa', 'パ'],
|
||||||
|
['pi', 'ピ'],
|
||||||
|
['pu', 'プ'],
|
||||||
|
['pe', 'ペ'],
|
||||||
|
['po', 'ポ'],
|
||||||
|
['ma', 'マ'],
|
||||||
|
['mi', 'ミ'],
|
||||||
|
['mu', 'ム'],
|
||||||
|
['me', 'メ'],
|
||||||
|
['mo', 'モ'],
|
||||||
|
['ya', 'ヤ'],
|
||||||
|
['yu', 'ユ'],
|
||||||
|
['yo', 'ヨ'],
|
||||||
|
['ra', 'ラ'],
|
||||||
|
['ri', 'リ'],
|
||||||
|
['ru', 'ル'],
|
||||||
|
['re', 'レ'],
|
||||||
|
['ro', 'ロ'],
|
||||||
|
['wa', 'ワ'],
|
||||||
|
['wo', 'ヲ'],
|
||||||
|
['n', 'ン'],
|
||||||
|
];
|
||||||
|
|
||||||
|
function romanizedTokenToKatakana(token: string): string | null {
|
||||||
|
const normalized = normalizeRomanizedName(token).replace(/\s+/g, '');
|
||||||
|
if (!normalized || !/^[a-z]+$/.test(normalized)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
let output = '';
|
||||||
|
for (let i = 0; i < normalized.length; ) {
|
||||||
|
const current = normalized[i]!;
|
||||||
|
const next = normalized[i + 1] ?? '';
|
||||||
|
|
||||||
|
if (
|
||||||
|
i + 1 < normalized.length &&
|
||||||
|
current === next &&
|
||||||
|
current !== 'n' &&
|
||||||
|
!'aeiou'.includes(current)
|
||||||
|
) {
|
||||||
|
output += 'ッ';
|
||||||
|
i += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
current === 'n' &&
|
||||||
|
next.length > 0 &&
|
||||||
|
next !== 'y' &&
|
||||||
|
!'aeiou'.includes(next)
|
||||||
|
) {
|
||||||
|
output += 'ン';
|
||||||
|
i += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const digraph = ROMANIZED_KANA_DIGRAPHS.find(([romaji]) =>
|
||||||
|
normalized.startsWith(romaji, i),
|
||||||
|
);
|
||||||
|
if (digraph) {
|
||||||
|
output += digraph[1];
|
||||||
|
i += digraph[0].length;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const monograph = ROMANIZED_KANA_MONOGRAPHS.find(([romaji]) =>
|
||||||
|
normalized.startsWith(romaji, i),
|
||||||
|
);
|
||||||
|
if (monograph) {
|
||||||
|
output += monograph[1];
|
||||||
|
i += monograph[0].length;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return output.length > 0 ? output : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function addRomanizedKanaAliases(values: Iterable<string>): string[] {
|
||||||
|
const aliases = new Set<string>();
|
||||||
|
for (const value of values) {
|
||||||
|
const trimmed = value.trim();
|
||||||
|
if (!trimmed || !isRomanizedName(trimmed)) continue;
|
||||||
|
const katakana = romanizedTokenToKatakana(trimmed);
|
||||||
|
if (katakana) {
|
||||||
|
aliases.add(katakana);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return [...aliases];
|
||||||
|
}
|
||||||
|
|
||||||
function buildNameTerms(character: CharacterRecord): string[] {
|
function buildNameTerms(character: CharacterRecord): string[] {
|
||||||
const base = new Set<string>();
|
const base = new Set<string>();
|
||||||
const rawNames = [character.nativeName, character.fullName];
|
const rawNames = [character.nativeName, character.fullName];
|
||||||
@@ -281,6 +521,13 @@ function buildNameTerms(character: CharacterRecord): string[] {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (const alias of addRomanizedKanaAliases(withHonorifics)) {
|
||||||
|
withHonorifics.add(alias);
|
||||||
|
for (const suffix of HONORIFIC_SUFFIXES) {
|
||||||
|
withHonorifics.add(`${alias}${suffix}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return [...withHonorifics].filter((entry) => entry.trim().length > 0);
|
return [...withHonorifics].filter((entry) => entry.trim().length > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user