feat(anki): add proxy transport and tokenizer annotation controls

This commit is contained in:
2026-02-27 21:25:26 -08:00
parent 1c70e486fe
commit 8aa2a45c7c
26 changed files with 1453 additions and 60 deletions

View File

@@ -9,13 +9,16 @@ import {
FrequencyDictionaryLookup,
JlptLevel,
} from '../../types';
import { annotateTokens } from './tokenizer/annotation-stage';
import { enrichTokensWithMecabPos1 } from './tokenizer/parser-enrichment-stage';
import { selectYomitanParseTokens } from './tokenizer/parser-selection-stage';
import { requestYomitanParseResults } from './tokenizer/yomitan-parser-runtime';
const logger = createLogger('main:tokenizer');
type MecabTokenEnrichmentFn = (
tokens: MergedToken[],
mecabTokens: MergedToken[] | null,
) => Promise<MergedToken[]>;
export interface TokenizerServiceDeps {
getYomitanExt: () => Extension | null;
getYomitanParserWindow: () => BrowserWindow | null;
@@ -27,12 +30,14 @@ export interface TokenizerServiceDeps {
isKnownWord: (text: string) => boolean;
getKnownWordMatchMode: () => NPlusOneMatchMode;
getJlptLevel: (text: string) => JlptLevel | null;
getNPlusOneEnabled?: () => boolean;
getJlptEnabled?: () => boolean;
getFrequencyDictionaryEnabled?: () => boolean;
getFrequencyRank?: FrequencyDictionaryLookup;
getMinSentenceWordsForNPlusOne?: () => number;
getYomitanGroupDebugEnabled?: () => boolean;
tokenizeWithMecab: (text: string) => Promise<MergedToken[] | null>;
enrichTokensWithMecab?: MecabTokenEnrichmentFn;
}
interface MecabTokenizerLike {
@@ -52,6 +57,7 @@ export interface TokenizerDepsRuntimeOptions {
isKnownWord: (text: string) => boolean;
getKnownWordMatchMode: () => NPlusOneMatchMode;
getJlptLevel: (text: string) => JlptLevel | null;
getNPlusOneEnabled?: () => boolean;
getJlptEnabled?: () => boolean;
getFrequencyDictionaryEnabled?: () => boolean;
getFrequencyRank?: FrequencyDictionaryLookup;
@@ -60,6 +66,82 @@ export interface TokenizerDepsRuntimeOptions {
getMecabTokenizer: () => MecabTokenizerLike | null;
}
interface TokenizerAnnotationOptions {
nPlusOneEnabled: boolean;
jlptEnabled: boolean;
frequencyEnabled: boolean;
minSentenceWordsForNPlusOne: number | undefined;
}
let parserEnrichmentWorkerRuntimeModulePromise:
| Promise<typeof import('./tokenizer/parser-enrichment-worker-runtime')>
| null = null;
let annotationStageModulePromise: Promise<typeof import('./tokenizer/annotation-stage')> | null = null;
let parserEnrichmentFallbackModulePromise:
| Promise<typeof import('./tokenizer/parser-enrichment-stage')>
| null = null;
function getKnownWordLookup(deps: TokenizerServiceDeps, options: TokenizerAnnotationOptions): (text: string) => boolean {
if (!options.nPlusOneEnabled) {
return () => false;
}
return deps.isKnownWord;
}
function needsMecabPosEnrichment(options: TokenizerAnnotationOptions): boolean {
return options.jlptEnabled || options.frequencyEnabled;
}
function hasAnyAnnotationEnabled(options: TokenizerAnnotationOptions): boolean {
return options.nPlusOneEnabled || options.jlptEnabled || options.frequencyEnabled;
}
async function enrichTokensWithMecabAsync(
tokens: MergedToken[],
mecabTokens: MergedToken[] | null,
): Promise<MergedToken[]> {
if (!parserEnrichmentWorkerRuntimeModulePromise) {
parserEnrichmentWorkerRuntimeModulePromise = import('./tokenizer/parser-enrichment-worker-runtime');
}
try {
const runtime = await parserEnrichmentWorkerRuntimeModulePromise;
return await runtime.enrichTokensWithMecabPos1Async(tokens, mecabTokens);
} catch {
if (!parserEnrichmentFallbackModulePromise) {
parserEnrichmentFallbackModulePromise = import('./tokenizer/parser-enrichment-stage');
}
const fallback = await parserEnrichmentFallbackModulePromise;
return fallback.enrichTokensWithMecabPos1(tokens, mecabTokens);
}
}
async function applyAnnotationStage(
tokens: MergedToken[],
deps: TokenizerServiceDeps,
options: TokenizerAnnotationOptions,
): Promise<MergedToken[]> {
if (!hasAnyAnnotationEnabled(options)) {
return tokens;
}
if (!annotationStageModulePromise) {
annotationStageModulePromise = import('./tokenizer/annotation-stage');
}
const annotationStage = await annotationStageModulePromise;
return annotationStage.annotateTokens(
tokens,
{
isKnownWord: getKnownWordLookup(deps, options),
knownWordMatchMode: deps.getKnownWordMatchMode(),
getJlptLevel: deps.getJlptLevel,
getFrequencyRank: deps.getFrequencyRank,
},
options,
);
}
export function createTokenizerDepsRuntime(
options: TokenizerDepsRuntimeOptions,
): TokenizerServiceDeps {
@@ -76,6 +158,7 @@ export function createTokenizerDepsRuntime(
isKnownWord: options.isKnownWord,
getKnownWordMatchMode: options.getKnownWordMatchMode,
getJlptLevel: options.getJlptLevel,
getNPlusOneEnabled: options.getNPlusOneEnabled,
getJlptEnabled: options.getJlptEnabled,
getFrequencyDictionaryEnabled: options.getFrequencyDictionaryEnabled,
getFrequencyRank: options.getFrequencyRank,
@@ -104,8 +187,11 @@ export function createTokenizerDepsRuntime(
return null;
}
return mergeTokens(rawTokens, options.isKnownWord, options.getKnownWordMatchMode());
const isKnownWordLookup = options.getNPlusOneEnabled?.() === false ? () => false : options.isKnownWord;
return mergeTokens(rawTokens, isKnownWordLookup, options.getKnownWordMatchMode());
},
enrichTokensWithMecab: async (tokens, mecabTokens) =>
enrichTokensWithMecabAsync(tokens, mecabTokens),
};
}
@@ -128,36 +214,19 @@ function logSelectedYomitanGroups(text: string, tokens: MergedToken[]): void {
});
}
function getAnnotationOptions(deps: TokenizerServiceDeps): {
jlptEnabled: boolean;
frequencyEnabled: boolean;
minSentenceWordsForNPlusOne: number | undefined;
} {
function getAnnotationOptions(deps: TokenizerServiceDeps): TokenizerAnnotationOptions {
return {
nPlusOneEnabled: deps.getNPlusOneEnabled?.() !== false,
jlptEnabled: deps.getJlptEnabled?.() !== false,
frequencyEnabled: deps.getFrequencyDictionaryEnabled?.() !== false,
minSentenceWordsForNPlusOne: deps.getMinSentenceWordsForNPlusOne?.(),
};
}
function applyAnnotationStage(tokens: MergedToken[], deps: TokenizerServiceDeps): MergedToken[] {
const options = getAnnotationOptions(deps);
return annotateTokens(
tokens,
{
isKnownWord: deps.isKnownWord,
knownWordMatchMode: deps.getKnownWordMatchMode(),
getJlptLevel: deps.getJlptLevel,
getFrequencyRank: deps.getFrequencyRank,
},
options,
);
}
async function parseWithYomitanInternalParser(
text: string,
deps: TokenizerServiceDeps,
options: TokenizerAnnotationOptions,
): Promise<MergedToken[] | null> {
const parseResults = await requestYomitanParseResults(text, deps, logger);
if (!parseResults) {
@@ -166,7 +235,7 @@ async function parseWithYomitanInternalParser(
const selectedTokens = selectYomitanParseTokens(
parseResults,
deps.isKnownWord,
getKnownWordLookup(deps, options),
deps.getKnownWordMatchMode(),
);
if (!selectedTokens || selectedTokens.length === 0) {
@@ -177,9 +246,14 @@ async function parseWithYomitanInternalParser(
logSelectedYomitanGroups(text, selectedTokens);
}
if (!needsMecabPosEnrichment(options)) {
return selectedTokens;
}
try {
const mecabTokens = await deps.tokenizeWithMecab(text);
return enrichTokensWithMecabPos1(selectedTokens, mecabTokens);
const enrichTokensWithMecab = deps.enrichTokensWithMecab ?? enrichTokensWithMecabAsync;
return await enrichTokensWithMecab(selectedTokens, mecabTokens);
} catch (err) {
const error = err as Error;
logger.warn(
@@ -207,12 +281,13 @@ export async function tokenizeSubtitle(
}
const tokenizeText = displayText.replace(/\n/g, ' ').replace(/\s+/g, ' ').trim();
const annotationOptions = getAnnotationOptions(deps);
const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps);
const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps, annotationOptions);
if (yomitanTokens && yomitanTokens.length > 0) {
return {
text: displayText,
tokens: applyAnnotationStage(yomitanTokens, deps),
tokens: await applyAnnotationStage(yomitanTokens, deps, annotationOptions),
};
}