mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-02 06:22:42 -08:00
feat(anki): add proxy transport and tokenizer annotation controls
This commit is contained in:
@@ -9,13 +9,16 @@ import {
|
||||
FrequencyDictionaryLookup,
|
||||
JlptLevel,
|
||||
} from '../../types';
|
||||
import { annotateTokens } from './tokenizer/annotation-stage';
|
||||
import { enrichTokensWithMecabPos1 } from './tokenizer/parser-enrichment-stage';
|
||||
import { selectYomitanParseTokens } from './tokenizer/parser-selection-stage';
|
||||
import { requestYomitanParseResults } from './tokenizer/yomitan-parser-runtime';
|
||||
|
||||
const logger = createLogger('main:tokenizer');
|
||||
|
||||
type MecabTokenEnrichmentFn = (
|
||||
tokens: MergedToken[],
|
||||
mecabTokens: MergedToken[] | null,
|
||||
) => Promise<MergedToken[]>;
|
||||
|
||||
export interface TokenizerServiceDeps {
|
||||
getYomitanExt: () => Extension | null;
|
||||
getYomitanParserWindow: () => BrowserWindow | null;
|
||||
@@ -27,12 +30,14 @@ export interface TokenizerServiceDeps {
|
||||
isKnownWord: (text: string) => boolean;
|
||||
getKnownWordMatchMode: () => NPlusOneMatchMode;
|
||||
getJlptLevel: (text: string) => JlptLevel | null;
|
||||
getNPlusOneEnabled?: () => boolean;
|
||||
getJlptEnabled?: () => boolean;
|
||||
getFrequencyDictionaryEnabled?: () => boolean;
|
||||
getFrequencyRank?: FrequencyDictionaryLookup;
|
||||
getMinSentenceWordsForNPlusOne?: () => number;
|
||||
getYomitanGroupDebugEnabled?: () => boolean;
|
||||
tokenizeWithMecab: (text: string) => Promise<MergedToken[] | null>;
|
||||
enrichTokensWithMecab?: MecabTokenEnrichmentFn;
|
||||
}
|
||||
|
||||
interface MecabTokenizerLike {
|
||||
@@ -52,6 +57,7 @@ export interface TokenizerDepsRuntimeOptions {
|
||||
isKnownWord: (text: string) => boolean;
|
||||
getKnownWordMatchMode: () => NPlusOneMatchMode;
|
||||
getJlptLevel: (text: string) => JlptLevel | null;
|
||||
getNPlusOneEnabled?: () => boolean;
|
||||
getJlptEnabled?: () => boolean;
|
||||
getFrequencyDictionaryEnabled?: () => boolean;
|
||||
getFrequencyRank?: FrequencyDictionaryLookup;
|
||||
@@ -60,6 +66,82 @@ export interface TokenizerDepsRuntimeOptions {
|
||||
getMecabTokenizer: () => MecabTokenizerLike | null;
|
||||
}
|
||||
|
||||
interface TokenizerAnnotationOptions {
|
||||
nPlusOneEnabled: boolean;
|
||||
jlptEnabled: boolean;
|
||||
frequencyEnabled: boolean;
|
||||
minSentenceWordsForNPlusOne: number | undefined;
|
||||
}
|
||||
|
||||
let parserEnrichmentWorkerRuntimeModulePromise:
|
||||
| Promise<typeof import('./tokenizer/parser-enrichment-worker-runtime')>
|
||||
| null = null;
|
||||
let annotationStageModulePromise: Promise<typeof import('./tokenizer/annotation-stage')> | null = null;
|
||||
let parserEnrichmentFallbackModulePromise:
|
||||
| Promise<typeof import('./tokenizer/parser-enrichment-stage')>
|
||||
| null = null;
|
||||
|
||||
function getKnownWordLookup(deps: TokenizerServiceDeps, options: TokenizerAnnotationOptions): (text: string) => boolean {
|
||||
if (!options.nPlusOneEnabled) {
|
||||
return () => false;
|
||||
}
|
||||
return deps.isKnownWord;
|
||||
}
|
||||
|
||||
function needsMecabPosEnrichment(options: TokenizerAnnotationOptions): boolean {
|
||||
return options.jlptEnabled || options.frequencyEnabled;
|
||||
}
|
||||
|
||||
function hasAnyAnnotationEnabled(options: TokenizerAnnotationOptions): boolean {
|
||||
return options.nPlusOneEnabled || options.jlptEnabled || options.frequencyEnabled;
|
||||
}
|
||||
|
||||
async function enrichTokensWithMecabAsync(
|
||||
tokens: MergedToken[],
|
||||
mecabTokens: MergedToken[] | null,
|
||||
): Promise<MergedToken[]> {
|
||||
if (!parserEnrichmentWorkerRuntimeModulePromise) {
|
||||
parserEnrichmentWorkerRuntimeModulePromise = import('./tokenizer/parser-enrichment-worker-runtime');
|
||||
}
|
||||
|
||||
try {
|
||||
const runtime = await parserEnrichmentWorkerRuntimeModulePromise;
|
||||
return await runtime.enrichTokensWithMecabPos1Async(tokens, mecabTokens);
|
||||
} catch {
|
||||
if (!parserEnrichmentFallbackModulePromise) {
|
||||
parserEnrichmentFallbackModulePromise = import('./tokenizer/parser-enrichment-stage');
|
||||
}
|
||||
const fallback = await parserEnrichmentFallbackModulePromise;
|
||||
return fallback.enrichTokensWithMecabPos1(tokens, mecabTokens);
|
||||
}
|
||||
}
|
||||
|
||||
async function applyAnnotationStage(
|
||||
tokens: MergedToken[],
|
||||
deps: TokenizerServiceDeps,
|
||||
options: TokenizerAnnotationOptions,
|
||||
): Promise<MergedToken[]> {
|
||||
if (!hasAnyAnnotationEnabled(options)) {
|
||||
return tokens;
|
||||
}
|
||||
|
||||
if (!annotationStageModulePromise) {
|
||||
annotationStageModulePromise = import('./tokenizer/annotation-stage');
|
||||
}
|
||||
|
||||
const annotationStage = await annotationStageModulePromise;
|
||||
return annotationStage.annotateTokens(
|
||||
tokens,
|
||||
{
|
||||
isKnownWord: getKnownWordLookup(deps, options),
|
||||
knownWordMatchMode: deps.getKnownWordMatchMode(),
|
||||
getJlptLevel: deps.getJlptLevel,
|
||||
getFrequencyRank: deps.getFrequencyRank,
|
||||
},
|
||||
options,
|
||||
);
|
||||
}
|
||||
|
||||
export function createTokenizerDepsRuntime(
|
||||
options: TokenizerDepsRuntimeOptions,
|
||||
): TokenizerServiceDeps {
|
||||
@@ -76,6 +158,7 @@ export function createTokenizerDepsRuntime(
|
||||
isKnownWord: options.isKnownWord,
|
||||
getKnownWordMatchMode: options.getKnownWordMatchMode,
|
||||
getJlptLevel: options.getJlptLevel,
|
||||
getNPlusOneEnabled: options.getNPlusOneEnabled,
|
||||
getJlptEnabled: options.getJlptEnabled,
|
||||
getFrequencyDictionaryEnabled: options.getFrequencyDictionaryEnabled,
|
||||
getFrequencyRank: options.getFrequencyRank,
|
||||
@@ -104,8 +187,11 @@ export function createTokenizerDepsRuntime(
|
||||
return null;
|
||||
}
|
||||
|
||||
return mergeTokens(rawTokens, options.isKnownWord, options.getKnownWordMatchMode());
|
||||
const isKnownWordLookup = options.getNPlusOneEnabled?.() === false ? () => false : options.isKnownWord;
|
||||
return mergeTokens(rawTokens, isKnownWordLookup, options.getKnownWordMatchMode());
|
||||
},
|
||||
enrichTokensWithMecab: async (tokens, mecabTokens) =>
|
||||
enrichTokensWithMecabAsync(tokens, mecabTokens),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -128,36 +214,19 @@ function logSelectedYomitanGroups(text: string, tokens: MergedToken[]): void {
|
||||
});
|
||||
}
|
||||
|
||||
function getAnnotationOptions(deps: TokenizerServiceDeps): {
|
||||
jlptEnabled: boolean;
|
||||
frequencyEnabled: boolean;
|
||||
minSentenceWordsForNPlusOne: number | undefined;
|
||||
} {
|
||||
function getAnnotationOptions(deps: TokenizerServiceDeps): TokenizerAnnotationOptions {
|
||||
return {
|
||||
nPlusOneEnabled: deps.getNPlusOneEnabled?.() !== false,
|
||||
jlptEnabled: deps.getJlptEnabled?.() !== false,
|
||||
frequencyEnabled: deps.getFrequencyDictionaryEnabled?.() !== false,
|
||||
minSentenceWordsForNPlusOne: deps.getMinSentenceWordsForNPlusOne?.(),
|
||||
};
|
||||
}
|
||||
|
||||
function applyAnnotationStage(tokens: MergedToken[], deps: TokenizerServiceDeps): MergedToken[] {
|
||||
const options = getAnnotationOptions(deps);
|
||||
|
||||
return annotateTokens(
|
||||
tokens,
|
||||
{
|
||||
isKnownWord: deps.isKnownWord,
|
||||
knownWordMatchMode: deps.getKnownWordMatchMode(),
|
||||
getJlptLevel: deps.getJlptLevel,
|
||||
getFrequencyRank: deps.getFrequencyRank,
|
||||
},
|
||||
options,
|
||||
);
|
||||
}
|
||||
|
||||
async function parseWithYomitanInternalParser(
|
||||
text: string,
|
||||
deps: TokenizerServiceDeps,
|
||||
options: TokenizerAnnotationOptions,
|
||||
): Promise<MergedToken[] | null> {
|
||||
const parseResults = await requestYomitanParseResults(text, deps, logger);
|
||||
if (!parseResults) {
|
||||
@@ -166,7 +235,7 @@ async function parseWithYomitanInternalParser(
|
||||
|
||||
const selectedTokens = selectYomitanParseTokens(
|
||||
parseResults,
|
||||
deps.isKnownWord,
|
||||
getKnownWordLookup(deps, options),
|
||||
deps.getKnownWordMatchMode(),
|
||||
);
|
||||
if (!selectedTokens || selectedTokens.length === 0) {
|
||||
@@ -177,9 +246,14 @@ async function parseWithYomitanInternalParser(
|
||||
logSelectedYomitanGroups(text, selectedTokens);
|
||||
}
|
||||
|
||||
if (!needsMecabPosEnrichment(options)) {
|
||||
return selectedTokens;
|
||||
}
|
||||
|
||||
try {
|
||||
const mecabTokens = await deps.tokenizeWithMecab(text);
|
||||
return enrichTokensWithMecabPos1(selectedTokens, mecabTokens);
|
||||
const enrichTokensWithMecab = deps.enrichTokensWithMecab ?? enrichTokensWithMecabAsync;
|
||||
return await enrichTokensWithMecab(selectedTokens, mecabTokens);
|
||||
} catch (err) {
|
||||
const error = err as Error;
|
||||
logger.warn(
|
||||
@@ -207,12 +281,13 @@ export async function tokenizeSubtitle(
|
||||
}
|
||||
|
||||
const tokenizeText = displayText.replace(/\n/g, ' ').replace(/\s+/g, ' ').trim();
|
||||
const annotationOptions = getAnnotationOptions(deps);
|
||||
|
||||
const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps);
|
||||
const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps, annotationOptions);
|
||||
if (yomitanTokens && yomitanTokens.length > 0) {
|
||||
return {
|
||||
text: displayText,
|
||||
tokens: applyAnnotationStage(yomitanTokens, deps),
|
||||
tokens: await applyAnnotationStage(yomitanTokens, deps, annotationOptions),
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user