perf: batch annotation passes into single loop

Collapse applyKnownWordMarking, applyFrequencyMarking, and
applyJlptMarking into a single .map() call. markNPlusOneTargets
remains a separate pass (needs full array with isKnown set).

Eliminates 3 intermediate array allocations and 3 redundant
iterations over the token array.
This commit is contained in:
2026-03-15 12:49:08 -07:00
parent 35946624c2
commit 047b349d05

View File

@@ -43,20 +43,6 @@ function resolveKnownWordText(
return matchMode === 'surface' ? surface : headword; return matchMode === 'surface' ? surface : headword;
} }
function applyKnownWordMarking(
tokens: MergedToken[],
isKnownWord: (text: string) => boolean,
knownWordMatchMode: NPlusOneMatchMode,
): MergedToken[] {
return tokens.map((token) => {
const matchText = resolveKnownWordText(token.surface, token.headword, knownWordMatchMode);
return {
...token,
isKnown: token.isKnown || (matchText ? isKnownWord(matchText) : false),
};
});
}
function normalizePos1Tag(pos1: string | undefined): string { function normalizePos1Tag(pos1: string | undefined): string {
return typeof pos1 === 'string' ? pos1.trim() : ''; return typeof pos1 === 'string' ? pos1.trim() : '';
@@ -144,27 +130,6 @@ export function shouldExcludeTokenFromVocabularyPersistence(
); );
} }
function applyFrequencyMarking(
tokens: MergedToken[],
pos1Exclusions: ReadonlySet<string>,
pos2Exclusions: ReadonlySet<string>,
): MergedToken[] {
return tokens.map((token) => {
if (isFrequencyExcludedByPos(token, pos1Exclusions, pos2Exclusions)) {
return { ...token, frequencyRank: undefined };
}
if (typeof token.frequencyRank === 'number' && Number.isFinite(token.frequencyRank)) {
const rank = Math.max(1, Math.floor(token.frequencyRank));
return { ...token, frequencyRank: rank };
}
return {
...token,
frequencyRank: undefined,
};
});
}
function getCachedJlptLevel( function getCachedJlptLevel(
lookupText: string, lookupText: string,
@@ -425,24 +390,45 @@ function isJlptEligibleToken(token: MergedToken): boolean {
return true; return true;
} }
function applyJlptMarking( function computeTokenKnownStatus(
tokens: MergedToken[], token: MergedToken,
isKnownWord: (text: string) => boolean,
knownWordMatchMode: NPlusOneMatchMode,
): boolean {
const matchText = resolveKnownWordText(token.surface, token.headword, knownWordMatchMode);
return token.isKnown || (matchText ? isKnownWord(matchText) : false);
}
function filterTokenFrequencyRank(
token: MergedToken,
pos1Exclusions: ReadonlySet<string>,
pos2Exclusions: ReadonlySet<string>,
): number | undefined {
if (isFrequencyExcludedByPos(token, pos1Exclusions, pos2Exclusions)) {
return undefined;
}
if (typeof token.frequencyRank === 'number' && Number.isFinite(token.frequencyRank)) {
return Math.max(1, Math.floor(token.frequencyRank));
}
return undefined;
}
function computeTokenJlptLevel(
token: MergedToken,
getJlptLevel: (text: string) => JlptLevel | null, getJlptLevel: (text: string) => JlptLevel | null,
): MergedToken[] { ): JlptLevel | undefined {
return tokens.map((token) => {
if (!isJlptEligibleToken(token)) { if (!isJlptEligibleToken(token)) {
return { ...token, jlptLevel: undefined }; return undefined;
} }
const primaryLevel = getCachedJlptLevel(resolveJlptLookupText(token), getJlptLevel); const primaryLevel = getCachedJlptLevel(resolveJlptLookupText(token), getJlptLevel);
const fallbackLevel = const fallbackLevel =
primaryLevel === null ? getCachedJlptLevel(token.surface, getJlptLevel) : null; primaryLevel === null ? getCachedJlptLevel(token.surface, getJlptLevel) : null;
return { const level = primaryLevel ?? fallbackLevel ?? token.jlptLevel;
...token, return level ?? undefined;
jlptLevel: primaryLevel ?? fallbackLevel ?? token.jlptLevel,
};
});
} }
export function annotateTokens( export function annotateTokens(
@@ -453,36 +439,34 @@ export function annotateTokens(
const pos1Exclusions = resolvePos1Exclusions(options); const pos1Exclusions = resolvePos1Exclusions(options);
const pos2Exclusions = resolvePos2Exclusions(options); const pos2Exclusions = resolvePos2Exclusions(options);
const nPlusOneEnabled = options.nPlusOneEnabled !== false; const nPlusOneEnabled = options.nPlusOneEnabled !== false;
const knownMarkedTokens = nPlusOneEnabled
? applyKnownWordMarking(tokens, deps.isKnownWord, deps.knownWordMatchMode)
: tokens.map((token) => ({
...token,
isKnown: false,
isNPlusOneTarget: false,
}));
const frequencyEnabled = options.frequencyEnabled !== false; const frequencyEnabled = options.frequencyEnabled !== false;
const frequencyMarkedTokens = frequencyEnabled
? applyFrequencyMarking(knownMarkedTokens, pos1Exclusions, pos2Exclusions)
: knownMarkedTokens.map((token) => ({
...token,
frequencyRank: undefined,
}));
const jlptEnabled = options.jlptEnabled !== false; const jlptEnabled = options.jlptEnabled !== false;
const jlptMarkedTokens = jlptEnabled
? applyJlptMarking(frequencyMarkedTokens, deps.getJlptLevel) // Single pass: compute known word status, frequency filtering, and JLPT level together
: frequencyMarkedTokens.map((token) => ({ const annotated = tokens.map((token) => {
const isKnown = nPlusOneEnabled
? computeTokenKnownStatus(token, deps.isKnownWord, deps.knownWordMatchMode)
: false;
const frequencyRank = frequencyEnabled
? filterTokenFrequencyRank(token, pos1Exclusions, pos2Exclusions)
: undefined;
const jlptLevel = jlptEnabled
? computeTokenJlptLevel(token, deps.getJlptLevel)
: undefined;
return {
...token, ...token,
jlptLevel: undefined, isKnown,
})); isNPlusOneTarget: nPlusOneEnabled ? token.isNPlusOneTarget : false,
frequencyRank,
jlptLevel,
};
});
if (!nPlusOneEnabled) { if (!nPlusOneEnabled) {
return jlptMarkedTokens.map((token) => ({ return annotated;
...token,
isKnown: false,
isNPlusOneTarget: false,
}));
} }
const minSentenceWordsForNPlusOne = options.minSentenceWordsForNPlusOne; const minSentenceWordsForNPlusOne = options.minSentenceWordsForNPlusOne;
@@ -494,7 +478,7 @@ export function annotateTokens(
: 3; : 3;
return markNPlusOneTargets( return markNPlusOneTargets(
jlptMarkedTokens, annotated,
sanitizedMinSentenceWordsForNPlusOne, sanitizedMinSentenceWordsForNPlusOne,
pos1Exclusions, pos1Exclusions,
pos2Exclusions, pos2Exclusions,