perf(tokenizer): skip known-word lookup in MeCab POS enrichment

This commit is contained in:
2026-03-02 01:38:37 -08:00
parent cde231b1ff
commit 83f13df627
3 changed files with 52 additions and 5 deletions

View File

@@ -2103,6 +2103,48 @@ test('createTokenizerDepsRuntime checks MeCab availability before first tokenize
assert.equal(second?.[0]?.surface, '仮面');
});
test('createTokenizerDepsRuntime skips known-word lookup for MeCab POS enrichment tokens', async () => {
let knownWordCalls = 0;
const deps = createTokenizerDepsRuntime({
getYomitanExt: () => null,
getYomitanParserWindow: () => null,
setYomitanParserWindow: () => {},
getYomitanParserReadyPromise: () => null,
setYomitanParserReadyPromise: () => {},
getYomitanParserInitPromise: () => null,
setYomitanParserInitPromise: () => {},
isKnownWord: () => {
knownWordCalls += 1;
return true;
},
getKnownWordMatchMode: () => 'headword',
getJlptLevel: () => null,
getMecabTokenizer: () => ({
tokenize: async () => [
{
word: '仮面',
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '一般',
pos3: '',
pos4: '',
inflectionType: '',
inflectionForm: '',
headword: '仮面',
katakanaReading: 'カメン',
pronunciation: 'カメン',
},
],
}),
});
const tokens = await deps.tokenizeWithMecab('仮面');
assert.equal(knownWordCalls, 0);
assert.equal(tokens?.[0]?.isKnown, false);
});
test('tokenizeSubtitle uses async MeCab enrichment override when provided', async () => {
const result = await tokenizeSubtitle(
'猫',

View File

@@ -211,8 +211,7 @@ export function createTokenizerDepsRuntime(
return null;
}
const isKnownWordLookup = options.getNPlusOneEnabled?.() === false ? () => false : options.isKnownWord;
return mergeTokens(rawTokens, isKnownWordLookup, options.getKnownWordMatchMode());
return mergeTokens(rawTokens, options.isKnownWord, options.getKnownWordMatchMode(), false);
},
enrichTokensWithMecab: async (tokens, mecabTokens) =>
enrichTokensWithMecabAsync(tokens, mecabTokens),