diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index c69d277..707e4f6 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -2103,6 +2103,48 @@ test('createTokenizerDepsRuntime checks MeCab availability before first tokenize assert.equal(second?.[0]?.surface, '仮面'); }); +test('createTokenizerDepsRuntime skips known-word lookup for MeCab POS enrichment tokens', async () => { + let knownWordCalls = 0; + + const deps = createTokenizerDepsRuntime({ + getYomitanExt: () => null, + getYomitanParserWindow: () => null, + setYomitanParserWindow: () => {}, + getYomitanParserReadyPromise: () => null, + setYomitanParserReadyPromise: () => {}, + getYomitanParserInitPromise: () => null, + setYomitanParserInitPromise: () => {}, + isKnownWord: () => { + knownWordCalls += 1; + return true; + }, + getKnownWordMatchMode: () => 'headword', + getJlptLevel: () => null, + getMecabTokenizer: () => ({ + tokenize: async () => [ + { + word: '仮面', + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '一般', + pos3: '', + pos4: '', + inflectionType: '', + inflectionForm: '', + headword: '仮面', + katakanaReading: 'カメン', + pronunciation: 'カメン', + }, + ], + }), + }); + + const tokens = await deps.tokenizeWithMecab('仮面'); + + assert.equal(knownWordCalls, 0); + assert.equal(tokens?.[0]?.isKnown, false); +}); + test('tokenizeSubtitle uses async MeCab enrichment override when provided', async () => { const result = await tokenizeSubtitle( '猫', diff --git a/src/core/services/tokenizer.ts b/src/core/services/tokenizer.ts index 210e8f7..c808642 100644 --- a/src/core/services/tokenizer.ts +++ b/src/core/services/tokenizer.ts @@ -211,8 +211,7 @@ export function createTokenizerDepsRuntime( return null; } - const isKnownWordLookup = options.getNPlusOneEnabled?.() === false ? () => false : options.isKnownWord; - return mergeTokens(rawTokens, isKnownWordLookup, options.getKnownWordMatchMode()); + return mergeTokens(rawTokens, options.isKnownWord, options.getKnownWordMatchMode(), false); }, enrichTokensWithMecab: async (tokens, mecabTokens) => enrichTokensWithMecabAsync(tokens, mecabTokens), diff --git a/src/token-merger.ts b/src/token-merger.ts index f333b01..f26470d 100644 --- a/src/token-merger.ts +++ b/src/token-merger.ts @@ -168,6 +168,7 @@ export function mergeTokens( tokens: Token[], isKnownWord: (text: string) => boolean = () => false, knownWordMatchMode: 'headword' | 'surface' = 'headword', + shouldLookupKnownWords = true, ): MergedToken[] { if (!tokens || tokens.length === 0) { return []; @@ -176,6 +177,12 @@ export function mergeTokens( const result: MergedToken[] = []; let charOffset = 0; let lastStandaloneToken: Token | null = null; + const resolveKnownMatch = (text: string | undefined): boolean => { + if (!shouldLookupKnownWords || !text) { + return false; + } + return isKnownWord(text); + }; for (const token of tokens) { const start = charOffset; @@ -189,7 +196,6 @@ export function mergeTokens( } const tokenReading = ignoreReading(token) ? '' : token.katakanaReading || token.word; - if (shouldMergeToken && result.length > 0) { const prev = result.pop()!; const mergedHeadword = prev.headword; @@ -210,7 +216,7 @@ export function mergeTokens( pos2: prev.pos2 ?? token.pos2, pos3: prev.pos3 ?? token.pos3, isMerged: true, - isKnown: headwordForKnownMatch ? isKnownWord(headwordForKnownMatch) : false, + isKnown: resolveKnownMatch(headwordForKnownMatch), isNPlusOneTarget: false, }); } else { @@ -231,7 +237,7 @@ export function mergeTokens( pos2: token.pos2, pos3: token.pos3, isMerged: false, - isKnown: headwordForKnownMatch ? isKnownWord(headwordForKnownMatch) : false, + isKnown: resolveKnownMatch(headwordForKnownMatch), isNPlusOneTarget: false, }); }