mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-02 06:22:42 -08:00
perf(tokenizer): skip known-word lookup in MeCab POS enrichment
This commit is contained in:
@@ -2103,6 +2103,48 @@ test('createTokenizerDepsRuntime checks MeCab availability before first tokenize
|
|||||||
assert.equal(second?.[0]?.surface, '仮面');
|
assert.equal(second?.[0]?.surface, '仮面');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('createTokenizerDepsRuntime skips known-word lookup for MeCab POS enrichment tokens', async () => {
|
||||||
|
let knownWordCalls = 0;
|
||||||
|
|
||||||
|
const deps = createTokenizerDepsRuntime({
|
||||||
|
getYomitanExt: () => null,
|
||||||
|
getYomitanParserWindow: () => null,
|
||||||
|
setYomitanParserWindow: () => {},
|
||||||
|
getYomitanParserReadyPromise: () => null,
|
||||||
|
setYomitanParserReadyPromise: () => {},
|
||||||
|
getYomitanParserInitPromise: () => null,
|
||||||
|
setYomitanParserInitPromise: () => {},
|
||||||
|
isKnownWord: () => {
|
||||||
|
knownWordCalls += 1;
|
||||||
|
return true;
|
||||||
|
},
|
||||||
|
getKnownWordMatchMode: () => 'headword',
|
||||||
|
getJlptLevel: () => null,
|
||||||
|
getMecabTokenizer: () => ({
|
||||||
|
tokenize: async () => [
|
||||||
|
{
|
||||||
|
word: '仮面',
|
||||||
|
partOfSpeech: PartOfSpeech.noun,
|
||||||
|
pos1: '名詞',
|
||||||
|
pos2: '一般',
|
||||||
|
pos3: '',
|
||||||
|
pos4: '',
|
||||||
|
inflectionType: '',
|
||||||
|
inflectionForm: '',
|
||||||
|
headword: '仮面',
|
||||||
|
katakanaReading: 'カメン',
|
||||||
|
pronunciation: 'カメン',
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
|
||||||
|
const tokens = await deps.tokenizeWithMecab('仮面');
|
||||||
|
|
||||||
|
assert.equal(knownWordCalls, 0);
|
||||||
|
assert.equal(tokens?.[0]?.isKnown, false);
|
||||||
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle uses async MeCab enrichment override when provided', async () => {
|
test('tokenizeSubtitle uses async MeCab enrichment override when provided', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'猫',
|
'猫',
|
||||||
|
|||||||
@@ -211,8 +211,7 @@ export function createTokenizerDepsRuntime(
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
const isKnownWordLookup = options.getNPlusOneEnabled?.() === false ? () => false : options.isKnownWord;
|
return mergeTokens(rawTokens, options.isKnownWord, options.getKnownWordMatchMode(), false);
|
||||||
return mergeTokens(rawTokens, isKnownWordLookup, options.getKnownWordMatchMode());
|
|
||||||
},
|
},
|
||||||
enrichTokensWithMecab: async (tokens, mecabTokens) =>
|
enrichTokensWithMecab: async (tokens, mecabTokens) =>
|
||||||
enrichTokensWithMecabAsync(tokens, mecabTokens),
|
enrichTokensWithMecabAsync(tokens, mecabTokens),
|
||||||
|
|||||||
@@ -168,6 +168,7 @@ export function mergeTokens(
|
|||||||
tokens: Token[],
|
tokens: Token[],
|
||||||
isKnownWord: (text: string) => boolean = () => false,
|
isKnownWord: (text: string) => boolean = () => false,
|
||||||
knownWordMatchMode: 'headword' | 'surface' = 'headword',
|
knownWordMatchMode: 'headword' | 'surface' = 'headword',
|
||||||
|
shouldLookupKnownWords = true,
|
||||||
): MergedToken[] {
|
): MergedToken[] {
|
||||||
if (!tokens || tokens.length === 0) {
|
if (!tokens || tokens.length === 0) {
|
||||||
return [];
|
return [];
|
||||||
@@ -176,6 +177,12 @@ export function mergeTokens(
|
|||||||
const result: MergedToken[] = [];
|
const result: MergedToken[] = [];
|
||||||
let charOffset = 0;
|
let charOffset = 0;
|
||||||
let lastStandaloneToken: Token | null = null;
|
let lastStandaloneToken: Token | null = null;
|
||||||
|
const resolveKnownMatch = (text: string | undefined): boolean => {
|
||||||
|
if (!shouldLookupKnownWords || !text) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return isKnownWord(text);
|
||||||
|
};
|
||||||
|
|
||||||
for (const token of tokens) {
|
for (const token of tokens) {
|
||||||
const start = charOffset;
|
const start = charOffset;
|
||||||
@@ -189,7 +196,6 @@ export function mergeTokens(
|
|||||||
}
|
}
|
||||||
|
|
||||||
const tokenReading = ignoreReading(token) ? '' : token.katakanaReading || token.word;
|
const tokenReading = ignoreReading(token) ? '' : token.katakanaReading || token.word;
|
||||||
|
|
||||||
if (shouldMergeToken && result.length > 0) {
|
if (shouldMergeToken && result.length > 0) {
|
||||||
const prev = result.pop()!;
|
const prev = result.pop()!;
|
||||||
const mergedHeadword = prev.headword;
|
const mergedHeadword = prev.headword;
|
||||||
@@ -210,7 +216,7 @@ export function mergeTokens(
|
|||||||
pos2: prev.pos2 ?? token.pos2,
|
pos2: prev.pos2 ?? token.pos2,
|
||||||
pos3: prev.pos3 ?? token.pos3,
|
pos3: prev.pos3 ?? token.pos3,
|
||||||
isMerged: true,
|
isMerged: true,
|
||||||
isKnown: headwordForKnownMatch ? isKnownWord(headwordForKnownMatch) : false,
|
isKnown: resolveKnownMatch(headwordForKnownMatch),
|
||||||
isNPlusOneTarget: false,
|
isNPlusOneTarget: false,
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
@@ -231,7 +237,7 @@ export function mergeTokens(
|
|||||||
pos2: token.pos2,
|
pos2: token.pos2,
|
||||||
pos3: token.pos3,
|
pos3: token.pos3,
|
||||||
isMerged: false,
|
isMerged: false,
|
||||||
isKnown: headwordForKnownMatch ? isKnownWord(headwordForKnownMatch) : false,
|
isKnown: resolveKnownMatch(headwordForKnownMatch),
|
||||||
isNPlusOneTarget: false,
|
isNPlusOneTarget: false,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user