From 1efc0f8650de2927947ab7b6eec8eeaaa650af88 Mon Sep 17 00:00:00 2001 From: sudacode Date: Thu, 19 Feb 2026 19:03:50 -0800 Subject: [PATCH] fix(tokenizer): restore n+1 highlighting with mecab pos enrichment --- src/core/services/tokenizer.test.ts | 313 ++++++++++++++++++++++++++++ src/core/services/tokenizer.ts | 115 +++++++++- src/token-merger.ts | 5 + 3 files changed, 431 insertions(+), 2 deletions(-) diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index a7d717b..016db68 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -1583,6 +1583,266 @@ test('tokenizeSubtitle applies N+1 target marking to Yomitan results', async () assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false); }); +test('tokenizeSubtitle ignores Yomitan functional tokens when evaluating N+1 candidates', async () => { + const parserWindow = { + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: 'scanning-parser', + index: 0, + content: [ + [{ text: '私', reading: 'わたし', headwords: [[{ term: '私' }]] }], + [{ text: 'も', reading: 'も', headwords: [[{ term: 'も' }]] }], + [{ text: 'あの', reading: 'あの', headwords: [[{ term: 'あの' }]] }], + [{ text: '仮面', reading: 'かめん', headwords: [[{ term: '仮面' }]] }], + [{ text: 'が', reading: 'が', headwords: [[{ term: 'が' }]] }], + [{ text: '欲しい', reading: 'ほしい', headwords: [[{ term: '欲しい' }]] }], + [{ text: 'です', reading: 'です', headwords: [[{ term: 'です' }]] }], + ], + }, + ], + }, + } as unknown as Electron.BrowserWindow; + + const result = await tokenizeSubtitle( + '私も あの仮面が欲しいです', + makeDeps({ + getYomitanExt: () => ({ id: 'dummy-ext' }) as any, + getYomitanParserWindow: () => parserWindow, + tokenizeWithMecab: async () => [ + { + surface: '私', + reading: 'ワタシ', + headword: '私', + startPos: 0, + endPos: 1, + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + isMerged: false, + isKnown: true, + isNPlusOneTarget: false, + }, + { + surface: 'も', + reading: 'モ', + headword: 'も', + startPos: 1, + endPos: 2, + partOfSpeech: PartOfSpeech.particle, + pos1: '助詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + surface: 'あの', + reading: 'アノ', + headword: 'あの', + startPos: 2, + endPos: 4, + partOfSpeech: PartOfSpeech.other, + pos1: '連体詞', + isMerged: false, + isKnown: true, + isNPlusOneTarget: false, + }, + { + surface: '仮面', + reading: 'カメン', + headword: '仮面', + startPos: 4, + endPos: 6, + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + surface: 'が', + reading: 'ガ', + headword: 'が', + startPos: 6, + endPos: 7, + partOfSpeech: PartOfSpeech.particle, + pos1: '助詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + surface: '欲しい', + reading: 'ホシイ', + headword: '欲しい', + startPos: 7, + endPos: 10, + partOfSpeech: PartOfSpeech.i_adjective, + pos1: '形容詞', + isMerged: false, + isKnown: true, + isNPlusOneTarget: false, + }, + { + surface: 'です', + reading: 'デス', + headword: 'です', + startPos: 10, + endPos: 12, + partOfSpeech: PartOfSpeech.bound_auxiliary, + pos1: '助動詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + ], + isKnownWord: (text) => text === '私' || text === 'あの' || text === '欲しい', + }), + ); + + const targets = result.tokens?.filter((token) => token.isNPlusOneTarget) ?? []; + assert.equal(targets.length, 1); + assert.equal(targets[0]?.surface, '仮面'); +}); + +test('tokenizeSubtitle keeps correct MeCab pos1 enrichment when Yomitan offsets skip spaces', async () => { + const parserWindow = { + isDestroyed: () => false, + webContents: { + executeJavaScript: async () => [ + { + source: 'scanning-parser', + index: 0, + content: [ + [{ text: '私', reading: 'わたし', headwords: [[{ term: '私' }]] }], + [{ text: 'も', reading: 'も', headwords: [[{ term: 'も' }]] }], + [{ text: 'あの', reading: 'あの', headwords: [[{ term: 'あの' }]] }], + [{ text: '仮面', reading: 'かめん', headwords: [[{ term: '仮面' }]] }], + [{ text: 'が', reading: 'が', headwords: [[{ term: 'が' }]] }], + [{ text: '欲しい', reading: 'ほしい', headwords: [[{ term: '欲しい' }]] }], + [{ text: 'です', reading: 'です', headwords: [[{ term: 'です' }]] }], + ], + }, + ], + }, + } as unknown as Electron.BrowserWindow; + + const result = await tokenizeSubtitle( + '私も あの仮面が欲しいです', + makeDeps({ + getYomitanExt: () => ({ id: 'dummy-ext' }) as any, + getYomitanParserWindow: () => parserWindow, + tokenizeWithMecab: async () => [ + { + surface: '私', + reading: 'ワタシ', + headword: '私', + startPos: 0, + endPos: 1, + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + isMerged: false, + isKnown: true, + isNPlusOneTarget: false, + }, + { + surface: 'も', + reading: 'モ', + headword: 'も', + startPos: 1, + endPos: 2, + partOfSpeech: PartOfSpeech.particle, + pos1: '助詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + surface: ' ', + reading: '', + headword: ' ', + startPos: 2, + endPos: 3, + partOfSpeech: PartOfSpeech.symbol, + pos1: '記号', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + surface: 'あの', + reading: 'アノ', + headword: 'あの', + startPos: 3, + endPos: 5, + partOfSpeech: PartOfSpeech.other, + pos1: '連体詞', + isMerged: false, + isKnown: true, + isNPlusOneTarget: false, + }, + { + surface: '仮面', + reading: 'カメン', + headword: '仮面', + startPos: 5, + endPos: 7, + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + surface: 'が', + reading: 'ガ', + headword: 'が', + startPos: 7, + endPos: 8, + partOfSpeech: PartOfSpeech.particle, + pos1: '助詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + surface: '欲しい', + reading: 'ホシイ', + headword: '欲しい', + startPos: 8, + endPos: 11, + partOfSpeech: PartOfSpeech.i_adjective, + pos1: '形容詞', + isMerged: false, + isKnown: true, + isNPlusOneTarget: false, + }, + { + surface: 'です', + reading: 'デス', + headword: 'です', + startPos: 11, + endPos: 13, + partOfSpeech: PartOfSpeech.bound_auxiliary, + pos1: '助動詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + ], + isKnownWord: (text) => text === '私' || text === 'あの' || text === '欲しい', + }), + ); + + const targets = result.tokens?.filter((token) => token.isNPlusOneTarget) ?? []; + const gaToken = result.tokens?.find((token) => token.surface === 'が'); + const desuToken = result.tokens?.find((token) => token.surface === 'です'); + assert.equal(gaToken?.pos1, '助詞'); + assert.equal(desuToken?.pos1, '助動詞'); + assert.equal(targets.length, 1); + assert.equal(targets[0]?.surface, '仮面'); +}); + test('tokenizeSubtitle does not color 1-2 word sentences by default', async () => { const result = await tokenizeSubtitle( '猫です', @@ -1678,3 +1938,56 @@ test('tokenizeSubtitle checks known words by surface when configured', async () assert.equal(result.text, '猫です'); assert.equal(result.tokens?.[0]?.isKnown, true); }); + +test('createTokenizerDepsRuntime checks MeCab availability before first tokenize call', async () => { + let available = false; + let checkCalls = 0; + + const deps = createTokenizerDepsRuntime({ + getYomitanExt: () => null, + getYomitanParserWindow: () => null, + setYomitanParserWindow: () => {}, + getYomitanParserReadyPromise: () => null, + setYomitanParserReadyPromise: () => {}, + getYomitanParserInitPromise: () => null, + setYomitanParserInitPromise: () => {}, + isKnownWord: () => false, + getKnownWordMatchMode: () => 'headword', + getJlptLevel: () => null, + getMecabTokenizer: () => ({ + getStatus: () => ({ available }), + checkAvailability: async () => { + checkCalls += 1; + available = true; + return true; + }, + tokenize: async () => { + if (!available) { + return null; + } + return [ + { + word: '仮面', + partOfSpeech: PartOfSpeech.noun, + pos1: '名詞', + pos2: '一般', + pos3: '', + pos4: '', + inflectionType: '', + inflectionForm: '', + headword: '仮面', + katakanaReading: 'カメン', + pronunciation: 'カメン', + }, + ]; + }, + }), + }); + + const first = await tokenizeSubtitle('仮面', deps); + const second = await tokenizeSubtitle('仮面', deps); + + assert.equal(checkCalls, 1); + assert.equal(first.tokens?.[0]?.surface, '仮面'); + assert.equal(second.tokens?.[0]?.surface, '仮面'); +}); diff --git a/src/core/services/tokenizer.ts b/src/core/services/tokenizer.ts index b0dd26d..b88a0d2 100644 --- a/src/core/services/tokenizer.ts +++ b/src/core/services/tokenizer.ts @@ -75,6 +75,8 @@ export interface TokenizerServiceDeps { interface MecabTokenizerLike { tokenize: (text: string) => Promise; + checkAvailability?: () => Promise; + getStatus?: () => { available: boolean }; } export interface TokenizerDepsRuntimeOptions { @@ -182,6 +184,8 @@ function getCachedFrequencyRank( export function createTokenizerDepsRuntime( options: TokenizerDepsRuntimeOptions, ): TokenizerServiceDeps { + const checkedMecabTokenizers = new WeakSet(); + return { getYomitanExt: options.getYomitanExt, getYomitanParserWindow: options.getYomitanParserWindow, @@ -203,6 +207,19 @@ export function createTokenizerDepsRuntime( if (!mecabTokenizer) { return null; } + + if ( + typeof mecabTokenizer.checkAvailability === 'function' && + typeof mecabTokenizer.getStatus === 'function' && + !checkedMecabTokenizers.has(mecabTokenizer as object) + ) { + const status = mecabTokenizer.getStatus(); + if (!status.available) { + await mecabTokenizer.checkAvailability(); + } + checkedMecabTokenizers.add(mecabTokenizer as object); + } + const rawTokens = await mecabTokenizer.tokenize(text); if (!rawTokens || rawTokens.length === 0) { return null; @@ -688,10 +705,42 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s const tokenStart = token.startPos ?? 0; const tokenEnd = token.endPos ?? tokenStart + token.surface.length; + let bestSurfaceMatchPos1: string | undefined; + let bestSurfaceMatchDistance = Number.MAX_SAFE_INTEGER; + let bestSurfaceMatchEndDistance = Number.MAX_SAFE_INTEGER; + + for (const mecabToken of mecabTokens) { + if (!mecabToken.pos1) { + continue; + } + + if (mecabToken.surface !== token.surface) { + continue; + } + + const mecabStart = mecabToken.startPos ?? 0; + const mecabEnd = mecabToken.endPos ?? mecabStart + mecabToken.surface.length; + const startDistance = Math.abs(mecabStart - tokenStart); + const endDistance = Math.abs(mecabEnd - tokenEnd); + + if ( + startDistance < bestSurfaceMatchDistance || + (startDistance === bestSurfaceMatchDistance && endDistance < bestSurfaceMatchEndDistance) + ) { + bestSurfaceMatchDistance = startDistance; + bestSurfaceMatchEndDistance = endDistance; + bestSurfaceMatchPos1 = mecabToken.pos1; + } + } + + if (bestSurfaceMatchPos1) { + return bestSurfaceMatchPos1; + } let bestPos1: string | undefined; let bestOverlap = 0; let bestSpan = 0; + let bestStartDistance = Number.MAX_SAFE_INTEGER; let bestStart = Number.MAX_SAFE_INTEGER; for (const mecabToken of mecabTokens) { @@ -712,10 +761,13 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s if ( overlap > bestOverlap || (overlap === bestOverlap && - (span > bestSpan || (span === bestSpan && mecabStart < bestStart))) + (Math.abs(mecabStart - tokenStart) < bestStartDistance || + (Math.abs(mecabStart - tokenStart) === bestStartDistance && + (span > bestSpan || (span === bestSpan && mecabStart < bestStart))))) ) { bestOverlap = overlap; bestSpan = span; + bestStartDistance = Math.abs(mecabStart - tokenStart); bestStart = mecabStart; bestPos1 = mecabToken.pos1; } @@ -724,6 +776,63 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s return bestOverlap > 0 ? bestPos1 : undefined; } +function fillMissingPos1BySurfaceSequence( + tokens: MergedToken[], + mecabTokens: MergedToken[], +): MergedToken[] { + const indexedMecabTokens = mecabTokens + .map((token, index) => ({ token, index })) + .filter(({ token }) => token.pos1 && token.surface.trim().length > 0); + + if (indexedMecabTokens.length === 0) { + return tokens; + } + + let cursor = 0; + return tokens.map((token) => { + if (token.pos1 && token.pos1.trim().length > 0) { + return token; + } + + const surface = token.surface.trim(); + if (!surface) { + return token; + } + + let best: { pos1: string; index: number } | null = null; + for (const candidate of indexedMecabTokens) { + if (candidate.token.surface !== surface) { + continue; + } + if (candidate.index < cursor) { + continue; + } + best = { pos1: candidate.token.pos1 as string, index: candidate.index }; + break; + } + + if (!best) { + for (const candidate of indexedMecabTokens) { + if (candidate.token.surface !== surface) { + continue; + } + best = { pos1: candidate.token.pos1 as string, index: candidate.index }; + break; + } + } + + if (!best) { + return token; + } + + cursor = best.index + 1; + return { + ...token, + pos1: best.pos1, + }; + }); +} + async function enrichYomitanPos1( tokens: MergedToken[], deps: TokenizerServiceDeps, @@ -756,7 +865,7 @@ async function enrichYomitanPos1( return tokens; } - return tokens.map((token) => { + const overlapEnriched = tokens.map((token) => { if (token.pos1) { return token; } @@ -771,6 +880,8 @@ async function enrichYomitanPos1( pos1, }; }); + + return fillMissingPos1BySurfaceSequence(overlapEnriched, mecabTokens); } async function ensureYomitanParserWindow(deps: TokenizerServiceDeps): Promise { diff --git a/src/token-merger.ts b/src/token-merger.ts index a54b029..54a89a3 100644 --- a/src/token-merger.ts +++ b/src/token-merger.ts @@ -241,6 +241,7 @@ export function mergeTokens( } const SENTENCE_BOUNDARY_SURFACES = new Set(['。', '?', '!', '?', '!', '…', '\u2026']); +const N_PLUS_ONE_IGNORED_POS1 = new Set(['助詞', '助動詞', '記号', '補助記号']); export function isNPlusOneCandidateToken(token: MergedToken): boolean { if (token.isKnown) { @@ -267,6 +268,10 @@ export function isNPlusOneCandidateToken(token: MergedToken): boolean { return false; } + if (token.pos1 && N_PLUS_ONE_IGNORED_POS1.has(token.pos1)) { + return false; + } + if (token.surface.trim().length === 0) { return false; }