import test from 'node:test'; import assert from 'node:assert/strict'; import { PartOfSpeech } from '../../types'; import { createTokenizerDepsRuntime, TokenizerServiceDeps, tokenizeSubtitle } from './tokenizer'; function makeDeps(overrides: Partial = {}): TokenizerServiceDeps { return { getYomitanExt: () => null, getYomitanParserWindow: () => null, setYomitanParserWindow: () => {}, getYomitanParserReadyPromise: () => null, setYomitanParserReadyPromise: () => {}, getYomitanParserInitPromise: () => null, setYomitanParserInitPromise: () => {}, isKnownWord: () => false, getKnownWordMatchMode: () => 'headword', getJlptLevel: () => null, tokenizeWithMecab: async () => null, ...overrides, }; } interface YomitanTokenInput { surface: string; reading?: string; headword?: string; isNameMatch?: boolean; } function makeDepsFromYomitanTokens( tokens: YomitanTokenInput[], overrides: Partial = {}, ): TokenizerServiceDeps { let cursor = 0; return makeDeps({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async (script: string) => { if (script.includes('getTermFrequencies')) { return []; } cursor = 0; return tokens.map((token) => { const startPos = cursor; const endPos = startPos + token.surface.length; cursor = endPos; return { surface: token.surface, reading: token.reading ?? token.surface, headword: token.headword ?? token.surface, startPos, endPos, isNameMatch: token.isNameMatch ?? false, }; }); }, }, }) as unknown as Electron.BrowserWindow, ...overrides, }); } function createDeferred() { let resolve: ((value: T) => void) | null = null; const promise = new Promise((innerResolve) => { resolve = innerResolve; }); return { promise, resolve: (value: T) => { resolve?.(value); }, }; } test('tokenizeSubtitle assigns JLPT level to parsed Yomitan tokens', async () => { const result = await tokenizeSubtitle( '猫です', makeDeps({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [ { text: '猫', reading: 'ねこ', headwords: [[{ term: '猫' }]], }, { text: 'です', reading: 'です', headwords: [[{ term: 'です' }]], }, ], ], }, ], }, }) as unknown as Electron.BrowserWindow, tokenizeWithMecab: async () => null, getJlptLevel: (text) => (text === '猫' ? 'N5' : null), }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.jlptLevel, 'N5'); }); test('tokenizeSubtitle preserves Yomitan name-match metadata on tokens', async () => { const result = await tokenizeSubtitle( 'アクアです', makeDepsFromYomitanTokens([ { surface: 'アクア', reading: 'あくあ', headword: 'アクア', isNameMatch: true }, { surface: 'です', reading: 'です', headword: 'です' }, ]), ); assert.equal(result.tokens?.length, 2); assert.equal((result.tokens?.[0] as { isNameMatch?: boolean } | undefined)?.isNameMatch, true); assert.equal((result.tokens?.[1] as { isNameMatch?: boolean } | undefined)?.isNameMatch, false); }); test('tokenizeSubtitle caches JLPT lookups across repeated tokens', async () => { let lookupCalls = 0; const result = await tokenizeSubtitle( '猫猫', makeDepsFromYomitanTokens( [ { surface: '猫', reading: 'ねこ', headword: '猫' }, { surface: '猫', reading: 'ねこ', headword: '猫' }, ], { getJlptLevel: (text) => { lookupCalls += 1; return text === '猫' ? 'N5' : null; }, }, ), ); assert.equal(result.tokens?.length, 2); assert.equal(lookupCalls, 1); assert.equal(result.tokens?.[0]?.jlptLevel, 'N5'); assert.equal(result.tokens?.[1]?.jlptLevel, 'N5'); }); test('tokenizeSubtitle leaves JLPT unset for non-matching tokens', async () => { const result = await tokenizeSubtitle( '猫', makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], { getJlptLevel: () => null, }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.jlptLevel, undefined); }); test('tokenizeSubtitle skips JLPT lookups when disabled', async () => { let lookupCalls = 0; const result = await tokenizeSubtitle( '猫です', makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], { getJlptLevel: () => { lookupCalls += 1; return 'N5'; }, getJlptEnabled: () => false, }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.jlptLevel, undefined); assert.equal(lookupCalls, 0); }); test('tokenizeSubtitle applies frequency dictionary ranks', async () => { const result = await tokenizeSubtitle( '猫です', makeDepsFromYomitanTokens( [ { surface: '猫', reading: 'ねこ', headword: '猫' }, { surface: 'です', reading: 'です', headword: 'です' }, ], { getFrequencyDictionaryEnabled: () => true, getFrequencyRank: (text) => (text === '猫' ? 23 : 1200), }, ), ); assert.equal(result.tokens?.length, 2); assert.equal(result.tokens?.[0]?.frequencyRank, 23); assert.equal(result.tokens?.[1]?.frequencyRank, 1200); }); test('tokenizeSubtitle uses left-to-right yomitan scanning to keep full katakana name tokens', async () => { const result = await tokenizeSubtitle( 'カズマ 魔王軍', makeDeps({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async (script: string) => { if (script.includes('getTermFrequencies')) { return []; } return [ { surface: 'カズマ', reading: 'かずま', headword: 'カズマ', startPos: 0, endPos: 3, }, { surface: '魔王軍', reading: 'まおうぐん', headword: '魔王軍', startPos: 4, endPos: 7, }, ]; }, }, }) as unknown as Electron.BrowserWindow, }), ); assert.deepEqual( result.tokens?.map((token) => ({ surface: token.surface, reading: token.reading, headword: token.headword, startPos: token.startPos, endPos: token.endPos, })), [ { surface: 'カズマ', reading: 'かずま', headword: 'カズマ', startPos: 0, endPos: 3, }, { surface: '魔王軍', reading: 'まおうぐん', headword: '魔王軍', startPos: 4, endPos: 7, }, ], ); }); test('tokenizeSubtitle loads frequency ranks from Yomitan installed dictionaries', async () => { const result = await tokenizeSubtitle( '猫', makeDeps({ getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async (script: string) => { if (script.includes('getTermFrequencies')) { return [ { term: '猫', reading: 'ねこ', dictionary: 'freq-dict', frequency: 77, displayValue: '77', displayValueParsed: true, }, ]; } return [ { source: 'scanning-parser', index: 0, content: [ [ { text: '猫', reading: 'ねこ', headwords: [[{ term: '猫' }]], }, ], ], }, ]; }, }, }) as unknown as Electron.BrowserWindow, }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.frequencyRank, 77); }); test('tokenizeSubtitle starts Yomitan frequency lookup and MeCab enrichment in parallel', async () => { const frequencyDeferred = createDeferred(); const mecabDeferred = createDeferred(); let frequencyRequested = false; let mecabRequested = false; const pendingResult = tokenizeSubtitle( '猫', makeDeps({ getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async (script: string) => { if (script.includes('getTermFrequencies')) { frequencyRequested = true; return await frequencyDeferred.promise; } return [ { source: 'scanning-parser', index: 0, content: [ [ { text: '猫', reading: 'ねこ', headwords: [[{ term: '猫' }]], }, ], ], }, ]; }, }, }) as unknown as Electron.BrowserWindow, tokenizeWithMecab: async () => { mecabRequested = true; return await mecabDeferred.promise; }, }), ); await new Promise((resolve) => setTimeout(resolve, 0)); assert.equal(frequencyRequested, true); assert.equal(mecabRequested, true); frequencyDeferred.resolve([ { term: '猫', reading: 'ねこ', dictionary: 'freq-dict', frequency: 77, displayValue: '77', displayValueParsed: true, }, ]); mecabDeferred.resolve(null); const result = await pendingResult; assert.equal(result.tokens?.[0]?.frequencyRank, 77); }); test('tokenizeSubtitle can signal tokenization-ready before enrichment completes', async () => { const frequencyDeferred = createDeferred(); const mecabDeferred = createDeferred(); let tokenizationReadyText: string | null = null; const pendingResult = tokenizeSubtitle( '猫', makeDeps({ onTokenizationReady: (text) => { tokenizationReadyText = text; }, getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async (script: string) => { if (script.includes('getTermFrequencies')) { return await frequencyDeferred.promise; } return [ { source: 'scanning-parser', index: 0, content: [ [ { text: '猫', reading: 'ねこ', headwords: [[{ term: '猫' }]], }, ], ], }, ]; }, }, }) as unknown as Electron.BrowserWindow, tokenizeWithMecab: async () => { return await mecabDeferred.promise; }, }), ); await new Promise((resolve) => setTimeout(resolve, 0)); assert.equal(tokenizationReadyText, '猫'); frequencyDeferred.resolve([]); mecabDeferred.resolve(null); await pendingResult; }); test('tokenizeSubtitle appends trailing kana to merged Yomitan readings when headword equals surface', async () => { const result = await tokenizeSubtitle( '断じて見ていない', makeDeps({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [ { text: '断', reading: 'だん', headwords: [[{ term: '断じて' }]] }, { text: 'じて', reading: '', headwords: [[{ term: 'じて' }]] }, ], [ { text: '見', reading: 'み', headwords: [[{ term: '見る' }]] }, { text: 'ていない', reading: '', headwords: [[{ term: 'ていない' }]] }, ], ], }, ], }, }) as unknown as Electron.BrowserWindow, }), ); assert.equal(result.tokens?.length, 2); assert.equal(result.tokens?.[0]?.surface, '断じて'); assert.equal(result.tokens?.[0]?.reading, 'だんじて'); assert.equal(result.tokens?.[1]?.surface, '見ていない'); assert.equal(result.tokens?.[1]?.reading, 'み'); }); test('tokenizeSubtitle queries headword frequencies with token reading for disambiguation', async () => { const result = await tokenizeSubtitle( '鍛えた', makeDeps({ getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async (script: string) => { if (script.includes('getTermFrequencies')) { assert.equal( script.includes('"term":"鍛える","reading":null'), false, 'should not eagerly include term-only fallback pair when reading lookup is present', ); if (!script.includes('"term":"鍛える","reading":"きた"')) { return []; } return [ { term: '鍛える', reading: 'きたえる', dictionary: 'freq-dict', frequency: 46961, displayValue: '2847,46961', displayValueParsed: true, }, ]; } return [ { source: 'scanning-parser', index: 0, content: [ [ { text: '鍛えた', reading: 'きた', headwords: [[{ term: '鍛える' }]], }, ], ], }, ]; }, }, }) as unknown as Electron.BrowserWindow, }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.headword, '鍛える'); assert.equal(result.tokens?.[0]?.reading, 'きた'); assert.equal(result.tokens?.[0]?.frequencyRank, 2847); }); test('tokenizeSubtitle falls back to term-only Yomitan frequency lookup when reading is noisy', async () => { const result = await tokenizeSubtitle( '断じて', makeDeps({ getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async (script: string) => { if (script.includes('getTermFrequencies')) { if (!script.includes('"term":"断じて","reading":null')) { return []; } return [ { term: '断じて', reading: null, dictionary: 'freq-dict', frequency: 7082, displayValue: '7082', displayValueParsed: true, }, ]; } return [ { source: 'scanning-parser', index: 0, content: [ [ { text: '断じて', reading: 'だん', headwords: [[{ term: '断じて' }]], }, ], ], }, ]; }, }, }) as unknown as Electron.BrowserWindow, }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.frequencyRank, 7082); }); test('tokenizeSubtitle avoids headword term-only fallback rank when reading-specific frequency exists', async () => { const result = await tokenizeSubtitle( '無人', makeDeps({ getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async (script: string) => { if (script.includes('getTermFrequencies')) { if (!script.includes('"term":"無人","reading":"むじん"')) { return []; } return [ { term: '無人', reading: null, dictionary: 'CC100', dictionaryPriority: 0, frequency: 157632, displayValue: null, displayValueParsed: false, }, { term: '無人', reading: 'むじん', dictionary: 'CC100', dictionaryPriority: 0, frequency: 7141, displayValue: null, displayValueParsed: false, }, ]; } return [ { source: 'scanning-parser', index: 0, content: [ [ { text: '無人', reading: 'むじん', headwords: [[{ term: '無人' }]], }, ], ], }, ]; }, }, }) as unknown as Electron.BrowserWindow, }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.frequencyRank, 7141); }); test('tokenizeSubtitle prefers Yomitan frequency from highest-priority dictionary', async () => { const result = await tokenizeSubtitle( '猫', makeDeps({ getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async (script: string) => { if (script.includes('getTermFrequencies')) { return [ { term: '猫', reading: 'ねこ', dictionary: 'low-priority', dictionaryPriority: 2, frequency: 5, displayValue: '5', displayValueParsed: true, }, { term: '猫', reading: 'ねこ', dictionary: 'high-priority', dictionaryPriority: 0, frequency: 100, displayValue: '100', displayValueParsed: true, }, ]; } return [ { source: 'scanning-parser', index: 0, content: [ [ { text: '猫', reading: 'ねこ', headwords: [[{ term: '猫' }]], }, ], ], }, ]; }, }, }) as unknown as Electron.BrowserWindow, }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.frequencyRank, 100); }); test('tokenizeSubtitle ignores occurrence-based Yomitan frequencies for inflected terms', async () => { const result = await tokenizeSubtitle( '潜み', makeDeps({ getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async (script: string) => { if (script.includes('getTermFrequencies')) { return [ { term: '潜む', reading: 'ひそ', dictionary: 'CC100', frequency: 118121, displayValue: null, displayValueParsed: false, }, ]; } if (script.includes('optionsGetFull')) { return { profileCurrent: 0, profileIndex: 0, scanLength: 40, dictionaries: ['CC100'], dictionaryPriorityByName: { CC100: 0 }, dictionaryFrequencyModeByName: { CC100: 'occurrence-based' }, profiles: [ { options: { scanning: { length: 40 }, dictionaries: [{ name: 'CC100', enabled: true, id: 0 }], }, }, ], }; } return [ { surface: '潜み', reading: 'ひそ', headword: '潜む', startPos: 0, endPos: 2, }, ]; }, }, }) as unknown as Electron.BrowserWindow, }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.frequencyRank, undefined); }); test('tokenizeSubtitle falls back to raw term-only Yomitan rank when no scan-derived rank exists', async () => { const result = await tokenizeSubtitle( '潜み', makeDeps({ getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async (script: string) => { if (script.includes('getTermFrequencies')) { return [ { term: '潜む', reading: 'ひそ', hasReading: false, dictionary: 'CC100', frequency: 118121, displayValue: null, displayValueParsed: false, }, ]; } if (script.includes('optionsGetFull')) { return { profileCurrent: 0, profileIndex: 0, scanLength: 40, dictionaries: ['CC100'], dictionaryPriorityByName: { CC100: 0 }, dictionaryFrequencyModeByName: { CC100: 'rank-based' }, profiles: [ { options: { scanning: { length: 40 }, dictionaries: [{ name: 'CC100', enabled: true, id: 0 }], }, }, ], }; } return [ { surface: '潜み', reading: 'ひそ', headword: '潜む', startPos: 0, endPos: 2, }, ]; }, }, }) as unknown as Electron.BrowserWindow, }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.frequencyRank, 118121); }); test('tokenizeSubtitle keeps parsed display rank for term-only inflected headword fallback', async () => { const result = await tokenizeSubtitle( '潜み', makeDeps({ getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async (script: string) => { if (script.includes('getTermFrequencies')) { return [ { term: '潜む', reading: 'ひそ', hasReading: false, dictionary: 'CC100', frequency: 118121, displayValue: '118,121', displayValueParsed: false, }, ]; } if (script.includes('optionsGetFull')) { return { profileCurrent: 0, profileIndex: 0, scanLength: 40, dictionaries: ['CC100'], dictionaryPriorityByName: { CC100: 0 }, dictionaryFrequencyModeByName: { CC100: 'rank-based' }, profiles: [ { options: { scanning: { length: 40 }, dictionaries: [{ name: 'CC100', enabled: true, id: 0 }], }, }, ], }; } return [ { surface: '潜み', reading: 'ひそ', headword: '潜む', startPos: 0, endPos: 2, }, ]; }, }, }) as unknown as Electron.BrowserWindow, }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.frequencyRank, 118); }); test('tokenizeSubtitle preserves scan-derived rank over lower-priority Yomitan fallback', async () => { const result = await tokenizeSubtitle( '潜み', makeDeps({ getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async (script: string) => { if (script.includes('getTermFrequencies')) { return [ { term: '潜む', reading: 'ひそ', hasReading: false, dictionary: 'CC100', dictionaryPriority: 2, frequency: 118121, displayValue: null, displayValueParsed: false, }, ]; } return [ { surface: '潜み', reading: 'ひそむ', headword: '潜む', startPos: 0, endPos: 2, frequencyRank: 4073, }, ]; }, }, }) as unknown as Electron.BrowserWindow, }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.frequencyRank, 4073); }); test('tokenizeSubtitle uses only selected Yomitan headword for frequency lookup', async () => { const result = await tokenizeSubtitle( '猫です', makeDeps({ getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [ { text: '猫です', reading: 'ねこです', headwords: [[{ term: '猫です' }], [{ term: '猫' }]], }, ], ], }, ], }, }) as unknown as Electron.BrowserWindow, getFrequencyRank: (text) => (text === '猫' ? 40 : text === '猫です' ? 1200 : null), }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.frequencyRank, 1200); }); test('tokenizeSubtitle keeps furigana-split Yomitan segments as one token', async () => { const result = await tokenizeSubtitle( '友達と話した', makeDeps({ getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [ { text: '友', reading: 'とも', headwords: [[{ term: '友達' }]], }, { text: '達', reading: 'だち', }, ], [ { text: 'と', reading: 'と', headwords: [[{ term: 'と' }]], }, ], [ { text: '話した', reading: 'はなした', headwords: [[{ term: '話す' }]], }, ], ], }, ], }, }) as unknown as Electron.BrowserWindow, getFrequencyRank: (text) => (text === '友達' ? 22 : text === '話す' ? 90 : null), }), ); assert.equal(result.tokens?.length, 3); assert.equal(result.tokens?.[0]?.surface, '友達'); assert.equal(result.tokens?.[0]?.reading, 'ともだち'); assert.equal(result.tokens?.[0]?.headword, '友達'); assert.equal(result.tokens?.[0]?.frequencyRank, 22); assert.equal(result.tokens?.[1]?.surface, 'と'); assert.equal(result.tokens?.[1]?.frequencyRank, undefined); assert.equal(result.tokens?.[2]?.surface, '話した'); assert.equal(result.tokens?.[2]?.frequencyRank, 90); }); test('tokenizeSubtitle prefers exact headword frequency over surface/reading when available', async () => { const result = await tokenizeSubtitle( '猫です', makeDeps({ getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [ { text: '猫', reading: 'ねこ', headwords: [[{ term: 'ネコ' }]], }, ], ], }, ], }, }) as unknown as Electron.BrowserWindow, getFrequencyRank: (text) => (text === '猫' ? 1200 : text === 'ネコ' ? 8 : null), }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.frequencyRank, 8); }); test('tokenizeSubtitle falls back to exact surface frequency when merged headword lookup misses', async () => { const frequencyScripts: string[] = []; const result = await tokenizeSubtitle( '陰に', makeDeps({ getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async (script: string) => { if (script.includes('getTermFrequencies')) { frequencyScripts.push(script); return script.includes('"term":"陰に","reading":"いんに"') ? [ { term: '陰に', reading: 'いんに', dictionary: 'freq-dict', frequency: 5702, displayValue: '5702', displayValueParsed: true, }, ] : []; } return [ { source: 'scanning-parser', index: 0, content: [ [ { text: '陰に', reading: 'いんに', headwords: [[{ term: '陰' }]], }, ], ], }, ]; }, }, }) as unknown as Electron.BrowserWindow, }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.surface, '陰に'); assert.equal(result.tokens?.[0]?.headword, '陰'); assert.equal(result.tokens?.[0]?.frequencyRank, 5702); assert.equal( frequencyScripts.some((script) => script.includes('"term":"陰","reading":"いんに"')), true, ); assert.equal( frequencyScripts.some((script) => script.includes('"term":"陰に","reading":"いんに"')), true, ); }); test('tokenizeSubtitle keeps no frequency when only reading matches and headword misses', async () => { const result = await tokenizeSubtitle( '猫です', makeDeps({ getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [ { text: '猫', reading: 'ねこ', headwords: [[{ term: '猫です' }]], }, ], ], }, ], }, }) as unknown as Electron.BrowserWindow, getFrequencyRank: (text) => (text === 'ねこ' ? 77 : null), }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.frequencyRank, undefined); }); test('tokenizeSubtitle ignores invalid frequency rank on selected headword', async () => { const result = await tokenizeSubtitle( '猫です', makeDeps({ getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [ { text: '猫です', reading: 'ねこです', headwords: [[{ term: '猫' }], [{ term: '猫です' }]], }, ], ], }, ], }, }) as unknown as Electron.BrowserWindow, getFrequencyRank: (text) => (text === '猫' ? Number.NaN : text === '猫です' ? 500 : null), }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.frequencyRank, undefined); }); test('tokenizeSubtitle handles real-word frequency candidates and prefers most frequent term', async () => { const result = await tokenizeSubtitle( '昨日', makeDeps({ getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [ { text: '昨日', reading: 'きのう', headwords: [[{ term: '昨日' }], [{ term: 'きのう' }]], }, ], ], }, ], }, }) as unknown as Electron.BrowserWindow, getFrequencyRank: (text) => (text === 'きのう' ? 120 : text === '昨日' ? 40 : null), }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.frequencyRank, 40); }); test('tokenizeSubtitle ignores candidates with no dictionary rank when higher-frequency candidate exists', async () => { const result = await tokenizeSubtitle( '猫です', makeDeps({ getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [ { text: '猫', reading: 'ねこ', headwords: [ [{ term: '猫' }], [{ term: '猫です' }], [{ term: 'unknown-term' }], ], }, ], ], }, ], }, }) as unknown as Electron.BrowserWindow, getFrequencyRank: (text) => text === 'unknown-term' ? -1 : text === '猫' ? 88 : text === '猫です' ? 9000 : null, }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.frequencyRank, 88); }); test('tokenizeSubtitle ignores frequency lookup failures', async () => { const result = await tokenizeSubtitle( '猫', makeDeps({ getFrequencyDictionaryEnabled: () => true, tokenizeWithMecab: async () => [ { headword: '猫', surface: '猫', reading: 'ネコ', startPos: 0, endPos: 1, partOfSpeech: PartOfSpeech.noun, isMerged: false, isKnown: false, isNPlusOneTarget: false, }, ], getFrequencyRank: () => { throw new Error('frequency lookup unavailable'); }, }), ); assert.equal(result.tokens?.[0]?.frequencyRank, undefined); }); test('tokenizeSubtitle keeps standalone particle token hoverable while clearing annotation metadata', async () => { const result = await tokenizeSubtitle( 'は', makeDeps({ getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [ { text: 'は', reading: 'は', headwords: [[{ term: 'は' }]], }, ], ], }, ], }, }) as unknown as Electron.BrowserWindow, tokenizeWithMecab: async () => [ { headword: 'は', surface: 'は', reading: 'ハ', startPos: 0, endPos: 1, partOfSpeech: PartOfSpeech.particle, pos1: '助詞', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, ], getFrequencyRank: (text) => (text === 'は' ? 10 : null), }), ); assert.equal(result.text, 'は'); assert.deepEqual( result.tokens?.map((token) => ({ surface: token.surface, reading: token.reading, headword: token.headword, pos1: token.pos1, isKnown: token.isKnown, isNPlusOneTarget: token.isNPlusOneTarget, isNameMatch: token.isNameMatch, jlptLevel: token.jlptLevel, frequencyRank: token.frequencyRank, })), [ { surface: 'は', reading: 'は', headword: 'は', pos1: '助詞', isKnown: false, isNPlusOneTarget: false, isNameMatch: false, jlptLevel: undefined, frequencyRank: undefined, }, ], ); }); test('tokenizeSubtitle keeps frequency rank when mecab tags classify token as content-bearing', async () => { const result = await tokenizeSubtitle( 'ふふ', makeDepsFromYomitanTokens([{ surface: 'ふふ', reading: '', headword: 'ふふ' }], { getFrequencyDictionaryEnabled: () => true, getFrequencyRank: (text) => (text === 'ふふ' ? 3014 : null), tokenizeWithMecab: async () => [ { headword: 'ふふ', surface: 'ふふ', reading: 'フフ', startPos: 0, endPos: 2, partOfSpeech: PartOfSpeech.verb, pos1: '動詞', pos2: '自立', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, ], }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.frequencyRank, 3014); }); test('tokenizeSubtitle ignores invalid frequency ranks', async () => { const result = await tokenizeSubtitle( '猫', makeDepsFromYomitanTokens( [ { surface: '猫', reading: 'ねこ', headword: '猫' }, { surface: 'です', reading: 'です', headword: 'です' }, ], { getFrequencyDictionaryEnabled: () => true, getFrequencyRank: (text) => { if (text === '猫') return Number.NaN; if (text === 'です') return -1; return 100; }, }, ), ); assert.equal(result.tokens?.length, 2); assert.equal(result.tokens?.[0]?.frequencyRank, undefined); assert.equal(result.tokens?.[1]?.frequencyRank, undefined); }); test('tokenizeSubtitle skips frequency lookups when disabled', async () => { let frequencyCalls = 0; const result = await tokenizeSubtitle( '猫', makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], { getFrequencyDictionaryEnabled: () => false, getFrequencyRank: () => { frequencyCalls += 1; return 10; }, }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.frequencyRank, undefined); assert.equal(frequencyCalls, 0); }); test('tokenizeSubtitle skips JLPT level for excluded demonstratives', async () => { const result = await tokenizeSubtitle( 'この', makeDeps({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [ { text: 'この', reading: 'この', headwords: [[{ term: 'この' }]], }, ], ], }, ], }, }) as unknown as Electron.BrowserWindow, tokenizeWithMecab: async () => null, getJlptLevel: (text) => (text === 'この' ? 'N5' : null), }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.jlptLevel, undefined); }); test('tokenizeSubtitle keeps repeated kana interjections tokenized while clearing annotation metadata', async () => { const result = await tokenizeSubtitle( 'ああ', makeDeps({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [ { text: 'ああ', reading: 'ああ', headwords: [[{ term: 'ああ' }]], }, ], ], }, ], }, }) as unknown as Electron.BrowserWindow, tokenizeWithMecab: async () => null, getJlptLevel: (text) => (text === 'ああ' ? 'N5' : null), }), ); assert.equal(result.text, 'ああ'); assert.deepEqual( result.tokens?.map((token) => ({ surface: token.surface, headword: token.headword, reading: token.reading, jlptLevel: token.jlptLevel, frequencyRank: token.frequencyRank, isKnown: token.isKnown, isNPlusOneTarget: token.isNPlusOneTarget, })), [ { surface: 'ああ', headword: 'ああ', reading: 'ああ', jlptLevel: undefined, frequencyRank: undefined, isKnown: false, isNPlusOneTarget: false, }, ], ); }); test('tokenizeSubtitle assigns JLPT level to Yomitan tokens', async () => { const result = await tokenizeSubtitle( '猫です', makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], { getJlptLevel: (text) => (text === '猫' ? 'N4' : null), }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.jlptLevel, 'N4'); }); test('tokenizeSubtitle can assign JLPT level to Yomitan particle token', async () => { const result = await tokenizeSubtitle( 'は', makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は' }], { getJlptLevel: (text) => (text === 'は' ? 'N5' : null), }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.jlptLevel, 'N5'); }); test('tokenizeSubtitle returns null tokens for empty normalized text', async () => { const result = await tokenizeSubtitle(' \\n ', makeDeps()); assert.deepEqual(result, { text: ' \\n ', tokens: null }); }); test('tokenizeSubtitle normalizes newlines before Yomitan parse request', async () => { let parseInput = ''; const result = await tokenizeSubtitle( '猫\\Nです\nね', makeDeps({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async (script: string) => { parseInput = script; return null; }, }, }) as unknown as Electron.BrowserWindow, }), ); assert.match(parseInput, /猫 です ね/); assert.equal(result.text, '猫\nです\nね'); assert.equal(result.tokens, null); }); test('tokenizeSubtitle collapses zero-width separators before Yomitan parse request', async () => { let parseInput = ''; const result = await tokenizeSubtitle( 'キリキリと\u200bかかってこい\nこのヘナチョコ冒険者どもめが!', makeDeps({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async (script: string) => { parseInput = script; return null; }, }, }) as unknown as Electron.BrowserWindow, }), ); assert.match(parseInput, /キリキリと かかってこい このヘナチョコ冒険者どもめが!/); assert.equal(result.text, 'キリキリと\u200bかかってこい\nこのヘナチョコ冒険者どもめが!'); assert.equal(result.tokens, null); }); test('tokenizeSubtitle returns null tokens when Yomitan parsing is unavailable', async () => { const result = await tokenizeSubtitle('猫です', makeDeps()); assert.deepEqual(result, { text: '猫です', tokens: null }); }); test('tokenizeSubtitle skips token payload and annotations when Yomitan parse has no dictionary matches', async () => { let frequencyRequested = false; let jlptLookupCalls = 0; let mecabCalls = 0; const result = await tokenizeSubtitle( 'これはテスト', makeDeps({ getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async (script: string) => { if (script.includes('getTermFrequencies')) { frequencyRequested = true; return []; } return [ { source: 'scanning-parser', index: 0, content: [ [{ text: 'これは', reading: 'これは' }], [{ text: 'テスト', reading: 'てすと' }], ], }, ]; }, }, }) as unknown as Electron.BrowserWindow, tokenizeWithMecab: async () => { mecabCalls += 1; return null; }, getJlptLevel: () => { jlptLookupCalls += 1; return 'N5'; }, }), ); assert.deepEqual(result, { text: 'これはテスト', tokens: null }); assert.equal(frequencyRequested, false); assert.equal(jlptLookupCalls, 0); assert.equal(mecabCalls, 0); }); test('tokenizeSubtitle excludes Yomitan token groups without dictionary headwords from annotation paths', async () => { let jlptLookupCalls = 0; let frequencyLookupCalls = 0; const result = await tokenizeSubtitle( '(ダクネスの荒い息) 猫', makeDeps({ getFrequencyDictionaryEnabled: () => true, getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async (script: string) => { if (script.includes('getTermFrequencies')) { return []; } return [ { source: 'scanning-parser', index: 0, content: [ [{ text: '(ダクネスの荒い息)', reading: 'だくねすのあらいいき' }], [{ text: '猫', reading: 'ねこ', headwords: [[{ term: '猫' }]] }], ], }, ]; }, }, }) as unknown as Electron.BrowserWindow, getJlptLevel: (text) => { jlptLookupCalls += 1; return text === '猫' ? 'N5' : null; }, getFrequencyRank: () => { frequencyLookupCalls += 1; return 12; }, tokenizeWithMecab: async () => null, }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.surface, '猫'); assert.equal(result.tokens?.[0]?.headword, '猫'); assert.equal(jlptLookupCalls, 1); assert.equal(frequencyLookupCalls, 1); }); test('tokenizeSubtitle returns null tokens when mecab throws', async () => { const result = await tokenizeSubtitle( '猫です', makeDeps({ tokenizeWithMecab: async () => { throw new Error('mecab failed'); }, }), ); assert.deepEqual(result, { text: '猫です', tokens: null }); }); test('tokenizeSubtitle uses Yomitan parser result when available and drops no-headword groups', async () => { const parserWindow = { isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [ { text: '猫', reading: 'ねこ', headwords: [[{ term: '猫' }]], }, ], [ { text: 'です', reading: 'です', }, ], ], }, ], }, } as unknown as Electron.BrowserWindow; const result = await tokenizeSubtitle( '猫です', makeDeps({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => parserWindow, tokenizeWithMecab: async () => null, }), ); assert.equal(result.text, '猫です'); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.surface, '猫'); assert.equal(result.tokens?.[0]?.reading, 'ねこ'); assert.equal(result.tokens?.[0]?.isKnown, false); }); test('tokenizeSubtitle logs selected Yomitan groups when debug toggle is enabled', async () => { const infoLogs: string[] = []; const originalInfo = console.info; console.info = (...args: unknown[]) => { infoLogs.push(args.map((value) => String(value)).join(' ')); }; try { await tokenizeSubtitle( '友達と話した', makeDeps({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [ { text: '友', reading: 'とも', headwords: [[{ term: '友達' }]], }, { text: '達', reading: 'だち', }, ], [ { text: 'と', reading: 'と', headwords: [[{ term: 'と' }]], }, ], ], }, ], }, }) as unknown as Electron.BrowserWindow, tokenizeWithMecab: async () => null, getYomitanGroupDebugEnabled: () => true, }), ); } finally { console.info = originalInfo; } assert.ok(infoLogs.some((line) => line.includes('Selected Yomitan token groups'))); }); test('tokenizeSubtitle does not log Yomitan groups when debug toggle is disabled', async () => { const infoLogs: string[] = []; const originalInfo = console.info; console.info = (...args: unknown[]) => { infoLogs.push(args.map((value) => String(value)).join(' ')); }; try { await tokenizeSubtitle( '友達と話した', makeDeps({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [ { text: '友', reading: 'とも', headwords: [[{ term: '友達' }]], }, { text: '達', reading: 'だち', }, ], ], }, ], }, }) as unknown as Electron.BrowserWindow, tokenizeWithMecab: async () => null, getYomitanGroupDebugEnabled: () => false, }), ); } finally { console.info = originalInfo; } assert.equal( infoLogs.some((line) => line.includes('Selected Yomitan token groups')), false, ); }); test('tokenizeSubtitle preserves segmented Yomitan line as one token', async () => { const parserWindow = { isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [ { text: '猫', reading: 'ねこ', headwords: [[{ term: '猫です' }]], }, { text: 'です', reading: 'です', }, ], ], }, ], }, } as unknown as Electron.BrowserWindow; const result = await tokenizeSubtitle( '猫です', makeDeps({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => parserWindow, tokenizeWithMecab: async () => null, }), ); assert.equal(result.text, '猫です'); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.surface, '猫です'); assert.equal(result.tokens?.[0]?.reading, 'ねこです'); assert.equal(result.tokens?.[0]?.headword, '猫です'); assert.equal(result.tokens?.[0]?.isKnown, false); }); test('tokenizeSubtitle keeps scanning parser token when scanning parser returns one token', async () => { const result = await tokenizeSubtitle( '俺は小園にいきたい', makeDeps({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [ { text: '俺は小園にいきたい', reading: 'おれは小園にいきたい', headwords: [[{ term: '俺は小園にいきたい' }]], }, ], ], }, { source: 'mecab', index: 0, content: [ [ { text: '俺', reading: 'おれ', headwords: [[{ term: '俺' }]], }, ], [ { text: 'は', reading: 'は', headwords: [[{ term: 'は' }]], }, ], [ { text: '小園', reading: 'おうえん', headwords: [[{ term: '小園' }]], }, ], [ { text: 'に', reading: 'に', headwords: [[{ term: 'に' }]], }, ], [ { text: 'いきたい', reading: 'いきたい', headwords: [[{ term: 'いきたい' }]], }, ], ], }, ], }, }) as unknown as Electron.BrowserWindow, getFrequencyDictionaryEnabled: () => true, tokenizeWithMecab: async () => null, getFrequencyRank: (text) => (text === '小園' ? 25 : text === 'いきたい' ? 1500 : null), }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.map((token) => token.surface).join(','), '俺は小園にいきたい'); assert.equal(result.tokens?.[0]?.frequencyRank, undefined); }); test('tokenizeSubtitle keeps scanning parser tokens when they are already split', async () => { const result = await tokenizeSubtitle( '小園に行きたい', makeDeps({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [ { text: '小園', reading: 'おうえん', headwords: [[{ term: '小園' }]], }, ], [ { text: 'に', reading: 'に', headwords: [[{ term: 'に' }]], }, ], [ { text: '行きたい', reading: 'いきたい', headwords: [[{ term: '行きたい' }]], }, ], ], }, { source: 'mecab', index: 0, content: [ [ { text: '小', reading: 'お', headwords: [[{ term: '小' }]], }, ], [ { text: '園', reading: 'えん', headwords: [[{ term: '園' }]], }, ], [ { text: 'に', reading: 'に', headwords: [[{ term: 'に' }]], }, ], [ { text: '行き', reading: 'いき', headwords: [[{ term: '行き' }]], }, ], [ { text: 'たい', reading: 'たい', headwords: [[{ term: 'たい' }]], }, ], ], }, ], }, }) as unknown as Electron.BrowserWindow, getFrequencyDictionaryEnabled: () => true, getFrequencyRank: (text) => (text === '小園' ? 20 : null), tokenizeWithMecab: async () => null, }), ); assert.equal(result.tokens?.length, 3); assert.equal(result.tokens?.map((token) => token.surface).join(','), '小園,に,行きたい'); assert.equal(result.tokens?.[0]?.frequencyRank, 20); assert.equal(result.tokens?.[1]?.frequencyRank, undefined); assert.equal(result.tokens?.[2]?.frequencyRank, undefined); }); test('tokenizeSubtitle keeps parsing explicit by scanning-parser source only', async () => { const result = await tokenizeSubtitle( '俺は公園にいきたい', makeDeps({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [ { text: '俺', reading: 'おれ', headwords: [[{ term: '俺' }]], }, ], [{ text: 'は', reading: '', headwords: [[{ term: 'は' }]] }], [ { text: '公園', reading: 'こうえん', headwords: [[{ term: '公園' }]], }, ], [ { text: 'にい', reading: '', headwords: [[{ term: '兄' }], [{ term: '二位' }]], }, ], [ { text: 'きたい', reading: '', headwords: [[{ term: '期待' }], [{ term: '来る' }]], }, ], ], }, { source: 'scanning-parser', index: 0, content: [ [ { text: '俺', reading: 'おれ', headwords: [[{ term: '俺' }]], }, ], [ { text: 'は', reading: 'は', headwords: [[{ term: 'は' }]], }, ], [ { text: '公園', reading: 'こうえん', headwords: [[{ term: '公園' }]], }, ], [ { text: 'に', reading: 'に', headwords: [[{ term: 'に' }]], }, ], [ { text: '行きたい', reading: 'いきたい', headwords: [[{ term: '行きたい' }]], }, ], ], }, ], }, }) as unknown as Electron.BrowserWindow, getFrequencyDictionaryEnabled: () => true, getFrequencyRank: (text) => text === '俺' ? 51 : text === '公園' ? 2304 : text === '行きたい' ? 1500 : null, tokenizeWithMecab: async () => null, }), ); assert.equal(result.tokens?.map((token) => token.surface).join(','), '俺,は,公園,に,行きたい'); assert.equal(result.tokens?.[1]?.frequencyRank, undefined); assert.equal(result.tokens?.[3]?.frequencyRank, undefined); assert.equal(result.tokens?.[4]?.frequencyRank, 1500); }); test('tokenizeSubtitle still assigns frequency to non-known multi-character Yomitan tokens', async () => { const result = await tokenizeSubtitle( '小園友達', makeDeps({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [ { text: '小園', reading: 'おうえん', headwords: [[{ term: '小園' }]], }, ], [ { text: '友達', reading: 'ともだち', headwords: [[{ term: '友達' }]], }, ], ], }, ], }, }) as unknown as Electron.BrowserWindow, getFrequencyDictionaryEnabled: () => true, getFrequencyRank: (text) => (text === '小園' ? 75 : text === '友達' ? 3000 : null), isKnownWord: (text) => text === '小園', }), ); assert.equal(result.tokens?.length, 2); assert.equal(result.tokens?.[0]?.isKnown, true); assert.equal(result.tokens?.[0]?.frequencyRank, 75); assert.equal(result.tokens?.[1]?.isKnown, false); assert.equal(result.tokens?.[1]?.frequencyRank, 3000); }); test('tokenizeSubtitle marks tokens as known using callback', async () => { const result = await tokenizeSubtitle( '猫です', makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], { isKnownWord: (text) => text === '猫', }), ); assert.equal(result.text, '猫です'); assert.equal(result.tokens?.[0]?.isKnown, true); }); test('tokenizeSubtitle still assigns frequency rank to non-known tokens', async () => { const result = await tokenizeSubtitle( '既知未知', makeDepsFromYomitanTokens( [ { surface: '既知', reading: 'きち', headword: '既知' }, { surface: '未知', reading: 'みち', headword: '未知' }, ], { getFrequencyDictionaryEnabled: () => true, getFrequencyRank: (text) => (text === '既知' ? 20 : text === '未知' ? 30 : null), isKnownWord: (text) => text === '既知', }, ), ); assert.equal(result.tokens?.length, 2); assert.equal(result.tokens?.[0]?.isKnown, true); assert.equal(result.tokens?.[0]?.frequencyRank, 20); assert.equal(result.tokens?.[1]?.isKnown, false); assert.equal(result.tokens?.[1]?.frequencyRank, 30); }); test('tokenizeSubtitle selects one N+1 target token', async () => { const result = await tokenizeSubtitle( '猫です', makeDepsFromYomitanTokens( [ { surface: '私', reading: 'わたし', headword: '私' }, { surface: '犬', reading: 'いぬ', headword: '犬' }, ], { getMinSentenceWordsForNPlusOne: () => 2, isKnownWord: (text) => text === '私', }, ), ); const targets = result.tokens?.filter((token) => token.isNPlusOneTarget) ?? []; assert.equal(targets.length, 1); assert.equal(targets[0]?.surface, '犬'); }); test('tokenizeSubtitle does not mark target when sentence has multiple candidates', async () => { const result = await tokenizeSubtitle( '猫犬', makeDepsFromYomitanTokens( [ { surface: '猫', reading: 'ねこ', headword: '猫' }, { surface: '犬', reading: 'いぬ', headword: '犬' }, ], {}, ), ); assert.equal( result.tokens?.some((token) => token.isNPlusOneTarget), false, ); }); test('tokenizeSubtitle applies N+1 target marking to Yomitan results', async () => { const parserWindow = { isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [ { text: '猫', reading: 'ねこ', headwords: [[{ term: '猫' }]], }, ], [ { text: 'です', reading: 'です', headwords: [[{ term: 'です' }]], }, ], ], }, ], }, } as unknown as Electron.BrowserWindow; const result = await tokenizeSubtitle( '猫です', makeDeps({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => parserWindow, tokenizeWithMecab: async () => null, isKnownWord: (text) => text === 'です', getMinSentenceWordsForNPlusOne: () => 2, }), ); assert.equal(result.text, '猫です'); assert.equal(result.tokens?.length, 2); assert.equal(result.tokens?.[0]?.surface, '猫'); assert.equal(result.tokens?.[0]?.isNPlusOneTarget, true); assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false); }); test('tokenizeSubtitle ignores Yomitan functional tokens when evaluating N+1 candidates', async () => { const parserWindow = { isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [{ text: '私', reading: 'わたし', headwords: [[{ term: '私' }]] }], [{ text: 'も', reading: 'も', headwords: [[{ term: 'も' }]] }], [{ text: 'あの', reading: 'あの', headwords: [[{ term: 'あの' }]] }], [{ text: '仮面', reading: 'かめん', headwords: [[{ term: '仮面' }]] }], [{ text: 'が', reading: 'が', headwords: [[{ term: 'が' }]] }], [{ text: '欲しい', reading: 'ほしい', headwords: [[{ term: '欲しい' }]] }], [{ text: 'です', reading: 'です', headwords: [[{ term: 'です' }]] }], ], }, ], }, } as unknown as Electron.BrowserWindow; const result = await tokenizeSubtitle( '私も あの仮面が欲しいです', makeDeps({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => parserWindow, tokenizeWithMecab: async () => [ { surface: '私', reading: 'ワタシ', headword: '私', startPos: 0, endPos: 1, partOfSpeech: PartOfSpeech.noun, pos1: '名詞', isMerged: false, isKnown: true, isNPlusOneTarget: false, }, { surface: 'も', reading: 'モ', headword: 'も', startPos: 1, endPos: 2, partOfSpeech: PartOfSpeech.particle, pos1: '助詞', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, { surface: 'あの', reading: 'アノ', headword: 'あの', startPos: 2, endPos: 4, partOfSpeech: PartOfSpeech.other, pos1: '連体詞', isMerged: false, isKnown: true, isNPlusOneTarget: false, }, { surface: '仮面', reading: 'カメン', headword: '仮面', startPos: 4, endPos: 6, partOfSpeech: PartOfSpeech.noun, pos1: '名詞', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, { surface: 'が', reading: 'ガ', headword: 'が', startPos: 6, endPos: 7, partOfSpeech: PartOfSpeech.particle, pos1: '助詞', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, { surface: '欲しい', reading: 'ホシイ', headword: '欲しい', startPos: 7, endPos: 10, partOfSpeech: PartOfSpeech.i_adjective, pos1: '形容詞', isMerged: false, isKnown: true, isNPlusOneTarget: false, }, { surface: 'です', reading: 'デス', headword: 'です', startPos: 10, endPos: 12, partOfSpeech: PartOfSpeech.bound_auxiliary, pos1: '助動詞', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, ], isKnownWord: (text) => text === '私' || text === 'あの' || text === '欲しい', }), ); const targets = result.tokens?.filter((token) => token.isNPlusOneTarget) ?? []; assert.equal(targets.length, 1); assert.equal(targets[0]?.surface, '仮面'); }); test('tokenizeSubtitle keeps correct MeCab pos1 enrichment when Yomitan offsets skip spaces', async () => { const parserWindow = { isDestroyed: () => false, webContents: { executeJavaScript: async () => [ { source: 'scanning-parser', index: 0, content: [ [{ text: '私', reading: 'わたし', headwords: [[{ term: '私' }]] }], [{ text: 'も', reading: 'も', headwords: [[{ term: 'も' }]] }], [{ text: 'あの', reading: 'あの', headwords: [[{ term: 'あの' }]] }], [{ text: '仮面', reading: 'かめん', headwords: [[{ term: '仮面' }]] }], [{ text: 'が', reading: 'が', headwords: [[{ term: 'が' }]] }], [{ text: '欲しい', reading: 'ほしい', headwords: [[{ term: '欲しい' }]] }], [{ text: 'です', reading: 'です', headwords: [[{ term: 'です' }]] }], ], }, ], }, } as unknown as Electron.BrowserWindow; const result = await tokenizeSubtitle( '私も あの仮面が欲しいです', makeDeps({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => parserWindow, tokenizeWithMecab: async () => [ { surface: '私', reading: 'ワタシ', headword: '私', startPos: 0, endPos: 1, partOfSpeech: PartOfSpeech.noun, pos1: '名詞', isMerged: false, isKnown: true, isNPlusOneTarget: false, }, { surface: 'も', reading: 'モ', headword: 'も', startPos: 1, endPos: 2, partOfSpeech: PartOfSpeech.particle, pos1: '助詞', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, { surface: ' ', reading: '', headword: ' ', startPos: 2, endPos: 3, partOfSpeech: PartOfSpeech.symbol, pos1: '記号', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, { surface: 'あの', reading: 'アノ', headword: 'あの', startPos: 3, endPos: 5, partOfSpeech: PartOfSpeech.other, pos1: '連体詞', isMerged: false, isKnown: true, isNPlusOneTarget: false, }, { surface: '仮面', reading: 'カメン', headword: '仮面', startPos: 5, endPos: 7, partOfSpeech: PartOfSpeech.noun, pos1: '名詞', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, { surface: 'が', reading: 'ガ', headword: 'が', startPos: 7, endPos: 8, partOfSpeech: PartOfSpeech.particle, pos1: '助詞', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, { surface: '欲しい', reading: 'ホシイ', headword: '欲しい', startPos: 8, endPos: 11, partOfSpeech: PartOfSpeech.i_adjective, pos1: '形容詞', isMerged: false, isKnown: true, isNPlusOneTarget: false, }, { surface: 'です', reading: 'デス', headword: 'です', startPos: 11, endPos: 13, partOfSpeech: PartOfSpeech.bound_auxiliary, pos1: '助動詞', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, ], isKnownWord: (text) => text === '私' || text === 'あの' || text === '欲しい', }), ); const targets = result.tokens?.filter((token) => token.isNPlusOneTarget) ?? []; const gaToken = result.tokens?.find((token) => token.surface === 'が'); const desuToken = result.tokens?.find((token) => token.surface === 'です'); assert.equal(gaToken?.pos1, '助詞'); assert.equal(gaToken?.isKnown, false); assert.equal(gaToken?.isNPlusOneTarget, false); assert.equal(gaToken?.jlptLevel, undefined); assert.equal(gaToken?.frequencyRank, undefined); assert.equal(desuToken?.pos1, '助動詞'); assert.equal(desuToken?.isKnown, false); assert.equal(desuToken?.isNPlusOneTarget, false); assert.equal(desuToken?.jlptLevel, undefined); assert.equal(desuToken?.frequencyRank, undefined); assert.equal(targets.length, 1); assert.equal(targets[0]?.surface, '仮面'); }); test('tokenizeSubtitle preserves merged token frequency when MeCab positions cross a newline gap', async () => { const parserWindow = { isDestroyed: () => false, webContents: { executeJavaScript: async (script: string) => { if (script.includes('getTermFrequencies')) { return script.includes('"term":"陰に","reading":"いんに"') ? [ { term: '陰に', reading: 'いんに', dictionary: 'JPDBv2㋕', frequency: 5702, displayValue: '5702', displayValueParsed: false, }, ] : []; } return [ { surface: 'X', reading: 'えっくす', headword: 'X', startPos: 0, endPos: 1, }, { surface: '陰に', reading: 'いんに', headword: '陰に', startPos: 2, endPos: 4, }, { surface: '潜み', reading: 'ひそ', headword: '潜む', startPos: 4, endPos: 6, }, ]; }, }, } as unknown as Electron.BrowserWindow; const deps = createTokenizerDepsRuntime({ getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => parserWindow, setYomitanParserWindow: () => {}, getYomitanParserReadyPromise: () => null, setYomitanParserReadyPromise: () => {}, getYomitanParserInitPromise: () => null, setYomitanParserInitPromise: () => {}, isKnownWord: () => false, getKnownWordMatchMode: () => 'headword', getJlptLevel: () => null, getFrequencyDictionaryEnabled: () => true, getMecabTokenizer: () => ({ tokenize: async () => [ { word: 'X', partOfSpeech: PartOfSpeech.noun, pos1: '名詞', pos2: '一般', pos3: '', pos4: '', inflectionType: '', inflectionForm: '', headword: 'X', katakanaReading: 'エックス', pronunciation: 'エックス', }, { word: '陰', partOfSpeech: PartOfSpeech.noun, pos1: '名詞', pos2: '一般', pos3: '', pos4: '', inflectionType: '', inflectionForm: '', headword: '陰', katakanaReading: 'カゲ', pronunciation: 'カゲ', }, { word: 'に', partOfSpeech: PartOfSpeech.particle, pos1: '助詞', pos2: '格助詞', pos3: '一般', pos4: '', inflectionType: '', inflectionForm: '', headword: 'に', katakanaReading: 'ニ', pronunciation: 'ニ', }, { word: '潜み', partOfSpeech: PartOfSpeech.verb, pos1: '動詞', pos2: '自立', pos3: '', pos4: '', inflectionType: '五段・マ行', inflectionForm: '連用形', headword: '潜む', katakanaReading: 'ヒソミ', pronunciation: 'ヒソミ', }, ], }), }); const result = await tokenizeSubtitle('X\n陰に潜み', deps); assert.equal(result.tokens?.[1]?.surface, '陰に'); assert.equal(result.tokens?.[1]?.pos1, '名詞|助詞'); assert.equal(result.tokens?.[1]?.pos2, '一般|格助詞'); assert.equal(result.tokens?.[1]?.frequencyRank, 5702); }); test('tokenizeSubtitle does not color 1-2 word sentences by default', async () => { const result = await tokenizeSubtitle( '猫です', makeDepsFromYomitanTokens( [ { surface: '私', reading: 'わたし', headword: '私' }, { surface: '犬', reading: 'いぬ', headword: '犬' }, ], {}, ), ); assert.equal( result.tokens?.some((token) => token.isNPlusOneTarget), false, ); }); test('tokenizeSubtitle checks known words by headword, not surface', async () => { const result = await tokenizeSubtitle( '猫です', makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫です' }], { isKnownWord: (text) => text === '猫です', }), ); assert.equal(result.text, '猫です'); assert.equal(result.tokens?.[0]?.isKnown, true); }); test('tokenizeSubtitle checks known words by surface when configured', async () => { const result = await tokenizeSubtitle( '猫です', makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫です' }], { getKnownWordMatchMode: () => 'surface', isKnownWord: (text) => text === '猫', }), ); assert.equal(result.text, '猫です'); assert.equal(result.tokens?.[0]?.isKnown, true); }); test('tokenizeSubtitle uses frequency surface match mode when configured', async () => { const result = await tokenizeSubtitle( '鍛えた', makeDepsFromYomitanTokens([{ surface: '鍛えた', reading: 'きたえた', headword: '鍛える' }], { getFrequencyDictionaryEnabled: () => true, getFrequencyDictionaryMatchMode: () => 'surface', getFrequencyRank: (text) => (text === '鍛えた' ? 2847 : null), }), ); assert.equal(result.text, '鍛えた'); assert.equal(result.tokens?.[0]?.frequencyRank, 2847); }); test('createTokenizerDepsRuntime checks MeCab availability before first tokenizeWithMecab call', async () => { let available = false; let checkCalls = 0; const deps = createTokenizerDepsRuntime({ getYomitanExt: () => null, getYomitanParserWindow: () => null, setYomitanParserWindow: () => {}, getYomitanParserReadyPromise: () => null, setYomitanParserReadyPromise: () => {}, getYomitanParserInitPromise: () => null, setYomitanParserInitPromise: () => {}, isKnownWord: () => false, getKnownWordMatchMode: () => 'headword', getJlptLevel: () => null, getMecabTokenizer: () => ({ getStatus: () => ({ available }), checkAvailability: async () => { checkCalls += 1; available = true; return true; }, tokenize: async () => { if (!available) { return null; } return [ { word: '仮面', partOfSpeech: PartOfSpeech.noun, pos1: '名詞', pos2: '一般', pos3: '', pos4: '', inflectionType: '', inflectionForm: '', headword: '仮面', katakanaReading: 'カメン', pronunciation: 'カメン', }, ]; }, }), }); const first = await deps.tokenizeWithMecab('仮面'); const second = await deps.tokenizeWithMecab('仮面'); assert.equal(checkCalls, 1); assert.equal(first?.[0]?.surface, '仮面'); assert.equal(second?.[0]?.surface, '仮面'); }); test('createTokenizerDepsRuntime skips known-word lookup for MeCab POS enrichment tokens', async () => { let knownWordCalls = 0; const deps = createTokenizerDepsRuntime({ getYomitanExt: () => null, getYomitanParserWindow: () => null, setYomitanParserWindow: () => {}, getYomitanParserReadyPromise: () => null, setYomitanParserReadyPromise: () => {}, getYomitanParserInitPromise: () => null, setYomitanParserInitPromise: () => {}, isKnownWord: () => { knownWordCalls += 1; return true; }, getKnownWordMatchMode: () => 'headword', getJlptLevel: () => null, getMecabTokenizer: () => ({ tokenize: async () => [ { word: '仮面', partOfSpeech: PartOfSpeech.noun, pos1: '名詞', pos2: '一般', pos3: '', pos4: '', inflectionType: '', inflectionForm: '', headword: '仮面', katakanaReading: 'カメン', pronunciation: 'カメン', }, ], }), }); const tokens = await deps.tokenizeWithMecab('仮面'); assert.equal(knownWordCalls, 0); assert.equal(tokens?.[0]?.isKnown, false); }); test('tokenizeSubtitle uses async MeCab enrichment override when provided', async () => { const result = await tokenizeSubtitle( '猫', makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], { tokenizeWithMecab: async () => [ { headword: '猫', surface: '猫', reading: 'ネコ', startPos: 0, endPos: 1, partOfSpeech: PartOfSpeech.noun, pos1: '名詞', isMerged: true, isKnown: false, isNPlusOneTarget: false, }, ], enrichTokensWithMecab: async (tokens) => tokens.map((token) => ({ ...token, pos1: 'override-pos', })), }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.pos1, 'override-pos'); }); test('createTokenizerDepsRuntime exposes async MeCab enrichment helper', async () => { const deps = createTokenizerDepsRuntime({ getYomitanExt: () => null, getYomitanParserWindow: () => null, setYomitanParserWindow: () => {}, getYomitanParserReadyPromise: () => null, setYomitanParserReadyPromise: () => {}, getYomitanParserInitPromise: () => null, setYomitanParserInitPromise: () => {}, isKnownWord: () => false, getKnownWordMatchMode: () => 'headword', getJlptLevel: () => null, getMecabTokenizer: () => null, }); const enriched = await deps.enrichTokensWithMecab?.( [ { headword: 'は', surface: 'は', reading: 'は', startPos: 0, endPos: 1, partOfSpeech: PartOfSpeech.other, isMerged: true, isKnown: false, isNPlusOneTarget: false, }, ], [ { headword: 'は', surface: 'は', reading: 'ハ', startPos: 0, endPos: 1, partOfSpeech: PartOfSpeech.particle, pos1: '助詞', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, ], ); assert.equal(enriched?.[0]?.pos1, '助詞'); }); test('tokenizeSubtitle skips all enrichment stages when disabled', async () => { let knownCalls = 0; let mecabCalls = 0; let jlptCalls = 0; let frequencyCalls = 0; const result = await tokenizeSubtitle( '猫', makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], { isKnownWord: () => { knownCalls += 1; return true; }, getNPlusOneEnabled: () => false, getJlptEnabled: () => false, getFrequencyDictionaryEnabled: () => false, getJlptLevel: () => { jlptCalls += 1; return 'N5'; }, getFrequencyRank: () => { frequencyCalls += 1; return 10; }, tokenizeWithMecab: async () => { mecabCalls += 1; return null; }, }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.isKnown, false); assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false); assert.equal(result.tokens?.[0]?.jlptLevel, undefined); assert.equal(result.tokens?.[0]?.frequencyRank, undefined); assert.equal(knownCalls, 0); assert.equal(mecabCalls, 0); assert.equal(jlptCalls, 0); assert.equal(frequencyCalls, 0); }); test('tokenizeSubtitle keeps frequency enrichment while n+1 is disabled', async () => { let knownCalls = 0; let mecabCalls = 0; let frequencyCalls = 0; const result = await tokenizeSubtitle( '猫', makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], { isKnownWord: () => { knownCalls += 1; return true; }, getNPlusOneEnabled: () => false, getJlptEnabled: () => false, getFrequencyDictionaryEnabled: () => true, getFrequencyRank: () => { frequencyCalls += 1; return 7; }, tokenizeWithMecab: async () => { mecabCalls += 1; return [ { headword: '猫', surface: '猫', reading: 'ネコ', startPos: 0, endPos: 1, partOfSpeech: PartOfSpeech.noun, pos1: '名詞', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, ]; }, }), ); assert.equal(result.tokens?.[0]?.frequencyRank, 7); assert.equal(result.tokens?.[0]?.isKnown, false); assert.equal(knownCalls, 0); assert.equal(mecabCalls, 1); assert.equal(frequencyCalls, 1); }); test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and frequency annotations', async () => { const result = await tokenizeSubtitle( 'になれば', makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], { getFrequencyDictionaryEnabled: () => true, getFrequencyRank: (text) => (text === 'なる' ? 11 : null), tokenizeWithMecab: async () => [ { headword: 'なる', surface: 'になれば', reading: 'ニナレバ', startPos: 0, endPos: 4, partOfSpeech: PartOfSpeech.verb, pos1: '動詞', pos2: '非自立', isMerged: true, isKnown: false, isNPlusOneTarget: false, }, ], getMinSentenceWordsForNPlusOne: () => 1, }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.frequencyRank, undefined); assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false); }); test('tokenizeSubtitle keeps mecab-tagged interjections tokenized while clearing annotation metadata', async () => { const result = await tokenizeSubtitle( 'ぐはっ', makeDepsFromYomitanTokens([{ surface: 'ぐはっ', reading: 'ぐはっ', headword: 'ぐはっ' }], { getFrequencyDictionaryEnabled: () => true, getFrequencyRank: () => 17, getJlptLevel: () => 'N5', tokenizeWithMecab: async () => [ { headword: 'ぐはっ', surface: 'ぐはっ', reading: 'グハッ', startPos: 0, endPos: 3, partOfSpeech: PartOfSpeech.other, pos1: '感動詞', isMerged: true, isKnown: false, isNPlusOneTarget: false, }, ], }), ); assert.equal(result.text, 'ぐはっ'); assert.deepEqual( result.tokens?.map((token) => ({ surface: token.surface, headword: token.headword, reading: token.reading, pos1: token.pos1, jlptLevel: token.jlptLevel, frequencyRank: token.frequencyRank, isKnown: token.isKnown, isNPlusOneTarget: token.isNPlusOneTarget, })), [ { surface: 'ぐはっ', headword: 'ぐはっ', reading: 'ぐはっ', pos1: '感動詞', jlptLevel: undefined, frequencyRank: undefined, isKnown: false, isNPlusOneTarget: false, }, ], ); }); test('tokenizeSubtitle keeps excluded interjections hoverable while clearing only their annotation metadata', async () => { const result = await tokenizeSubtitle( 'ぐはっ 猫', makeDeps({ getFrequencyDictionaryEnabled: () => true, getFrequencyRank: (text) => (text === '猫' ? 11 : 17), getJlptLevel: (text) => (text === '猫' ? 'N5' : null), getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async (script: string) => { if (script.includes('getTermFrequencies')) { return []; } return [ { source: 'scanning-parser', index: 0, content: [ [{ text: 'ぐはっ', reading: 'ぐはっ', headwords: [[{ term: 'ぐはっ' }]] }], [{ text: '猫', reading: 'ねこ', headwords: [[{ term: '猫' }]] }], ], }, ]; }, }, }) as unknown as Electron.BrowserWindow, tokenizeWithMecab: async () => [ { headword: 'ぐはっ', surface: 'ぐはっ', reading: 'グハッ', startPos: 0, endPos: 3, partOfSpeech: PartOfSpeech.other, pos1: '感動詞', isMerged: true, isKnown: false, isNPlusOneTarget: false, }, { headword: '猫', surface: '猫', reading: 'ネコ', startPos: 4, endPos: 5, partOfSpeech: PartOfSpeech.noun, pos1: '名詞', isMerged: true, isKnown: false, isNPlusOneTarget: false, }, ], }), ); assert.equal(result.text, 'ぐはっ 猫'); assert.deepEqual( result.tokens?.map((token) => ({ surface: token.surface, headword: token.headword, frequencyRank: token.frequencyRank, jlptLevel: token.jlptLevel, })), [ { surface: 'ぐはっ', headword: 'ぐはっ', frequencyRank: undefined, jlptLevel: undefined }, { surface: '猫', headword: '猫', frequencyRank: 11, jlptLevel: 'N5' }, ], ); }); test('tokenizeSubtitle keeps explanatory ending variants hoverable while clearing only their annotation metadata', async () => { const result = await tokenizeSubtitle( '猫んです', makeDepsFromYomitanTokens( [ { surface: '猫', reading: 'ねこ', headword: '猫' }, { surface: 'んです', reading: 'んです', headword: 'ん' }, ], { getFrequencyDictionaryEnabled: () => true, getFrequencyRank: (text) => (text === '猫' ? 11 : 500), getJlptLevel: (text) => (text === '猫' ? 'N5' : null), tokenizeWithMecab: async () => [ { headword: '猫', surface: '猫', reading: 'ネコ', startPos: 0, endPos: 1, partOfSpeech: PartOfSpeech.noun, pos1: '名詞', pos2: '一般', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, { headword: 'ん', surface: 'ん', reading: 'ン', startPos: 1, endPos: 2, partOfSpeech: PartOfSpeech.other, pos1: '名詞', pos2: '非自立', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, { headword: 'です', surface: 'です', reading: 'デス', startPos: 2, endPos: 4, partOfSpeech: PartOfSpeech.bound_auxiliary, pos1: '助動詞', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, ], }, ), ); assert.equal(result.text, '猫んです'); assert.deepEqual( result.tokens?.map((token) => ({ surface: token.surface, headword: token.headword, jlptLevel: token.jlptLevel, frequencyRank: token.frequencyRank, })), [ { surface: '猫', headword: '猫', jlptLevel: 'N5', frequencyRank: 11 }, { surface: 'んです', headword: 'ん', jlptLevel: undefined, frequencyRank: undefined }, ], ); }); test('tokenizeSubtitle keeps standalone grammar-only tokens hoverable while clearing only their annotation metadata', async () => { const result = await tokenizeSubtitle( '私はこの猫です', makeDeps({ getFrequencyDictionaryEnabled: () => true, getFrequencyRank: (text) => (text === '私' ? 50 : text === '猫' ? 11 : 500), getJlptLevel: (text) => (text === '私' ? 'N5' : text === '猫' ? 'N5' : null), getYomitanExt: () => ({ id: 'dummy-ext' }) as any, getYomitanParserWindow: () => ({ isDestroyed: () => false, webContents: { executeJavaScript: async (script: string) => { if (script.includes('getTermFrequencies')) { return []; } return [ { source: 'scanning-parser', index: 0, content: [ [{ text: '私', reading: 'わたし', headwords: [[{ term: '私' }]] }], [{ text: 'は', reading: 'は', headwords: [[{ term: 'は' }]] }], [{ text: 'この', reading: 'この', headwords: [[{ term: 'この' }]] }], [{ text: '猫', reading: 'ねこ', headwords: [[{ term: '猫' }]] }], [{ text: 'です', reading: 'です', headwords: [[{ term: 'です' }]] }], ], }, ]; }, }, }) as unknown as Electron.BrowserWindow, tokenizeWithMecab: async () => [ { headword: '私', surface: '私', reading: 'ワタシ', startPos: 0, endPos: 1, partOfSpeech: PartOfSpeech.noun, pos1: '名詞', pos2: '代名詞', isMerged: true, isKnown: false, isNPlusOneTarget: false, }, { headword: 'は', surface: 'は', reading: 'ハ', startPos: 1, endPos: 2, partOfSpeech: PartOfSpeech.particle, pos1: '助詞', pos2: '係助詞', isMerged: true, isKnown: false, isNPlusOneTarget: false, }, { headword: 'この', surface: 'この', reading: 'コノ', startPos: 2, endPos: 4, partOfSpeech: PartOfSpeech.other, pos1: '連体詞', isMerged: true, isKnown: false, isNPlusOneTarget: false, }, { headword: '猫', surface: '猫', reading: 'ネコ', startPos: 4, endPos: 5, partOfSpeech: PartOfSpeech.noun, pos1: '名詞', pos2: '一般', isMerged: true, isKnown: false, isNPlusOneTarget: false, }, { headword: 'です', surface: 'です', reading: 'デス', startPos: 5, endPos: 7, partOfSpeech: PartOfSpeech.bound_auxiliary, pos1: '助動詞', isMerged: true, isKnown: false, isNPlusOneTarget: false, }, ], }), ); assert.equal(result.text, '私はこの猫です'); assert.deepEqual( result.tokens?.map((token) => ({ surface: token.surface, headword: token.headword, frequencyRank: token.frequencyRank, jlptLevel: token.jlptLevel, })), [ { surface: '私', headword: '私', frequencyRank: 50, jlptLevel: 'N5' }, { surface: 'は', headword: 'は', frequencyRank: undefined, jlptLevel: undefined }, { surface: 'この', headword: 'この', frequencyRank: undefined, jlptLevel: undefined }, { surface: '猫', headword: '猫', frequencyRank: 11, jlptLevel: 'N5' }, { surface: 'です', headword: 'です', frequencyRank: undefined, jlptLevel: undefined }, ], ); }); test('tokenizeSubtitle keeps trailing quote-particle merged tokens hoverable while clearing only their annotation metadata', async () => { const result = await tokenizeSubtitle( 'どうしてもって', makeDepsFromYomitanTokens([{ surface: 'どうしてもって', reading: 'どうしてもって', headword: 'どうしても' }], { getFrequencyDictionaryEnabled: () => true, getFrequencyRank: (text) => (text === 'どうしても' ? 123 : null), getJlptLevel: (text) => (text === 'どうしても' ? 'N3' : null), tokenizeWithMecab: async () => [ { headword: 'どうしても', surface: 'どうしても', reading: 'ドウシテモ', startPos: 0, endPos: 5, partOfSpeech: PartOfSpeech.other, pos1: '副詞', pos2: '一般', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, { headword: 'って', surface: 'って', reading: 'ッテ', startPos: 5, endPos: 7, partOfSpeech: PartOfSpeech.particle, pos1: '助詞', pos2: '格助詞', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, ], getMinSentenceWordsForNPlusOne: () => 1, }), ); assert.equal(result.text, 'どうしてもって'); assert.deepEqual( result.tokens?.map((token) => ({ surface: token.surface, headword: token.headword, jlptLevel: token.jlptLevel, frequencyRank: token.frequencyRank, })), [ { surface: 'どうしてもって', headword: 'どうしても', jlptLevel: undefined, frequencyRank: undefined, }, ], ); }); test('tokenizeSubtitle keeps auxiliary-stem そうだ grammar tails hoverable while clearing annotation metadata', async () => { const result = await tokenizeSubtitle( '与えるそうだ', makeDepsFromYomitanTokens( [ { surface: '与える', reading: 'あたえる', headword: '与える' }, { surface: 'そうだ', reading: 'そうだ', headword: 'そうだ' }, ], { getFrequencyDictionaryEnabled: () => true, getFrequencyRank: (text) => (text === '与える' ? 100 : text === 'そうだ' ? 12 : null), getJlptLevel: (text) => (text === '与える' ? 'N3' : text === 'そうだ' ? 'N5' : null), tokenizeWithMecab: async () => [ { headword: '与える', surface: '与える', reading: 'アタエル', startPos: 0, endPos: 3, partOfSpeech: PartOfSpeech.verb, pos1: '動詞', pos2: '自立', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, { headword: 'そう', surface: 'そう', reading: 'ソウ', startPos: 3, endPos: 5, partOfSpeech: PartOfSpeech.noun, pos1: '名詞', pos2: '特殊', pos3: '助動詞語幹', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, { headword: 'だ', surface: 'だ', reading: 'ダ', startPos: 5, endPos: 6, partOfSpeech: PartOfSpeech.bound_auxiliary, pos1: '助動詞', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, ], getMinSentenceWordsForNPlusOne: () => 1, }, ), ); assert.equal(result.text, '与えるそうだ'); assert.deepEqual( result.tokens?.map((token) => ({ surface: token.surface, headword: token.headword, frequencyRank: token.frequencyRank, jlptLevel: token.jlptLevel, })), [ { surface: '与える', headword: '与える', frequencyRank: 100, jlptLevel: 'N3' }, { surface: 'そうだ', headword: 'そうだ', frequencyRank: undefined, jlptLevel: undefined }, ], ); }); test('tokenizeSubtitle excludes single-kana merged tokens from frequency highlighting', async () => { const result = await tokenizeSubtitle( 'た', makeDepsFromYomitanTokens([{ surface: 'た', reading: 'た', headword: 'た' }], { getFrequencyDictionaryEnabled: () => true, getFrequencyRank: (text) => (text === 'た' ? 17 : null), getMinSentenceWordsForNPlusOne: () => 1, tokenizeWithMecab: async () => null, }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.frequencyRank, undefined); }); test('tokenizeSubtitle excludes merged function/content token from frequency highlighting but keeps N+1', async () => { const result = await tokenizeSubtitle( 'になれば', makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], { getFrequencyDictionaryEnabled: () => true, getFrequencyRank: (text) => (text === 'なる' ? 13 : null), tokenizeWithMecab: async () => [ { headword: 'に', surface: 'に', reading: 'ニ', startPos: 0, endPos: 1, partOfSpeech: PartOfSpeech.particle, pos1: '助詞', pos2: '格助詞', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, { headword: 'なる', surface: 'なれ', reading: 'ナレ', startPos: 1, endPos: 3, partOfSpeech: PartOfSpeech.verb, pos1: '動詞', pos2: '自立', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, { headword: 'ば', surface: 'ば', reading: 'バ', startPos: 3, endPos: 4, partOfSpeech: PartOfSpeech.particle, pos1: '助詞', pos2: '接続助詞', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, ], getMinSentenceWordsForNPlusOne: () => 1, }), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.pos1, '助詞|動詞'); assert.equal(result.tokens?.[0]?.frequencyRank, undefined); assert.equal(result.tokens?.[0]?.isNPlusOneTarget, true); }); test('tokenizeSubtitle clears all annotations for kana-only demonstrative helper merges', async () => { const result = await tokenizeSubtitle( 'これで実力どおりか', makeDepsFromYomitanTokens( [ { surface: 'これで', reading: 'これで', headword: 'これ' }, { surface: '実力どおり', reading: 'じつりょくどおり', headword: '実力どおり' }, { surface: 'か', reading: 'か', headword: 'か' }, ], { getFrequencyDictionaryEnabled: () => true, getFrequencyRank: (text) => text === 'これ' ? 9 : text === '実力どおり' ? 2500 : text === 'か' ? 800 : null, getJlptLevel: (text) => text === 'これ' ? 'N5' : text === '実力どおり' ? 'N1' : text === 'か' ? 'N5' : null, isKnownWord: (text) => text === 'これ', getMinSentenceWordsForNPlusOne: () => 1, tokenizeWithMecab: async () => [ { headword: 'これ', surface: 'これ', reading: 'コレ', startPos: 0, endPos: 2, partOfSpeech: PartOfSpeech.noun, pos1: '名詞', pos2: '代名詞', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, { headword: 'で', surface: 'で', reading: 'デ', startPos: 2, endPos: 3, partOfSpeech: PartOfSpeech.particle, pos1: '助詞', pos2: '格助詞', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, { headword: '実力どおり', surface: '実力どおり', reading: 'ジツリョクドオリ', startPos: 3, endPos: 8, partOfSpeech: PartOfSpeech.noun, pos1: '名詞', pos2: '一般', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, { headword: 'か', surface: 'か', reading: 'カ', startPos: 8, endPos: 9, partOfSpeech: PartOfSpeech.particle, pos1: '助詞', pos2: '終助詞', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, ], }, ), ); assert.deepEqual( result.tokens?.map((token) => ({ surface: token.surface, headword: token.headword, isKnown: token.isKnown, isNPlusOneTarget: token.isNPlusOneTarget, frequencyRank: token.frequencyRank, jlptLevel: token.jlptLevel, })), [ { surface: 'これで', headword: 'これ', isKnown: false, isNPlusOneTarget: false, frequencyRank: undefined, jlptLevel: undefined, }, { surface: '実力どおり', headword: '実力どおり', isKnown: false, isNPlusOneTarget: true, frequencyRank: 2500, jlptLevel: 'N1', }, { surface: 'か', headword: 'か', isKnown: false, isNPlusOneTarget: false, frequencyRank: undefined, jlptLevel: undefined, }, ], ); }); test('tokenizeSubtitle clears all annotations for explanatory pondering endings', async () => { const result = await tokenizeSubtitle( '俺どうかしちゃったのかな', makeDepsFromYomitanTokens( [ { surface: '俺', reading: 'おれ', headword: '俺' }, { surface: 'どうかしちゃった', reading: 'どうかしちゃった', headword: 'どうかしちゃう' }, { surface: 'のかな', reading: 'のかな', headword: 'の' }, ], { getFrequencyDictionaryEnabled: () => true, getFrequencyRank: (text) => (text === '俺' ? 19 : text === 'どうかしちゃう' ? 3200 : 77), getJlptLevel: (text) => text === '俺' ? 'N5' : text === 'どうかしちゃう' ? 'N3' : text === 'の' ? 'N5' : null, isKnownWord: (text) => text === '俺' || text === 'の', getMinSentenceWordsForNPlusOne: () => 1, tokenizeWithMecab: async () => [ { headword: '俺', surface: '俺', reading: 'オレ', startPos: 0, endPos: 1, partOfSpeech: PartOfSpeech.noun, pos1: '名詞', pos2: '代名詞', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, { headword: 'どうかしちゃう', surface: 'どうかしちゃった', reading: 'ドウカシチャッタ', startPos: 1, endPos: 8, partOfSpeech: PartOfSpeech.verb, pos1: '動詞', pos2: '自立', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, { headword: 'の', surface: 'のかな', reading: 'ノカナ', startPos: 8, endPos: 11, partOfSpeech: PartOfSpeech.other, pos1: '名詞|助動詞', pos2: '非自立', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, ], }, ), ); assert.deepEqual( result.tokens?.map((token) => ({ surface: token.surface, headword: token.headword, isKnown: token.isKnown, isNPlusOneTarget: token.isNPlusOneTarget, frequencyRank: token.frequencyRank, jlptLevel: token.jlptLevel, })), [ { surface: '俺', headword: '俺', isKnown: true, isNPlusOneTarget: false, frequencyRank: 19, jlptLevel: 'N5' }, { surface: 'どうかしちゃった', headword: 'どうかしちゃう', isKnown: false, isNPlusOneTarget: true, frequencyRank: 3200, jlptLevel: 'N3', }, { surface: 'のかな', headword: 'の', isKnown: false, isNPlusOneTarget: false, frequencyRank: undefined, jlptLevel: undefined, }, ], ); }); test('tokenizeSubtitle keeps frequency for content-led merged token with trailing colloquial suffixes', async () => { const result = await tokenizeSubtitle( '張り切ってんじゃ', makeDepsFromYomitanTokens( [{ surface: '張り切ってん', reading: 'はき', headword: '張り切る' }], { getFrequencyDictionaryEnabled: () => true, getFrequencyRank: (text) => (text === '張り切る' ? 5468 : null), tokenizeWithMecab: async () => [ { headword: '張り切る', surface: '張り切っ', reading: 'ハリキッ', startPos: 0, endPos: 4, partOfSpeech: PartOfSpeech.verb, pos1: '動詞', pos2: '自立', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, { headword: 'て', surface: 'て', reading: 'テ', startPos: 4, endPos: 5, partOfSpeech: PartOfSpeech.particle, pos1: '助詞', pos2: '接続助詞', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, { headword: 'んじゃ', surface: 'んじゃ', reading: 'ンジャ', startPos: 5, endPos: 8, partOfSpeech: PartOfSpeech.other, pos1: '接続詞', pos2: '*', isMerged: false, isKnown: false, isNPlusOneTarget: false, }, ], getMinSentenceWordsForNPlusOne: () => 1, }, ), ); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.surface, '張り切ってん'); assert.equal(result.tokens?.[0]?.pos1, '動詞|助詞|接続詞'); assert.equal(result.tokens?.[0]?.frequencyRank, 5468); }); test('tokenizeSubtitle excludes default non-independent pos2 from N+1 when JLPT/frequency are disabled', async () => { let mecabCalls = 0; const result = await tokenizeSubtitle( 'になれば', makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], { getJlptEnabled: () => false, getFrequencyDictionaryEnabled: () => false, getMinSentenceWordsForNPlusOne: () => 1, tokenizeWithMecab: async () => { mecabCalls += 1; return [ { headword: 'なる', surface: 'になれば', reading: 'ニナレバ', startPos: 0, endPos: 4, partOfSpeech: PartOfSpeech.verb, pos1: '動詞', pos2: '非自立', isMerged: true, isKnown: false, isNPlusOneTarget: false, }, ]; }, }), ); assert.equal(mecabCalls, 1); assert.equal(result.tokens?.length, 1); assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false); });