mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-01 06:22:44 -08:00
1914 lines
57 KiB
TypeScript
1914 lines
57 KiB
TypeScript
import test from 'node:test';
|
|
import assert from 'node:assert/strict';
|
|
import { PartOfSpeech } from '../../types';
|
|
import { createTokenizerDepsRuntime, TokenizerServiceDeps, tokenizeSubtitle } from './tokenizer';
|
|
|
|
function makeDeps(overrides: Partial<TokenizerServiceDeps> = {}): TokenizerServiceDeps {
|
|
return {
|
|
getYomitanExt: () => null,
|
|
getYomitanParserWindow: () => null,
|
|
setYomitanParserWindow: () => {},
|
|
getYomitanParserReadyPromise: () => null,
|
|
setYomitanParserReadyPromise: () => {},
|
|
getYomitanParserInitPromise: () => null,
|
|
setYomitanParserInitPromise: () => {},
|
|
isKnownWord: () => false,
|
|
getKnownWordMatchMode: () => 'headword',
|
|
getJlptLevel: () => null,
|
|
tokenizeWithMecab: async () => null,
|
|
...overrides,
|
|
};
|
|
}
|
|
|
|
interface YomitanTokenInput {
|
|
surface: string;
|
|
reading?: string;
|
|
headword?: string;
|
|
}
|
|
|
|
function makeDepsFromYomitanTokens(
|
|
tokens: YomitanTokenInput[],
|
|
overrides: Partial<TokenizerServiceDeps> = {},
|
|
): TokenizerServiceDeps {
|
|
return makeDeps({
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () =>
|
|
({
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: tokens.map((token) => [
|
|
{
|
|
text: token.surface,
|
|
reading: token.reading ?? token.surface,
|
|
headwords: [[{ term: token.headword ?? token.surface }]],
|
|
},
|
|
]),
|
|
},
|
|
],
|
|
},
|
|
}) as unknown as Electron.BrowserWindow,
|
|
...overrides,
|
|
});
|
|
}
|
|
|
|
test('tokenizeSubtitle assigns JLPT level to parsed Yomitan tokens', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'猫です',
|
|
makeDeps({
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () =>
|
|
({
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: '猫',
|
|
reading: 'ねこ',
|
|
headwords: [[{ term: '猫' }]],
|
|
},
|
|
{
|
|
text: 'です',
|
|
reading: 'です',
|
|
headwords: [[{ term: 'です' }]],
|
|
},
|
|
],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}) as unknown as Electron.BrowserWindow,
|
|
tokenizeWithMecab: async () => null,
|
|
getJlptLevel: (text) => (text === '猫' ? 'N5' : null),
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 1);
|
|
assert.equal(result.tokens?.[0]?.jlptLevel, 'N5');
|
|
});
|
|
|
|
test('tokenizeSubtitle caches JLPT lookups across repeated tokens', async () => {
|
|
let lookupCalls = 0;
|
|
const result = await tokenizeSubtitle(
|
|
'猫猫',
|
|
makeDepsFromYomitanTokens(
|
|
[
|
|
{ surface: '猫', reading: 'ねこ', headword: '猫' },
|
|
{ surface: '猫', reading: 'ねこ', headword: '猫' },
|
|
],
|
|
{
|
|
getJlptLevel: (text) => {
|
|
lookupCalls += 1;
|
|
return text === '猫' ? 'N5' : null;
|
|
},
|
|
},
|
|
),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 2);
|
|
assert.equal(lookupCalls, 1);
|
|
assert.equal(result.tokens?.[0]?.jlptLevel, 'N5');
|
|
assert.equal(result.tokens?.[1]?.jlptLevel, 'N5');
|
|
});
|
|
|
|
test('tokenizeSubtitle leaves JLPT unset for non-matching tokens', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'猫',
|
|
makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], {
|
|
getJlptLevel: () => null,
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 1);
|
|
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
|
|
});
|
|
|
|
test('tokenizeSubtitle skips JLPT lookups when disabled', async () => {
|
|
let lookupCalls = 0;
|
|
const result = await tokenizeSubtitle(
|
|
'猫です',
|
|
makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], {
|
|
getJlptLevel: () => {
|
|
lookupCalls += 1;
|
|
return 'N5';
|
|
},
|
|
getJlptEnabled: () => false,
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 1);
|
|
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
|
|
assert.equal(lookupCalls, 0);
|
|
});
|
|
|
|
test('tokenizeSubtitle applies frequency dictionary ranks', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'猫です',
|
|
makeDepsFromYomitanTokens(
|
|
[
|
|
{ surface: '猫', reading: 'ねこ', headword: '猫' },
|
|
{ surface: 'です', reading: 'です', headword: 'です' },
|
|
],
|
|
{
|
|
getFrequencyDictionaryEnabled: () => true,
|
|
getFrequencyRank: (text) => (text === '猫' ? 23 : 1200),
|
|
},
|
|
),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 2);
|
|
assert.equal(result.tokens?.[0]?.frequencyRank, 23);
|
|
assert.equal(result.tokens?.[1]?.frequencyRank, 1200);
|
|
});
|
|
|
|
test('tokenizeSubtitle loads frequency ranks from Yomitan installed dictionaries', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'猫',
|
|
makeDeps({
|
|
getFrequencyDictionaryEnabled: () => true,
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () =>
|
|
({
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async (script: string) => {
|
|
if (script.includes('getTermFrequencies')) {
|
|
return [
|
|
{
|
|
term: '猫',
|
|
reading: 'ねこ',
|
|
dictionary: 'freq-dict',
|
|
frequency: 77,
|
|
displayValue: '77',
|
|
displayValueParsed: true,
|
|
},
|
|
];
|
|
}
|
|
|
|
return [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: '猫',
|
|
reading: 'ねこ',
|
|
headwords: [[{ term: '猫' }]],
|
|
},
|
|
],
|
|
],
|
|
},
|
|
];
|
|
},
|
|
},
|
|
}) as unknown as Electron.BrowserWindow,
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 1);
|
|
assert.equal(result.tokens?.[0]?.frequencyRank, 77);
|
|
});
|
|
|
|
test('tokenizeSubtitle uses only selected Yomitan headword for frequency lookup', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'猫です',
|
|
makeDeps({
|
|
getFrequencyDictionaryEnabled: () => true,
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () =>
|
|
({
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: '猫です',
|
|
reading: 'ねこです',
|
|
headwords: [[{ term: '猫です' }], [{ term: '猫' }]],
|
|
},
|
|
],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}) as unknown as Electron.BrowserWindow,
|
|
getFrequencyRank: (text) => (text === '猫' ? 40 : text === '猫です' ? 1200 : null),
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 1);
|
|
assert.equal(result.tokens?.[0]?.frequencyRank, 1200);
|
|
});
|
|
|
|
test('tokenizeSubtitle keeps furigana-split Yomitan segments as one token', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'友達と話した',
|
|
makeDeps({
|
|
getFrequencyDictionaryEnabled: () => true,
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () =>
|
|
({
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: '友',
|
|
reading: 'とも',
|
|
headwords: [[{ term: '友達' }]],
|
|
},
|
|
{
|
|
text: '達',
|
|
reading: 'だち',
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: 'と',
|
|
reading: 'と',
|
|
headwords: [[{ term: 'と' }]],
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: '話した',
|
|
reading: 'はなした',
|
|
headwords: [[{ term: '話す' }]],
|
|
},
|
|
],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}) as unknown as Electron.BrowserWindow,
|
|
getFrequencyRank: (text) => (text === '友達' ? 22 : text === '話す' ? 90 : null),
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 3);
|
|
assert.equal(result.tokens?.[0]?.surface, '友達');
|
|
assert.equal(result.tokens?.[0]?.reading, 'ともだち');
|
|
assert.equal(result.tokens?.[0]?.headword, '友達');
|
|
assert.equal(result.tokens?.[0]?.frequencyRank, 22);
|
|
assert.equal(result.tokens?.[1]?.surface, 'と');
|
|
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
|
|
assert.equal(result.tokens?.[2]?.surface, '話した');
|
|
assert.equal(result.tokens?.[2]?.frequencyRank, 90);
|
|
});
|
|
|
|
test('tokenizeSubtitle prefers exact headword frequency over surface/reading when available', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'猫です',
|
|
makeDeps({
|
|
getFrequencyDictionaryEnabled: () => true,
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () =>
|
|
({
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: '猫',
|
|
reading: 'ねこ',
|
|
headwords: [[{ term: 'ネコ' }]],
|
|
},
|
|
],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}) as unknown as Electron.BrowserWindow,
|
|
getFrequencyRank: (text) => (text === '猫' ? 1200 : text === 'ネコ' ? 8 : null),
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 1);
|
|
assert.equal(result.tokens?.[0]?.frequencyRank, 8);
|
|
});
|
|
|
|
test('tokenizeSubtitle keeps no frequency when only reading matches and headword misses', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'猫です',
|
|
makeDeps({
|
|
getFrequencyDictionaryEnabled: () => true,
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () =>
|
|
({
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: '猫',
|
|
reading: 'ねこ',
|
|
headwords: [[{ term: '猫です' }]],
|
|
},
|
|
],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}) as unknown as Electron.BrowserWindow,
|
|
getFrequencyRank: (text) => (text === 'ねこ' ? 77 : null),
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 1);
|
|
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
|
});
|
|
|
|
test('tokenizeSubtitle ignores invalid frequency rank on selected headword', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'猫です',
|
|
makeDeps({
|
|
getFrequencyDictionaryEnabled: () => true,
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () =>
|
|
({
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: '猫です',
|
|
reading: 'ねこです',
|
|
headwords: [[{ term: '猫' }], [{ term: '猫です' }]],
|
|
},
|
|
],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}) as unknown as Electron.BrowserWindow,
|
|
getFrequencyRank: (text) => (text === '猫' ? Number.NaN : text === '猫です' ? 500 : null),
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 1);
|
|
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
|
});
|
|
|
|
test('tokenizeSubtitle handles real-word frequency candidates and prefers most frequent term', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'昨日',
|
|
makeDeps({
|
|
getFrequencyDictionaryEnabled: () => true,
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () =>
|
|
({
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: '昨日',
|
|
reading: 'きのう',
|
|
headwords: [[{ term: '昨日' }], [{ term: 'きのう' }]],
|
|
},
|
|
],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}) as unknown as Electron.BrowserWindow,
|
|
getFrequencyRank: (text) => (text === 'きのう' ? 120 : text === '昨日' ? 40 : null),
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 1);
|
|
assert.equal(result.tokens?.[0]?.frequencyRank, 40);
|
|
});
|
|
|
|
test('tokenizeSubtitle ignores candidates with no dictionary rank when higher-frequency candidate exists', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'猫です',
|
|
makeDeps({
|
|
getFrequencyDictionaryEnabled: () => true,
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () =>
|
|
({
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: '猫',
|
|
reading: 'ねこ',
|
|
headwords: [
|
|
[{ term: '猫' }],
|
|
[{ term: '猫です' }],
|
|
[{ term: 'unknown-term' }],
|
|
],
|
|
},
|
|
],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}) as unknown as Electron.BrowserWindow,
|
|
getFrequencyRank: (text) =>
|
|
text === 'unknown-term' ? -1 : text === '猫' ? 88 : text === '猫です' ? 9000 : null,
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 1);
|
|
assert.equal(result.tokens?.[0]?.frequencyRank, 88);
|
|
});
|
|
|
|
test('tokenizeSubtitle ignores frequency lookup failures', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'猫',
|
|
makeDeps({
|
|
getFrequencyDictionaryEnabled: () => true,
|
|
tokenizeWithMecab: async () => [
|
|
{
|
|
headword: '猫',
|
|
surface: '猫',
|
|
reading: 'ネコ',
|
|
startPos: 0,
|
|
endPos: 1,
|
|
partOfSpeech: PartOfSpeech.noun,
|
|
isMerged: false,
|
|
isKnown: false,
|
|
isNPlusOneTarget: false,
|
|
},
|
|
],
|
|
getFrequencyRank: () => {
|
|
throw new Error('frequency lookup unavailable');
|
|
},
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
|
});
|
|
|
|
test('tokenizeSubtitle skips frequency rank when Yomitan token is enriched as particle by mecab pos1', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'は',
|
|
makeDeps({
|
|
getFrequencyDictionaryEnabled: () => true,
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () =>
|
|
({
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: 'は',
|
|
reading: 'は',
|
|
headwords: [[{ term: 'は' }]],
|
|
},
|
|
],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}) as unknown as Electron.BrowserWindow,
|
|
tokenizeWithMecab: async () => [
|
|
{
|
|
headword: 'は',
|
|
surface: 'は',
|
|
reading: 'ハ',
|
|
startPos: 0,
|
|
endPos: 1,
|
|
partOfSpeech: PartOfSpeech.particle,
|
|
pos1: '助詞',
|
|
isMerged: false,
|
|
isKnown: false,
|
|
isNPlusOneTarget: false,
|
|
},
|
|
],
|
|
getFrequencyRank: (text) => (text === 'は' ? 10 : null),
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 1);
|
|
assert.equal(result.tokens?.[0]?.pos1, '助詞');
|
|
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
|
});
|
|
|
|
test('tokenizeSubtitle ignores invalid frequency ranks', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'猫',
|
|
makeDepsFromYomitanTokens(
|
|
[
|
|
{ surface: '猫', reading: 'ねこ', headword: '猫' },
|
|
{ surface: 'です', reading: 'です', headword: 'です' },
|
|
],
|
|
{
|
|
getFrequencyDictionaryEnabled: () => true,
|
|
getFrequencyRank: (text) => {
|
|
if (text === '猫') return Number.NaN;
|
|
if (text === 'です') return -1;
|
|
return 100;
|
|
},
|
|
},
|
|
),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 2);
|
|
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
|
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
|
|
});
|
|
|
|
test('tokenizeSubtitle skips frequency lookups when disabled', async () => {
|
|
let frequencyCalls = 0;
|
|
const result = await tokenizeSubtitle(
|
|
'猫',
|
|
makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], {
|
|
getFrequencyDictionaryEnabled: () => false,
|
|
getFrequencyRank: () => {
|
|
frequencyCalls += 1;
|
|
return 10;
|
|
},
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 1);
|
|
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
|
assert.equal(frequencyCalls, 0);
|
|
});
|
|
|
|
test('tokenizeSubtitle skips JLPT level for excluded demonstratives', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'この',
|
|
makeDeps({
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () =>
|
|
({
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: 'この',
|
|
reading: 'この',
|
|
headwords: [[{ term: 'この' }]],
|
|
},
|
|
],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}) as unknown as Electron.BrowserWindow,
|
|
tokenizeWithMecab: async () => null,
|
|
getJlptLevel: (text) => (text === 'この' ? 'N5' : null),
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 1);
|
|
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
|
|
});
|
|
|
|
test('tokenizeSubtitle skips JLPT level for repeated kana SFX', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'ああ',
|
|
makeDeps({
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () =>
|
|
({
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: 'ああ',
|
|
reading: 'ああ',
|
|
headwords: [[{ term: 'ああ' }]],
|
|
},
|
|
],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}) as unknown as Electron.BrowserWindow,
|
|
tokenizeWithMecab: async () => null,
|
|
getJlptLevel: (text) => (text === 'ああ' ? 'N5' : null),
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 1);
|
|
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
|
|
});
|
|
|
|
test('tokenizeSubtitle assigns JLPT level to Yomitan tokens', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'猫です',
|
|
makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], {
|
|
getJlptLevel: (text) => (text === '猫' ? 'N4' : null),
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 1);
|
|
assert.equal(result.tokens?.[0]?.jlptLevel, 'N4');
|
|
});
|
|
|
|
test('tokenizeSubtitle can assign JLPT level to Yomitan particle token', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'は',
|
|
makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は' }], {
|
|
getJlptLevel: (text) => (text === 'は' ? 'N5' : null),
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 1);
|
|
assert.equal(result.tokens?.[0]?.jlptLevel, 'N5');
|
|
});
|
|
|
|
test('tokenizeSubtitle returns null tokens for empty normalized text', async () => {
|
|
const result = await tokenizeSubtitle(' \\n ', makeDeps());
|
|
assert.deepEqual(result, { text: ' \\n ', tokens: null });
|
|
});
|
|
|
|
test('tokenizeSubtitle normalizes newlines before Yomitan parse request', async () => {
|
|
let parseInput = '';
|
|
const result = await tokenizeSubtitle(
|
|
'猫\\Nです\nね',
|
|
makeDeps({
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () =>
|
|
({
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async (script: string) => {
|
|
parseInput = script;
|
|
return null;
|
|
},
|
|
},
|
|
}) as unknown as Electron.BrowserWindow,
|
|
}),
|
|
);
|
|
|
|
assert.match(parseInput, /猫 です ね/);
|
|
assert.equal(result.text, '猫\nです\nね');
|
|
assert.equal(result.tokens, null);
|
|
});
|
|
|
|
test('tokenizeSubtitle returns null tokens when Yomitan parsing is unavailable', async () => {
|
|
const result = await tokenizeSubtitle('猫です', makeDeps());
|
|
|
|
assert.deepEqual(result, { text: '猫です', tokens: null });
|
|
});
|
|
|
|
test('tokenizeSubtitle returns null tokens when mecab throws', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'猫です',
|
|
makeDeps({
|
|
tokenizeWithMecab: async () => {
|
|
throw new Error('mecab failed');
|
|
},
|
|
}),
|
|
);
|
|
|
|
assert.deepEqual(result, { text: '猫です', tokens: null });
|
|
});
|
|
|
|
test('tokenizeSubtitle uses Yomitan parser result when available', async () => {
|
|
const parserWindow = {
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: '猫',
|
|
reading: 'ねこ',
|
|
headwords: [[{ term: '猫' }]],
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: 'です',
|
|
reading: 'です',
|
|
},
|
|
],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
} as unknown as Electron.BrowserWindow;
|
|
|
|
const result = await tokenizeSubtitle(
|
|
'猫です',
|
|
makeDeps({
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () => parserWindow,
|
|
tokenizeWithMecab: async () => null,
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.text, '猫です');
|
|
assert.equal(result.tokens?.length, 2);
|
|
assert.equal(result.tokens?.[0]?.surface, '猫');
|
|
assert.equal(result.tokens?.[0]?.reading, 'ねこ');
|
|
assert.equal(result.tokens?.[0]?.isKnown, false);
|
|
assert.equal(result.tokens?.[1]?.surface, 'です');
|
|
assert.equal(result.tokens?.[1]?.reading, 'です');
|
|
assert.equal(result.tokens?.[1]?.isKnown, false);
|
|
});
|
|
|
|
test('tokenizeSubtitle logs selected Yomitan groups when debug toggle is enabled', async () => {
|
|
const infoLogs: string[] = [];
|
|
const originalInfo = console.info;
|
|
console.info = (...args: unknown[]) => {
|
|
infoLogs.push(args.map((value) => String(value)).join(' '));
|
|
};
|
|
|
|
try {
|
|
await tokenizeSubtitle(
|
|
'友達と話した',
|
|
makeDeps({
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () =>
|
|
({
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: '友',
|
|
reading: 'とも',
|
|
headwords: [[{ term: '友達' }]],
|
|
},
|
|
{
|
|
text: '達',
|
|
reading: 'だち',
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: 'と',
|
|
reading: 'と',
|
|
headwords: [[{ term: 'と' }]],
|
|
},
|
|
],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}) as unknown as Electron.BrowserWindow,
|
|
tokenizeWithMecab: async () => null,
|
|
getYomitanGroupDebugEnabled: () => true,
|
|
}),
|
|
);
|
|
} finally {
|
|
console.info = originalInfo;
|
|
}
|
|
|
|
assert.ok(infoLogs.some((line) => line.includes('Selected Yomitan token groups')));
|
|
});
|
|
|
|
test('tokenizeSubtitle does not log Yomitan groups when debug toggle is disabled', async () => {
|
|
const infoLogs: string[] = [];
|
|
const originalInfo = console.info;
|
|
console.info = (...args: unknown[]) => {
|
|
infoLogs.push(args.map((value) => String(value)).join(' '));
|
|
};
|
|
|
|
try {
|
|
await tokenizeSubtitle(
|
|
'友達と話した',
|
|
makeDeps({
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () =>
|
|
({
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: '友',
|
|
reading: 'とも',
|
|
headwords: [[{ term: '友達' }]],
|
|
},
|
|
{
|
|
text: '達',
|
|
reading: 'だち',
|
|
},
|
|
],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}) as unknown as Electron.BrowserWindow,
|
|
tokenizeWithMecab: async () => null,
|
|
getYomitanGroupDebugEnabled: () => false,
|
|
}),
|
|
);
|
|
} finally {
|
|
console.info = originalInfo;
|
|
}
|
|
|
|
assert.equal(
|
|
infoLogs.some((line) => line.includes('Selected Yomitan token groups')),
|
|
false,
|
|
);
|
|
});
|
|
|
|
test('tokenizeSubtitle preserves segmented Yomitan line as one token', async () => {
|
|
const parserWindow = {
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: '猫',
|
|
reading: 'ねこ',
|
|
headwords: [[{ term: '猫です' }]],
|
|
},
|
|
{
|
|
text: 'です',
|
|
reading: 'です',
|
|
},
|
|
],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
} as unknown as Electron.BrowserWindow;
|
|
|
|
const result = await tokenizeSubtitle(
|
|
'猫です',
|
|
makeDeps({
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () => parserWindow,
|
|
tokenizeWithMecab: async () => null,
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.text, '猫です');
|
|
assert.equal(result.tokens?.length, 1);
|
|
assert.equal(result.tokens?.[0]?.surface, '猫です');
|
|
assert.equal(result.tokens?.[0]?.reading, 'ねこです');
|
|
assert.equal(result.tokens?.[0]?.headword, '猫です');
|
|
assert.equal(result.tokens?.[0]?.isKnown, false);
|
|
});
|
|
|
|
test('tokenizeSubtitle keeps scanning parser token when scanning parser returns one token', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'俺は小園にいきたい',
|
|
makeDeps({
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () =>
|
|
({
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: '俺は小園にいきたい',
|
|
reading: 'おれは小園にいきたい',
|
|
headwords: [[{ term: '俺は小園にいきたい' }]],
|
|
},
|
|
],
|
|
],
|
|
},
|
|
{
|
|
source: 'mecab',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: '俺',
|
|
reading: 'おれ',
|
|
headwords: [[{ term: '俺' }]],
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: 'は',
|
|
reading: 'は',
|
|
headwords: [[{ term: 'は' }]],
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: '小園',
|
|
reading: 'おうえん',
|
|
headwords: [[{ term: '小園' }]],
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: 'に',
|
|
reading: 'に',
|
|
headwords: [[{ term: 'に' }]],
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: 'いきたい',
|
|
reading: 'いきたい',
|
|
headwords: [[{ term: 'いきたい' }]],
|
|
},
|
|
],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}) as unknown as Electron.BrowserWindow,
|
|
getFrequencyDictionaryEnabled: () => true,
|
|
tokenizeWithMecab: async () => null,
|
|
getFrequencyRank: (text) => (text === '小園' ? 25 : text === 'いきたい' ? 1500 : null),
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 1);
|
|
assert.equal(result.tokens?.map((token) => token.surface).join(','), '俺は小園にいきたい');
|
|
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
|
});
|
|
|
|
test('tokenizeSubtitle keeps scanning parser tokens when they are already split', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'小園に行きたい',
|
|
makeDeps({
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () =>
|
|
({
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: '小園',
|
|
reading: 'おうえん',
|
|
headwords: [[{ term: '小園' }]],
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: 'に',
|
|
reading: 'に',
|
|
headwords: [[{ term: 'に' }]],
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: '行きたい',
|
|
reading: 'いきたい',
|
|
headwords: [[{ term: '行きたい' }]],
|
|
},
|
|
],
|
|
],
|
|
},
|
|
{
|
|
source: 'mecab',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: '小',
|
|
reading: 'お',
|
|
headwords: [[{ term: '小' }]],
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: '園',
|
|
reading: 'えん',
|
|
headwords: [[{ term: '園' }]],
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: 'に',
|
|
reading: 'に',
|
|
headwords: [[{ term: 'に' }]],
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: '行き',
|
|
reading: 'いき',
|
|
headwords: [[{ term: '行き' }]],
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: 'たい',
|
|
reading: 'たい',
|
|
headwords: [[{ term: 'たい' }]],
|
|
},
|
|
],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}) as unknown as Electron.BrowserWindow,
|
|
getFrequencyDictionaryEnabled: () => true,
|
|
getFrequencyRank: (text) => (text === '小園' ? 20 : null),
|
|
tokenizeWithMecab: async () => null,
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 3);
|
|
assert.equal(result.tokens?.map((token) => token.surface).join(','), '小園,に,行きたい');
|
|
assert.equal(result.tokens?.[0]?.frequencyRank, 20);
|
|
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
|
|
assert.equal(result.tokens?.[2]?.frequencyRank, undefined);
|
|
});
|
|
|
|
test('tokenizeSubtitle keeps parsing explicit by scanning-parser source only', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'俺は公園にいきたい',
|
|
makeDeps({
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () =>
|
|
({
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: '俺',
|
|
reading: 'おれ',
|
|
headwords: [[{ term: '俺' }]],
|
|
},
|
|
],
|
|
[{ text: 'は', reading: '', headwords: [[{ term: 'は' }]] }],
|
|
[
|
|
{
|
|
text: '公園',
|
|
reading: 'こうえん',
|
|
headwords: [[{ term: '公園' }]],
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: 'にい',
|
|
reading: '',
|
|
headwords: [[{ term: '兄' }], [{ term: '二位' }]],
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: 'きたい',
|
|
reading: '',
|
|
headwords: [[{ term: '期待' }], [{ term: '来る' }]],
|
|
},
|
|
],
|
|
],
|
|
},
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: '俺',
|
|
reading: 'おれ',
|
|
headwords: [[{ term: '俺' }]],
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: 'は',
|
|
reading: 'は',
|
|
headwords: [[{ term: 'は' }]],
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: '公園',
|
|
reading: 'こうえん',
|
|
headwords: [[{ term: '公園' }]],
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: 'に',
|
|
reading: 'に',
|
|
headwords: [[{ term: 'に' }]],
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: '行きたい',
|
|
reading: 'いきたい',
|
|
headwords: [[{ term: '行きたい' }]],
|
|
},
|
|
],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}) as unknown as Electron.BrowserWindow,
|
|
getFrequencyDictionaryEnabled: () => true,
|
|
getFrequencyRank: (text) =>
|
|
text === '俺' ? 51 : text === '公園' ? 2304 : text === '行きたい' ? 1500 : null,
|
|
tokenizeWithMecab: async () => null,
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.map((token) => token.surface).join(','), '俺,は,公園,に,行きたい');
|
|
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
|
|
assert.equal(result.tokens?.[3]?.frequencyRank, undefined);
|
|
assert.equal(result.tokens?.[4]?.frequencyRank, 1500);
|
|
});
|
|
|
|
test('tokenizeSubtitle still assigns frequency to non-known Yomitan tokens', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'小園に',
|
|
makeDeps({
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () =>
|
|
({
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: '小園',
|
|
reading: 'おうえん',
|
|
headwords: [[{ term: '小園' }]],
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: 'に',
|
|
reading: 'に',
|
|
headwords: [[{ term: 'に' }]],
|
|
},
|
|
],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}) as unknown as Electron.BrowserWindow,
|
|
getFrequencyDictionaryEnabled: () => true,
|
|
getFrequencyRank: (text) => (text === '小園' ? 75 : text === 'に' ? 3000 : null),
|
|
isKnownWord: (text) => text === '小園',
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 2);
|
|
assert.equal(result.tokens?.[0]?.isKnown, true);
|
|
assert.equal(result.tokens?.[0]?.frequencyRank, 75);
|
|
assert.equal(result.tokens?.[1]?.isKnown, false);
|
|
assert.equal(result.tokens?.[1]?.frequencyRank, 3000);
|
|
});
|
|
|
|
test('tokenizeSubtitle marks tokens as known using callback', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'猫です',
|
|
makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], {
|
|
isKnownWord: (text) => text === '猫',
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.text, '猫です');
|
|
assert.equal(result.tokens?.[0]?.isKnown, true);
|
|
});
|
|
|
|
test('tokenizeSubtitle still assigns frequency rank to non-known tokens', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'既知未知',
|
|
makeDepsFromYomitanTokens(
|
|
[
|
|
{ surface: '既知', reading: 'きち', headword: '既知' },
|
|
{ surface: '未知', reading: 'みち', headword: '未知' },
|
|
],
|
|
{
|
|
getFrequencyDictionaryEnabled: () => true,
|
|
getFrequencyRank: (text) => (text === '既知' ? 20 : text === '未知' ? 30 : null),
|
|
isKnownWord: (text) => text === '既知',
|
|
},
|
|
),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 2);
|
|
assert.equal(result.tokens?.[0]?.isKnown, true);
|
|
assert.equal(result.tokens?.[0]?.frequencyRank, 20);
|
|
assert.equal(result.tokens?.[1]?.isKnown, false);
|
|
assert.equal(result.tokens?.[1]?.frequencyRank, 30);
|
|
});
|
|
|
|
test('tokenizeSubtitle selects one N+1 target token', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'猫です',
|
|
makeDepsFromYomitanTokens(
|
|
[
|
|
{ surface: '私', reading: 'わたし', headword: '私' },
|
|
{ surface: '犬', reading: 'いぬ', headword: '犬' },
|
|
],
|
|
{
|
|
getMinSentenceWordsForNPlusOne: () => 2,
|
|
isKnownWord: (text) => text === '私',
|
|
},
|
|
),
|
|
);
|
|
|
|
const targets = result.tokens?.filter((token) => token.isNPlusOneTarget) ?? [];
|
|
assert.equal(targets.length, 1);
|
|
assert.equal(targets[0]?.surface, '犬');
|
|
});
|
|
|
|
test('tokenizeSubtitle does not mark target when sentence has multiple candidates', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'猫犬',
|
|
makeDepsFromYomitanTokens(
|
|
[
|
|
{ surface: '猫', reading: 'ねこ', headword: '猫' },
|
|
{ surface: '犬', reading: 'いぬ', headword: '犬' },
|
|
],
|
|
{},
|
|
),
|
|
);
|
|
|
|
assert.equal(
|
|
result.tokens?.some((token) => token.isNPlusOneTarget),
|
|
false,
|
|
);
|
|
});
|
|
|
|
test('tokenizeSubtitle applies N+1 target marking to Yomitan results', async () => {
|
|
const parserWindow = {
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[
|
|
{
|
|
text: '猫',
|
|
reading: 'ねこ',
|
|
headwords: [[{ term: '猫' }]],
|
|
},
|
|
],
|
|
[
|
|
{
|
|
text: 'です',
|
|
reading: 'です',
|
|
headwords: [[{ term: 'です' }]],
|
|
},
|
|
],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
} as unknown as Electron.BrowserWindow;
|
|
|
|
const result = await tokenizeSubtitle(
|
|
'猫です',
|
|
makeDeps({
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () => parserWindow,
|
|
tokenizeWithMecab: async () => null,
|
|
isKnownWord: (text) => text === 'です',
|
|
getMinSentenceWordsForNPlusOne: () => 2,
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.text, '猫です');
|
|
assert.equal(result.tokens?.length, 2);
|
|
assert.equal(result.tokens?.[0]?.surface, '猫');
|
|
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, true);
|
|
assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false);
|
|
});
|
|
|
|
test('tokenizeSubtitle ignores Yomitan functional tokens when evaluating N+1 candidates', async () => {
|
|
const parserWindow = {
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[{ text: '私', reading: 'わたし', headwords: [[{ term: '私' }]] }],
|
|
[{ text: 'も', reading: 'も', headwords: [[{ term: 'も' }]] }],
|
|
[{ text: 'あの', reading: 'あの', headwords: [[{ term: 'あの' }]] }],
|
|
[{ text: '仮面', reading: 'かめん', headwords: [[{ term: '仮面' }]] }],
|
|
[{ text: 'が', reading: 'が', headwords: [[{ term: 'が' }]] }],
|
|
[{ text: '欲しい', reading: 'ほしい', headwords: [[{ term: '欲しい' }]] }],
|
|
[{ text: 'です', reading: 'です', headwords: [[{ term: 'です' }]] }],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
} as unknown as Electron.BrowserWindow;
|
|
|
|
const result = await tokenizeSubtitle(
|
|
'私も あの仮面が欲しいです',
|
|
makeDeps({
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () => parserWindow,
|
|
tokenizeWithMecab: async () => [
|
|
{
|
|
surface: '私',
|
|
reading: 'ワタシ',
|
|
headword: '私',
|
|
startPos: 0,
|
|
endPos: 1,
|
|
partOfSpeech: PartOfSpeech.noun,
|
|
pos1: '名詞',
|
|
isMerged: false,
|
|
isKnown: true,
|
|
isNPlusOneTarget: false,
|
|
},
|
|
{
|
|
surface: 'も',
|
|
reading: 'モ',
|
|
headword: 'も',
|
|
startPos: 1,
|
|
endPos: 2,
|
|
partOfSpeech: PartOfSpeech.particle,
|
|
pos1: '助詞',
|
|
isMerged: false,
|
|
isKnown: false,
|
|
isNPlusOneTarget: false,
|
|
},
|
|
{
|
|
surface: 'あの',
|
|
reading: 'アノ',
|
|
headword: 'あの',
|
|
startPos: 2,
|
|
endPos: 4,
|
|
partOfSpeech: PartOfSpeech.other,
|
|
pos1: '連体詞',
|
|
isMerged: false,
|
|
isKnown: true,
|
|
isNPlusOneTarget: false,
|
|
},
|
|
{
|
|
surface: '仮面',
|
|
reading: 'カメン',
|
|
headword: '仮面',
|
|
startPos: 4,
|
|
endPos: 6,
|
|
partOfSpeech: PartOfSpeech.noun,
|
|
pos1: '名詞',
|
|
isMerged: false,
|
|
isKnown: false,
|
|
isNPlusOneTarget: false,
|
|
},
|
|
{
|
|
surface: 'が',
|
|
reading: 'ガ',
|
|
headword: 'が',
|
|
startPos: 6,
|
|
endPos: 7,
|
|
partOfSpeech: PartOfSpeech.particle,
|
|
pos1: '助詞',
|
|
isMerged: false,
|
|
isKnown: false,
|
|
isNPlusOneTarget: false,
|
|
},
|
|
{
|
|
surface: '欲しい',
|
|
reading: 'ホシイ',
|
|
headword: '欲しい',
|
|
startPos: 7,
|
|
endPos: 10,
|
|
partOfSpeech: PartOfSpeech.i_adjective,
|
|
pos1: '形容詞',
|
|
isMerged: false,
|
|
isKnown: true,
|
|
isNPlusOneTarget: false,
|
|
},
|
|
{
|
|
surface: 'です',
|
|
reading: 'デス',
|
|
headword: 'です',
|
|
startPos: 10,
|
|
endPos: 12,
|
|
partOfSpeech: PartOfSpeech.bound_auxiliary,
|
|
pos1: '助動詞',
|
|
isMerged: false,
|
|
isKnown: false,
|
|
isNPlusOneTarget: false,
|
|
},
|
|
],
|
|
isKnownWord: (text) => text === '私' || text === 'あの' || text === '欲しい',
|
|
}),
|
|
);
|
|
|
|
const targets = result.tokens?.filter((token) => token.isNPlusOneTarget) ?? [];
|
|
assert.equal(targets.length, 1);
|
|
assert.equal(targets[0]?.surface, '仮面');
|
|
});
|
|
|
|
test('tokenizeSubtitle keeps correct MeCab pos1 enrichment when Yomitan offsets skip spaces', async () => {
|
|
const parserWindow = {
|
|
isDestroyed: () => false,
|
|
webContents: {
|
|
executeJavaScript: async () => [
|
|
{
|
|
source: 'scanning-parser',
|
|
index: 0,
|
|
content: [
|
|
[{ text: '私', reading: 'わたし', headwords: [[{ term: '私' }]] }],
|
|
[{ text: 'も', reading: 'も', headwords: [[{ term: 'も' }]] }],
|
|
[{ text: 'あの', reading: 'あの', headwords: [[{ term: 'あの' }]] }],
|
|
[{ text: '仮面', reading: 'かめん', headwords: [[{ term: '仮面' }]] }],
|
|
[{ text: 'が', reading: 'が', headwords: [[{ term: 'が' }]] }],
|
|
[{ text: '欲しい', reading: 'ほしい', headwords: [[{ term: '欲しい' }]] }],
|
|
[{ text: 'です', reading: 'です', headwords: [[{ term: 'です' }]] }],
|
|
],
|
|
},
|
|
],
|
|
},
|
|
} as unknown as Electron.BrowserWindow;
|
|
|
|
const result = await tokenizeSubtitle(
|
|
'私も あの仮面が欲しいです',
|
|
makeDeps({
|
|
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
|
getYomitanParserWindow: () => parserWindow,
|
|
tokenizeWithMecab: async () => [
|
|
{
|
|
surface: '私',
|
|
reading: 'ワタシ',
|
|
headword: '私',
|
|
startPos: 0,
|
|
endPos: 1,
|
|
partOfSpeech: PartOfSpeech.noun,
|
|
pos1: '名詞',
|
|
isMerged: false,
|
|
isKnown: true,
|
|
isNPlusOneTarget: false,
|
|
},
|
|
{
|
|
surface: 'も',
|
|
reading: 'モ',
|
|
headword: 'も',
|
|
startPos: 1,
|
|
endPos: 2,
|
|
partOfSpeech: PartOfSpeech.particle,
|
|
pos1: '助詞',
|
|
isMerged: false,
|
|
isKnown: false,
|
|
isNPlusOneTarget: false,
|
|
},
|
|
{
|
|
surface: ' ',
|
|
reading: '',
|
|
headword: ' ',
|
|
startPos: 2,
|
|
endPos: 3,
|
|
partOfSpeech: PartOfSpeech.symbol,
|
|
pos1: '記号',
|
|
isMerged: false,
|
|
isKnown: false,
|
|
isNPlusOneTarget: false,
|
|
},
|
|
{
|
|
surface: 'あの',
|
|
reading: 'アノ',
|
|
headword: 'あの',
|
|
startPos: 3,
|
|
endPos: 5,
|
|
partOfSpeech: PartOfSpeech.other,
|
|
pos1: '連体詞',
|
|
isMerged: false,
|
|
isKnown: true,
|
|
isNPlusOneTarget: false,
|
|
},
|
|
{
|
|
surface: '仮面',
|
|
reading: 'カメン',
|
|
headword: '仮面',
|
|
startPos: 5,
|
|
endPos: 7,
|
|
partOfSpeech: PartOfSpeech.noun,
|
|
pos1: '名詞',
|
|
isMerged: false,
|
|
isKnown: false,
|
|
isNPlusOneTarget: false,
|
|
},
|
|
{
|
|
surface: 'が',
|
|
reading: 'ガ',
|
|
headword: 'が',
|
|
startPos: 7,
|
|
endPos: 8,
|
|
partOfSpeech: PartOfSpeech.particle,
|
|
pos1: '助詞',
|
|
isMerged: false,
|
|
isKnown: false,
|
|
isNPlusOneTarget: false,
|
|
},
|
|
{
|
|
surface: '欲しい',
|
|
reading: 'ホシイ',
|
|
headword: '欲しい',
|
|
startPos: 8,
|
|
endPos: 11,
|
|
partOfSpeech: PartOfSpeech.i_adjective,
|
|
pos1: '形容詞',
|
|
isMerged: false,
|
|
isKnown: true,
|
|
isNPlusOneTarget: false,
|
|
},
|
|
{
|
|
surface: 'です',
|
|
reading: 'デス',
|
|
headword: 'です',
|
|
startPos: 11,
|
|
endPos: 13,
|
|
partOfSpeech: PartOfSpeech.bound_auxiliary,
|
|
pos1: '助動詞',
|
|
isMerged: false,
|
|
isKnown: false,
|
|
isNPlusOneTarget: false,
|
|
},
|
|
],
|
|
isKnownWord: (text) => text === '私' || text === 'あの' || text === '欲しい',
|
|
}),
|
|
);
|
|
|
|
const targets = result.tokens?.filter((token) => token.isNPlusOneTarget) ?? [];
|
|
const gaToken = result.tokens?.find((token) => token.surface === 'が');
|
|
const desuToken = result.tokens?.find((token) => token.surface === 'です');
|
|
assert.equal(gaToken?.pos1, '助詞');
|
|
assert.equal(desuToken?.pos1, '助動詞');
|
|
assert.equal(targets.length, 1);
|
|
assert.equal(targets[0]?.surface, '仮面');
|
|
});
|
|
|
|
test('tokenizeSubtitle does not color 1-2 word sentences by default', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'猫です',
|
|
makeDepsFromYomitanTokens(
|
|
[
|
|
{ surface: '私', reading: 'わたし', headword: '私' },
|
|
{ surface: '犬', reading: 'いぬ', headword: '犬' },
|
|
],
|
|
{},
|
|
),
|
|
);
|
|
|
|
assert.equal(
|
|
result.tokens?.some((token) => token.isNPlusOneTarget),
|
|
false,
|
|
);
|
|
});
|
|
|
|
test('tokenizeSubtitle checks known words by headword, not surface', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'猫です',
|
|
makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫です' }], {
|
|
isKnownWord: (text) => text === '猫です',
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.text, '猫です');
|
|
assert.equal(result.tokens?.[0]?.isKnown, true);
|
|
});
|
|
|
|
test('tokenizeSubtitle checks known words by surface when configured', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'猫です',
|
|
makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫です' }], {
|
|
getKnownWordMatchMode: () => 'surface',
|
|
isKnownWord: (text) => text === '猫',
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.text, '猫です');
|
|
assert.equal(result.tokens?.[0]?.isKnown, true);
|
|
});
|
|
|
|
test('createTokenizerDepsRuntime checks MeCab availability before first tokenizeWithMecab call', async () => {
|
|
let available = false;
|
|
let checkCalls = 0;
|
|
|
|
const deps = createTokenizerDepsRuntime({
|
|
getYomitanExt: () => null,
|
|
getYomitanParserWindow: () => null,
|
|
setYomitanParserWindow: () => {},
|
|
getYomitanParserReadyPromise: () => null,
|
|
setYomitanParserReadyPromise: () => {},
|
|
getYomitanParserInitPromise: () => null,
|
|
setYomitanParserInitPromise: () => {},
|
|
isKnownWord: () => false,
|
|
getKnownWordMatchMode: () => 'headword',
|
|
getJlptLevel: () => null,
|
|
getMecabTokenizer: () => ({
|
|
getStatus: () => ({ available }),
|
|
checkAvailability: async () => {
|
|
checkCalls += 1;
|
|
available = true;
|
|
return true;
|
|
},
|
|
tokenize: async () => {
|
|
if (!available) {
|
|
return null;
|
|
}
|
|
return [
|
|
{
|
|
word: '仮面',
|
|
partOfSpeech: PartOfSpeech.noun,
|
|
pos1: '名詞',
|
|
pos2: '一般',
|
|
pos3: '',
|
|
pos4: '',
|
|
inflectionType: '',
|
|
inflectionForm: '',
|
|
headword: '仮面',
|
|
katakanaReading: 'カメン',
|
|
pronunciation: 'カメン',
|
|
},
|
|
];
|
|
},
|
|
}),
|
|
});
|
|
|
|
const first = await deps.tokenizeWithMecab('仮面');
|
|
const second = await deps.tokenizeWithMecab('仮面');
|
|
|
|
assert.equal(checkCalls, 1);
|
|
assert.equal(first?.[0]?.surface, '仮面');
|
|
assert.equal(second?.[0]?.surface, '仮面');
|
|
});
|
|
|
|
test('tokenizeSubtitle uses async MeCab enrichment override when provided', async () => {
|
|
const result = await tokenizeSubtitle(
|
|
'猫',
|
|
makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], {
|
|
tokenizeWithMecab: async () => [
|
|
{
|
|
headword: '猫',
|
|
surface: '猫',
|
|
reading: 'ネコ',
|
|
startPos: 0,
|
|
endPos: 1,
|
|
partOfSpeech: PartOfSpeech.noun,
|
|
pos1: '名詞',
|
|
isMerged: true,
|
|
isKnown: false,
|
|
isNPlusOneTarget: false,
|
|
},
|
|
],
|
|
enrichTokensWithMecab: async (tokens) =>
|
|
tokens.map((token) => ({
|
|
...token,
|
|
pos1: 'override-pos',
|
|
})),
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 1);
|
|
assert.equal(result.tokens?.[0]?.pos1, 'override-pos');
|
|
});
|
|
|
|
test('createTokenizerDepsRuntime exposes async MeCab enrichment helper', async () => {
|
|
const deps = createTokenizerDepsRuntime({
|
|
getYomitanExt: () => null,
|
|
getYomitanParserWindow: () => null,
|
|
setYomitanParserWindow: () => {},
|
|
getYomitanParserReadyPromise: () => null,
|
|
setYomitanParserReadyPromise: () => {},
|
|
getYomitanParserInitPromise: () => null,
|
|
setYomitanParserInitPromise: () => {},
|
|
isKnownWord: () => false,
|
|
getKnownWordMatchMode: () => 'headword',
|
|
getJlptLevel: () => null,
|
|
getMecabTokenizer: () => null,
|
|
});
|
|
|
|
const enriched = await deps.enrichTokensWithMecab?.(
|
|
[
|
|
{
|
|
headword: 'は',
|
|
surface: 'は',
|
|
reading: 'は',
|
|
startPos: 0,
|
|
endPos: 1,
|
|
partOfSpeech: PartOfSpeech.other,
|
|
isMerged: true,
|
|
isKnown: false,
|
|
isNPlusOneTarget: false,
|
|
},
|
|
],
|
|
[
|
|
{
|
|
headword: 'は',
|
|
surface: 'は',
|
|
reading: 'ハ',
|
|
startPos: 0,
|
|
endPos: 1,
|
|
partOfSpeech: PartOfSpeech.particle,
|
|
pos1: '助詞',
|
|
isMerged: false,
|
|
isKnown: false,
|
|
isNPlusOneTarget: false,
|
|
},
|
|
],
|
|
);
|
|
|
|
assert.equal(enriched?.[0]?.pos1, '助詞');
|
|
});
|
|
|
|
test('tokenizeSubtitle skips all enrichment stages when disabled', async () => {
|
|
let knownCalls = 0;
|
|
let mecabCalls = 0;
|
|
let jlptCalls = 0;
|
|
let frequencyCalls = 0;
|
|
|
|
const result = await tokenizeSubtitle(
|
|
'猫',
|
|
makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], {
|
|
isKnownWord: () => {
|
|
knownCalls += 1;
|
|
return true;
|
|
},
|
|
getNPlusOneEnabled: () => false,
|
|
getJlptEnabled: () => false,
|
|
getFrequencyDictionaryEnabled: () => false,
|
|
getJlptLevel: () => {
|
|
jlptCalls += 1;
|
|
return 'N5';
|
|
},
|
|
getFrequencyRank: () => {
|
|
frequencyCalls += 1;
|
|
return 10;
|
|
},
|
|
tokenizeWithMecab: async () => {
|
|
mecabCalls += 1;
|
|
return null;
|
|
},
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.length, 1);
|
|
assert.equal(result.tokens?.[0]?.isKnown, false);
|
|
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
|
|
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
|
|
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
|
assert.equal(knownCalls, 0);
|
|
assert.equal(mecabCalls, 0);
|
|
assert.equal(jlptCalls, 0);
|
|
assert.equal(frequencyCalls, 0);
|
|
});
|
|
|
|
test('tokenizeSubtitle keeps frequency enrichment while n+1 is disabled', async () => {
|
|
let knownCalls = 0;
|
|
let mecabCalls = 0;
|
|
let frequencyCalls = 0;
|
|
|
|
const result = await tokenizeSubtitle(
|
|
'猫',
|
|
makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], {
|
|
isKnownWord: () => {
|
|
knownCalls += 1;
|
|
return true;
|
|
},
|
|
getNPlusOneEnabled: () => false,
|
|
getJlptEnabled: () => false,
|
|
getFrequencyDictionaryEnabled: () => true,
|
|
getFrequencyRank: () => {
|
|
frequencyCalls += 1;
|
|
return 7;
|
|
},
|
|
tokenizeWithMecab: async () => {
|
|
mecabCalls += 1;
|
|
return [
|
|
{
|
|
headword: '猫',
|
|
surface: '猫',
|
|
reading: 'ネコ',
|
|
startPos: 0,
|
|
endPos: 1,
|
|
partOfSpeech: PartOfSpeech.noun,
|
|
pos1: '名詞',
|
|
isMerged: false,
|
|
isKnown: false,
|
|
isNPlusOneTarget: false,
|
|
},
|
|
];
|
|
},
|
|
}),
|
|
);
|
|
|
|
assert.equal(result.tokens?.[0]?.frequencyRank, 7);
|
|
assert.equal(result.tokens?.[0]?.isKnown, false);
|
|
assert.equal(knownCalls, 0);
|
|
assert.equal(mecabCalls, 1);
|
|
assert.equal(frequencyCalls, 1);
|
|
});
|