mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-02-27 18:22:41 -08:00
refactor(tokenizer): remove MeCab fallback tokenization path
This commit is contained in:
@@ -1,12 +1,7 @@
|
||||
import test from 'node:test';
|
||||
import assert from 'node:assert/strict';
|
||||
import { PartOfSpeech } from '../../types';
|
||||
import {
|
||||
createTokenizerDepsRuntime,
|
||||
TokenizerServiceDeps,
|
||||
TokenizerDepsRuntimeOptions,
|
||||
tokenizeSubtitle,
|
||||
} from './tokenizer';
|
||||
import { createTokenizerDepsRuntime, TokenizerServiceDeps, tokenizeSubtitle } from './tokenizer';
|
||||
|
||||
function makeDeps(overrides: Partial<TokenizerServiceDeps> = {}): TokenizerServiceDeps {
|
||||
return {
|
||||
@@ -25,24 +20,37 @@ function makeDeps(overrides: Partial<TokenizerServiceDeps> = {}): TokenizerServi
|
||||
};
|
||||
}
|
||||
|
||||
function makeDepsFromMecabTokenizer(
|
||||
tokenize: (text: string) => Promise<import('../../types').Token[] | null>,
|
||||
overrides: Partial<TokenizerDepsRuntimeOptions> = {},
|
||||
interface YomitanTokenInput {
|
||||
surface: string;
|
||||
reading?: string;
|
||||
headword?: string;
|
||||
}
|
||||
|
||||
function makeDepsFromYomitanTokens(
|
||||
tokens: YomitanTokenInput[],
|
||||
overrides: Partial<TokenizerServiceDeps> = {},
|
||||
): TokenizerServiceDeps {
|
||||
return createTokenizerDepsRuntime({
|
||||
getYomitanExt: () => null,
|
||||
getYomitanParserWindow: () => null,
|
||||
setYomitanParserWindow: () => {},
|
||||
getYomitanParserReadyPromise: () => null,
|
||||
setYomitanParserReadyPromise: () => {},
|
||||
getYomitanParserInitPromise: () => null,
|
||||
setYomitanParserInitPromise: () => {},
|
||||
isKnownWord: () => false,
|
||||
getKnownWordMatchMode: () => 'headword',
|
||||
getMecabTokenizer: () => ({
|
||||
tokenize,
|
||||
}),
|
||||
getJlptLevel: () => null,
|
||||
return makeDeps({
|
||||
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
||||
getYomitanParserWindow: () =>
|
||||
({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async () => [
|
||||
{
|
||||
source: 'scanning-parser',
|
||||
index: 0,
|
||||
content: tokens.map((token) => [
|
||||
{
|
||||
text: token.surface,
|
||||
reading: token.reading ?? token.surface,
|
||||
headwords: [[{ term: token.headword ?? token.surface }]],
|
||||
},
|
||||
]),
|
||||
},
|
||||
],
|
||||
},
|
||||
}) as unknown as Electron.BrowserWindow,
|
||||
...overrides,
|
||||
});
|
||||
}
|
||||
@@ -91,34 +99,10 @@ test('tokenizeSubtitle caches JLPT lookups across repeated tokens', async () =>
|
||||
let lookupCalls = 0;
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫猫',
|
||||
makeDepsFromMecabTokenizer(
|
||||
async () => [
|
||||
{
|
||||
word: '猫',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
pos3: '',
|
||||
pos4: '',
|
||||
inflectionType: '',
|
||||
inflectionForm: '',
|
||||
headword: '猫',
|
||||
katakanaReading: 'ネコ',
|
||||
pronunciation: 'ネコ',
|
||||
},
|
||||
{
|
||||
word: '猫',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
pos3: '',
|
||||
pos4: '',
|
||||
inflectionType: '',
|
||||
inflectionForm: '',
|
||||
headword: '猫',
|
||||
katakanaReading: 'ネコ',
|
||||
pronunciation: 'ネコ',
|
||||
},
|
||||
makeDepsFromYomitanTokens(
|
||||
[
|
||||
{ surface: '猫', reading: 'ねこ', headword: '猫' },
|
||||
{ surface: '猫', reading: 'ねこ', headword: '猫' },
|
||||
],
|
||||
{
|
||||
getJlptLevel: (text) => {
|
||||
@@ -138,26 +122,9 @@ test('tokenizeSubtitle caches JLPT lookups across repeated tokens', async () =>
|
||||
test('tokenizeSubtitle leaves JLPT unset for non-matching tokens', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫',
|
||||
makeDepsFromMecabTokenizer(
|
||||
async () => [
|
||||
{
|
||||
word: '猫',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
pos3: '',
|
||||
pos4: '',
|
||||
inflectionType: '',
|
||||
inflectionForm: '',
|
||||
headword: '猫',
|
||||
katakanaReading: 'ネコ',
|
||||
pronunciation: 'ネコ',
|
||||
},
|
||||
],
|
||||
{
|
||||
getJlptLevel: () => null,
|
||||
},
|
||||
),
|
||||
makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], {
|
||||
getJlptLevel: () => null,
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
@@ -168,20 +135,7 @@ test('tokenizeSubtitle skips JLPT lookups when disabled', async () => {
|
||||
let lookupCalls = 0;
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫です',
|
||||
makeDeps({
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
headword: '猫',
|
||||
surface: '猫',
|
||||
reading: 'ネコ',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], {
|
||||
getJlptLevel: () => {
|
||||
lookupCalls += 1;
|
||||
return 'N5';
|
||||
@@ -198,34 +152,16 @@ test('tokenizeSubtitle skips JLPT lookups when disabled', async () => {
|
||||
test('tokenizeSubtitle applies frequency dictionary ranks', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫です',
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
headword: '猫',
|
||||
surface: '猫',
|
||||
reading: 'ネコ',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'です',
|
||||
surface: 'です',
|
||||
reading: 'デス',
|
||||
startPos: 1,
|
||||
endPos: 2,
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
makeDepsFromYomitanTokens(
|
||||
[
|
||||
{ surface: '猫', reading: 'ねこ', headword: '猫' },
|
||||
{ surface: 'です', reading: 'です', headword: 'です' },
|
||||
],
|
||||
getFrequencyRank: (text) => (text === '猫' ? 23 : 1200),
|
||||
}),
|
||||
{
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) => (text === '猫' ? 23 : 1200),
|
||||
},
|
||||
),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 2);
|
||||
@@ -588,38 +524,20 @@ test('tokenizeSubtitle skips frequency rank when Yomitan token is enriched as pa
|
||||
test('tokenizeSubtitle ignores invalid frequency ranks', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫',
|
||||
makeDeps({
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
headword: '猫',
|
||||
surface: '猫',
|
||||
reading: 'ネコ',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'です',
|
||||
surface: 'です',
|
||||
reading: 'デス',
|
||||
startPos: 1,
|
||||
endPos: 2,
|
||||
partOfSpeech: PartOfSpeech.bound_auxiliary,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
makeDepsFromYomitanTokens(
|
||||
[
|
||||
{ surface: '猫', reading: 'ねこ', headword: '猫' },
|
||||
{ surface: 'です', reading: 'です', headword: 'です' },
|
||||
],
|
||||
getFrequencyRank: (text) => {
|
||||
if (text === '猫') return Number.NaN;
|
||||
if (text === 'です') return -1;
|
||||
return 100;
|
||||
{
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) => {
|
||||
if (text === '猫') return Number.NaN;
|
||||
if (text === 'です') return -1;
|
||||
return 100;
|
||||
},
|
||||
},
|
||||
}),
|
||||
),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 2);
|
||||
@@ -631,21 +549,8 @@ test('tokenizeSubtitle skips frequency lookups when disabled', async () => {
|
||||
let frequencyCalls = 0;
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫',
|
||||
makeDeps({
|
||||
makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], {
|
||||
getFrequencyDictionaryEnabled: () => false,
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
headword: '猫',
|
||||
surface: '猫',
|
||||
reading: 'ネコ',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
getFrequencyRank: () => {
|
||||
frequencyCalls += 1;
|
||||
return 10;
|
||||
@@ -728,63 +633,28 @@ test('tokenizeSubtitle skips JLPT level for repeated kana SFX', async () => {
|
||||
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle assigns JLPT level to mecab tokens', async () => {
|
||||
test('tokenizeSubtitle assigns JLPT level to Yomitan tokens', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫です',
|
||||
makeDepsFromMecabTokenizer(
|
||||
async () => [
|
||||
{
|
||||
word: '猫',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
pos3: '',
|
||||
pos4: '',
|
||||
inflectionType: '',
|
||||
inflectionForm: '',
|
||||
headword: '猫',
|
||||
katakanaReading: 'ネコ',
|
||||
pronunciation: 'ネコ',
|
||||
},
|
||||
],
|
||||
{
|
||||
getJlptLevel: (text) => (text === '猫' ? 'N4' : null),
|
||||
},
|
||||
),
|
||||
makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], {
|
||||
getJlptLevel: (text) => (text === '猫' ? 'N4' : null),
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.jlptLevel, 'N4');
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle skips JLPT level for mecab tokens marked as ineligible', async () => {
|
||||
test('tokenizeSubtitle can assign JLPT level to Yomitan particle token', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'は',
|
||||
makeDepsFromMecabTokenizer(
|
||||
async () => [
|
||||
{
|
||||
word: 'は',
|
||||
partOfSpeech: PartOfSpeech.particle,
|
||||
pos1: '助詞',
|
||||
pos2: '',
|
||||
pos3: '',
|
||||
pos4: '',
|
||||
inflectionType: '',
|
||||
inflectionForm: '',
|
||||
headword: 'は',
|
||||
katakanaReading: 'ハ',
|
||||
pronunciation: 'ハ',
|
||||
},
|
||||
],
|
||||
{
|
||||
getJlptLevel: (text) => (text === 'は' ? 'N5' : null),
|
||||
},
|
||||
),
|
||||
makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は' }], {
|
||||
getJlptLevel: (text) => (text === 'は' ? 'N5' : null),
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.pos1, '助詞');
|
||||
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
|
||||
assert.equal(result.tokens?.[0]?.jlptLevel, 'N5');
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle returns null tokens for empty normalized text', async () => {
|
||||
@@ -792,58 +662,34 @@ test('tokenizeSubtitle returns null tokens for empty normalized text', async ()
|
||||
assert.deepEqual(result, { text: ' \\n ', tokens: null });
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle normalizes newlines before mecab fallback', async () => {
|
||||
let tokenizeInput = '';
|
||||
test('tokenizeSubtitle normalizes newlines before Yomitan parse request', async () => {
|
||||
let parseInput = '';
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫\\Nです\nね',
|
||||
makeDeps({
|
||||
tokenizeWithMecab: async (text) => {
|
||||
tokenizeInput = text;
|
||||
return [
|
||||
{
|
||||
surface: '猫ですね',
|
||||
reading: 'ネコデスネ',
|
||||
headword: '猫ですね',
|
||||
startPos: 0,
|
||||
endPos: 4,
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
isMerged: true,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
getYomitanExt: () => ({ id: 'dummy-ext' }) as any,
|
||||
getYomitanParserWindow: () =>
|
||||
({
|
||||
isDestroyed: () => false,
|
||||
webContents: {
|
||||
executeJavaScript: async (script: string) => {
|
||||
parseInput = script;
|
||||
return null;
|
||||
},
|
||||
},
|
||||
];
|
||||
},
|
||||
}) as unknown as Electron.BrowserWindow,
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(tokenizeInput, '猫 です ね');
|
||||
assert.match(parseInput, /猫 です ね/);
|
||||
assert.equal(result.text, '猫\nです\nね');
|
||||
assert.equal(result.tokens?.[0]?.surface, '猫ですね');
|
||||
assert.equal(result.tokens, null);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle falls back to mecab tokens when available', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫です',
|
||||
makeDeps({
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
surface: '猫',
|
||||
reading: 'ネコ',
|
||||
headword: '猫',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
}),
|
||||
);
|
||||
test('tokenizeSubtitle returns null tokens when Yomitan parsing is unavailable', async () => {
|
||||
const result = await tokenizeSubtitle('猫です', makeDeps());
|
||||
|
||||
assert.equal(result.text, '猫です');
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.surface, '猫');
|
||||
assert.deepEqual(result, { text: '猫です', tokens: null });
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle returns null tokens when mecab throws', async () => {
|
||||
@@ -1055,7 +901,7 @@ test('tokenizeSubtitle preserves segmented Yomitan line as one token', async ()
|
||||
assert.equal(result.tokens?.[0]?.isKnown, false);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle prefers mecab parser tokens when scanning parser returns one token', async () => {
|
||||
test('tokenizeSubtitle keeps scanning parser token when scanning parser returns one token', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'俺は小園にいきたい',
|
||||
makeDeps({
|
||||
@@ -1128,10 +974,9 @@ test('tokenizeSubtitle prefers mecab parser tokens when scanning parser returns
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 5);
|
||||
assert.equal(result.tokens?.map((token) => token.surface).join(','), '俺,は,小園,に,いきたい');
|
||||
assert.equal(result.tokens?.[2]?.surface, '小園');
|
||||
assert.equal(result.tokens?.[2]?.frequencyRank, 25);
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.map((token) => token.surface).join(','), '俺は小園にいきたい');
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle keeps scanning parser tokens when they are already split', async () => {
|
||||
@@ -1228,7 +1073,7 @@ test('tokenizeSubtitle keeps scanning parser tokens when they are already split'
|
||||
assert.equal(result.tokens?.[2]?.frequencyRank, undefined);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle prefers parse candidates with fewer fragment-only kana tokens when source priority is equal', async () => {
|
||||
test('tokenizeSubtitle keeps parsing explicit by scanning-parser source only', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'俺は公園にいきたい',
|
||||
makeDeps({
|
||||
@@ -1239,7 +1084,7 @@ test('tokenizeSubtitle prefers parse candidates with fewer fragment-only kana to
|
||||
webContents: {
|
||||
executeJavaScript: async () => [
|
||||
{
|
||||
source: 'mecab-fragmented',
|
||||
source: 'scanning-parser',
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
@@ -1274,7 +1119,7 @@ test('tokenizeSubtitle prefers parse candidates with fewer fragment-only kana to
|
||||
],
|
||||
},
|
||||
{
|
||||
source: 'mecab',
|
||||
source: 'scanning-parser',
|
||||
index: 0,
|
||||
content: [
|
||||
[
|
||||
@@ -1379,26 +1224,9 @@ test('tokenizeSubtitle still assigns frequency to non-known Yomitan tokens', asy
|
||||
test('tokenizeSubtitle marks tokens as known using callback', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫です',
|
||||
makeDepsFromMecabTokenizer(
|
||||
async () => [
|
||||
{
|
||||
word: '猫',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
pos3: '',
|
||||
pos4: '',
|
||||
inflectionType: '',
|
||||
inflectionForm: '',
|
||||
headword: '猫',
|
||||
katakanaReading: 'ネコ',
|
||||
pronunciation: 'ネコ',
|
||||
},
|
||||
],
|
||||
{
|
||||
isKnownWord: (text) => text === '猫',
|
||||
},
|
||||
),
|
||||
makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫' }], {
|
||||
isKnownWord: (text) => text === '猫',
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.text, '猫です');
|
||||
@@ -1408,51 +1236,17 @@ test('tokenizeSubtitle marks tokens as known using callback', async () => {
|
||||
test('tokenizeSubtitle still assigns frequency rank to non-known tokens', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'既知未知',
|
||||
makeDeps({
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
surface: '既知',
|
||||
reading: 'キチ',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
pos3: '',
|
||||
pos4: '',
|
||||
inflectionType: '',
|
||||
inflectionForm: '',
|
||||
headword: '既知',
|
||||
katakanaReading: 'キチ',
|
||||
pronunciation: 'キチ',
|
||||
startPos: 0,
|
||||
endPos: 2,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
surface: '未知',
|
||||
reading: 'ミチ',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
pos3: '',
|
||||
pos4: '',
|
||||
inflectionType: '',
|
||||
inflectionForm: '',
|
||||
headword: '未知',
|
||||
katakanaReading: 'ミチ',
|
||||
pronunciation: 'ミチ',
|
||||
startPos: 2,
|
||||
endPos: 4,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
makeDepsFromYomitanTokens(
|
||||
[
|
||||
{ surface: '既知', reading: 'きち', headword: '既知' },
|
||||
{ surface: '未知', reading: 'みち', headword: '未知' },
|
||||
],
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) => (text === '既知' ? 20 : text === '未知' ? 30 : null),
|
||||
isKnownWord: (text) => text === '既知',
|
||||
}),
|
||||
{
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) => (text === '既知' ? 20 : text === '未知' ? 30 : null),
|
||||
isKnownWord: (text) => text === '既知',
|
||||
},
|
||||
),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 2);
|
||||
@@ -1465,33 +1259,16 @@ test('tokenizeSubtitle still assigns frequency rank to non-known tokens', async
|
||||
test('tokenizeSubtitle selects one N+1 target token', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫です',
|
||||
makeDeps({
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
surface: '私',
|
||||
reading: 'ワタシ',
|
||||
headword: '私',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
isMerged: false,
|
||||
isKnown: true,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
surface: '犬',
|
||||
reading: 'イヌ',
|
||||
headword: '犬',
|
||||
startPos: 1,
|
||||
endPos: 2,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
makeDepsFromYomitanTokens(
|
||||
[
|
||||
{ surface: '私', reading: 'わたし', headword: '私' },
|
||||
{ surface: '犬', reading: 'いぬ', headword: '犬' },
|
||||
],
|
||||
getMinSentenceWordsForNPlusOne: () => 2,
|
||||
}),
|
||||
{
|
||||
getMinSentenceWordsForNPlusOne: () => 2,
|
||||
isKnownWord: (text) => text === '私',
|
||||
},
|
||||
),
|
||||
);
|
||||
|
||||
const targets = result.tokens?.filter((token) => token.isNPlusOneTarget) ?? [];
|
||||
@@ -1502,32 +1279,13 @@ test('tokenizeSubtitle selects one N+1 target token', async () => {
|
||||
test('tokenizeSubtitle does not mark target when sentence has multiple candidates', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫犬',
|
||||
makeDeps({
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
surface: '猫',
|
||||
reading: 'ネコ',
|
||||
headword: '猫',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
surface: '犬',
|
||||
reading: 'イヌ',
|
||||
headword: '犬',
|
||||
startPos: 1,
|
||||
endPos: 2,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
makeDepsFromYomitanTokens(
|
||||
[
|
||||
{ surface: '猫', reading: 'ねこ', headword: '猫' },
|
||||
{ surface: '犬', reading: 'いぬ', headword: '犬' },
|
||||
],
|
||||
}),
|
||||
{},
|
||||
),
|
||||
);
|
||||
|
||||
assert.equal(
|
||||
@@ -1846,32 +1604,13 @@ test('tokenizeSubtitle keeps correct MeCab pos1 enrichment when Yomitan offsets
|
||||
test('tokenizeSubtitle does not color 1-2 word sentences by default', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫です',
|
||||
makeDeps({
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
surface: '私',
|
||||
reading: 'ワタシ',
|
||||
headword: '私',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
isMerged: false,
|
||||
isKnown: true,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
surface: '犬',
|
||||
reading: 'イヌ',
|
||||
headword: '犬',
|
||||
startPos: 1,
|
||||
endPos: 2,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
makeDepsFromYomitanTokens(
|
||||
[
|
||||
{ surface: '私', reading: 'わたし', headword: '私' },
|
||||
{ surface: '犬', reading: 'いぬ', headword: '犬' },
|
||||
],
|
||||
}),
|
||||
{},
|
||||
),
|
||||
);
|
||||
|
||||
assert.equal(
|
||||
@@ -1883,26 +1622,9 @@ test('tokenizeSubtitle does not color 1-2 word sentences by default', async () =
|
||||
test('tokenizeSubtitle checks known words by headword, not surface', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫です',
|
||||
makeDepsFromMecabTokenizer(
|
||||
async () => [
|
||||
{
|
||||
word: '猫',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
pos3: '',
|
||||
pos4: '',
|
||||
inflectionType: '',
|
||||
inflectionForm: '',
|
||||
headword: '猫です',
|
||||
katakanaReading: 'ネコ',
|
||||
pronunciation: 'ネコ',
|
||||
},
|
||||
],
|
||||
{
|
||||
isKnownWord: (text) => text === '猫です',
|
||||
},
|
||||
),
|
||||
makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫です' }], {
|
||||
isKnownWord: (text) => text === '猫です',
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.text, '猫です');
|
||||
@@ -1912,34 +1634,17 @@ test('tokenizeSubtitle checks known words by headword, not surface', async () =>
|
||||
test('tokenizeSubtitle checks known words by surface when configured', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'猫です',
|
||||
makeDepsFromMecabTokenizer(
|
||||
async () => [
|
||||
{
|
||||
word: '猫',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
pos3: '',
|
||||
pos4: '',
|
||||
inflectionType: '',
|
||||
inflectionForm: '',
|
||||
headword: '猫です',
|
||||
katakanaReading: 'ネコ',
|
||||
pronunciation: 'ネコ',
|
||||
},
|
||||
],
|
||||
{
|
||||
getKnownWordMatchMode: () => 'surface',
|
||||
isKnownWord: (text) => text === '猫',
|
||||
},
|
||||
),
|
||||
makeDepsFromYomitanTokens([{ surface: '猫', reading: 'ねこ', headword: '猫です' }], {
|
||||
getKnownWordMatchMode: () => 'surface',
|
||||
isKnownWord: (text) => text === '猫',
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.text, '猫です');
|
||||
assert.equal(result.tokens?.[0]?.isKnown, true);
|
||||
});
|
||||
|
||||
test('createTokenizerDepsRuntime checks MeCab availability before first tokenize call', async () => {
|
||||
test('createTokenizerDepsRuntime checks MeCab availability before first tokenizeWithMecab call', async () => {
|
||||
let available = false;
|
||||
let checkCalls = 0;
|
||||
|
||||
@@ -1984,10 +1689,10 @@ test('createTokenizerDepsRuntime checks MeCab availability before first tokenize
|
||||
}),
|
||||
});
|
||||
|
||||
const first = await tokenizeSubtitle('仮面', deps);
|
||||
const second = await tokenizeSubtitle('仮面', deps);
|
||||
const first = await deps.tokenizeWithMecab('仮面');
|
||||
const second = await deps.tokenizeWithMecab('仮面');
|
||||
|
||||
assert.equal(checkCalls, 1);
|
||||
assert.equal(first.tokens?.[0]?.surface, '仮面');
|
||||
assert.equal(second.tokens?.[0]?.surface, '仮面');
|
||||
assert.equal(first?.[0]?.surface, '仮面');
|
||||
assert.equal(second?.[0]?.surface, '仮面');
|
||||
});
|
||||
|
||||
@@ -216,17 +216,5 @@ export async function tokenizeSubtitle(
|
||||
};
|
||||
}
|
||||
|
||||
try {
|
||||
const mecabTokens = await deps.tokenizeWithMecab(tokenizeText);
|
||||
if (mecabTokens && mecabTokens.length > 0) {
|
||||
return {
|
||||
text: displayText,
|
||||
tokens: applyAnnotationStage(mecabTokens, deps),
|
||||
};
|
||||
}
|
||||
} catch (err) {
|
||||
logger.error('Tokenization error:', (err as Error).message);
|
||||
}
|
||||
|
||||
return { text: displayText, tokens: null };
|
||||
}
|
||||
|
||||
@@ -48,7 +48,7 @@ test('prefers scanning parser when scanning candidate has more than one token',
|
||||
assert.equal(tokens?.map((token) => token.surface).join(','), '小園,に');
|
||||
});
|
||||
|
||||
test('prefers mecab candidate when scanning candidate is single token and mecab has better split', () => {
|
||||
test('keeps scanning parser candidate when scanning candidate is single token', () => {
|
||||
const parseResults = [
|
||||
makeParseItem('scanning-parser', [
|
||||
[{ text: '俺は公園にいきたい', reading: 'おれはこうえんにいきたい' }],
|
||||
@@ -63,17 +63,17 @@ test('prefers mecab candidate when scanning candidate is single token and mecab
|
||||
];
|
||||
|
||||
const tokens = selectYomitanParseTokens(parseResults, () => false, 'headword');
|
||||
assert.equal(tokens?.map((token) => token.surface).join(','), '俺,は,公園,に,いきたい');
|
||||
assert.equal(tokens?.map((token) => token.surface).join(','), '俺は公園にいきたい');
|
||||
});
|
||||
|
||||
test('tie-break prefers fewer suspicious kana fragments', () => {
|
||||
const parseResults = [
|
||||
makeParseItem('mecab-fragmented', [
|
||||
makeParseItem('scanning-parser', [
|
||||
[{ text: '俺', reading: 'おれ', headword: '俺' }],
|
||||
[{ text: 'にい', reading: '', headword: '兄' }],
|
||||
[{ text: 'きたい', reading: '', headword: '期待' }],
|
||||
]),
|
||||
makeParseItem('mecab', [
|
||||
makeParseItem('scanning-parser', [
|
||||
[{ text: '俺', reading: 'おれ', headword: '俺' }],
|
||||
[{ text: 'に', reading: 'に', headword: 'に' }],
|
||||
[{ text: '行きたい', reading: 'いきたい', headword: '行きたい' }],
|
||||
@@ -83,3 +83,16 @@ test('tie-break prefers fewer suspicious kana fragments', () => {
|
||||
const tokens = selectYomitanParseTokens(parseResults, () => false, 'headword');
|
||||
assert.equal(tokens?.map((token) => token.surface).join(','), '俺,に,行きたい');
|
||||
});
|
||||
|
||||
test('returns null when only mecab-source candidates are present', () => {
|
||||
const parseResults = [
|
||||
makeParseItem('mecab', [
|
||||
[{ text: '俺', reading: 'おれ', headword: '俺' }],
|
||||
[{ text: 'は', reading: 'は', headword: 'は' }],
|
||||
[{ text: '公園', reading: 'こうえん', headword: '公園' }],
|
||||
]),
|
||||
];
|
||||
|
||||
const tokens = selectYomitanParseTokens(parseResults, () => false, 'headword');
|
||||
assert.equal(tokens, null);
|
||||
});
|
||||
|
||||
@@ -199,14 +199,9 @@ export function selectBestYomitanParseCandidate(
|
||||
const scanningCandidates = candidates.filter(
|
||||
(candidate) => candidate.source === 'scanning-parser',
|
||||
);
|
||||
const mecabCandidates = candidates.filter((candidate) => candidate.source === 'mecab');
|
||||
|
||||
const getBestByTokenCount = (items: YomitanParseCandidate[]): YomitanParseCandidate | null =>
|
||||
items.length === 0
|
||||
? null
|
||||
: items.reduce((best, current) =>
|
||||
current.tokens.length > best.tokens.length ? current : best,
|
||||
);
|
||||
if (scanningCandidates.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const getCandidateScore = (candidate: YomitanParseCandidate): number => {
|
||||
const readableTokenCount = candidate.tokens.filter(
|
||||
@@ -242,22 +237,10 @@ export function selectBestYomitanParseCandidate(
|
||||
});
|
||||
};
|
||||
|
||||
if (scanningCandidates.length > 0) {
|
||||
const bestScanning = getBestByTokenCount(scanningCandidates);
|
||||
if (bestScanning && bestScanning.tokens.length > 1) {
|
||||
return bestScanning.tokens;
|
||||
}
|
||||
|
||||
const bestMecab = chooseBestCandidate(mecabCandidates);
|
||||
if (bestMecab && bestMecab.tokens.length > (bestScanning?.tokens.length ?? 0)) {
|
||||
return bestMecab.tokens;
|
||||
}
|
||||
|
||||
return bestScanning ? bestScanning.tokens : null;
|
||||
}
|
||||
|
||||
const multiTokenCandidates = candidates.filter((candidate) => candidate.tokens.length > 1);
|
||||
const pool = multiTokenCandidates.length > 0 ? multiTokenCandidates : candidates;
|
||||
const multiTokenCandidates = scanningCandidates.filter(
|
||||
(candidate) => candidate.tokens.length > 1,
|
||||
);
|
||||
const pool = multiTokenCandidates.length > 0 ? multiTokenCandidates : scanningCandidates;
|
||||
const bestCandidate = chooseBestCandidate(pool);
|
||||
return bestCandidate ? bestCandidate.tokens : null;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user