mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-05-04 00:41:33 -07:00
Restore multi-copy digit capture and add AniList selection (#56)
This commit is contained in:
@@ -4069,6 +4069,225 @@ test('tokenizeSubtitle clears all annotations for explanatory contrast endings',
|
||||
);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle clears annotations for ことに while preserving lexical N+1 target', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'さっきの俺と違うことに気付かないのかい?',
|
||||
makeDepsFromYomitanTokens(
|
||||
[
|
||||
{ surface: 'さっき', reading: 'さっき', headword: 'さっき' },
|
||||
{ surface: 'の', reading: 'の', headword: 'の' },
|
||||
{ surface: '俺', reading: 'おれ', headword: '俺' },
|
||||
{ surface: 'と', reading: 'と', headword: 'と' },
|
||||
{ surface: '違う', reading: 'ちがう', headword: '違う' },
|
||||
{ surface: 'ことに', reading: 'ことに', headword: '事' },
|
||||
{ surface: '気付かない', reading: 'きづかない', headword: '気付く' },
|
||||
{ surface: 'の', reading: 'の', headword: 'の' },
|
||||
{ surface: 'かい', reading: 'かい', headword: 'かい' },
|
||||
{ surface: '?', reading: '', headword: '?' },
|
||||
],
|
||||
{
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) =>
|
||||
text === '違う' ? 900 : text === '事' ? 81 : text === '気付く' ? 1500 : null,
|
||||
getJlptLevel: (text) =>
|
||||
text === '違う' ? 'N4' : text === '事' ? 'N4' : text === '気付く' ? 'N3' : null,
|
||||
isKnownWord: (text) => ['さっき', 'の', '俺', 'と', '気付く', 'かい', '?'].includes(text),
|
||||
getMinSentenceWordsForNPlusOne: () => 1,
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
headword: 'さっき',
|
||||
surface: 'さっき',
|
||||
reading: 'サッキ',
|
||||
startPos: 0,
|
||||
endPos: 3,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '副詞可能',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'の',
|
||||
surface: 'の',
|
||||
reading: 'ノ',
|
||||
startPos: 3,
|
||||
endPos: 4,
|
||||
partOfSpeech: PartOfSpeech.particle,
|
||||
pos1: '助詞',
|
||||
pos2: '連体化',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: '俺',
|
||||
surface: '俺',
|
||||
reading: 'オレ',
|
||||
startPos: 4,
|
||||
endPos: 5,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '代名詞',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'と',
|
||||
surface: 'と',
|
||||
reading: 'ト',
|
||||
startPos: 5,
|
||||
endPos: 6,
|
||||
partOfSpeech: PartOfSpeech.particle,
|
||||
pos1: '助詞',
|
||||
pos2: '格助詞',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: '違う',
|
||||
surface: '違う',
|
||||
reading: 'チガウ',
|
||||
startPos: 6,
|
||||
endPos: 8,
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞',
|
||||
pos2: '自立',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: '事',
|
||||
surface: 'こと',
|
||||
reading: 'コト',
|
||||
startPos: 8,
|
||||
endPos: 10,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '非自立',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'に',
|
||||
surface: 'に',
|
||||
reading: 'ニ',
|
||||
startPos: 10,
|
||||
endPos: 11,
|
||||
partOfSpeech: PartOfSpeech.particle,
|
||||
pos1: '助詞',
|
||||
pos2: '格助詞',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: '気付く',
|
||||
surface: '気付か',
|
||||
reading: 'キヅカ',
|
||||
startPos: 11,
|
||||
endPos: 14,
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞',
|
||||
pos2: '自立',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'ない',
|
||||
surface: 'ない',
|
||||
reading: 'ナイ',
|
||||
startPos: 14,
|
||||
endPos: 16,
|
||||
partOfSpeech: PartOfSpeech.bound_auxiliary,
|
||||
pos1: '助動詞',
|
||||
pos2: '*',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'の',
|
||||
surface: 'の',
|
||||
reading: 'ノ',
|
||||
startPos: 16,
|
||||
endPos: 17,
|
||||
partOfSpeech: PartOfSpeech.particle,
|
||||
pos1: '助詞',
|
||||
pos2: '終助詞',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'かい',
|
||||
surface: 'かい',
|
||||
reading: 'カイ',
|
||||
startPos: 17,
|
||||
endPos: 19,
|
||||
partOfSpeech: PartOfSpeech.particle,
|
||||
pos1: '助詞',
|
||||
pos2: '終助詞',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: '?',
|
||||
surface: '?',
|
||||
reading: '',
|
||||
startPos: 19,
|
||||
endPos: 20,
|
||||
partOfSpeech: PartOfSpeech.symbol,
|
||||
pos1: '記号',
|
||||
pos2: '一般',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
},
|
||||
),
|
||||
);
|
||||
|
||||
const tokenSummary = result.tokens?.map((token) => ({
|
||||
surface: token.surface,
|
||||
headword: token.headword,
|
||||
isKnown: token.isKnown,
|
||||
isNPlusOneTarget: token.isNPlusOneTarget,
|
||||
frequencyRank: token.frequencyRank,
|
||||
jlptLevel: token.jlptLevel,
|
||||
}));
|
||||
|
||||
assert.deepEqual(
|
||||
tokenSummary?.find((token) => token.surface === 'ことに'),
|
||||
{
|
||||
surface: 'ことに',
|
||||
headword: '事',
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
frequencyRank: undefined,
|
||||
jlptLevel: undefined,
|
||||
},
|
||||
);
|
||||
assert.deepEqual(
|
||||
tokenSummary?.find((token) => token.surface === '違う'),
|
||||
{
|
||||
surface: '違う',
|
||||
headword: '違う',
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: true,
|
||||
frequencyRank: 900,
|
||||
jlptLevel: 'N4',
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle excludes default non-independent pos2 from N+1 when JLPT/frequency are disabled', async () => {
|
||||
let mecabCalls = 0;
|
||||
const result = await tokenizeSubtitle(
|
||||
|
||||
Reference in New Issue
Block a user