mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-04-29 04:19:26 -07:00
fix: address CodeRabbit review comments
This commit is contained in:
@@ -3086,6 +3086,27 @@ test('tokenizeSubtitle uses Yomitan word classes to classify standalone particle
|
||||
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle uses Yomitan word classes to classify auxiliary subclasses', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'です',
|
||||
makeDepsFromYomitanTokens(
|
||||
[{ surface: 'です', reading: 'です', headword: 'です', wordClasses: ['aux-v'] }],
|
||||
{
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: () => 10,
|
||||
getJlptLevel: () => 'N5',
|
||||
tokenizeWithMecab: async () => null,
|
||||
},
|
||||
),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.partOfSpeech, PartOfSpeech.bound_auxiliary);
|
||||
assert.equal(result.tokens?.[0]?.pos1, '助動詞');
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
||||
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle fills detailed MeCab POS when Yomitan word class supplies coarse POS', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'は',
|
||||
|
||||
@@ -359,7 +359,7 @@ function resolvePartOfSpeechFromYomitanWordClasses(wordClasses: string[]): {
|
||||
if (wordClasses.includes('prt')) {
|
||||
return { partOfSpeech: PartOfSpeech.particle, pos1: '助詞' };
|
||||
}
|
||||
if (wordClasses.includes('aux')) {
|
||||
if (wordClasses.some((wordClass) => wordClass === 'aux' || wordClass.startsWith('aux-'))) {
|
||||
return { partOfSpeech: PartOfSpeech.bound_auxiliary, pos1: '助動詞' };
|
||||
}
|
||||
if (wordClasses.some((wordClass) => wordClass.startsWith('v'))) {
|
||||
|
||||
@@ -713,6 +713,57 @@ test('annotateTokens N+1 sentence word count respects source punctuation gaps om
|
||||
assert.equal(result[3]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('annotateTokens N+1 sentence word count normalizes line breaks before gap detection', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: '私',
|
||||
headword: '私',
|
||||
pos1: '名詞',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
}),
|
||||
makeToken({
|
||||
surface: '猫',
|
||||
headword: '猫',
|
||||
pos1: '名詞',
|
||||
startPos: 2,
|
||||
endPos: 3,
|
||||
}),
|
||||
makeToken({
|
||||
surface: '犬',
|
||||
headword: '犬',
|
||||
pos1: '名詞',
|
||||
startPos: 3,
|
||||
endPos: 4,
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'ふざけん',
|
||||
headword: 'ふざける',
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞',
|
||||
pos2: '自立',
|
||||
startPos: 5,
|
||||
endPos: 9,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === '私' || text === '猫' || text === '犬',
|
||||
}),
|
||||
{
|
||||
minSentenceWordsForNPlusOne: 3,
|
||||
sourceText: '私\r\n猫犬!ふざけんなよ!',
|
||||
},
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[1]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[2]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[3]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('annotateTokens applies configured pos1 exclusions to both frequency and N+1', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
|
||||
Reference in New Issue
Block a user