mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-05-13 20:12:54 -07:00
fix: retain frequency rank for honorific prefix-noun tokens
- Add `shouldAllowHonorificPrefixNounFrequency` to exempt お/ご/御 + noun merged tokens from frequency exclusion - Add regression test for `ご機嫌` asserting rank 5484 is preserved after MeCab enrichment and annotation - Close TASK-341
This commit is contained in:
@@ -4140,6 +4140,96 @@ test('tokenizeSubtitle keeps frequency for ordinal prefix-noun tokens', async ()
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 1820);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle keeps frequency for honorific prefix-noun tokens', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'ご機嫌が良くない',
|
||||
makeDepsFromYomitanTokens(
|
||||
[
|
||||
{ surface: 'ご機嫌', reading: 'ごきげん', headword: 'ご機嫌' },
|
||||
{ surface: 'が', reading: 'が', headword: 'が' },
|
||||
{ surface: '良くない', reading: 'よくない', headword: '良い' },
|
||||
],
|
||||
{
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) => (text === 'ご機嫌' ? 5484 : null),
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
headword: 'ご',
|
||||
surface: 'ご',
|
||||
reading: 'ゴ',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '接頭詞',
|
||||
pos2: '名詞接続',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: '機嫌',
|
||||
surface: '機嫌',
|
||||
reading: 'キゲン',
|
||||
startPos: 1,
|
||||
endPos: 3,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '一般',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'が',
|
||||
surface: 'が',
|
||||
reading: 'ガ',
|
||||
startPos: 3,
|
||||
endPos: 4,
|
||||
partOfSpeech: PartOfSpeech.particle,
|
||||
pos1: '助詞',
|
||||
pos2: '格助詞',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: '良い',
|
||||
surface: '良く',
|
||||
reading: 'ヨク',
|
||||
startPos: 4,
|
||||
endPos: 6,
|
||||
partOfSpeech: PartOfSpeech.i_adjective,
|
||||
pos1: '形容詞',
|
||||
pos2: '自立',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: 'ない',
|
||||
surface: 'ない',
|
||||
reading: 'ナイ',
|
||||
startPos: 6,
|
||||
endPos: 8,
|
||||
partOfSpeech: PartOfSpeech.bound_auxiliary,
|
||||
pos1: '助動詞',
|
||||
pos2: '*',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
getMinSentenceWordsForNPlusOne: () => 1,
|
||||
},
|
||||
),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.[0]?.surface, 'ご機嫌');
|
||||
assert.equal(result.tokens?.[0]?.pos1, '接頭詞|名詞');
|
||||
assert.equal(result.tokens?.[0]?.pos2, '名詞接続|一般');
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, 5484);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle clears all annotations for explanatory contrast endings', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'最近辛いものが続いとるんですけど',
|
||||
|
||||
@@ -167,6 +167,27 @@ function shouldAllowOrdinalPrefixNounFrequency(token: MergedToken): boolean {
|
||||
);
|
||||
}
|
||||
|
||||
function shouldAllowHonorificPrefixNounFrequency(token: MergedToken): boolean {
|
||||
const normalizedSurface = token.surface.trim();
|
||||
const normalizedHeadword = token.headword.trim();
|
||||
if (
|
||||
!['お', 'ご', '御'].some(
|
||||
(prefix) => normalizedSurface.startsWith(prefix) || normalizedHeadword.startsWith(prefix),
|
||||
)
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1));
|
||||
const pos2Parts = splitNormalizedTagParts(normalizePos2Tag(token.pos2));
|
||||
return (
|
||||
pos1Parts.length >= 2 &&
|
||||
pos1Parts[0] === '接頭詞' &&
|
||||
pos1Parts.slice(1).some((part) => part === '名詞') &&
|
||||
pos2Parts[0] === '名詞接続'
|
||||
);
|
||||
}
|
||||
|
||||
function isFrequencyExcludedByPos(
|
||||
token: MergedToken,
|
||||
pos1Exclusions: ReadonlySet<string>,
|
||||
@@ -187,11 +208,13 @@ function isFrequencyExcludedByPos(
|
||||
pos2Exclusions,
|
||||
);
|
||||
const allowOrdinalPrefixNounToken = shouldAllowOrdinalPrefixNounFrequency(token);
|
||||
const allowHonorificPrefixNounToken = shouldAllowHonorificPrefixNounFrequency(token);
|
||||
|
||||
if (
|
||||
isExcludedByTagSet(normalizedPos1, pos1Exclusions) &&
|
||||
!allowContentLedMergedToken &&
|
||||
!allowOrdinalPrefixNounToken
|
||||
!allowOrdinalPrefixNounToken &&
|
||||
!allowHonorificPrefixNounToken
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
@@ -199,7 +222,8 @@ function isFrequencyExcludedByPos(
|
||||
if (
|
||||
isExcludedByTagSet(normalizedPos2, pos2Exclusions) &&
|
||||
!allowContentLedMergedToken &&
|
||||
!allowOrdinalPrefixNounToken
|
||||
!allowOrdinalPrefixNounToken &&
|
||||
!allowHonorificPrefixNounToken
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user