mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-06 19:57:26 -08:00
fix(tokenizer): tighten frequency highlighting exclusions
This commit is contained in:
@@ -972,6 +972,34 @@ test('tokenizeSubtitle skips frequency rank when Yomitan token is enriched as pa
|
|||||||
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('tokenizeSubtitle keeps frequency rank when mecab tags classify token as content-bearing', async () => {
|
||||||
|
const result = await tokenizeSubtitle(
|
||||||
|
'ふふ',
|
||||||
|
makeDepsFromYomitanTokens([{ surface: 'ふふ', reading: '', headword: 'ふふ' }], {
|
||||||
|
getFrequencyDictionaryEnabled: () => true,
|
||||||
|
getFrequencyRank: (text) => (text === 'ふふ' ? 3014 : null),
|
||||||
|
tokenizeWithMecab: async () => [
|
||||||
|
{
|
||||||
|
headword: 'ふふ',
|
||||||
|
surface: 'ふふ',
|
||||||
|
reading: 'フフ',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 2,
|
||||||
|
partOfSpeech: PartOfSpeech.verb,
|
||||||
|
pos1: '動詞',
|
||||||
|
pos2: '自立',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.equal(result.tokens?.length, 1);
|
||||||
|
assert.equal(result.tokens?.[0]?.frequencyRank, 3014);
|
||||||
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle ignores invalid frequency ranks', async () => {
|
test('tokenizeSubtitle ignores invalid frequency ranks', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'猫',
|
'猫',
|
||||||
@@ -2400,7 +2428,7 @@ test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and freque
|
|||||||
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
|
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('tokenizeSubtitle keeps merged token when overlap contains at least one content pos1 tag', async () => {
|
test('tokenizeSubtitle excludes merged function/content token from frequency highlighting but keeps N+1', async () => {
|
||||||
const result = await tokenizeSubtitle(
|
const result = await tokenizeSubtitle(
|
||||||
'になれば',
|
'になれば',
|
||||||
makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], {
|
makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], {
|
||||||
@@ -2453,7 +2481,7 @@ test('tokenizeSubtitle keeps merged token when overlap contains at least one con
|
|||||||
|
|
||||||
assert.equal(result.tokens?.length, 1);
|
assert.equal(result.tokens?.length, 1);
|
||||||
assert.equal(result.tokens?.[0]?.pos1, '助詞|動詞');
|
assert.equal(result.tokens?.[0]?.pos1, '助詞|動詞');
|
||||||
assert.equal(result.tokens?.[0]?.frequencyRank, 13);
|
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, true);
|
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, true);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -314,6 +314,26 @@ test('annotateTokens excludes likely kana SFX tokens from frequency when POS tag
|
|||||||
assert.equal(result[0]?.frequencyRank, undefined);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('annotateTokens keeps frequency when mecab tags classify token as content-bearing', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: 'ふふ',
|
||||||
|
headword: 'ふふ',
|
||||||
|
pos1: '動詞',
|
||||||
|
pos2: '自立',
|
||||||
|
frequencyRank: 3014,
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 2,
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = annotateTokens(tokens, makeDeps(), {
|
||||||
|
minSentenceWordsForNPlusOne: 1,
|
||||||
|
});
|
||||||
|
|
||||||
|
assert.equal(result[0]?.frequencyRank, 3014);
|
||||||
|
});
|
||||||
|
|
||||||
test('annotateTokens allows previously default-excluded pos2 when removed from effective set', () => {
|
test('annotateTokens allows previously default-excluded pos2 when removed from effective set', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
@@ -337,7 +357,7 @@ test('annotateTokens allows previously default-excluded pos2 when removed from e
|
|||||||
assert.equal(result[0]?.isNPlusOneTarget, true);
|
assert.equal(result[0]?.isNPlusOneTarget, true);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('annotateTokens keeps composite tokens when any component pos tag is content-bearing', () => {
|
test('annotateTokens excludes composite function/content tokens from frequency but keeps N+1 eligible', () => {
|
||||||
const tokens = [
|
const tokens = [
|
||||||
makeToken({
|
makeToken({
|
||||||
surface: 'になれば',
|
surface: 'になれば',
|
||||||
@@ -354,7 +374,7 @@ test('annotateTokens keeps composite tokens when any component pos tag is conten
|
|||||||
minSentenceWordsForNPlusOne: 1,
|
minSentenceWordsForNPlusOne: 1,
|
||||||
});
|
});
|
||||||
|
|
||||||
assert.equal(result[0]?.frequencyRank, 5);
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, true);
|
assert.equal(result[0]?.isNPlusOneTarget, true);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -73,8 +73,9 @@ function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet<strin
|
|||||||
if (parts.length === 0) {
|
if (parts.length === 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// Composite tags like "助詞|名詞" stay eligible unless every component is excluded.
|
// Frequency highlighting should be conservative: if any merged component is excluded,
|
||||||
return parts.every((part) => exclusions.has(part));
|
// skip highlighting the whole token to avoid noisy merged fragments.
|
||||||
|
return parts.some((part) => exclusions.has(part));
|
||||||
}
|
}
|
||||||
|
|
||||||
function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
|
function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
|
||||||
|
|||||||
@@ -39,6 +39,30 @@ test('enrichTokensWithMecabPos1 fills missing pos1 using surface-sequence fallba
|
|||||||
assert.equal(enriched[0]?.pos1, '助詞');
|
assert.equal(enriched[0]?.pos1, '助詞');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('enrichTokensWithMecabPos1 keeps partOfSpeech unchanged and only enriches POS tags', () => {
|
||||||
|
const tokens = [makeToken({ surface: 'これは', startPos: 0, endPos: 3 })];
|
||||||
|
const mecabTokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: 'これ',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 2,
|
||||||
|
pos1: '名詞',
|
||||||
|
partOfSpeech: PartOfSpeech.noun,
|
||||||
|
}),
|
||||||
|
makeToken({
|
||||||
|
surface: 'は',
|
||||||
|
startPos: 2,
|
||||||
|
endPos: 3,
|
||||||
|
pos1: '助詞',
|
||||||
|
partOfSpeech: PartOfSpeech.particle,
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
|
||||||
|
assert.equal(enriched[0]?.pos1, '名詞|助詞');
|
||||||
|
assert.equal(enriched[0]?.partOfSpeech, PartOfSpeech.other);
|
||||||
|
});
|
||||||
|
|
||||||
test('enrichTokensWithMecabPos1 passes through unchanged when mecab tokens are null or empty', () => {
|
test('enrichTokensWithMecabPos1 passes through unchanged when mecab tokens are null or empty', () => {
|
||||||
const tokens = [makeToken({ surface: '猫', startPos: 0, endPos: 1 })];
|
const tokens = [makeToken({ surface: '猫', startPos: 0, endPos: 1 })];
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user