feat(tokenizer): use Yomitan word classes for subtitle POS filtering

- Carry matched headword wordClasses from termsFind into YomitanScanToken
- Map recognized Yomitan wordClasses to SubMiner coarse POS before annotation
- MeCab enrichment now fills only missing POS fields, preserving existing coarse pos1
- Exclude standalone grammar particles, して helper fragments, and single-kana surfaces from annotations
- Respect source-text punctuation gaps when counting N+1 sentence words
- Preserve known-word highlight on excluded kanji-containing tokens
- Add backlog tasks 304 (N+1 boundary bug) and 305 (wordClasses POS, done)
This commit is contained in:
2026-04-25 23:08:33 -07:00
parent 30712738dc
commit 544a770c09
11 changed files with 925 additions and 40 deletions
+110 -2
View File
@@ -25,6 +25,7 @@ interface YomitanTokenInput {
reading?: string;
headword?: string;
isNameMatch?: boolean;
wordClasses?: string[];
}
function makeDepsFromYomitanTokens(
@@ -55,6 +56,7 @@ function makeDepsFromYomitanTokens(
startPos,
endPos,
isNameMatch: token.isNameMatch ?? false,
wordClasses: token.wordClasses,
};
});
},
@@ -1552,7 +1554,7 @@ test('tokenizeSubtitle assigns JLPT level to Yomitan tokens', async () => {
assert.equal(result.tokens?.[0]?.jlptLevel, 'N4');
});
test('tokenizeSubtitle can assign JLPT level to Yomitan particle token', async () => {
test('tokenizeSubtitle clears JLPT level from standalone Yomitan particle token', async () => {
const result = await tokenizeSubtitle(
'は',
makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は' }], {
@@ -1561,7 +1563,7 @@ test('tokenizeSubtitle can assign JLPT level to Yomitan particle token', async (
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.jlptLevel, 'N5');
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
});
test('tokenizeSubtitle returns null tokens for empty normalized text', async () => {
@@ -3034,6 +3036,58 @@ test('tokenizeSubtitle skips all enrichment stages when disabled', async () => {
assert.equal(frequencyCalls, 0);
});
test('tokenizeSubtitle uses Yomitan word classes to classify standalone particles', async () => {
let mecabCalls = 0;
const result = await tokenizeSubtitle(
'は',
makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は', wordClasses: ['prt'] }], {
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) => (text === 'は' ? 10 : null),
getJlptLevel: (text) => (text === 'は' ? 'N5' : null),
tokenizeWithMecab: async () => {
mecabCalls += 1;
return null;
},
}),
);
assert.equal(mecabCalls, 1);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.partOfSpeech, PartOfSpeech.particle);
assert.equal(result.tokens?.[0]?.pos1, '助詞');
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
});
test('tokenizeSubtitle fills detailed MeCab POS when Yomitan word class supplies coarse POS', async () => {
const result = await tokenizeSubtitle(
'は',
makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は', wordClasses: ['prt'] }], {
tokenizeWithMecab: async () => [
{
headword: 'は',
surface: 'は',
reading: 'ハ',
startPos: 0,
endPos: 1,
partOfSpeech: PartOfSpeech.particle,
pos1: '助詞',
pos2: '係助詞',
pos3: '*',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
],
}),
);
assert.equal(result.tokens?.[0]?.partOfSpeech, PartOfSpeech.particle);
assert.equal(result.tokens?.[0]?.pos1, '助詞');
assert.equal(result.tokens?.[0]?.pos2, '係助詞');
});
test('tokenizeSubtitle keeps frequency enrichment while n+1 is disabled', async () => {
let knownCalls = 0;
let mecabCalls = 0;
@@ -3110,6 +3164,60 @@ test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and freque
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
});
test('tokenizeSubtitle preserves known-word highlight for exact non-independent kanji noun tokens', async () => {
const result = await tokenizeSubtitle(
'その点',
makeDepsFromYomitanTokens(
[
{ surface: 'その', reading: 'その', headword: 'その' },
{ surface: '点', reading: 'てん', headword: '点' },
],
{
isKnownWord: (text) => text === '点' || text === 'てん',
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) => (text === '点' ? 1384 : null),
getJlptLevel: (text) => (text === '点' ? 'N3' : null),
tokenizeWithMecab: async () => [
{
headword: 'その',
surface: 'その',
reading: 'ソノ',
startPos: 0,
endPos: 2,
partOfSpeech: PartOfSpeech.other,
pos1: '連体詞',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: '点',
surface: '点',
reading: 'テン',
startPos: 2,
endPos: 3,
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '非自立',
pos3: '一般',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
],
},
),
);
assert.equal(result.tokens?.length, 2);
assert.equal(result.tokens?.[0]?.isKnown, false);
assert.equal(result.tokens?.[1]?.surface, '点');
assert.equal(result.tokens?.[1]?.isKnown, true);
assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false);
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
assert.equal(result.tokens?.[1]?.jlptLevel, undefined);
});
test('tokenizeSubtitle keeps mecab-tagged interjections tokenized while clearing annotation metadata', async () => {
const result = await tokenizeSubtitle(
'ぐはっ',