feat(tokenizer): use Yomitan word classes for subtitle POS filtering

- Carry matched headword wordClasses from termsFind into YomitanScanToken
- Map recognized Yomitan wordClasses to SubMiner coarse POS before annotation
- MeCab enrichment now fills only missing POS fields, preserving existing coarse pos1
- Exclude standalone grammar particles, して helper fragments, and single-kana surfaces from annotations
- Respect source-text punctuation gaps when counting N+1 sentence words
- Preserve known-word highlight on excluded kanji-containing tokens
- Add backlog tasks 304 (N+1 boundary bug) and 305 (wordClasses POS, done)
This commit is contained in:
2026-04-25 23:08:33 -07:00
parent 53aa58d044
commit 96894ff85c
11 changed files with 926 additions and 39 deletions

View File

@@ -25,6 +25,7 @@ interface YomitanTokenInput {
reading?: string;
headword?: string;
isNameMatch?: boolean;
wordClasses?: string[];
}
function makeDepsFromYomitanTokens(
@@ -55,6 +56,7 @@ function makeDepsFromYomitanTokens(
startPos,
endPos,
isNameMatch: token.isNameMatch ?? false,
wordClasses: token.wordClasses,
};
});
},
@@ -1552,7 +1554,7 @@ test('tokenizeSubtitle assigns JLPT level to Yomitan tokens', async () => {
assert.equal(result.tokens?.[0]?.jlptLevel, 'N4');
});
test('tokenizeSubtitle can assign JLPT level to Yomitan particle token', async () => {
test('tokenizeSubtitle clears JLPT level from standalone Yomitan particle token', async () => {
const result = await tokenizeSubtitle(
'は',
makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は' }], {
@@ -1561,7 +1563,7 @@ test('tokenizeSubtitle can assign JLPT level to Yomitan particle token', async (
);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.jlptLevel, 'N5');
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
});
test('tokenizeSubtitle returns null tokens for empty normalized text', async () => {
@@ -3034,6 +3036,58 @@ test('tokenizeSubtitle skips all enrichment stages when disabled', async () => {
assert.equal(frequencyCalls, 0);
});
test('tokenizeSubtitle uses Yomitan word classes to classify standalone particles', async () => {
let mecabCalls = 0;
const result = await tokenizeSubtitle(
'は',
makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は', wordClasses: ['prt'] }], {
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) => (text === 'は' ? 10 : null),
getJlptLevel: (text) => (text === 'は' ? 'N5' : null),
tokenizeWithMecab: async () => {
mecabCalls += 1;
return null;
},
}),
);
assert.equal(mecabCalls, 1);
assert.equal(result.tokens?.length, 1);
assert.equal(result.tokens?.[0]?.partOfSpeech, PartOfSpeech.particle);
assert.equal(result.tokens?.[0]?.pos1, '助詞');
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
});
test('tokenizeSubtitle fills detailed MeCab POS when Yomitan word class supplies coarse POS', async () => {
const result = await tokenizeSubtitle(
'は',
makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は', wordClasses: ['prt'] }], {
tokenizeWithMecab: async () => [
{
headword: 'は',
surface: 'は',
reading: 'ハ',
startPos: 0,
endPos: 1,
partOfSpeech: PartOfSpeech.particle,
pos1: '助詞',
pos2: '係助詞',
pos3: '*',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
],
}),
);
assert.equal(result.tokens?.[0]?.partOfSpeech, PartOfSpeech.particle);
assert.equal(result.tokens?.[0]?.pos1, '助詞');
assert.equal(result.tokens?.[0]?.pos2, '係助詞');
});
test('tokenizeSubtitle keeps frequency enrichment while n+1 is disabled', async () => {
let knownCalls = 0;
let mecabCalls = 0;
@@ -3110,6 +3164,60 @@ test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and freque
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
});
test('tokenizeSubtitle preserves known-word highlight for exact non-independent kanji noun tokens', async () => {
const result = await tokenizeSubtitle(
'その点',
makeDepsFromYomitanTokens(
[
{ surface: 'その', reading: 'その', headword: 'その' },
{ surface: '点', reading: 'てん', headword: '点' },
],
{
isKnownWord: (text) => text === '点' || text === 'てん',
getFrequencyDictionaryEnabled: () => true,
getFrequencyRank: (text) => (text === '点' ? 1384 : null),
getJlptLevel: (text) => (text === '点' ? 'N3' : null),
tokenizeWithMecab: async () => [
{
headword: 'その',
surface: 'その',
reading: 'ソノ',
startPos: 0,
endPos: 2,
partOfSpeech: PartOfSpeech.other,
pos1: '連体詞',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
{
headword: '点',
surface: '点',
reading: 'テン',
startPos: 2,
endPos: 3,
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '非自立',
pos3: '一般',
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
},
],
},
),
);
assert.equal(result.tokens?.length, 2);
assert.equal(result.tokens?.[0]?.isKnown, false);
assert.equal(result.tokens?.[1]?.surface, '点');
assert.equal(result.tokens?.[1]?.isKnown, true);
assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false);
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
assert.equal(result.tokens?.[1]?.jlptLevel, undefined);
});
test('tokenizeSubtitle keeps mecab-tagged interjections tokenized while clearing annotation metadata', async () => {
const result = await tokenizeSubtitle(
'ぐはっ',

View File

@@ -96,6 +96,7 @@ interface TokenizerAnnotationOptions {
minSentenceWordsForNPlusOne: number | undefined;
pos1Exclusions: ReadonlySet<string>;
pos2Exclusions: ReadonlySet<string>;
sourceText?: string;
}
let parserEnrichmentWorkerRuntimeModulePromise: Promise<
@@ -333,6 +334,66 @@ function normalizeSelectedYomitanTokens(tokens: MergedToken[]): MergedToken[] {
}));
}
function normalizeYomitanWordClasses(wordClasses: unknown): string[] {
if (!Array.isArray(wordClasses)) {
return [];
}
const normalized: string[] = [];
for (const wordClass of wordClasses) {
if (typeof wordClass !== 'string') {
continue;
}
const trimmed = wordClass.trim();
if (trimmed && !normalized.includes(trimmed)) {
normalized.push(trimmed);
}
}
return normalized;
}
function resolvePartOfSpeechFromYomitanWordClasses(wordClasses: string[]): {
partOfSpeech: PartOfSpeech;
pos1?: string;
} {
if (wordClasses.includes('prt')) {
return { partOfSpeech: PartOfSpeech.particle, pos1: '助詞' };
}
if (wordClasses.includes('aux')) {
return { partOfSpeech: PartOfSpeech.bound_auxiliary, pos1: '助動詞' };
}
if (wordClasses.some((wordClass) => wordClass.startsWith('v'))) {
return { partOfSpeech: PartOfSpeech.verb, pos1: '動詞' };
}
if (wordClasses.includes('adj-i') || wordClasses.includes('adj-ix')) {
return { partOfSpeech: PartOfSpeech.i_adjective, pos1: '形容詞' };
}
if (wordClasses.includes('adj-na')) {
return { partOfSpeech: PartOfSpeech.na_adjective, pos1: '名詞' };
}
if (
wordClasses.some(
(wordClass) =>
wordClass === 'n' ||
wordClass === 'num' ||
wordClass === 'ctr' ||
wordClass === 'pn' ||
wordClass.startsWith('n-'),
)
) {
return { partOfSpeech: PartOfSpeech.noun, pos1: '名詞' };
}
return { partOfSpeech: PartOfSpeech.other };
}
function getYomitanWordClassPosMetadata(wordClasses: unknown): {
partOfSpeech: PartOfSpeech;
pos1?: string;
} {
return resolvePartOfSpeechFromYomitanWordClasses(normalizeYomitanWordClasses(wordClasses));
}
function resolveFrequencyLookupText(
token: MergedToken,
matchMode: FrequencyDictionaryMatchMode,
@@ -623,19 +684,23 @@ async function parseWithYomitanInternalParser(
}
const normalizedSelectedTokens = normalizeSelectedYomitanTokens(
selectedTokens.map(
(token): MergedToken => ({
surface: token.surface,
reading: token.reading,
headword: token.headword,
startPos: token.startPos,
endPos: token.endPos,
partOfSpeech: PartOfSpeech.other,
isMerged: true,
isKnown: false,
isNPlusOneTarget: false,
isNameMatch: token.isNameMatch ?? false,
frequencyRank: token.frequencyRank,
}),
(token): MergedToken => {
const posMetadata = getYomitanWordClassPosMetadata(token.wordClasses);
return {
surface: token.surface,
reading: token.reading,
headword: token.headword,
startPos: token.startPos,
endPos: token.endPos,
partOfSpeech: posMetadata.partOfSpeech,
pos1: posMetadata.pos1,
isMerged: true,
isKnown: false,
isNPlusOneTarget: false,
isNameMatch: token.isNameMatch ?? false,
frequencyRank: token.frequencyRank,
};
},
),
);
@@ -716,12 +781,11 @@ export async function tokenizeSubtitle(
.replace(/\s+/g, ' ')
.trim();
const annotationOptions = getAnnotationOptions(deps);
annotationOptions.sourceText = tokenizeText;
const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps, annotationOptions);
if (yomitanTokens && yomitanTokens.length > 0) {
const annotatedTokens = await stripSubtitleAnnotationMetadata(
await applyAnnotationStage(yomitanTokens, deps, annotationOptions),
);
const annotatedTokens = await applyAnnotationStage(yomitanTokens, deps, annotationOptions);
return {
text: displayText,
tokens: annotatedTokens.length > 0 ? annotatedTokens : null,

View File

@@ -366,6 +366,132 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes kana-only non-independe
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone して grammar helper fragments', () => {
const token = makeToken({
surface: 'して',
headword: 'する',
reading: 'シテ',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞|助詞',
pos2: '自立|接続助詞',
});
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes inflected standalone して grammar helper fragments', () => {
const token = makeToken({
surface: 'してる',
headword: 'する',
reading: 'シテル',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞|助動詞',
pos2: '自立|非自立',
});
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone particle fragments without POS tags', () => {
const token = makeToken({
surface: 'と',
headword: 'と',
reading: 'ト',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
});
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone connective particle fragments without POS tags', () => {
const token = makeToken({
surface: 'たって',
headword: 'たって',
reading: 'タッテ',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
});
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes rhetorical もんか grammar particle phrases', () => {
for (const surface of ['もんか', 'ものか']) {
const token = makeToken({
surface,
headword: surface,
reading: surface === 'もんか' ? 'モンカ' : 'モノカ',
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞|助詞',
pos2: '非自立|副助詞/並立助詞/終助詞',
});
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, surface);
}
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes bare くれ auxiliary fragments', () => {
const token = makeToken({
surface: 'くれ',
headword: '暮れ',
reading: 'クレ',
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '一般',
});
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone quote particle and auxiliary grammar terms', () => {
for (const token of [
makeToken({
surface: 'って',
headword: 'って',
reading: 'ッテ',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
}),
makeToken({
surface: 'べき',
headword: 'べき',
reading: 'ベキ',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
}),
]) {
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
}
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes single-kana surface fragments', () => {
for (const token of [
makeToken({
surface: 'ふ',
headword: '不',
reading: 'フ',
partOfSpeech: PartOfSpeech.other,
pos1: '接頭詞',
pos2: '',
}),
makeToken({
surface: 'フ',
headword: '負',
reading: 'フ',
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '一般',
}),
]) {
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
}
});
test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => {
const token = makeToken({
surface: 'は',
@@ -536,6 +662,57 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
assert.equal(result[0]?.isNPlusOneTarget, false);
});
test('annotateTokens N+1 sentence word count respects source punctuation gaps omitted by Yomitan', () => {
const tokens = [
makeToken({
surface: '私',
headword: '私',
pos1: '名詞',
startPos: 0,
endPos: 1,
}),
makeToken({
surface: '猫',
headword: '猫',
pos1: '名詞',
startPos: 1,
endPos: 2,
}),
makeToken({
surface: '犬',
headword: '犬',
pos1: '名詞',
startPos: 2,
endPos: 3,
}),
makeToken({
surface: 'ふざけん',
headword: 'ふざける',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '自立',
startPos: 4,
endPos: 8,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === '私' || text === '猫' || text === '犬',
}),
{
minSentenceWordsForNPlusOne: 3,
sourceText: '私猫犬!ふざけんなよ!',
},
);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[1]?.isNPlusOneTarget, false);
assert.equal(result[2]?.isNPlusOneTarget, false);
assert.equal(result[3]?.isNPlusOneTarget, false);
});
test('annotateTokens applies configured pos1 exclusions to both frequency and N+1', () => {
const tokens = [
makeToken({
@@ -610,14 +787,52 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+
}),
];
const result = annotateTokens(tokens, makeDeps(), {
minSentenceWordsForNPlusOne: 1,
});
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === 'た' || text === '負',
getJlptLevel: (text) => (text === 'た' || text === '負' ? 'N3' : null),
}),
{
minSentenceWordsForNPlusOne: 1,
},
);
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.isNPlusOneTarget, false);
});
test('annotateTokens preserves exact known-word status for non-independent kanji noun tokens', () => {
const tokens = [
makeToken({
surface: '点',
reading: 'てん',
headword: '点',
partOfSpeech: PartOfSpeech.other,
pos1: '名詞',
pos2: '非自立',
pos3: '一般',
startPos: 2,
endPos: 3,
frequencyRank: 1384,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === '点' || text === 'てん',
getJlptLevel: (text) => (text === '点' ? 'N3' : null),
}),
{ minSentenceWordsForNPlusOne: 1 },
);
assert.equal(result[0]?.isKnown, true);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.jlptLevel, undefined);
});
test('annotateTokens clears all annotations for non-independent kanji noun tokens under unified gate', () => {
const tokens = [
makeToken({
@@ -665,7 +880,7 @@ test('annotateTokens excludes likely kana SFX tokens from frequency when POS tag
assert.equal(result[0]?.frequencyRank, undefined);
});
test('annotateTokens excludes single hiragana and katakana tokens from frequency when POS tags are missing', () => {
test('annotateTokens clears all annotations from single hiragana and katakana surface fragments', () => {
const tokens = [
makeToken({
surface: 'た',
@@ -679,12 +894,12 @@ test('annotateTokens excludes single hiragana and katakana tokens from frequency
endPos: 1,
}),
makeToken({
surface: '',
reading: '',
headword: '',
pos1: '',
surface: '',
reading: '',
headword: '',
pos1: '名詞',
pos2: '',
partOfSpeech: PartOfSpeech.other,
partOfSpeech: PartOfSpeech.noun,
frequencyRank: 22,
startPos: 1,
endPos: 2,
@@ -706,8 +921,14 @@ test('annotateTokens excludes single hiragana and katakana tokens from frequency
minSentenceWordsForNPlusOne: 1,
});
assert.equal(result[0]?.isKnown, false);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.jlptLevel, undefined);
assert.equal(result[1]?.isKnown, false);
assert.equal(result[1]?.isNPlusOneTarget, false);
assert.equal(result[1]?.frequencyRank, undefined);
assert.equal(result[1]?.jlptLevel, undefined);
assert.equal(result[2]?.frequencyRank, 23);
});
@@ -856,6 +1077,219 @@ test('annotateTokens clears all annotations for kana-only non-independent noun h
assert.equal(result[0]?.jlptLevel, undefined);
});
test('annotateTokens clears all annotations for standalone して helper fragments', () => {
const tokens = [
makeToken({
surface: 'してる',
headword: 'する',
reading: 'シテル',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞|助動詞',
pos2: '自立|非自立',
startPos: 0,
endPos: 3,
frequencyRank: 22,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === 'する',
getJlptLevel: (text) => (text === 'する' ? 'N5' : null),
}),
{ minSentenceWordsForNPlusOne: 1 },
);
assert.equal(result[0]?.isKnown, false);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.jlptLevel, undefined);
});
test('annotateTokens clears all annotations for standalone particle fragments without POS tags', () => {
const tokens = [
makeToken({
surface: 'と',
headword: 'と',
reading: 'ト',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
startPos: 0,
endPos: 1,
frequencyRank: 4,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === 'と',
getJlptLevel: (text) => (text === 'と' ? 'N5' : null),
}),
{ minSentenceWordsForNPlusOne: 1 },
);
assert.equal(result[0]?.isKnown, false);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.jlptLevel, undefined);
});
test('annotateTokens does not mark standalone connective particles as N+1', () => {
const tokens = [
makeToken({
surface: '逃げる',
headword: '逃げる',
reading: 'ニゲル',
partOfSpeech: PartOfSpeech.verb,
pos1: '動詞',
pos2: '自立',
startPos: 0,
endPos: 3,
}),
makeToken({
surface: 'たって',
headword: 'たって',
reading: 'タッテ',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
startPos: 3,
endPos: 6,
frequencyRank: 28,
}),
makeToken({
surface: '無駄',
headword: '無駄',
reading: 'ムダ',
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '形容動詞語幹',
startPos: 6,
endPos: 8,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === '逃げる' || text === '無駄',
getJlptLevel: (text) => (text === 'たって' ? 'N3' : null),
}),
{ minSentenceWordsForNPlusOne: 1 },
);
assert.equal(result[1]?.isKnown, false);
assert.equal(result[1]?.isNPlusOneTarget, false);
assert.equal(result[1]?.frequencyRank, undefined);
assert.equal(result[1]?.jlptLevel, undefined);
});
test('annotateTokens clears all annotations for rhetorical もんか grammar particle phrases', () => {
const tokens = [
makeToken({
surface: 'もんか',
headword: 'もんか',
reading: 'モンカ',
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞|助詞',
pos2: '非自立|副助詞/並立助詞/終助詞',
startPos: 0,
endPos: 3,
frequencyRank: 69629,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === 'もんか',
getJlptLevel: (text) => (text === 'もんか' ? 'N2' : null),
}),
{ minSentenceWordsForNPlusOne: 1 },
);
assert.equal(result[0]?.isKnown, false);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.jlptLevel, undefined);
});
test('annotateTokens clears all annotations for bare くれ auxiliary fragments', () => {
const tokens = [
makeToken({
surface: 'くれ',
headword: '暮れ',
reading: 'クレ',
partOfSpeech: PartOfSpeech.noun,
pos1: '名詞',
pos2: '一般',
startPos: 0,
endPos: 2,
frequencyRank: 12877,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === '暮れ',
getJlptLevel: (text) => (text === '暮れ' ? 'N3' : null),
}),
{ minSentenceWordsForNPlusOne: 1 },
);
assert.equal(result[0]?.isKnown, false);
assert.equal(result[0]?.isNPlusOneTarget, false);
assert.equal(result[0]?.frequencyRank, undefined);
assert.equal(result[0]?.jlptLevel, undefined);
});
test('annotateTokens clears all annotations for standalone quote particle and auxiliary grammar terms', () => {
const tokens = [
makeToken({
surface: 'って',
headword: 'って',
reading: 'ッテ',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
startPos: 0,
endPos: 2,
frequencyRank: 28,
}),
makeToken({
surface: 'べき',
headword: 'べき',
reading: 'ベキ',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
startPos: 2,
endPos: 4,
frequencyRank: 268,
}),
];
const result = annotateTokens(
tokens,
makeDeps({
isKnownWord: (text) => text === 'って' || text === 'べき',
getJlptLevel: (text) => (text === 'って' || text === 'べき' ? 'N3' : null),
}),
{ minSentenceWordsForNPlusOne: 1 },
);
for (const token of result) {
assert.equal(token.isKnown, false, token.surface);
assert.equal(token.isNPlusOneTarget, false, token.surface);
assert.equal(token.frequencyRank, undefined, token.surface);
assert.equal(token.jlptLevel, undefined, token.surface);
}
});
test('annotateTokens clears all annotations from standalone あ interjections without POS tags', () => {
const tokens = [
makeToken({

View File

@@ -89,6 +89,7 @@ export interface AnnotationStageOptions {
minSentenceWordsForNPlusOne?: number;
pos1Exclusions?: ReadonlySet<string>;
pos2Exclusions?: ReadonlySet<string>;
sourceText?: string;
}
function resolveKnownWordText(
@@ -670,6 +671,36 @@ function computeTokenKnownStatus(
return normalizedReading !== matchText.trim() && isKnownWord(normalizedReading);
}
function computeExcludedTokenKnownStatus(
token: MergedToken,
isKnownWord: (text: string) => boolean,
): boolean {
const normalizedSurface = token.surface.trim();
if (!hasKanjiChar(normalizedSurface)) {
return false;
}
if (normalizedSurface && isKnownWord(normalizedSurface)) {
return true;
}
const normalizedReading = token.reading.trim();
if (
normalizedReading &&
normalizedReading !== normalizedSurface &&
isKnownWord(normalizedReading)
) {
return true;
}
const normalizedHeadword = token.headword.trim();
return (
normalizedHeadword.length > 0 &&
normalizedHeadword === normalizedSurface &&
isKnownWord(normalizedHeadword)
);
}
function filterTokenFrequencyRank(
token: MergedToken,
pos1Exclusions: ReadonlySet<string>,
@@ -732,10 +763,16 @@ export function annotateTokens(
pos2Exclusions,
})
) {
return sharedStripSubtitleAnnotationMetadata(token, {
const strippedToken = sharedStripSubtitleAnnotationMetadata(token, {
pos1Exclusions,
pos2Exclusions,
});
return {
...strippedToken,
isKnown:
nPlusOneEnabled &&
computeExcludedTokenKnownStatus(token, deps.isKnownWord),
};
}
const prioritizedNameMatch = nameMatchEnabled && token.isNameMatch === true;
@@ -779,6 +816,7 @@ export function annotateTokens(
sanitizedMinSentenceWordsForNPlusOne,
pos1Exclusions,
pos2Exclusions,
options.sourceText,
);
if (!nameMatchEnabled) {

View File

@@ -303,7 +303,9 @@ function fillMissingPos1BySurfaceSequence(
let cursor = 0;
return tokens.map((token) => {
if (token.pos1 && token.pos1.trim().length > 0) {
const hasCompletePosMetadata =
token.pos1?.trim() && token.pos2?.trim() && token.pos3?.trim();
if (hasCompletePosMetadata) {
return token;
}
@@ -327,9 +329,9 @@ function fillMissingPos1BySurfaceSequence(
cursor = best.index + 1;
return {
...token,
pos1: best.pos1,
pos2: best.pos2,
pos3: best.pos3,
pos1: token.pos1 ?? best.pos1,
pos2: token.pos2 ?? best.pos2,
pos3: token.pos3 ?? best.pos3,
};
});
}
@@ -382,7 +384,7 @@ export function enrichTokensWithMecabPos1(
const metadataByTokenIndex = new Map<number, MecabPosMetadata>();
for (const [index, token] of tokens.entries()) {
if (token.pos1) {
if (token.pos1?.trim() && token.pos2?.trim() && token.pos3?.trim()) {
continue;
}
@@ -410,9 +412,9 @@ export function enrichTokensWithMecabPos1(
return {
...token,
pos1: metadata.pos1,
pos2: metadata.pos2,
pos3: metadata.pos3,
pos1: token.pos1 ?? metadata.pos1,
pos2: token.pos2 ?? metadata.pos2,
pos3: token.pos3 ?? metadata.pos3,
};
});

View File

@@ -19,11 +19,18 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
'ええ',
'うう',
'おお',
'くれ',
'たって',
'って',
'だって',
'はあ',
'はは',
'べき',
'へえ',
'ふう',
'ほう',
'もんか',
'ものか',
]);
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
@@ -72,6 +79,26 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
]);
const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
const NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1 = new Set(['助詞', '助動詞']);
const STANDALONE_GRAMMAR_PARTICLE_SURFACES = new Set([
'か',
'が',
'さ',
'し',
'ぞ',
'ぜ',
'と',
'な',
'に',
'ね',
'の',
'は',
'へ',
'も',
'や',
'よ',
'を',
]);
const STANDALONE_GRAMMAR_PARTICLE_PHRASES = new Set(['たって', 'だって']);
export interface SubtitleAnnotationFilterOptions {
pos1Exclusions?: ReadonlySet<string>;
@@ -278,6 +305,38 @@ function isKanaOnlyNonIndependentNounHelperMerge(token: MergedToken): boolean {
return pos1Parts.slice(1).every((part) => NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1.has(part));
}
function isKanaOnlyText(text: string): boolean {
const normalized = normalizeKana(text);
return normalized.length > 0 && [...normalized].every(isKanaChar);
}
function isStandaloneSuruTeGrammarHelper(token: MergedToken): boolean {
const normalizedSurface = normalizeKana(token.surface);
const normalizedHeadword = normalizeKana(token.headword);
if (!normalizedSurface.startsWith('して') || normalizedHeadword !== 'する') {
return false;
}
const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
return isKanaOnlyText(normalizedSurface) && (pos1Parts.length === 0 || pos1Parts.includes('動詞'));
}
function isStandaloneGrammarParticle(token: MergedToken): boolean {
const normalizedSurface = normalizeKana(token.surface);
const normalizedHeadword = normalizeKana(token.headword);
return (
normalizedSurface === normalizedHeadword &&
(STANDALONE_GRAMMAR_PARTICLE_SURFACES.has(normalizedSurface) ||
STANDALONE_GRAMMAR_PARTICLE_PHRASES.has(normalizedSurface))
);
}
function isSingleKanaSurfaceFragment(token: MergedToken): boolean {
const normalizedSurface = normalizeKana(token.surface);
const chars = [...normalizedSurface];
return chars.length === 1 && chars.every(isKanaChar);
}
function isExcludedByTerm(token: MergedToken): boolean {
const candidates = [token.surface, token.reading, token.headword].filter(
(candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
@@ -365,6 +424,18 @@ export function shouldExcludeTokenFromSubtitleAnnotations(
return true;
}
if (isStandaloneSuruTeGrammarHelper(token)) {
return true;
}
if (isStandaloneGrammarParticle(token)) {
return true;
}
if (isSingleKanaSurfaceFragment(token)) {
return true;
}
if (isExcludedTrailingParticleMergedToken(token)) {
return true;
}

View File

@@ -1049,6 +1049,60 @@ test('requestYomitanScanTokens marks grouped entries when SubMiner dictionary al
assert.equal((result as Array<{ isNameMatch?: boolean }>)[0]?.isNameMatch, true);
});
test('requestYomitanScanTokens preserves matched headword word classes', async () => {
let scannerScript = '';
const deps = createDeps(async (script) => {
if (script.includes('termsFind')) {
scannerScript = script;
return [];
}
if (script.includes('optionsGetFull')) {
return {
profileCurrent: 0,
profiles: [
{
options: {
scanning: { length: 40 },
},
},
],
};
}
return null;
});
await requestYomitanScanTokens('は', deps, { error: () => undefined });
const result = await runInjectedYomitanScript(scannerScript, (action, params) => {
if (action !== 'termsFind') {
throw new Error(`unexpected action: ${action}`);
}
const text = (params as { text?: string } | undefined)?.text;
if (text !== 'は') {
return { originalTextLength: 0, dictionaryEntries: [] };
}
return {
originalTextLength: 1,
dictionaryEntries: [
{
headwords: [
{
term: 'は',
reading: 'は',
wordClasses: ['prt'],
sources: [{ originalText: 'は', isPrimary: true, matchType: 'exact' }],
},
],
},
],
};
});
assert.deepEqual((result as Array<{ wordClasses?: string[] }>)[0]?.wordClasses, ['prt']);
});
test('requestYomitanScanTokens skips fallback fragments without exact primary source matches', async () => {
const deps = createDeps(async (script) => {
if (script.includes('optionsGetFull')) {

View File

@@ -53,6 +53,7 @@ export interface YomitanScanToken {
endPos: number;
isNameMatch?: boolean;
frequencyRank?: number;
wordClasses?: string[];
}
interface YomitanProfileMetadata {
@@ -91,7 +92,10 @@ function isScanTokenArray(value: unknown): value is YomitanScanToken[] {
typeof entry.startPos === 'number' &&
typeof entry.endPos === 'number' &&
(entry.isNameMatch === undefined || typeof entry.isNameMatch === 'boolean') &&
(entry.frequencyRank === undefined || typeof entry.frequencyRank === 'number'),
(entry.frequencyRank === undefined || typeof entry.frequencyRank === 'number') &&
(entry.wordClasses === undefined ||
(Array.isArray(entry.wordClasses) &&
entry.wordClasses.every((wordClass) => typeof wordClass === 'string'))),
)
);
}
@@ -975,6 +979,11 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
return best;
}
function getPreferredHeadword(dictionaryEntries, token, dictionaryPriorityByName, dictionaryFrequencyModeByName) {
function normalizeWordClasses(headword) {
if (!Array.isArray(headword?.wordClasses)) { return undefined; }
const classes = headword.wordClasses.filter((wordClass) => typeof wordClass === "string" && wordClass.trim().length > 0);
return classes.length > 0 ? classes : undefined;
}
function appendDictionaryNames(target, value) {
if (!value || typeof value !== 'object') {
return;
@@ -1033,6 +1042,7 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
return {
term: preferredMatch.headword.term,
reading: preferredMatch.headword.reading,
wordClasses: normalizeWordClasses(preferredMatch.headword),
isNameMatch: matchedNameDictionary || isNameDictionaryEntry(preferredMatch.dictionaryEntry),
frequencyRank: getBestFrequencyRankForMatches(
exactFrequencyMatches.length > 0 ? exactFrequencyMatches : exactPrimaryMatches,
@@ -1099,7 +1109,7 @@ ${YOMITAN_SCANNING_HELPERS}
if (preferredHeadword && typeof preferredHeadword.term === "string") {
const reading = typeof preferredHeadword.reading === "string" ? preferredHeadword.reading : "";
const segments = distributeFuriganaInflected(preferredHeadword.term, reading, source);
tokens.push({
const tokenPayload = {
surface: segments.map((segment) => segment.text).join("") || source,
reading: segments.map((segment) => typeof segment.reading === "string" ? segment.reading : "").join(""),
headword: preferredHeadword.term,
@@ -1110,7 +1120,11 @@ ${YOMITAN_SCANNING_HELPERS}
typeof preferredHeadword.frequencyRank === "number" && Number.isFinite(preferredHeadword.frequencyRank)
? Math.max(1, Math.floor(preferredHeadword.frequencyRank))
: undefined,
});
};
if (Array.isArray(preferredHeadword.wordClasses) && preferredHeadword.wordClasses.length > 0) {
tokenPayload.wordClasses = preferredHeadword.wordClasses;
}
tokens.push(tokenPayload);
i += originalTextLength;
continue;
}

View File

@@ -347,11 +347,25 @@ function isSentenceBoundaryToken(token: MergedToken): boolean {
return SENTENCE_BOUNDARY_SURFACES.has(token.surface);
}
function hasSentenceBoundaryInSourceGap(
sourceText: string | undefined,
previousEnd: number | null,
nextStart: number,
): boolean {
if (typeof sourceText !== 'string' || previousEnd === null || nextStart <= previousEnd) {
return false;
}
const gap = sourceText.slice(previousEnd, nextStart);
return [...gap].some((char) => SENTENCE_BOUNDARY_SURFACES.has(char));
}
export function markNPlusOneTargets(
tokens: MergedToken[],
minSentenceWords = 3,
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
sourceText?: string,
): MergedToken[] {
if (tokens.length === 0) {
return [];
@@ -363,6 +377,7 @@ export function markNPlusOneTargets(
}));
let sentenceStart = 0;
let previousTokenEnd: number | null = null;
const minimumSentenceWords = Number.isInteger(minSentenceWords)
? Math.max(1, minSentenceWords)
: 3;
@@ -393,10 +408,15 @@ export function markNPlusOneTargets(
for (let i = 0; i < markedTokens.length; i++) {
const token = markedTokens[i];
if (!token) continue;
if (hasSentenceBoundaryInSourceGap(sourceText, previousTokenEnd, token.startPos)) {
markSentence(sentenceStart, i);
sentenceStart = i;
}
if (isSentenceBoundaryToken(token)) {
markSentence(sentenceStart, i);
sentenceStart = i + 1;
}
previousTokenEnd = token.endPos;
}
if (sentenceStart < markedTokens.length) {