feat(tokenizer): use Yomitan word classes for subtitle POS filtering

- Carry matched headword wordClasses from termsFind into YomitanScanToken
- Map recognized Yomitan wordClasses to SubMiner coarse POS before annotation
- MeCab enrichment now fills only missing POS fields, preserving existing coarse pos1
- Exclude standalone grammar particles, して helper fragments, and single-kana surfaces from annotations
- Respect source-text punctuation gaps when counting N+1 sentence words
- Preserve known-word highlight on excluded kanji-containing tokens
- Add backlog tasks 304 (N+1 boundary bug) and 305 (wordClasses POS, done)
This commit is contained in:
2026-04-25 23:08:33 -07:00
parent 53aa58d044
commit 96894ff85c
11 changed files with 926 additions and 39 deletions

View File

@@ -96,6 +96,7 @@ interface TokenizerAnnotationOptions {
minSentenceWordsForNPlusOne: number | undefined;
pos1Exclusions: ReadonlySet<string>;
pos2Exclusions: ReadonlySet<string>;
sourceText?: string;
}
let parserEnrichmentWorkerRuntimeModulePromise: Promise<
@@ -333,6 +334,66 @@ function normalizeSelectedYomitanTokens(tokens: MergedToken[]): MergedToken[] {
}));
}
function normalizeYomitanWordClasses(wordClasses: unknown): string[] {
if (!Array.isArray(wordClasses)) {
return [];
}
const normalized: string[] = [];
for (const wordClass of wordClasses) {
if (typeof wordClass !== 'string') {
continue;
}
const trimmed = wordClass.trim();
if (trimmed && !normalized.includes(trimmed)) {
normalized.push(trimmed);
}
}
return normalized;
}
function resolvePartOfSpeechFromYomitanWordClasses(wordClasses: string[]): {
partOfSpeech: PartOfSpeech;
pos1?: string;
} {
if (wordClasses.includes('prt')) {
return { partOfSpeech: PartOfSpeech.particle, pos1: '助詞' };
}
if (wordClasses.includes('aux')) {
return { partOfSpeech: PartOfSpeech.bound_auxiliary, pos1: '助動詞' };
}
if (wordClasses.some((wordClass) => wordClass.startsWith('v'))) {
return { partOfSpeech: PartOfSpeech.verb, pos1: '動詞' };
}
if (wordClasses.includes('adj-i') || wordClasses.includes('adj-ix')) {
return { partOfSpeech: PartOfSpeech.i_adjective, pos1: '形容詞' };
}
if (wordClasses.includes('adj-na')) {
return { partOfSpeech: PartOfSpeech.na_adjective, pos1: '名詞' };
}
if (
wordClasses.some(
(wordClass) =>
wordClass === 'n' ||
wordClass === 'num' ||
wordClass === 'ctr' ||
wordClass === 'pn' ||
wordClass.startsWith('n-'),
)
) {
return { partOfSpeech: PartOfSpeech.noun, pos1: '名詞' };
}
return { partOfSpeech: PartOfSpeech.other };
}
function getYomitanWordClassPosMetadata(wordClasses: unknown): {
partOfSpeech: PartOfSpeech;
pos1?: string;
} {
return resolvePartOfSpeechFromYomitanWordClasses(normalizeYomitanWordClasses(wordClasses));
}
function resolveFrequencyLookupText(
token: MergedToken,
matchMode: FrequencyDictionaryMatchMode,
@@ -623,19 +684,23 @@ async function parseWithYomitanInternalParser(
}
const normalizedSelectedTokens = normalizeSelectedYomitanTokens(
selectedTokens.map(
(token): MergedToken => ({
surface: token.surface,
reading: token.reading,
headword: token.headword,
startPos: token.startPos,
endPos: token.endPos,
partOfSpeech: PartOfSpeech.other,
isMerged: true,
isKnown: false,
isNPlusOneTarget: false,
isNameMatch: token.isNameMatch ?? false,
frequencyRank: token.frequencyRank,
}),
(token): MergedToken => {
const posMetadata = getYomitanWordClassPosMetadata(token.wordClasses);
return {
surface: token.surface,
reading: token.reading,
headword: token.headword,
startPos: token.startPos,
endPos: token.endPos,
partOfSpeech: posMetadata.partOfSpeech,
pos1: posMetadata.pos1,
isMerged: true,
isKnown: false,
isNPlusOneTarget: false,
isNameMatch: token.isNameMatch ?? false,
frequencyRank: token.frequencyRank,
};
},
),
);
@@ -716,12 +781,11 @@ export async function tokenizeSubtitle(
.replace(/\s+/g, ' ')
.trim();
const annotationOptions = getAnnotationOptions(deps);
annotationOptions.sourceText = tokenizeText;
const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps, annotationOptions);
if (yomitanTokens && yomitanTokens.length > 0) {
const annotatedTokens = await stripSubtitleAnnotationMetadata(
await applyAnnotationStage(yomitanTokens, deps, annotationOptions),
);
const annotatedTokens = await applyAnnotationStage(yomitanTokens, deps, annotationOptions);
return {
text: displayText,
tokens: annotatedTokens.length > 0 ? annotatedTokens : null,