mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-04-26 04:19:27 -07:00
Compare commits
2 Commits
dev
...
tokenizer-
| Author | SHA1 | Date | |
|---|---|---|---|
|
6b7d0553a7
|
|||
| d8934647a9 |
@@ -0,0 +1,27 @@
|
||||
---
|
||||
id: TASK-304
|
||||
title: Fix N+1 sentence boundary counting across Yomitan punctuation gaps
|
||||
status: In Progress
|
||||
assignee: []
|
||||
created_date: '2026-04-26 05:33'
|
||||
labels:
|
||||
- bug
|
||||
- tokenizer
|
||||
- annotations
|
||||
dependencies: []
|
||||
priority: medium
|
||||
---
|
||||
|
||||
## Description
|
||||
|
||||
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||
N+1 target selection should respect sentence-ending punctuation from the original subtitle text even when Yomitan token output omits punctuation tokens. Current behavior can treat multiple subtitle sentences as one token span and incorrectly satisfy the minimum content-token threshold.
|
||||
<!-- SECTION:DESCRIPTION:END -->
|
||||
|
||||
## Acceptance Criteria
|
||||
<!-- AC:BEGIN -->
|
||||
- [ ] #1 A subtitle like `てんめ!ふざけんなよ!` does not mark `ふざけん`/similar single-content-token second sentence as N+1 when the minimum sentence word count is 3.
|
||||
- [ ] #2 N+1 sentence segmentation uses original subtitle text offsets or equivalent source-boundary data, not only punctuation tokens returned by Yomitan.
|
||||
- [ ] #3 Existing annotation exclusion behavior for particles/grammar tokens remains unchanged.
|
||||
- [ ] #4 Regression tests cover Yomitan-style token streams where punctuation is absent from the token list.
|
||||
<!-- AC:END -->
|
||||
@@ -0,0 +1,55 @@
|
||||
---
|
||||
id: TASK-305
|
||||
title: Use Yomitan word classes for subtitle token POS filtering
|
||||
status: Done
|
||||
assignee: []
|
||||
created_date: '2026-04-26 05:56'
|
||||
updated_date: '2026-04-26 05:59'
|
||||
labels:
|
||||
- tokenizer
|
||||
- yomitan
|
||||
dependencies: []
|
||||
priority: medium
|
||||
---
|
||||
|
||||
## Description
|
||||
|
||||
<!-- SECTION:DESCRIPTION:BEGIN -->
|
||||
Subtitle annotation filtering currently uses Yomitan token spans, then enriches those spans by running MeCab over the full normalized subtitle line. Add support for carrying Yomitan headword wordClasses from termsFind into SubMiner tokens so dictionary-backed tokens can provide coarse POS/tag metadata without vendored Yomitan changes. MeCab whole-line enrichment should remain a fallback/source of detailed POS data when Yomitan classes are absent.
|
||||
<!-- SECTION:DESCRIPTION:END -->
|
||||
|
||||
## Acceptance Criteria
|
||||
<!-- AC:BEGIN -->
|
||||
- [x] #1 Yomitan scanner tokens preserve matched headword wordClasses when termsFind returns them.
|
||||
- [x] #2 Subtitle tokenization maps recognized Yomitan wordClasses to coarse PartOfSpeech/POS metadata before annotation filtering.
|
||||
- [x] #3 Whole-line MeCab enrichment remains available for missing or more detailed POS metadata and does not break existing subtitle annotation behavior.
|
||||
- [x] #4 Focused tokenizer tests cover wordClasses extraction and POS mapping.
|
||||
<!-- AC:END -->
|
||||
|
||||
## Implementation Plan
|
||||
|
||||
<!-- SECTION:PLAN:BEGIN -->
|
||||
1. Add focused regression coverage for Yomitan scanner wordClasses payload and subtitle POS mapping.
|
||||
2. Extend the app-owned Yomitan scanner payload to carry matched headword wordClasses when present.
|
||||
3. Map recognized Yomitan wordClasses to SubMiner coarse PartOfSpeech/POS metadata before annotation filtering.
|
||||
4. Keep MeCab whole-line enrichment as fallback/detail-fill for missing POS fields.
|
||||
5. Run focused tokenizer tests and typecheck.
|
||||
<!-- SECTION:PLAN:END -->
|
||||
|
||||
## Implementation Notes
|
||||
|
||||
<!-- SECTION:NOTES:BEGIN -->
|
||||
Implemented app-only wordClasses extraction from termsFind results; no vendored Yomitan changes required. Recognized classes currently map prt, aux, v*, adj-i/adj-ix, adj-na, and noun-like classes to SubMiner POS metadata. MeCab enrichment now skips only tokens with complete pos1/pos2/pos3 and otherwise fills missing fields while preserving existing coarse pos1. Verification: bun test src/core/services/tokenizer/yomitan-parser-runtime.test.ts src/core/services/tokenizer.test.ts; bun run typecheck.
|
||||
<!-- SECTION:NOTES:END -->
|
||||
|
||||
## Final Summary
|
||||
|
||||
<!-- SECTION:FINAL_SUMMARY:BEGIN -->
|
||||
Implemented app-only Yomitan wordClasses support for subtitle token annotation filtering. The scanner now carries matched headword wordClasses from termsFind results, tokenizer maps recognized classes into SubMiner coarse POS metadata before annotation, and MeCab whole-line enrichment continues to fill missing detailed POS fields without requiring vendored Yomitan changes.
|
||||
|
||||
Tests run:
|
||||
- bun test src/core/services/tokenizer/yomitan-parser-runtime.test.ts src/core/services/tokenizer.test.ts
|
||||
- bun run typecheck
|
||||
|
||||
Note: the working tree already had unrelated tokenizer/annotation edits and task-304 before this work; those were left intact.
|
||||
<!-- SECTION:FINAL_SUMMARY:END -->
|
||||
@@ -25,6 +25,7 @@ interface YomitanTokenInput {
|
||||
reading?: string;
|
||||
headword?: string;
|
||||
isNameMatch?: boolean;
|
||||
wordClasses?: string[];
|
||||
}
|
||||
|
||||
function makeDepsFromYomitanTokens(
|
||||
@@ -55,6 +56,7 @@ function makeDepsFromYomitanTokens(
|
||||
startPos,
|
||||
endPos,
|
||||
isNameMatch: token.isNameMatch ?? false,
|
||||
wordClasses: token.wordClasses,
|
||||
};
|
||||
});
|
||||
},
|
||||
@@ -1552,7 +1554,7 @@ test('tokenizeSubtitle assigns JLPT level to Yomitan tokens', async () => {
|
||||
assert.equal(result.tokens?.[0]?.jlptLevel, 'N4');
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle can assign JLPT level to Yomitan particle token', async () => {
|
||||
test('tokenizeSubtitle clears JLPT level from standalone Yomitan particle token', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'は',
|
||||
makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は' }], {
|
||||
@@ -1561,7 +1563,7 @@ test('tokenizeSubtitle can assign JLPT level to Yomitan particle token', async (
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.jlptLevel, 'N5');
|
||||
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle returns null tokens for empty normalized text', async () => {
|
||||
@@ -3034,6 +3036,58 @@ test('tokenizeSubtitle skips all enrichment stages when disabled', async () => {
|
||||
assert.equal(frequencyCalls, 0);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle uses Yomitan word classes to classify standalone particles', async () => {
|
||||
let mecabCalls = 0;
|
||||
const result = await tokenizeSubtitle(
|
||||
'は',
|
||||
makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は', wordClasses: ['prt'] }], {
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) => (text === 'は' ? 10 : null),
|
||||
getJlptLevel: (text) => (text === 'は' ? 'N5' : null),
|
||||
tokenizeWithMecab: async () => {
|
||||
mecabCalls += 1;
|
||||
return null;
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(mecabCalls, 1);
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.partOfSpeech, PartOfSpeech.particle);
|
||||
assert.equal(result.tokens?.[0]?.pos1, '助詞');
|
||||
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
||||
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle fills detailed MeCab POS when Yomitan word class supplies coarse POS', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'は',
|
||||
makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は', wordClasses: ['prt'] }], {
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
headword: 'は',
|
||||
surface: 'は',
|
||||
reading: 'ハ',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
partOfSpeech: PartOfSpeech.particle,
|
||||
pos1: '助詞',
|
||||
pos2: '係助詞',
|
||||
pos3: '*',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.[0]?.partOfSpeech, PartOfSpeech.particle);
|
||||
assert.equal(result.tokens?.[0]?.pos1, '助詞');
|
||||
assert.equal(result.tokens?.[0]?.pos2, '係助詞');
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle keeps frequency enrichment while n+1 is disabled', async () => {
|
||||
let knownCalls = 0;
|
||||
let mecabCalls = 0;
|
||||
@@ -3110,6 +3164,60 @@ test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and freque
|
||||
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle preserves known-word highlight for exact non-independent kanji noun tokens', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'その点',
|
||||
makeDepsFromYomitanTokens(
|
||||
[
|
||||
{ surface: 'その', reading: 'その', headword: 'その' },
|
||||
{ surface: '点', reading: 'てん', headword: '点' },
|
||||
],
|
||||
{
|
||||
isKnownWord: (text) => text === '点' || text === 'てん',
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) => (text === '点' ? 1384 : null),
|
||||
getJlptLevel: (text) => (text === '点' ? 'N3' : null),
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
headword: 'その',
|
||||
surface: 'その',
|
||||
reading: 'ソノ',
|
||||
startPos: 0,
|
||||
endPos: 2,
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '連体詞',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: '点',
|
||||
surface: '点',
|
||||
reading: 'テン',
|
||||
startPos: 2,
|
||||
endPos: 3,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '非自立',
|
||||
pos3: '一般',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
},
|
||||
),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 2);
|
||||
assert.equal(result.tokens?.[0]?.isKnown, false);
|
||||
assert.equal(result.tokens?.[1]?.surface, '点');
|
||||
assert.equal(result.tokens?.[1]?.isKnown, true);
|
||||
assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false);
|
||||
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
|
||||
assert.equal(result.tokens?.[1]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle keeps mecab-tagged interjections tokenized while clearing annotation metadata', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'ぐはっ',
|
||||
|
||||
@@ -96,6 +96,7 @@ interface TokenizerAnnotationOptions {
|
||||
minSentenceWordsForNPlusOne: number | undefined;
|
||||
pos1Exclusions: ReadonlySet<string>;
|
||||
pos2Exclusions: ReadonlySet<string>;
|
||||
sourceText?: string;
|
||||
}
|
||||
|
||||
let parserEnrichmentWorkerRuntimeModulePromise: Promise<
|
||||
@@ -333,6 +334,66 @@ function normalizeSelectedYomitanTokens(tokens: MergedToken[]): MergedToken[] {
|
||||
}));
|
||||
}
|
||||
|
||||
function normalizeYomitanWordClasses(wordClasses: unknown): string[] {
|
||||
if (!Array.isArray(wordClasses)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const normalized: string[] = [];
|
||||
for (const wordClass of wordClasses) {
|
||||
if (typeof wordClass !== 'string') {
|
||||
continue;
|
||||
}
|
||||
const trimmed = wordClass.trim();
|
||||
if (trimmed && !normalized.includes(trimmed)) {
|
||||
normalized.push(trimmed);
|
||||
}
|
||||
}
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function resolvePartOfSpeechFromYomitanWordClasses(wordClasses: string[]): {
|
||||
partOfSpeech: PartOfSpeech;
|
||||
pos1?: string;
|
||||
} {
|
||||
if (wordClasses.includes('prt')) {
|
||||
return { partOfSpeech: PartOfSpeech.particle, pos1: '助詞' };
|
||||
}
|
||||
if (wordClasses.includes('aux')) {
|
||||
return { partOfSpeech: PartOfSpeech.bound_auxiliary, pos1: '助動詞' };
|
||||
}
|
||||
if (wordClasses.some((wordClass) => wordClass.startsWith('v'))) {
|
||||
return { partOfSpeech: PartOfSpeech.verb, pos1: '動詞' };
|
||||
}
|
||||
if (wordClasses.includes('adj-i') || wordClasses.includes('adj-ix')) {
|
||||
return { partOfSpeech: PartOfSpeech.i_adjective, pos1: '形容詞' };
|
||||
}
|
||||
if (wordClasses.includes('adj-na')) {
|
||||
return { partOfSpeech: PartOfSpeech.na_adjective, pos1: '名詞' };
|
||||
}
|
||||
if (
|
||||
wordClasses.some(
|
||||
(wordClass) =>
|
||||
wordClass === 'n' ||
|
||||
wordClass === 'num' ||
|
||||
wordClass === 'ctr' ||
|
||||
wordClass === 'pn' ||
|
||||
wordClass.startsWith('n-'),
|
||||
)
|
||||
) {
|
||||
return { partOfSpeech: PartOfSpeech.noun, pos1: '名詞' };
|
||||
}
|
||||
|
||||
return { partOfSpeech: PartOfSpeech.other };
|
||||
}
|
||||
|
||||
function getYomitanWordClassPosMetadata(wordClasses: unknown): {
|
||||
partOfSpeech: PartOfSpeech;
|
||||
pos1?: string;
|
||||
} {
|
||||
return resolvePartOfSpeechFromYomitanWordClasses(normalizeYomitanWordClasses(wordClasses));
|
||||
}
|
||||
|
||||
function resolveFrequencyLookupText(
|
||||
token: MergedToken,
|
||||
matchMode: FrequencyDictionaryMatchMode,
|
||||
@@ -623,19 +684,23 @@ async function parseWithYomitanInternalParser(
|
||||
}
|
||||
const normalizedSelectedTokens = normalizeSelectedYomitanTokens(
|
||||
selectedTokens.map(
|
||||
(token): MergedToken => ({
|
||||
surface: token.surface,
|
||||
reading: token.reading,
|
||||
headword: token.headword,
|
||||
startPos: token.startPos,
|
||||
endPos: token.endPos,
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
isMerged: true,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
isNameMatch: token.isNameMatch ?? false,
|
||||
frequencyRank: token.frequencyRank,
|
||||
}),
|
||||
(token): MergedToken => {
|
||||
const posMetadata = getYomitanWordClassPosMetadata(token.wordClasses);
|
||||
return {
|
||||
surface: token.surface,
|
||||
reading: token.reading,
|
||||
headword: token.headword,
|
||||
startPos: token.startPos,
|
||||
endPos: token.endPos,
|
||||
partOfSpeech: posMetadata.partOfSpeech,
|
||||
pos1: posMetadata.pos1,
|
||||
isMerged: true,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
isNameMatch: token.isNameMatch ?? false,
|
||||
frequencyRank: token.frequencyRank,
|
||||
};
|
||||
},
|
||||
),
|
||||
);
|
||||
|
||||
@@ -716,12 +781,11 @@ export async function tokenizeSubtitle(
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
const annotationOptions = getAnnotationOptions(deps);
|
||||
annotationOptions.sourceText = tokenizeText;
|
||||
|
||||
const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps, annotationOptions);
|
||||
if (yomitanTokens && yomitanTokens.length > 0) {
|
||||
const annotatedTokens = await stripSubtitleAnnotationMetadata(
|
||||
await applyAnnotationStage(yomitanTokens, deps, annotationOptions),
|
||||
);
|
||||
const annotatedTokens = await applyAnnotationStage(yomitanTokens, deps, annotationOptions);
|
||||
return {
|
||||
text: displayText,
|
||||
tokens: annotatedTokens.length > 0 ? annotatedTokens : null,
|
||||
|
||||
@@ -366,6 +366,132 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes kana-only non-independe
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone して grammar helper fragments', () => {
|
||||
const token = makeToken({
|
||||
surface: 'して',
|
||||
headword: 'する',
|
||||
reading: 'シテ',
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞|助詞',
|
||||
pos2: '自立|接続助詞',
|
||||
});
|
||||
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes inflected standalone して grammar helper fragments', () => {
|
||||
const token = makeToken({
|
||||
surface: 'してる',
|
||||
headword: 'する',
|
||||
reading: 'シテル',
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞|助動詞',
|
||||
pos2: '自立|非自立',
|
||||
});
|
||||
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone particle fragments without POS tags', () => {
|
||||
const token = makeToken({
|
||||
surface: 'と',
|
||||
headword: 'と',
|
||||
reading: 'ト',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
});
|
||||
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone connective particle fragments without POS tags', () => {
|
||||
const token = makeToken({
|
||||
surface: 'たって',
|
||||
headword: 'たって',
|
||||
reading: 'タッテ',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
});
|
||||
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes rhetorical もんか grammar particle phrases', () => {
|
||||
for (const surface of ['もんか', 'ものか']) {
|
||||
const token = makeToken({
|
||||
surface,
|
||||
headword: surface,
|
||||
reading: surface === 'もんか' ? 'モンカ' : 'モノカ',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞|助詞',
|
||||
pos2: '非自立|副助詞/並立助詞/終助詞',
|
||||
});
|
||||
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, surface);
|
||||
}
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes bare くれ auxiliary fragments', () => {
|
||||
const token = makeToken({
|
||||
surface: 'くれ',
|
||||
headword: '暮れ',
|
||||
reading: 'クレ',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '一般',
|
||||
});
|
||||
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone quote particle and auxiliary grammar terms', () => {
|
||||
for (const token of [
|
||||
makeToken({
|
||||
surface: 'って',
|
||||
headword: 'って',
|
||||
reading: 'ッテ',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'べき',
|
||||
headword: 'べき',
|
||||
reading: 'ベキ',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
}),
|
||||
]) {
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
|
||||
}
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes single-kana surface fragments', () => {
|
||||
for (const token of [
|
||||
makeToken({
|
||||
surface: 'ふ',
|
||||
headword: '不',
|
||||
reading: 'フ',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '接頭詞',
|
||||
pos2: '',
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'フ',
|
||||
headword: '負',
|
||||
reading: 'フ',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '一般',
|
||||
}),
|
||||
]) {
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
|
||||
}
|
||||
});
|
||||
|
||||
test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => {
|
||||
const token = makeToken({
|
||||
surface: 'は',
|
||||
@@ -536,6 +662,57 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('annotateTokens N+1 sentence word count respects source punctuation gaps omitted by Yomitan', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: '私',
|
||||
headword: '私',
|
||||
pos1: '名詞',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
}),
|
||||
makeToken({
|
||||
surface: '猫',
|
||||
headword: '猫',
|
||||
pos1: '名詞',
|
||||
startPos: 1,
|
||||
endPos: 2,
|
||||
}),
|
||||
makeToken({
|
||||
surface: '犬',
|
||||
headword: '犬',
|
||||
pos1: '名詞',
|
||||
startPos: 2,
|
||||
endPos: 3,
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'ふざけん',
|
||||
headword: 'ふざける',
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞',
|
||||
pos2: '自立',
|
||||
startPos: 4,
|
||||
endPos: 8,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === '私' || text === '猫' || text === '犬',
|
||||
}),
|
||||
{
|
||||
minSentenceWordsForNPlusOne: 3,
|
||||
sourceText: '私猫犬!ふざけんなよ!',
|
||||
},
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[1]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[2]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[3]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('annotateTokens applies configured pos1 exclusions to both frequency and N+1', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
@@ -610,14 +787,52 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(tokens, makeDeps(), {
|
||||
minSentenceWordsForNPlusOne: 1,
|
||||
});
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === 'た' || text === '負',
|
||||
getJlptLevel: (text) => (text === 'た' || text === '負' ? 'N3' : null),
|
||||
}),
|
||||
{
|
||||
minSentenceWordsForNPlusOne: 1,
|
||||
},
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('annotateTokens preserves exact known-word status for non-independent kanji noun tokens', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: '点',
|
||||
reading: 'てん',
|
||||
headword: '点',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '名詞',
|
||||
pos2: '非自立',
|
||||
pos3: '一般',
|
||||
startPos: 2,
|
||||
endPos: 3,
|
||||
frequencyRank: 1384,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === '点' || text === 'てん',
|
||||
getJlptLevel: (text) => (text === '点' ? 'N3' : null),
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 1 },
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.isKnown, true);
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
assert.equal(result[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens clears all annotations for non-independent kanji noun tokens under unified gate', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
@@ -665,7 +880,7 @@ test('annotateTokens excludes likely kana SFX tokens from frequency when POS tag
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens excludes single hiragana and katakana tokens from frequency when POS tags are missing', () => {
|
||||
test('annotateTokens clears all annotations from single hiragana and katakana surface fragments', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'た',
|
||||
@@ -679,12 +894,12 @@ test('annotateTokens excludes single hiragana and katakana tokens from frequency
|
||||
endPos: 1,
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'ア',
|
||||
reading: 'ア',
|
||||
headword: 'ア',
|
||||
pos1: '',
|
||||
surface: 'フ',
|
||||
reading: 'フ',
|
||||
headword: '負',
|
||||
pos1: '名詞',
|
||||
pos2: '',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
frequencyRank: 22,
|
||||
startPos: 1,
|
||||
endPos: 2,
|
||||
@@ -706,8 +921,14 @@ test('annotateTokens excludes single hiragana and katakana tokens from frequency
|
||||
minSentenceWordsForNPlusOne: 1,
|
||||
});
|
||||
|
||||
assert.equal(result[0]?.isKnown, false);
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
assert.equal(result[0]?.jlptLevel, undefined);
|
||||
assert.equal(result[1]?.isKnown, false);
|
||||
assert.equal(result[1]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[1]?.frequencyRank, undefined);
|
||||
assert.equal(result[1]?.jlptLevel, undefined);
|
||||
assert.equal(result[2]?.frequencyRank, 23);
|
||||
});
|
||||
|
||||
@@ -856,6 +1077,219 @@ test('annotateTokens clears all annotations for kana-only non-independent noun h
|
||||
assert.equal(result[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens clears all annotations for standalone して helper fragments', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'してる',
|
||||
headword: 'する',
|
||||
reading: 'シテル',
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞|助動詞',
|
||||
pos2: '自立|非自立',
|
||||
startPos: 0,
|
||||
endPos: 3,
|
||||
frequencyRank: 22,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === 'する',
|
||||
getJlptLevel: (text) => (text === 'する' ? 'N5' : null),
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 1 },
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.isKnown, false);
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
assert.equal(result[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens clears all annotations for standalone particle fragments without POS tags', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'と',
|
||||
headword: 'と',
|
||||
reading: 'ト',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
frequencyRank: 4,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === 'と',
|
||||
getJlptLevel: (text) => (text === 'と' ? 'N5' : null),
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 1 },
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.isKnown, false);
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
assert.equal(result[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens does not mark standalone connective particles as N+1', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: '逃げる',
|
||||
headword: '逃げる',
|
||||
reading: 'ニゲル',
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞',
|
||||
pos2: '自立',
|
||||
startPos: 0,
|
||||
endPos: 3,
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'たって',
|
||||
headword: 'たって',
|
||||
reading: 'タッテ',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
startPos: 3,
|
||||
endPos: 6,
|
||||
frequencyRank: 28,
|
||||
}),
|
||||
makeToken({
|
||||
surface: '無駄',
|
||||
headword: '無駄',
|
||||
reading: 'ムダ',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '形容動詞語幹',
|
||||
startPos: 6,
|
||||
endPos: 8,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === '逃げる' || text === '無駄',
|
||||
getJlptLevel: (text) => (text === 'たって' ? 'N3' : null),
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 1 },
|
||||
);
|
||||
|
||||
assert.equal(result[1]?.isKnown, false);
|
||||
assert.equal(result[1]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[1]?.frequencyRank, undefined);
|
||||
assert.equal(result[1]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens clears all annotations for rhetorical もんか grammar particle phrases', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'もんか',
|
||||
headword: 'もんか',
|
||||
reading: 'モンカ',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞|助詞',
|
||||
pos2: '非自立|副助詞/並立助詞/終助詞',
|
||||
startPos: 0,
|
||||
endPos: 3,
|
||||
frequencyRank: 69629,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === 'もんか',
|
||||
getJlptLevel: (text) => (text === 'もんか' ? 'N2' : null),
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 1 },
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.isKnown, false);
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
assert.equal(result[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens clears all annotations for bare くれ auxiliary fragments', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'くれ',
|
||||
headword: '暮れ',
|
||||
reading: 'クレ',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '一般',
|
||||
startPos: 0,
|
||||
endPos: 2,
|
||||
frequencyRank: 12877,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === '暮れ',
|
||||
getJlptLevel: (text) => (text === '暮れ' ? 'N3' : null),
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 1 },
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.isKnown, false);
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
assert.equal(result[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens clears all annotations for standalone quote particle and auxiliary grammar terms', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'って',
|
||||
headword: 'って',
|
||||
reading: 'ッテ',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
startPos: 0,
|
||||
endPos: 2,
|
||||
frequencyRank: 28,
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'べき',
|
||||
headword: 'べき',
|
||||
reading: 'ベキ',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
startPos: 2,
|
||||
endPos: 4,
|
||||
frequencyRank: 268,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === 'って' || text === 'べき',
|
||||
getJlptLevel: (text) => (text === 'って' || text === 'べき' ? 'N3' : null),
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 1 },
|
||||
);
|
||||
|
||||
for (const token of result) {
|
||||
assert.equal(token.isKnown, false, token.surface);
|
||||
assert.equal(token.isNPlusOneTarget, false, token.surface);
|
||||
assert.equal(token.frequencyRank, undefined, token.surface);
|
||||
assert.equal(token.jlptLevel, undefined, token.surface);
|
||||
}
|
||||
});
|
||||
|
||||
test('annotateTokens clears all annotations from standalone あ interjections without POS tags', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
|
||||
@@ -89,6 +89,7 @@ export interface AnnotationStageOptions {
|
||||
minSentenceWordsForNPlusOne?: number;
|
||||
pos1Exclusions?: ReadonlySet<string>;
|
||||
pos2Exclusions?: ReadonlySet<string>;
|
||||
sourceText?: string;
|
||||
}
|
||||
|
||||
function resolveKnownWordText(
|
||||
@@ -670,6 +671,36 @@ function computeTokenKnownStatus(
|
||||
return normalizedReading !== matchText.trim() && isKnownWord(normalizedReading);
|
||||
}
|
||||
|
||||
function computeExcludedTokenKnownStatus(
|
||||
token: MergedToken,
|
||||
isKnownWord: (text: string) => boolean,
|
||||
): boolean {
|
||||
const normalizedSurface = token.surface.trim();
|
||||
if (!hasKanjiChar(normalizedSurface)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (normalizedSurface && isKnownWord(normalizedSurface)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const normalizedReading = token.reading.trim();
|
||||
if (
|
||||
normalizedReading &&
|
||||
normalizedReading !== normalizedSurface &&
|
||||
isKnownWord(normalizedReading)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const normalizedHeadword = token.headword.trim();
|
||||
return (
|
||||
normalizedHeadword.length > 0 &&
|
||||
normalizedHeadword === normalizedSurface &&
|
||||
isKnownWord(normalizedHeadword)
|
||||
);
|
||||
}
|
||||
|
||||
function filterTokenFrequencyRank(
|
||||
token: MergedToken,
|
||||
pos1Exclusions: ReadonlySet<string>,
|
||||
@@ -732,10 +763,16 @@ export function annotateTokens(
|
||||
pos2Exclusions,
|
||||
})
|
||||
) {
|
||||
return sharedStripSubtitleAnnotationMetadata(token, {
|
||||
const strippedToken = sharedStripSubtitleAnnotationMetadata(token, {
|
||||
pos1Exclusions,
|
||||
pos2Exclusions,
|
||||
});
|
||||
return {
|
||||
...strippedToken,
|
||||
isKnown:
|
||||
nPlusOneEnabled &&
|
||||
computeExcludedTokenKnownStatus(token, deps.isKnownWord),
|
||||
};
|
||||
}
|
||||
|
||||
const prioritizedNameMatch = nameMatchEnabled && token.isNameMatch === true;
|
||||
@@ -779,6 +816,7 @@ export function annotateTokens(
|
||||
sanitizedMinSentenceWordsForNPlusOne,
|
||||
pos1Exclusions,
|
||||
pos2Exclusions,
|
||||
options.sourceText,
|
||||
);
|
||||
|
||||
if (!nameMatchEnabled) {
|
||||
|
||||
@@ -303,7 +303,9 @@ function fillMissingPos1BySurfaceSequence(
|
||||
|
||||
let cursor = 0;
|
||||
return tokens.map((token) => {
|
||||
if (token.pos1 && token.pos1.trim().length > 0) {
|
||||
const hasCompletePosMetadata =
|
||||
token.pos1?.trim() && token.pos2?.trim() && token.pos3?.trim();
|
||||
if (hasCompletePosMetadata) {
|
||||
return token;
|
||||
}
|
||||
|
||||
@@ -327,9 +329,9 @@ function fillMissingPos1BySurfaceSequence(
|
||||
cursor = best.index + 1;
|
||||
return {
|
||||
...token,
|
||||
pos1: best.pos1,
|
||||
pos2: best.pos2,
|
||||
pos3: best.pos3,
|
||||
pos1: token.pos1 ?? best.pos1,
|
||||
pos2: token.pos2 ?? best.pos2,
|
||||
pos3: token.pos3 ?? best.pos3,
|
||||
};
|
||||
});
|
||||
}
|
||||
@@ -382,7 +384,7 @@ export function enrichTokensWithMecabPos1(
|
||||
const metadataByTokenIndex = new Map<number, MecabPosMetadata>();
|
||||
|
||||
for (const [index, token] of tokens.entries()) {
|
||||
if (token.pos1) {
|
||||
if (token.pos1?.trim() && token.pos2?.trim() && token.pos3?.trim()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -410,9 +412,9 @@ export function enrichTokensWithMecabPos1(
|
||||
|
||||
return {
|
||||
...token,
|
||||
pos1: metadata.pos1,
|
||||
pos2: metadata.pos2,
|
||||
pos3: metadata.pos3,
|
||||
pos1: token.pos1 ?? metadata.pos1,
|
||||
pos2: token.pos2 ?? metadata.pos2,
|
||||
pos3: token.pos3 ?? metadata.pos3,
|
||||
};
|
||||
});
|
||||
|
||||
|
||||
@@ -19,11 +19,18 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
||||
'ええ',
|
||||
'うう',
|
||||
'おお',
|
||||
'くれ',
|
||||
'たって',
|
||||
'って',
|
||||
'だって',
|
||||
'はあ',
|
||||
'はは',
|
||||
'べき',
|
||||
'へえ',
|
||||
'ふう',
|
||||
'ほう',
|
||||
'もんか',
|
||||
'ものか',
|
||||
]);
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
|
||||
@@ -72,6 +79,26 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
|
||||
]);
|
||||
const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
|
||||
const NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1 = new Set(['助詞', '助動詞']);
|
||||
const STANDALONE_GRAMMAR_PARTICLE_SURFACES = new Set([
|
||||
'か',
|
||||
'が',
|
||||
'さ',
|
||||
'し',
|
||||
'ぞ',
|
||||
'ぜ',
|
||||
'と',
|
||||
'な',
|
||||
'に',
|
||||
'ね',
|
||||
'の',
|
||||
'は',
|
||||
'へ',
|
||||
'も',
|
||||
'や',
|
||||
'よ',
|
||||
'を',
|
||||
]);
|
||||
const STANDALONE_GRAMMAR_PARTICLE_PHRASES = new Set(['たって', 'だって']);
|
||||
|
||||
export interface SubtitleAnnotationFilterOptions {
|
||||
pos1Exclusions?: ReadonlySet<string>;
|
||||
@@ -278,6 +305,38 @@ function isKanaOnlyNonIndependentNounHelperMerge(token: MergedToken): boolean {
|
||||
return pos1Parts.slice(1).every((part) => NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1.has(part));
|
||||
}
|
||||
|
||||
function isKanaOnlyText(text: string): boolean {
|
||||
const normalized = normalizeKana(text);
|
||||
return normalized.length > 0 && [...normalized].every(isKanaChar);
|
||||
}
|
||||
|
||||
function isStandaloneSuruTeGrammarHelper(token: MergedToken): boolean {
|
||||
const normalizedSurface = normalizeKana(token.surface);
|
||||
const normalizedHeadword = normalizeKana(token.headword);
|
||||
if (!normalizedSurface.startsWith('して') || normalizedHeadword !== 'する') {
|
||||
return false;
|
||||
}
|
||||
|
||||
const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
|
||||
return isKanaOnlyText(normalizedSurface) && (pos1Parts.length === 0 || pos1Parts.includes('動詞'));
|
||||
}
|
||||
|
||||
function isStandaloneGrammarParticle(token: MergedToken): boolean {
|
||||
const normalizedSurface = normalizeKana(token.surface);
|
||||
const normalizedHeadword = normalizeKana(token.headword);
|
||||
return (
|
||||
normalizedSurface === normalizedHeadword &&
|
||||
(STANDALONE_GRAMMAR_PARTICLE_SURFACES.has(normalizedSurface) ||
|
||||
STANDALONE_GRAMMAR_PARTICLE_PHRASES.has(normalizedSurface))
|
||||
);
|
||||
}
|
||||
|
||||
function isSingleKanaSurfaceFragment(token: MergedToken): boolean {
|
||||
const normalizedSurface = normalizeKana(token.surface);
|
||||
const chars = [...normalizedSurface];
|
||||
return chars.length === 1 && chars.every(isKanaChar);
|
||||
}
|
||||
|
||||
function isExcludedByTerm(token: MergedToken): boolean {
|
||||
const candidates = [token.surface, token.reading, token.headword].filter(
|
||||
(candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
|
||||
@@ -365,6 +424,18 @@ export function shouldExcludeTokenFromSubtitleAnnotations(
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isStandaloneSuruTeGrammarHelper(token)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isStandaloneGrammarParticle(token)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isSingleKanaSurfaceFragment(token)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isExcludedTrailingParticleMergedToken(token)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1049,6 +1049,60 @@ test('requestYomitanScanTokens marks grouped entries when SubMiner dictionary al
|
||||
assert.equal((result as Array<{ isNameMatch?: boolean }>)[0]?.isNameMatch, true);
|
||||
});
|
||||
|
||||
test('requestYomitanScanTokens preserves matched headword word classes', async () => {
|
||||
let scannerScript = '';
|
||||
const deps = createDeps(async (script) => {
|
||||
if (script.includes('termsFind')) {
|
||||
scannerScript = script;
|
||||
return [];
|
||||
}
|
||||
if (script.includes('optionsGetFull')) {
|
||||
return {
|
||||
profileCurrent: 0,
|
||||
profiles: [
|
||||
{
|
||||
options: {
|
||||
scanning: { length: 40 },
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
return null;
|
||||
});
|
||||
|
||||
await requestYomitanScanTokens('は', deps, { error: () => undefined });
|
||||
|
||||
const result = await runInjectedYomitanScript(scannerScript, (action, params) => {
|
||||
if (action !== 'termsFind') {
|
||||
throw new Error(`unexpected action: ${action}`);
|
||||
}
|
||||
|
||||
const text = (params as { text?: string } | undefined)?.text;
|
||||
if (text !== 'は') {
|
||||
return { originalTextLength: 0, dictionaryEntries: [] };
|
||||
}
|
||||
|
||||
return {
|
||||
originalTextLength: 1,
|
||||
dictionaryEntries: [
|
||||
{
|
||||
headwords: [
|
||||
{
|
||||
term: 'は',
|
||||
reading: 'は',
|
||||
wordClasses: ['prt'],
|
||||
sources: [{ originalText: 'は', isPrimary: true, matchType: 'exact' }],
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
});
|
||||
|
||||
assert.deepEqual((result as Array<{ wordClasses?: string[] }>)[0]?.wordClasses, ['prt']);
|
||||
});
|
||||
|
||||
test('requestYomitanScanTokens skips fallback fragments without exact primary source matches', async () => {
|
||||
const deps = createDeps(async (script) => {
|
||||
if (script.includes('optionsGetFull')) {
|
||||
|
||||
@@ -53,6 +53,7 @@ export interface YomitanScanToken {
|
||||
endPos: number;
|
||||
isNameMatch?: boolean;
|
||||
frequencyRank?: number;
|
||||
wordClasses?: string[];
|
||||
}
|
||||
|
||||
interface YomitanProfileMetadata {
|
||||
@@ -91,7 +92,10 @@ function isScanTokenArray(value: unknown): value is YomitanScanToken[] {
|
||||
typeof entry.startPos === 'number' &&
|
||||
typeof entry.endPos === 'number' &&
|
||||
(entry.isNameMatch === undefined || typeof entry.isNameMatch === 'boolean') &&
|
||||
(entry.frequencyRank === undefined || typeof entry.frequencyRank === 'number'),
|
||||
(entry.frequencyRank === undefined || typeof entry.frequencyRank === 'number') &&
|
||||
(entry.wordClasses === undefined ||
|
||||
(Array.isArray(entry.wordClasses) &&
|
||||
entry.wordClasses.every((wordClass) => typeof wordClass === 'string'))),
|
||||
)
|
||||
);
|
||||
}
|
||||
@@ -975,6 +979,11 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
|
||||
return best;
|
||||
}
|
||||
function getPreferredHeadword(dictionaryEntries, token, dictionaryPriorityByName, dictionaryFrequencyModeByName) {
|
||||
function normalizeWordClasses(headword) {
|
||||
if (!Array.isArray(headword?.wordClasses)) { return undefined; }
|
||||
const classes = headword.wordClasses.filter((wordClass) => typeof wordClass === "string" && wordClass.trim().length > 0);
|
||||
return classes.length > 0 ? classes : undefined;
|
||||
}
|
||||
function appendDictionaryNames(target, value) {
|
||||
if (!value || typeof value !== 'object') {
|
||||
return;
|
||||
@@ -1033,6 +1042,7 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
|
||||
return {
|
||||
term: preferredMatch.headword.term,
|
||||
reading: preferredMatch.headword.reading,
|
||||
wordClasses: normalizeWordClasses(preferredMatch.headword),
|
||||
isNameMatch: matchedNameDictionary || isNameDictionaryEntry(preferredMatch.dictionaryEntry),
|
||||
frequencyRank: getBestFrequencyRankForMatches(
|
||||
exactFrequencyMatches.length > 0 ? exactFrequencyMatches : exactPrimaryMatches,
|
||||
@@ -1099,7 +1109,7 @@ ${YOMITAN_SCANNING_HELPERS}
|
||||
if (preferredHeadword && typeof preferredHeadword.term === "string") {
|
||||
const reading = typeof preferredHeadword.reading === "string" ? preferredHeadword.reading : "";
|
||||
const segments = distributeFuriganaInflected(preferredHeadword.term, reading, source);
|
||||
tokens.push({
|
||||
const tokenPayload = {
|
||||
surface: segments.map((segment) => segment.text).join("") || source,
|
||||
reading: segments.map((segment) => typeof segment.reading === "string" ? segment.reading : "").join(""),
|
||||
headword: preferredHeadword.term,
|
||||
@@ -1110,7 +1120,11 @@ ${YOMITAN_SCANNING_HELPERS}
|
||||
typeof preferredHeadword.frequencyRank === "number" && Number.isFinite(preferredHeadword.frequencyRank)
|
||||
? Math.max(1, Math.floor(preferredHeadword.frequencyRank))
|
||||
: undefined,
|
||||
});
|
||||
};
|
||||
if (Array.isArray(preferredHeadword.wordClasses) && preferredHeadword.wordClasses.length > 0) {
|
||||
tokenPayload.wordClasses = preferredHeadword.wordClasses;
|
||||
}
|
||||
tokens.push(tokenPayload);
|
||||
i += originalTextLength;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -347,11 +347,25 @@ function isSentenceBoundaryToken(token: MergedToken): boolean {
|
||||
return SENTENCE_BOUNDARY_SURFACES.has(token.surface);
|
||||
}
|
||||
|
||||
function hasSentenceBoundaryInSourceGap(
|
||||
sourceText: string | undefined,
|
||||
previousEnd: number | null,
|
||||
nextStart: number,
|
||||
): boolean {
|
||||
if (typeof sourceText !== 'string' || previousEnd === null || nextStart <= previousEnd) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const gap = sourceText.slice(previousEnd, nextStart);
|
||||
return [...gap].some((char) => SENTENCE_BOUNDARY_SURFACES.has(char));
|
||||
}
|
||||
|
||||
export function markNPlusOneTargets(
|
||||
tokens: MergedToken[],
|
||||
minSentenceWords = 3,
|
||||
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
|
||||
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
|
||||
sourceText?: string,
|
||||
): MergedToken[] {
|
||||
if (tokens.length === 0) {
|
||||
return [];
|
||||
@@ -363,6 +377,7 @@ export function markNPlusOneTargets(
|
||||
}));
|
||||
|
||||
let sentenceStart = 0;
|
||||
let previousTokenEnd: number | null = null;
|
||||
const minimumSentenceWords = Number.isInteger(minSentenceWords)
|
||||
? Math.max(1, minSentenceWords)
|
||||
: 3;
|
||||
@@ -393,10 +408,15 @@ export function markNPlusOneTargets(
|
||||
for (let i = 0; i < markedTokens.length; i++) {
|
||||
const token = markedTokens[i];
|
||||
if (!token) continue;
|
||||
if (hasSentenceBoundaryInSourceGap(sourceText, previousTokenEnd, token.startPos)) {
|
||||
markSentence(sentenceStart, i);
|
||||
sentenceStart = i;
|
||||
}
|
||||
if (isSentenceBoundaryToken(token)) {
|
||||
markSentence(sentenceStart, i);
|
||||
sentenceStart = i + 1;
|
||||
}
|
||||
previousTokenEnd = token.endPos;
|
||||
}
|
||||
|
||||
if (sentenceStart < markedTokens.length) {
|
||||
|
||||
Reference in New Issue
Block a user