mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-04-28 04:19:27 -07:00
feat(tokenizer): use Yomitan word classes for subtitle POS filtering
- Carry matched headword wordClasses from termsFind into YomitanScanToken - Map recognized Yomitan wordClasses to SubMiner coarse POS before annotation - MeCab enrichment now fills only missing POS fields, preserving existing coarse pos1 - Exclude standalone grammar particles, して helper fragments, and single-kana surfaces from annotations - Respect source-text punctuation gaps when counting N+1 sentence words - Preserve known-word highlight on excluded kanji-containing tokens - Add backlog tasks 304 (N+1 boundary bug) and 305 (wordClasses POS, done)
This commit is contained in:
@@ -25,6 +25,7 @@ interface YomitanTokenInput {
|
||||
reading?: string;
|
||||
headword?: string;
|
||||
isNameMatch?: boolean;
|
||||
wordClasses?: string[];
|
||||
}
|
||||
|
||||
function makeDepsFromYomitanTokens(
|
||||
@@ -55,6 +56,7 @@ function makeDepsFromYomitanTokens(
|
||||
startPos,
|
||||
endPos,
|
||||
isNameMatch: token.isNameMatch ?? false,
|
||||
wordClasses: token.wordClasses,
|
||||
};
|
||||
});
|
||||
},
|
||||
@@ -1552,7 +1554,7 @@ test('tokenizeSubtitle assigns JLPT level to Yomitan tokens', async () => {
|
||||
assert.equal(result.tokens?.[0]?.jlptLevel, 'N4');
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle can assign JLPT level to Yomitan particle token', async () => {
|
||||
test('tokenizeSubtitle clears JLPT level from standalone Yomitan particle token', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'は',
|
||||
makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は' }], {
|
||||
@@ -1561,7 +1563,7 @@ test('tokenizeSubtitle can assign JLPT level to Yomitan particle token', async (
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.jlptLevel, 'N5');
|
||||
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle returns null tokens for empty normalized text', async () => {
|
||||
@@ -3034,6 +3036,58 @@ test('tokenizeSubtitle skips all enrichment stages when disabled', async () => {
|
||||
assert.equal(frequencyCalls, 0);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle uses Yomitan word classes to classify standalone particles', async () => {
|
||||
let mecabCalls = 0;
|
||||
const result = await tokenizeSubtitle(
|
||||
'は',
|
||||
makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は', wordClasses: ['prt'] }], {
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) => (text === 'は' ? 10 : null),
|
||||
getJlptLevel: (text) => (text === 'は' ? 'N5' : null),
|
||||
tokenizeWithMecab: async () => {
|
||||
mecabCalls += 1;
|
||||
return null;
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(mecabCalls, 1);
|
||||
assert.equal(result.tokens?.length, 1);
|
||||
assert.equal(result.tokens?.[0]?.partOfSpeech, PartOfSpeech.particle);
|
||||
assert.equal(result.tokens?.[0]?.pos1, '助詞');
|
||||
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
|
||||
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
||||
assert.equal(result.tokens?.[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle fills detailed MeCab POS when Yomitan word class supplies coarse POS', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'は',
|
||||
makeDepsFromYomitanTokens([{ surface: 'は', reading: 'は', headword: 'は', wordClasses: ['prt'] }], {
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
headword: 'は',
|
||||
surface: 'は',
|
||||
reading: 'ハ',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
partOfSpeech: PartOfSpeech.particle,
|
||||
pos1: '助詞',
|
||||
pos2: '係助詞',
|
||||
pos3: '*',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
}),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.[0]?.partOfSpeech, PartOfSpeech.particle);
|
||||
assert.equal(result.tokens?.[0]?.pos1, '助詞');
|
||||
assert.equal(result.tokens?.[0]?.pos2, '係助詞');
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle keeps frequency enrichment while n+1 is disabled', async () => {
|
||||
let knownCalls = 0;
|
||||
let mecabCalls = 0;
|
||||
@@ -3110,6 +3164,60 @@ test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and freque
|
||||
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle preserves known-word highlight for exact non-independent kanji noun tokens', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'その点',
|
||||
makeDepsFromYomitanTokens(
|
||||
[
|
||||
{ surface: 'その', reading: 'その', headword: 'その' },
|
||||
{ surface: '点', reading: 'てん', headword: '点' },
|
||||
],
|
||||
{
|
||||
isKnownWord: (text) => text === '点' || text === 'てん',
|
||||
getFrequencyDictionaryEnabled: () => true,
|
||||
getFrequencyRank: (text) => (text === '点' ? 1384 : null),
|
||||
getJlptLevel: (text) => (text === '点' ? 'N3' : null),
|
||||
tokenizeWithMecab: async () => [
|
||||
{
|
||||
headword: 'その',
|
||||
surface: 'その',
|
||||
reading: 'ソノ',
|
||||
startPos: 0,
|
||||
endPos: 2,
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '連体詞',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
{
|
||||
headword: '点',
|
||||
surface: '点',
|
||||
reading: 'テン',
|
||||
startPos: 2,
|
||||
endPos: 3,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '非自立',
|
||||
pos3: '一般',
|
||||
isMerged: false,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
},
|
||||
],
|
||||
},
|
||||
),
|
||||
);
|
||||
|
||||
assert.equal(result.tokens?.length, 2);
|
||||
assert.equal(result.tokens?.[0]?.isKnown, false);
|
||||
assert.equal(result.tokens?.[1]?.surface, '点');
|
||||
assert.equal(result.tokens?.[1]?.isKnown, true);
|
||||
assert.equal(result.tokens?.[1]?.isNPlusOneTarget, false);
|
||||
assert.equal(result.tokens?.[1]?.frequencyRank, undefined);
|
||||
assert.equal(result.tokens?.[1]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('tokenizeSubtitle keeps mecab-tagged interjections tokenized while clearing annotation metadata', async () => {
|
||||
const result = await tokenizeSubtitle(
|
||||
'ぐはっ',
|
||||
|
||||
@@ -96,6 +96,7 @@ interface TokenizerAnnotationOptions {
|
||||
minSentenceWordsForNPlusOne: number | undefined;
|
||||
pos1Exclusions: ReadonlySet<string>;
|
||||
pos2Exclusions: ReadonlySet<string>;
|
||||
sourceText?: string;
|
||||
}
|
||||
|
||||
let parserEnrichmentWorkerRuntimeModulePromise: Promise<
|
||||
@@ -333,6 +334,66 @@ function normalizeSelectedYomitanTokens(tokens: MergedToken[]): MergedToken[] {
|
||||
}));
|
||||
}
|
||||
|
||||
function normalizeYomitanWordClasses(wordClasses: unknown): string[] {
|
||||
if (!Array.isArray(wordClasses)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const normalized: string[] = [];
|
||||
for (const wordClass of wordClasses) {
|
||||
if (typeof wordClass !== 'string') {
|
||||
continue;
|
||||
}
|
||||
const trimmed = wordClass.trim();
|
||||
if (trimmed && !normalized.includes(trimmed)) {
|
||||
normalized.push(trimmed);
|
||||
}
|
||||
}
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function resolvePartOfSpeechFromYomitanWordClasses(wordClasses: string[]): {
|
||||
partOfSpeech: PartOfSpeech;
|
||||
pos1?: string;
|
||||
} {
|
||||
if (wordClasses.includes('prt')) {
|
||||
return { partOfSpeech: PartOfSpeech.particle, pos1: '助詞' };
|
||||
}
|
||||
if (wordClasses.includes('aux')) {
|
||||
return { partOfSpeech: PartOfSpeech.bound_auxiliary, pos1: '助動詞' };
|
||||
}
|
||||
if (wordClasses.some((wordClass) => wordClass.startsWith('v'))) {
|
||||
return { partOfSpeech: PartOfSpeech.verb, pos1: '動詞' };
|
||||
}
|
||||
if (wordClasses.includes('adj-i') || wordClasses.includes('adj-ix')) {
|
||||
return { partOfSpeech: PartOfSpeech.i_adjective, pos1: '形容詞' };
|
||||
}
|
||||
if (wordClasses.includes('adj-na')) {
|
||||
return { partOfSpeech: PartOfSpeech.na_adjective, pos1: '名詞' };
|
||||
}
|
||||
if (
|
||||
wordClasses.some(
|
||||
(wordClass) =>
|
||||
wordClass === 'n' ||
|
||||
wordClass === 'num' ||
|
||||
wordClass === 'ctr' ||
|
||||
wordClass === 'pn' ||
|
||||
wordClass.startsWith('n-'),
|
||||
)
|
||||
) {
|
||||
return { partOfSpeech: PartOfSpeech.noun, pos1: '名詞' };
|
||||
}
|
||||
|
||||
return { partOfSpeech: PartOfSpeech.other };
|
||||
}
|
||||
|
||||
function getYomitanWordClassPosMetadata(wordClasses: unknown): {
|
||||
partOfSpeech: PartOfSpeech;
|
||||
pos1?: string;
|
||||
} {
|
||||
return resolvePartOfSpeechFromYomitanWordClasses(normalizeYomitanWordClasses(wordClasses));
|
||||
}
|
||||
|
||||
function resolveFrequencyLookupText(
|
||||
token: MergedToken,
|
||||
matchMode: FrequencyDictionaryMatchMode,
|
||||
@@ -623,19 +684,23 @@ async function parseWithYomitanInternalParser(
|
||||
}
|
||||
const normalizedSelectedTokens = normalizeSelectedYomitanTokens(
|
||||
selectedTokens.map(
|
||||
(token): MergedToken => ({
|
||||
surface: token.surface,
|
||||
reading: token.reading,
|
||||
headword: token.headword,
|
||||
startPos: token.startPos,
|
||||
endPos: token.endPos,
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
isMerged: true,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
isNameMatch: token.isNameMatch ?? false,
|
||||
frequencyRank: token.frequencyRank,
|
||||
}),
|
||||
(token): MergedToken => {
|
||||
const posMetadata = getYomitanWordClassPosMetadata(token.wordClasses);
|
||||
return {
|
||||
surface: token.surface,
|
||||
reading: token.reading,
|
||||
headword: token.headword,
|
||||
startPos: token.startPos,
|
||||
endPos: token.endPos,
|
||||
partOfSpeech: posMetadata.partOfSpeech,
|
||||
pos1: posMetadata.pos1,
|
||||
isMerged: true,
|
||||
isKnown: false,
|
||||
isNPlusOneTarget: false,
|
||||
isNameMatch: token.isNameMatch ?? false,
|
||||
frequencyRank: token.frequencyRank,
|
||||
};
|
||||
},
|
||||
),
|
||||
);
|
||||
|
||||
@@ -716,12 +781,11 @@ export async function tokenizeSubtitle(
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
const annotationOptions = getAnnotationOptions(deps);
|
||||
annotationOptions.sourceText = tokenizeText;
|
||||
|
||||
const yomitanTokens = await parseWithYomitanInternalParser(tokenizeText, deps, annotationOptions);
|
||||
if (yomitanTokens && yomitanTokens.length > 0) {
|
||||
const annotatedTokens = await stripSubtitleAnnotationMetadata(
|
||||
await applyAnnotationStage(yomitanTokens, deps, annotationOptions),
|
||||
);
|
||||
const annotatedTokens = await applyAnnotationStage(yomitanTokens, deps, annotationOptions);
|
||||
return {
|
||||
text: displayText,
|
||||
tokens: annotatedTokens.length > 0 ? annotatedTokens : null,
|
||||
|
||||
@@ -366,6 +366,132 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes kana-only non-independe
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone して grammar helper fragments', () => {
|
||||
const token = makeToken({
|
||||
surface: 'して',
|
||||
headword: 'する',
|
||||
reading: 'シテ',
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞|助詞',
|
||||
pos2: '自立|接続助詞',
|
||||
});
|
||||
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes inflected standalone して grammar helper fragments', () => {
|
||||
const token = makeToken({
|
||||
surface: 'してる',
|
||||
headword: 'する',
|
||||
reading: 'シテル',
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞|助動詞',
|
||||
pos2: '自立|非自立',
|
||||
});
|
||||
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone particle fragments without POS tags', () => {
|
||||
const token = makeToken({
|
||||
surface: 'と',
|
||||
headword: 'と',
|
||||
reading: 'ト',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
});
|
||||
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone connective particle fragments without POS tags', () => {
|
||||
const token = makeToken({
|
||||
surface: 'たって',
|
||||
headword: 'たって',
|
||||
reading: 'タッテ',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
});
|
||||
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes rhetorical もんか grammar particle phrases', () => {
|
||||
for (const surface of ['もんか', 'ものか']) {
|
||||
const token = makeToken({
|
||||
surface,
|
||||
headword: surface,
|
||||
reading: surface === 'もんか' ? 'モンカ' : 'モノカ',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞|助詞',
|
||||
pos2: '非自立|副助詞/並立助詞/終助詞',
|
||||
});
|
||||
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, surface);
|
||||
}
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes bare くれ auxiliary fragments', () => {
|
||||
const token = makeToken({
|
||||
surface: 'くれ',
|
||||
headword: '暮れ',
|
||||
reading: 'クレ',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '一般',
|
||||
});
|
||||
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true);
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone quote particle and auxiliary grammar terms', () => {
|
||||
for (const token of [
|
||||
makeToken({
|
||||
surface: 'って',
|
||||
headword: 'って',
|
||||
reading: 'ッテ',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'べき',
|
||||
headword: 'べき',
|
||||
reading: 'ベキ',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
}),
|
||||
]) {
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
|
||||
}
|
||||
});
|
||||
|
||||
test('shouldExcludeTokenFromSubtitleAnnotations excludes single-kana surface fragments', () => {
|
||||
for (const token of [
|
||||
makeToken({
|
||||
surface: 'ふ',
|
||||
headword: '不',
|
||||
reading: 'フ',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '接頭詞',
|
||||
pos2: '',
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'フ',
|
||||
headword: '負',
|
||||
reading: 'フ',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '一般',
|
||||
}),
|
||||
]) {
|
||||
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
|
||||
}
|
||||
});
|
||||
|
||||
test('stripSubtitleAnnotationMetadata keeps token hover data while clearing annotation fields', () => {
|
||||
const token = makeToken({
|
||||
surface: 'は',
|
||||
@@ -536,6 +662,57 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('annotateTokens N+1 sentence word count respects source punctuation gaps omitted by Yomitan', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: '私',
|
||||
headword: '私',
|
||||
pos1: '名詞',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
}),
|
||||
makeToken({
|
||||
surface: '猫',
|
||||
headword: '猫',
|
||||
pos1: '名詞',
|
||||
startPos: 1,
|
||||
endPos: 2,
|
||||
}),
|
||||
makeToken({
|
||||
surface: '犬',
|
||||
headword: '犬',
|
||||
pos1: '名詞',
|
||||
startPos: 2,
|
||||
endPos: 3,
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'ふざけん',
|
||||
headword: 'ふざける',
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞',
|
||||
pos2: '自立',
|
||||
startPos: 4,
|
||||
endPos: 8,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === '私' || text === '猫' || text === '犬',
|
||||
}),
|
||||
{
|
||||
minSentenceWordsForNPlusOne: 3,
|
||||
sourceText: '私猫犬!ふざけんなよ!',
|
||||
},
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[1]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[2]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[3]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('annotateTokens applies configured pos1 exclusions to both frequency and N+1', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
@@ -610,14 +787,52 @@ test('annotateTokens excludes default non-independent pos2 from frequency and N+
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(tokens, makeDeps(), {
|
||||
minSentenceWordsForNPlusOne: 1,
|
||||
});
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === 'た' || text === '負',
|
||||
getJlptLevel: (text) => (text === 'た' || text === '負' ? 'N3' : null),
|
||||
}),
|
||||
{
|
||||
minSentenceWordsForNPlusOne: 1,
|
||||
},
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
});
|
||||
|
||||
test('annotateTokens preserves exact known-word status for non-independent kanji noun tokens', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: '点',
|
||||
reading: 'てん',
|
||||
headword: '点',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '名詞',
|
||||
pos2: '非自立',
|
||||
pos3: '一般',
|
||||
startPos: 2,
|
||||
endPos: 3,
|
||||
frequencyRank: 1384,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === '点' || text === 'てん',
|
||||
getJlptLevel: (text) => (text === '点' ? 'N3' : null),
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 1 },
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.isKnown, true);
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
assert.equal(result[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens clears all annotations for non-independent kanji noun tokens under unified gate', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
@@ -665,7 +880,7 @@ test('annotateTokens excludes likely kana SFX tokens from frequency when POS tag
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens excludes single hiragana and katakana tokens from frequency when POS tags are missing', () => {
|
||||
test('annotateTokens clears all annotations from single hiragana and katakana surface fragments', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'た',
|
||||
@@ -679,12 +894,12 @@ test('annotateTokens excludes single hiragana and katakana tokens from frequency
|
||||
endPos: 1,
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'ア',
|
||||
reading: 'ア',
|
||||
headword: 'ア',
|
||||
pos1: '',
|
||||
surface: 'フ',
|
||||
reading: 'フ',
|
||||
headword: '負',
|
||||
pos1: '名詞',
|
||||
pos2: '',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
frequencyRank: 22,
|
||||
startPos: 1,
|
||||
endPos: 2,
|
||||
@@ -706,8 +921,14 @@ test('annotateTokens excludes single hiragana and katakana tokens from frequency
|
||||
minSentenceWordsForNPlusOne: 1,
|
||||
});
|
||||
|
||||
assert.equal(result[0]?.isKnown, false);
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
assert.equal(result[0]?.jlptLevel, undefined);
|
||||
assert.equal(result[1]?.isKnown, false);
|
||||
assert.equal(result[1]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[1]?.frequencyRank, undefined);
|
||||
assert.equal(result[1]?.jlptLevel, undefined);
|
||||
assert.equal(result[2]?.frequencyRank, 23);
|
||||
});
|
||||
|
||||
@@ -856,6 +1077,219 @@ test('annotateTokens clears all annotations for kana-only non-independent noun h
|
||||
assert.equal(result[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens clears all annotations for standalone して helper fragments', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'してる',
|
||||
headword: 'する',
|
||||
reading: 'シテル',
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞|助動詞',
|
||||
pos2: '自立|非自立',
|
||||
startPos: 0,
|
||||
endPos: 3,
|
||||
frequencyRank: 22,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === 'する',
|
||||
getJlptLevel: (text) => (text === 'する' ? 'N5' : null),
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 1 },
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.isKnown, false);
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
assert.equal(result[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens clears all annotations for standalone particle fragments without POS tags', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'と',
|
||||
headword: 'と',
|
||||
reading: 'ト',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
startPos: 0,
|
||||
endPos: 1,
|
||||
frequencyRank: 4,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === 'と',
|
||||
getJlptLevel: (text) => (text === 'と' ? 'N5' : null),
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 1 },
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.isKnown, false);
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
assert.equal(result[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens does not mark standalone connective particles as N+1', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: '逃げる',
|
||||
headword: '逃げる',
|
||||
reading: 'ニゲル',
|
||||
partOfSpeech: PartOfSpeech.verb,
|
||||
pos1: '動詞',
|
||||
pos2: '自立',
|
||||
startPos: 0,
|
||||
endPos: 3,
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'たって',
|
||||
headword: 'たって',
|
||||
reading: 'タッテ',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
startPos: 3,
|
||||
endPos: 6,
|
||||
frequencyRank: 28,
|
||||
}),
|
||||
makeToken({
|
||||
surface: '無駄',
|
||||
headword: '無駄',
|
||||
reading: 'ムダ',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '形容動詞語幹',
|
||||
startPos: 6,
|
||||
endPos: 8,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === '逃げる' || text === '無駄',
|
||||
getJlptLevel: (text) => (text === 'たって' ? 'N3' : null),
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 1 },
|
||||
);
|
||||
|
||||
assert.equal(result[1]?.isKnown, false);
|
||||
assert.equal(result[1]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[1]?.frequencyRank, undefined);
|
||||
assert.equal(result[1]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens clears all annotations for rhetorical もんか grammar particle phrases', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'もんか',
|
||||
headword: 'もんか',
|
||||
reading: 'モンカ',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞|助詞',
|
||||
pos2: '非自立|副助詞/並立助詞/終助詞',
|
||||
startPos: 0,
|
||||
endPos: 3,
|
||||
frequencyRank: 69629,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === 'もんか',
|
||||
getJlptLevel: (text) => (text === 'もんか' ? 'N2' : null),
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 1 },
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.isKnown, false);
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
assert.equal(result[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens clears all annotations for bare くれ auxiliary fragments', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'くれ',
|
||||
headword: '暮れ',
|
||||
reading: 'クレ',
|
||||
partOfSpeech: PartOfSpeech.noun,
|
||||
pos1: '名詞',
|
||||
pos2: '一般',
|
||||
startPos: 0,
|
||||
endPos: 2,
|
||||
frequencyRank: 12877,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === '暮れ',
|
||||
getJlptLevel: (text) => (text === '暮れ' ? 'N3' : null),
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 1 },
|
||||
);
|
||||
|
||||
assert.equal(result[0]?.isKnown, false);
|
||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||
assert.equal(result[0]?.frequencyRank, undefined);
|
||||
assert.equal(result[0]?.jlptLevel, undefined);
|
||||
});
|
||||
|
||||
test('annotateTokens clears all annotations for standalone quote particle and auxiliary grammar terms', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
surface: 'って',
|
||||
headword: 'って',
|
||||
reading: 'ッテ',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
startPos: 0,
|
||||
endPos: 2,
|
||||
frequencyRank: 28,
|
||||
}),
|
||||
makeToken({
|
||||
surface: 'べき',
|
||||
headword: 'べき',
|
||||
reading: 'ベキ',
|
||||
partOfSpeech: PartOfSpeech.other,
|
||||
pos1: '',
|
||||
pos2: '',
|
||||
startPos: 2,
|
||||
endPos: 4,
|
||||
frequencyRank: 268,
|
||||
}),
|
||||
];
|
||||
|
||||
const result = annotateTokens(
|
||||
tokens,
|
||||
makeDeps({
|
||||
isKnownWord: (text) => text === 'って' || text === 'べき',
|
||||
getJlptLevel: (text) => (text === 'って' || text === 'べき' ? 'N3' : null),
|
||||
}),
|
||||
{ minSentenceWordsForNPlusOne: 1 },
|
||||
);
|
||||
|
||||
for (const token of result) {
|
||||
assert.equal(token.isKnown, false, token.surface);
|
||||
assert.equal(token.isNPlusOneTarget, false, token.surface);
|
||||
assert.equal(token.frequencyRank, undefined, token.surface);
|
||||
assert.equal(token.jlptLevel, undefined, token.surface);
|
||||
}
|
||||
});
|
||||
|
||||
test('annotateTokens clears all annotations from standalone あ interjections without POS tags', () => {
|
||||
const tokens = [
|
||||
makeToken({
|
||||
|
||||
@@ -89,6 +89,7 @@ export interface AnnotationStageOptions {
|
||||
minSentenceWordsForNPlusOne?: number;
|
||||
pos1Exclusions?: ReadonlySet<string>;
|
||||
pos2Exclusions?: ReadonlySet<string>;
|
||||
sourceText?: string;
|
||||
}
|
||||
|
||||
function resolveKnownWordText(
|
||||
@@ -670,6 +671,36 @@ function computeTokenKnownStatus(
|
||||
return normalizedReading !== matchText.trim() && isKnownWord(normalizedReading);
|
||||
}
|
||||
|
||||
function computeExcludedTokenKnownStatus(
|
||||
token: MergedToken,
|
||||
isKnownWord: (text: string) => boolean,
|
||||
): boolean {
|
||||
const normalizedSurface = token.surface.trim();
|
||||
if (!hasKanjiChar(normalizedSurface)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (normalizedSurface && isKnownWord(normalizedSurface)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const normalizedReading = token.reading.trim();
|
||||
if (
|
||||
normalizedReading &&
|
||||
normalizedReading !== normalizedSurface &&
|
||||
isKnownWord(normalizedReading)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const normalizedHeadword = token.headword.trim();
|
||||
return (
|
||||
normalizedHeadword.length > 0 &&
|
||||
normalizedHeadword === normalizedSurface &&
|
||||
isKnownWord(normalizedHeadword)
|
||||
);
|
||||
}
|
||||
|
||||
function filterTokenFrequencyRank(
|
||||
token: MergedToken,
|
||||
pos1Exclusions: ReadonlySet<string>,
|
||||
@@ -732,10 +763,16 @@ export function annotateTokens(
|
||||
pos2Exclusions,
|
||||
})
|
||||
) {
|
||||
return sharedStripSubtitleAnnotationMetadata(token, {
|
||||
const strippedToken = sharedStripSubtitleAnnotationMetadata(token, {
|
||||
pos1Exclusions,
|
||||
pos2Exclusions,
|
||||
});
|
||||
return {
|
||||
...strippedToken,
|
||||
isKnown:
|
||||
nPlusOneEnabled &&
|
||||
computeExcludedTokenKnownStatus(token, deps.isKnownWord),
|
||||
};
|
||||
}
|
||||
|
||||
const prioritizedNameMatch = nameMatchEnabled && token.isNameMatch === true;
|
||||
@@ -779,6 +816,7 @@ export function annotateTokens(
|
||||
sanitizedMinSentenceWordsForNPlusOne,
|
||||
pos1Exclusions,
|
||||
pos2Exclusions,
|
||||
options.sourceText,
|
||||
);
|
||||
|
||||
if (!nameMatchEnabled) {
|
||||
|
||||
@@ -303,7 +303,9 @@ function fillMissingPos1BySurfaceSequence(
|
||||
|
||||
let cursor = 0;
|
||||
return tokens.map((token) => {
|
||||
if (token.pos1 && token.pos1.trim().length > 0) {
|
||||
const hasCompletePosMetadata =
|
||||
token.pos1?.trim() && token.pos2?.trim() && token.pos3?.trim();
|
||||
if (hasCompletePosMetadata) {
|
||||
return token;
|
||||
}
|
||||
|
||||
@@ -327,9 +329,9 @@ function fillMissingPos1BySurfaceSequence(
|
||||
cursor = best.index + 1;
|
||||
return {
|
||||
...token,
|
||||
pos1: best.pos1,
|
||||
pos2: best.pos2,
|
||||
pos3: best.pos3,
|
||||
pos1: token.pos1 ?? best.pos1,
|
||||
pos2: token.pos2 ?? best.pos2,
|
||||
pos3: token.pos3 ?? best.pos3,
|
||||
};
|
||||
});
|
||||
}
|
||||
@@ -382,7 +384,7 @@ export function enrichTokensWithMecabPos1(
|
||||
const metadataByTokenIndex = new Map<number, MecabPosMetadata>();
|
||||
|
||||
for (const [index, token] of tokens.entries()) {
|
||||
if (token.pos1) {
|
||||
if (token.pos1?.trim() && token.pos2?.trim() && token.pos3?.trim()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -410,9 +412,9 @@ export function enrichTokensWithMecabPos1(
|
||||
|
||||
return {
|
||||
...token,
|
||||
pos1: metadata.pos1,
|
||||
pos2: metadata.pos2,
|
||||
pos3: metadata.pos3,
|
||||
pos1: token.pos1 ?? metadata.pos1,
|
||||
pos2: token.pos2 ?? metadata.pos2,
|
||||
pos3: token.pos3 ?? metadata.pos3,
|
||||
};
|
||||
});
|
||||
|
||||
|
||||
@@ -19,11 +19,18 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
|
||||
'ええ',
|
||||
'うう',
|
||||
'おお',
|
||||
'くれ',
|
||||
'たって',
|
||||
'って',
|
||||
'だって',
|
||||
'はあ',
|
||||
'はは',
|
||||
'べき',
|
||||
'へえ',
|
||||
'ふう',
|
||||
'ほう',
|
||||
'もんか',
|
||||
'ものか',
|
||||
]);
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
|
||||
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
|
||||
@@ -72,6 +79,26 @@ const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
|
||||
]);
|
||||
const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
|
||||
const NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1 = new Set(['助詞', '助動詞']);
|
||||
const STANDALONE_GRAMMAR_PARTICLE_SURFACES = new Set([
|
||||
'か',
|
||||
'が',
|
||||
'さ',
|
||||
'し',
|
||||
'ぞ',
|
||||
'ぜ',
|
||||
'と',
|
||||
'な',
|
||||
'に',
|
||||
'ね',
|
||||
'の',
|
||||
'は',
|
||||
'へ',
|
||||
'も',
|
||||
'や',
|
||||
'よ',
|
||||
'を',
|
||||
]);
|
||||
const STANDALONE_GRAMMAR_PARTICLE_PHRASES = new Set(['たって', 'だって']);
|
||||
|
||||
export interface SubtitleAnnotationFilterOptions {
|
||||
pos1Exclusions?: ReadonlySet<string>;
|
||||
@@ -278,6 +305,38 @@ function isKanaOnlyNonIndependentNounHelperMerge(token: MergedToken): boolean {
|
||||
return pos1Parts.slice(1).every((part) => NON_INDEPENDENT_NOUN_HELPER_TAIL_POS1.has(part));
|
||||
}
|
||||
|
||||
function isKanaOnlyText(text: string): boolean {
|
||||
const normalized = normalizeKana(text);
|
||||
return normalized.length > 0 && [...normalized].every(isKanaChar);
|
||||
}
|
||||
|
||||
function isStandaloneSuruTeGrammarHelper(token: MergedToken): boolean {
|
||||
const normalizedSurface = normalizeKana(token.surface);
|
||||
const normalizedHeadword = normalizeKana(token.headword);
|
||||
if (!normalizedSurface.startsWith('して') || normalizedHeadword !== 'する') {
|
||||
return false;
|
||||
}
|
||||
|
||||
const pos1Parts = splitNormalizedTagParts(normalizePosTag(token.pos1));
|
||||
return isKanaOnlyText(normalizedSurface) && (pos1Parts.length === 0 || pos1Parts.includes('動詞'));
|
||||
}
|
||||
|
||||
function isStandaloneGrammarParticle(token: MergedToken): boolean {
|
||||
const normalizedSurface = normalizeKana(token.surface);
|
||||
const normalizedHeadword = normalizeKana(token.headword);
|
||||
return (
|
||||
normalizedSurface === normalizedHeadword &&
|
||||
(STANDALONE_GRAMMAR_PARTICLE_SURFACES.has(normalizedSurface) ||
|
||||
STANDALONE_GRAMMAR_PARTICLE_PHRASES.has(normalizedSurface))
|
||||
);
|
||||
}
|
||||
|
||||
function isSingleKanaSurfaceFragment(token: MergedToken): boolean {
|
||||
const normalizedSurface = normalizeKana(token.surface);
|
||||
const chars = [...normalizedSurface];
|
||||
return chars.length === 1 && chars.every(isKanaChar);
|
||||
}
|
||||
|
||||
function isExcludedByTerm(token: MergedToken): boolean {
|
||||
const candidates = [token.surface, token.reading, token.headword].filter(
|
||||
(candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
|
||||
@@ -365,6 +424,18 @@ export function shouldExcludeTokenFromSubtitleAnnotations(
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isStandaloneSuruTeGrammarHelper(token)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isStandaloneGrammarParticle(token)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isSingleKanaSurfaceFragment(token)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isExcludedTrailingParticleMergedToken(token)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1049,6 +1049,60 @@ test('requestYomitanScanTokens marks grouped entries when SubMiner dictionary al
|
||||
assert.equal((result as Array<{ isNameMatch?: boolean }>)[0]?.isNameMatch, true);
|
||||
});
|
||||
|
||||
test('requestYomitanScanTokens preserves matched headword word classes', async () => {
|
||||
let scannerScript = '';
|
||||
const deps = createDeps(async (script) => {
|
||||
if (script.includes('termsFind')) {
|
||||
scannerScript = script;
|
||||
return [];
|
||||
}
|
||||
if (script.includes('optionsGetFull')) {
|
||||
return {
|
||||
profileCurrent: 0,
|
||||
profiles: [
|
||||
{
|
||||
options: {
|
||||
scanning: { length: 40 },
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
return null;
|
||||
});
|
||||
|
||||
await requestYomitanScanTokens('は', deps, { error: () => undefined });
|
||||
|
||||
const result = await runInjectedYomitanScript(scannerScript, (action, params) => {
|
||||
if (action !== 'termsFind') {
|
||||
throw new Error(`unexpected action: ${action}`);
|
||||
}
|
||||
|
||||
const text = (params as { text?: string } | undefined)?.text;
|
||||
if (text !== 'は') {
|
||||
return { originalTextLength: 0, dictionaryEntries: [] };
|
||||
}
|
||||
|
||||
return {
|
||||
originalTextLength: 1,
|
||||
dictionaryEntries: [
|
||||
{
|
||||
headwords: [
|
||||
{
|
||||
term: 'は',
|
||||
reading: 'は',
|
||||
wordClasses: ['prt'],
|
||||
sources: [{ originalText: 'は', isPrimary: true, matchType: 'exact' }],
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
});
|
||||
|
||||
assert.deepEqual((result as Array<{ wordClasses?: string[] }>)[0]?.wordClasses, ['prt']);
|
||||
});
|
||||
|
||||
test('requestYomitanScanTokens skips fallback fragments without exact primary source matches', async () => {
|
||||
const deps = createDeps(async (script) => {
|
||||
if (script.includes('optionsGetFull')) {
|
||||
|
||||
@@ -53,6 +53,7 @@ export interface YomitanScanToken {
|
||||
endPos: number;
|
||||
isNameMatch?: boolean;
|
||||
frequencyRank?: number;
|
||||
wordClasses?: string[];
|
||||
}
|
||||
|
||||
interface YomitanProfileMetadata {
|
||||
@@ -91,7 +92,10 @@ function isScanTokenArray(value: unknown): value is YomitanScanToken[] {
|
||||
typeof entry.startPos === 'number' &&
|
||||
typeof entry.endPos === 'number' &&
|
||||
(entry.isNameMatch === undefined || typeof entry.isNameMatch === 'boolean') &&
|
||||
(entry.frequencyRank === undefined || typeof entry.frequencyRank === 'number'),
|
||||
(entry.frequencyRank === undefined || typeof entry.frequencyRank === 'number') &&
|
||||
(entry.wordClasses === undefined ||
|
||||
(Array.isArray(entry.wordClasses) &&
|
||||
entry.wordClasses.every((wordClass) => typeof wordClass === 'string'))),
|
||||
)
|
||||
);
|
||||
}
|
||||
@@ -975,6 +979,11 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
|
||||
return best;
|
||||
}
|
||||
function getPreferredHeadword(dictionaryEntries, token, dictionaryPriorityByName, dictionaryFrequencyModeByName) {
|
||||
function normalizeWordClasses(headword) {
|
||||
if (!Array.isArray(headword?.wordClasses)) { return undefined; }
|
||||
const classes = headword.wordClasses.filter((wordClass) => typeof wordClass === "string" && wordClass.trim().length > 0);
|
||||
return classes.length > 0 ? classes : undefined;
|
||||
}
|
||||
function appendDictionaryNames(target, value) {
|
||||
if (!value || typeof value !== 'object') {
|
||||
return;
|
||||
@@ -1033,6 +1042,7 @@ const YOMITAN_SCANNING_HELPERS = String.raw`
|
||||
return {
|
||||
term: preferredMatch.headword.term,
|
||||
reading: preferredMatch.headword.reading,
|
||||
wordClasses: normalizeWordClasses(preferredMatch.headword),
|
||||
isNameMatch: matchedNameDictionary || isNameDictionaryEntry(preferredMatch.dictionaryEntry),
|
||||
frequencyRank: getBestFrequencyRankForMatches(
|
||||
exactFrequencyMatches.length > 0 ? exactFrequencyMatches : exactPrimaryMatches,
|
||||
@@ -1099,7 +1109,7 @@ ${YOMITAN_SCANNING_HELPERS}
|
||||
if (preferredHeadword && typeof preferredHeadword.term === "string") {
|
||||
const reading = typeof preferredHeadword.reading === "string" ? preferredHeadword.reading : "";
|
||||
const segments = distributeFuriganaInflected(preferredHeadword.term, reading, source);
|
||||
tokens.push({
|
||||
const tokenPayload = {
|
||||
surface: segments.map((segment) => segment.text).join("") || source,
|
||||
reading: segments.map((segment) => typeof segment.reading === "string" ? segment.reading : "").join(""),
|
||||
headword: preferredHeadword.term,
|
||||
@@ -1110,7 +1120,11 @@ ${YOMITAN_SCANNING_HELPERS}
|
||||
typeof preferredHeadword.frequencyRank === "number" && Number.isFinite(preferredHeadword.frequencyRank)
|
||||
? Math.max(1, Math.floor(preferredHeadword.frequencyRank))
|
||||
: undefined,
|
||||
});
|
||||
};
|
||||
if (Array.isArray(preferredHeadword.wordClasses) && preferredHeadword.wordClasses.length > 0) {
|
||||
tokenPayload.wordClasses = preferredHeadword.wordClasses;
|
||||
}
|
||||
tokens.push(tokenPayload);
|
||||
i += originalTextLength;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -347,11 +347,25 @@ function isSentenceBoundaryToken(token: MergedToken): boolean {
|
||||
return SENTENCE_BOUNDARY_SURFACES.has(token.surface);
|
||||
}
|
||||
|
||||
function hasSentenceBoundaryInSourceGap(
|
||||
sourceText: string | undefined,
|
||||
previousEnd: number | null,
|
||||
nextStart: number,
|
||||
): boolean {
|
||||
if (typeof sourceText !== 'string' || previousEnd === null || nextStart <= previousEnd) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const gap = sourceText.slice(previousEnd, nextStart);
|
||||
return [...gap].some((char) => SENTENCE_BOUNDARY_SURFACES.has(char));
|
||||
}
|
||||
|
||||
export function markNPlusOneTargets(
|
||||
tokens: MergedToken[],
|
||||
minSentenceWords = 3,
|
||||
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
|
||||
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
|
||||
sourceText?: string,
|
||||
): MergedToken[] {
|
||||
if (tokens.length === 0) {
|
||||
return [];
|
||||
@@ -363,6 +377,7 @@ export function markNPlusOneTargets(
|
||||
}));
|
||||
|
||||
let sentenceStart = 0;
|
||||
let previousTokenEnd: number | null = null;
|
||||
const minimumSentenceWords = Number.isInteger(minSentenceWords)
|
||||
? Math.max(1, minSentenceWords)
|
||||
: 3;
|
||||
@@ -393,10 +408,15 @@ export function markNPlusOneTargets(
|
||||
for (let i = 0; i < markedTokens.length; i++) {
|
||||
const token = markedTokens[i];
|
||||
if (!token) continue;
|
||||
if (hasSentenceBoundaryInSourceGap(sourceText, previousTokenEnd, token.startPos)) {
|
||||
markSentence(sentenceStart, i);
|
||||
sentenceStart = i;
|
||||
}
|
||||
if (isSentenceBoundaryToken(token)) {
|
||||
markSentence(sentenceStart, i);
|
||||
sentenceStart = i + 1;
|
||||
}
|
||||
previousTokenEnd = token.endPos;
|
||||
}
|
||||
|
||||
if (sentenceStart < markedTokens.length) {
|
||||
|
||||
Reference in New Issue
Block a user