mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-01 18:22:41 -08:00
fix(tokenizer): tighten n+1 eligibility using mecab pos overlaps
This commit is contained in:
@@ -2038,3 +2038,125 @@ test('tokenizeSubtitle keeps frequency enrichment while n+1 is disabled', async
|
|||||||
assert.equal(mecabCalls, 1);
|
assert.equal(mecabCalls, 1);
|
||||||
assert.equal(frequencyCalls, 1);
|
assert.equal(frequencyCalls, 1);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and frequency annotations', async () => {
|
||||||
|
const result = await tokenizeSubtitle(
|
||||||
|
'になれば',
|
||||||
|
makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], {
|
||||||
|
getFrequencyDictionaryEnabled: () => true,
|
||||||
|
getFrequencyRank: (text) => (text === 'なる' ? 11 : null),
|
||||||
|
tokenizeWithMecab: async () => [
|
||||||
|
{
|
||||||
|
headword: 'なる',
|
||||||
|
surface: 'になれば',
|
||||||
|
reading: 'ニナレバ',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 4,
|
||||||
|
partOfSpeech: PartOfSpeech.verb,
|
||||||
|
pos1: '動詞',
|
||||||
|
pos2: '非自立',
|
||||||
|
isMerged: true,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
getMinSentenceWordsForNPlusOne: () => 1,
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.equal(result.tokens?.length, 1);
|
||||||
|
assert.equal(result.tokens?.[0]?.frequencyRank, undefined);
|
||||||
|
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('tokenizeSubtitle keeps merged token when overlap contains at least one content pos1 tag', async () => {
|
||||||
|
const result = await tokenizeSubtitle(
|
||||||
|
'になれば',
|
||||||
|
makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], {
|
||||||
|
getFrequencyDictionaryEnabled: () => true,
|
||||||
|
getFrequencyRank: (text) => (text === 'なる' ? 13 : null),
|
||||||
|
tokenizeWithMecab: async () => [
|
||||||
|
{
|
||||||
|
headword: 'に',
|
||||||
|
surface: 'に',
|
||||||
|
reading: 'ニ',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 1,
|
||||||
|
partOfSpeech: PartOfSpeech.particle,
|
||||||
|
pos1: '助詞',
|
||||||
|
pos2: '格助詞',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: 'なる',
|
||||||
|
surface: 'なれ',
|
||||||
|
reading: 'ナレ',
|
||||||
|
startPos: 1,
|
||||||
|
endPos: 3,
|
||||||
|
partOfSpeech: PartOfSpeech.verb,
|
||||||
|
pos1: '動詞',
|
||||||
|
pos2: '自立',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headword: 'ば',
|
||||||
|
surface: 'ば',
|
||||||
|
reading: 'バ',
|
||||||
|
startPos: 3,
|
||||||
|
endPos: 4,
|
||||||
|
partOfSpeech: PartOfSpeech.particle,
|
||||||
|
pos1: '助詞',
|
||||||
|
pos2: '接続助詞',
|
||||||
|
isMerged: false,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
getMinSentenceWordsForNPlusOne: () => 1,
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.equal(result.tokens?.length, 1);
|
||||||
|
assert.equal(result.tokens?.[0]?.pos1, '助詞|動詞');
|
||||||
|
assert.equal(result.tokens?.[0]?.frequencyRank, 13);
|
||||||
|
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('tokenizeSubtitle excludes default non-independent pos2 from N+1 when JLPT/frequency are disabled', async () => {
|
||||||
|
let mecabCalls = 0;
|
||||||
|
const result = await tokenizeSubtitle(
|
||||||
|
'になれば',
|
||||||
|
makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], {
|
||||||
|
getJlptEnabled: () => false,
|
||||||
|
getFrequencyDictionaryEnabled: () => false,
|
||||||
|
getMinSentenceWordsForNPlusOne: () => 1,
|
||||||
|
tokenizeWithMecab: async () => {
|
||||||
|
mecabCalls += 1;
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
headword: 'なる',
|
||||||
|
surface: 'になれば',
|
||||||
|
reading: 'ニナレバ',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 4,
|
||||||
|
partOfSpeech: PartOfSpeech.verb,
|
||||||
|
pos1: '動詞',
|
||||||
|
pos2: '非自立',
|
||||||
|
isMerged: true,
|
||||||
|
isKnown: false,
|
||||||
|
isNPlusOneTarget: false,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.equal(mecabCalls, 1);
|
||||||
|
assert.equal(result.tokens?.length, 1);
|
||||||
|
assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false);
|
||||||
|
});
|
||||||
|
|||||||
@@ -10,6 +10,14 @@ import {
|
|||||||
FrequencyDictionaryLookup,
|
FrequencyDictionaryLookup,
|
||||||
JlptLevel,
|
JlptLevel,
|
||||||
} from '../../types';
|
} from '../../types';
|
||||||
|
import {
|
||||||
|
DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
|
||||||
|
resolveAnnotationPos1ExclusionSet,
|
||||||
|
} from '../../token-pos1-exclusions';
|
||||||
|
import {
|
||||||
|
DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
|
||||||
|
resolveAnnotationPos2ExclusionSet,
|
||||||
|
} from '../../token-pos2-exclusions';
|
||||||
import { selectYomitanParseTokens } from './tokenizer/parser-selection-stage';
|
import { selectYomitanParseTokens } from './tokenizer/parser-selection-stage';
|
||||||
import {
|
import {
|
||||||
requestYomitanParseResults,
|
requestYomitanParseResults,
|
||||||
@@ -78,6 +86,8 @@ interface TokenizerAnnotationOptions {
|
|||||||
frequencyEnabled: boolean;
|
frequencyEnabled: boolean;
|
||||||
frequencyMatchMode: FrequencyDictionaryMatchMode;
|
frequencyMatchMode: FrequencyDictionaryMatchMode;
|
||||||
minSentenceWordsForNPlusOne: number | undefined;
|
minSentenceWordsForNPlusOne: number | undefined;
|
||||||
|
pos1Exclusions: ReadonlySet<string>;
|
||||||
|
pos2Exclusions: ReadonlySet<string>;
|
||||||
}
|
}
|
||||||
|
|
||||||
let parserEnrichmentWorkerRuntimeModulePromise:
|
let parserEnrichmentWorkerRuntimeModulePromise:
|
||||||
@@ -87,6 +97,12 @@ let annotationStageModulePromise: Promise<typeof import('./tokenizer/annotation-
|
|||||||
let parserEnrichmentFallbackModulePromise:
|
let parserEnrichmentFallbackModulePromise:
|
||||||
| Promise<typeof import('./tokenizer/parser-enrichment-stage')>
|
| Promise<typeof import('./tokenizer/parser-enrichment-stage')>
|
||||||
| null = null;
|
| null = null;
|
||||||
|
const DEFAULT_ANNOTATION_POS1_EXCLUSIONS = resolveAnnotationPos1ExclusionSet(
|
||||||
|
DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
|
||||||
|
);
|
||||||
|
const DEFAULT_ANNOTATION_POS2_EXCLUSIONS = resolveAnnotationPos2ExclusionSet(
|
||||||
|
DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
|
||||||
|
);
|
||||||
|
|
||||||
function getKnownWordLookup(deps: TokenizerServiceDeps, options: TokenizerAnnotationOptions): (text: string) => boolean {
|
function getKnownWordLookup(deps: TokenizerServiceDeps, options: TokenizerAnnotationOptions): (text: string) => boolean {
|
||||||
if (!options.nPlusOneEnabled) {
|
if (!options.nPlusOneEnabled) {
|
||||||
@@ -96,7 +112,7 @@ function getKnownWordLookup(deps: TokenizerServiceDeps, options: TokenizerAnnota
|
|||||||
}
|
}
|
||||||
|
|
||||||
function needsMecabPosEnrichment(options: TokenizerAnnotationOptions): boolean {
|
function needsMecabPosEnrichment(options: TokenizerAnnotationOptions): boolean {
|
||||||
return options.jlptEnabled || options.frequencyEnabled;
|
return options.nPlusOneEnabled || options.jlptEnabled || options.frequencyEnabled;
|
||||||
}
|
}
|
||||||
|
|
||||||
function hasAnyAnnotationEnabled(options: TokenizerAnnotationOptions): boolean {
|
function hasAnyAnnotationEnabled(options: TokenizerAnnotationOptions): boolean {
|
||||||
@@ -389,6 +405,8 @@ function getAnnotationOptions(deps: TokenizerServiceDeps): TokenizerAnnotationOp
|
|||||||
frequencyEnabled: deps.getFrequencyDictionaryEnabled?.() !== false,
|
frequencyEnabled: deps.getFrequencyDictionaryEnabled?.() !== false,
|
||||||
frequencyMatchMode: deps.getFrequencyDictionaryMatchMode?.() ?? 'headword',
|
frequencyMatchMode: deps.getFrequencyDictionaryMatchMode?.() ?? 'headword',
|
||||||
minSentenceWordsForNPlusOne: deps.getMinSentenceWordsForNPlusOne?.(),
|
minSentenceWordsForNPlusOne: deps.getMinSentenceWordsForNPlusOne?.(),
|
||||||
|
pos1Exclusions: DEFAULT_ANNOTATION_POS1_EXCLUSIONS,
|
||||||
|
pos2Exclusions: DEFAULT_ANNOTATION_POS2_EXCLUSIONS,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -205,3 +205,171 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens
|
|||||||
assert.equal(result[2]?.isKnown, true);
|
assert.equal(result[2]?.isKnown, true);
|
||||||
assert.equal(result[0]?.isNPlusOneTarget, false);
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('annotateTokens applies configured pos1 exclusions to both frequency and N+1', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: '猫',
|
||||||
|
headword: '猫',
|
||||||
|
pos1: '名詞',
|
||||||
|
frequencyRank: 21,
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 1,
|
||||||
|
}),
|
||||||
|
makeToken({
|
||||||
|
surface: '走る',
|
||||||
|
headword: '走る',
|
||||||
|
pos1: '動詞',
|
||||||
|
partOfSpeech: PartOfSpeech.verb,
|
||||||
|
startPos: 1,
|
||||||
|
endPos: 3,
|
||||||
|
frequencyRank: 22,
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = annotateTokens(
|
||||||
|
tokens,
|
||||||
|
makeDeps({
|
||||||
|
isKnownWord: (text) => text === '走る',
|
||||||
|
}),
|
||||||
|
{
|
||||||
|
minSentenceWordsForNPlusOne: 1,
|
||||||
|
pos1Exclusions: new Set(['名詞']),
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
|
assert.equal(result[1]?.frequencyRank, 22);
|
||||||
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
|
assert.equal(result[1]?.isNPlusOneTarget, false);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('annotateTokens allows previously default-excluded pos1 when removed from effective set', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: 'は',
|
||||||
|
headword: 'は',
|
||||||
|
partOfSpeech: PartOfSpeech.other,
|
||||||
|
pos1: '助詞',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 1,
|
||||||
|
frequencyRank: 8,
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = annotateTokens(tokens, makeDeps(), {
|
||||||
|
minSentenceWordsForNPlusOne: 1,
|
||||||
|
pos1Exclusions: new Set(),
|
||||||
|
});
|
||||||
|
|
||||||
|
assert.equal(result[0]?.frequencyRank, 8);
|
||||||
|
assert.equal(result[0]?.isNPlusOneTarget, true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('annotateTokens excludes default non-independent pos2 from frequency and N+1', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: 'になれば',
|
||||||
|
headword: 'なる',
|
||||||
|
partOfSpeech: PartOfSpeech.verb,
|
||||||
|
pos1: '動詞',
|
||||||
|
pos2: '非自立',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 4,
|
||||||
|
frequencyRank: 7,
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = annotateTokens(tokens, makeDeps(), {
|
||||||
|
minSentenceWordsForNPlusOne: 1,
|
||||||
|
});
|
||||||
|
|
||||||
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('annotateTokens excludes likely kana SFX tokens from frequency when POS tags are missing', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: 'ぐわっ',
|
||||||
|
reading: 'ぐわっ',
|
||||||
|
headword: 'ぐわっ',
|
||||||
|
pos1: '',
|
||||||
|
pos2: '',
|
||||||
|
frequencyRank: 12,
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 3,
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = annotateTokens(tokens, makeDeps(), {
|
||||||
|
minSentenceWordsForNPlusOne: 1,
|
||||||
|
});
|
||||||
|
|
||||||
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('annotateTokens allows previously default-excluded pos2 when removed from effective set', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: 'になれば',
|
||||||
|
headword: 'なる',
|
||||||
|
partOfSpeech: PartOfSpeech.verb,
|
||||||
|
pos1: '動詞',
|
||||||
|
pos2: '非自立',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 4,
|
||||||
|
frequencyRank: 9,
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = annotateTokens(tokens, makeDeps(), {
|
||||||
|
minSentenceWordsForNPlusOne: 1,
|
||||||
|
pos2Exclusions: new Set(),
|
||||||
|
});
|
||||||
|
|
||||||
|
assert.equal(result[0]?.frequencyRank, 9);
|
||||||
|
assert.equal(result[0]?.isNPlusOneTarget, true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('annotateTokens keeps composite tokens when any component pos tag is content-bearing', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: 'になれば',
|
||||||
|
headword: 'なる',
|
||||||
|
pos1: '助詞|動詞',
|
||||||
|
pos2: '格助詞|自立|接続助詞',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 4,
|
||||||
|
frequencyRank: 5,
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = annotateTokens(tokens, makeDeps(), {
|
||||||
|
minSentenceWordsForNPlusOne: 1,
|
||||||
|
});
|
||||||
|
|
||||||
|
assert.equal(result[0]?.frequencyRank, 5);
|
||||||
|
assert.equal(result[0]?.isNPlusOneTarget, true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('annotateTokens excludes composite tokens when all component pos tags are excluded', () => {
|
||||||
|
const tokens = [
|
||||||
|
makeToken({
|
||||||
|
surface: 'けど',
|
||||||
|
headword: 'けど',
|
||||||
|
pos1: '助詞|助詞',
|
||||||
|
pos2: '接続助詞|終助詞',
|
||||||
|
startPos: 0,
|
||||||
|
endPos: 2,
|
||||||
|
frequencyRank: 6,
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = annotateTokens(tokens, makeDeps(), {
|
||||||
|
minSentenceWordsForNPlusOne: 1,
|
||||||
|
});
|
||||||
|
|
||||||
|
assert.equal(result[0]?.frequencyRank, undefined);
|
||||||
|
assert.equal(result[0]?.isNPlusOneTarget, false);
|
||||||
|
});
|
||||||
|
|||||||
@@ -1,4 +1,12 @@
|
|||||||
import { markNPlusOneTargets } from '../../../token-merger';
|
import { markNPlusOneTargets } from '../../../token-merger';
|
||||||
|
import {
|
||||||
|
DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG,
|
||||||
|
resolveAnnotationPos1ExclusionSet,
|
||||||
|
} from '../../../token-pos1-exclusions';
|
||||||
|
import {
|
||||||
|
DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG,
|
||||||
|
resolveAnnotationPos2ExclusionSet,
|
||||||
|
} from '../../../token-pos2-exclusions';
|
||||||
import {
|
import {
|
||||||
JlptLevel,
|
JlptLevel,
|
||||||
MergedToken,
|
MergedToken,
|
||||||
@@ -28,6 +36,8 @@ export interface AnnotationStageOptions {
|
|||||||
jlptEnabled?: boolean;
|
jlptEnabled?: boolean;
|
||||||
frequencyEnabled?: boolean;
|
frequencyEnabled?: boolean;
|
||||||
minSentenceWordsForNPlusOne?: number;
|
minSentenceWordsForNPlusOne?: number;
|
||||||
|
pos1Exclusions?: ReadonlySet<string>;
|
||||||
|
pos2Exclusions?: ReadonlySet<string>;
|
||||||
}
|
}
|
||||||
|
|
||||||
function resolveKnownWordText(
|
function resolveKnownWordText(
|
||||||
@@ -53,22 +63,85 @@ function applyKnownWordMarking(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
function isFrequencyExcludedByPos(token: MergedToken): boolean {
|
function normalizePos1Tag(pos1: string | undefined): string {
|
||||||
if (
|
return typeof pos1 === 'string' ? pos1.trim() : '';
|
||||||
token.partOfSpeech === PartOfSpeech.particle ||
|
}
|
||||||
token.partOfSpeech === PartOfSpeech.bound_auxiliary
|
|
||||||
) {
|
function isExcludedByTagSet(
|
||||||
|
normalizedTag: string,
|
||||||
|
exclusions: ReadonlySet<string>,
|
||||||
|
): boolean {
|
||||||
|
if (!normalizedTag) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const parts = normalizedTag
|
||||||
|
.split('|')
|
||||||
|
.map((part) => part.trim())
|
||||||
|
.filter((part) => part.length > 0);
|
||||||
|
if (parts.length === 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return parts.every((part) => exclusions.has(part));
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
|
||||||
|
if (options.pos1Exclusions) {
|
||||||
|
return options.pos1Exclusions;
|
||||||
|
}
|
||||||
|
|
||||||
|
return resolveAnnotationPos1ExclusionSet(DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG);
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolvePos2Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
|
||||||
|
if (options.pos2Exclusions) {
|
||||||
|
return options.pos2Exclusions;
|
||||||
|
}
|
||||||
|
|
||||||
|
return resolveAnnotationPos2ExclusionSet(DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG);
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizePos2Tag(pos2: string | undefined): string {
|
||||||
|
return typeof pos2 === 'string' ? pos2.trim() : '';
|
||||||
|
}
|
||||||
|
|
||||||
|
function isFrequencyExcludedByPos(
|
||||||
|
token: MergedToken,
|
||||||
|
pos1Exclusions: ReadonlySet<string>,
|
||||||
|
pos2Exclusions: ReadonlySet<string>,
|
||||||
|
): boolean {
|
||||||
|
const normalizedPos1 = normalizePos1Tag(token.pos1);
|
||||||
|
const hasPos1 = normalizedPos1.length > 0;
|
||||||
|
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
return token.pos1 === '助詞' || token.pos1 === '助動詞';
|
const normalizedPos2 = normalizePos2Tag(token.pos2);
|
||||||
|
const hasPos2 = normalizedPos2.length > 0;
|
||||||
|
if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasPos1 || hasPos2) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isLikelyFrequencyNoiseToken(token)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (
|
||||||
|
token.partOfSpeech === PartOfSpeech.particle ||
|
||||||
|
token.partOfSpeech === PartOfSpeech.bound_auxiliary
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
function applyFrequencyMarking(
|
function applyFrequencyMarking(
|
||||||
tokens: MergedToken[],
|
tokens: MergedToken[],
|
||||||
|
pos1Exclusions: ReadonlySet<string>,
|
||||||
|
pos2Exclusions: ReadonlySet<string>,
|
||||||
): MergedToken[] {
|
): MergedToken[] {
|
||||||
return tokens.map((token) => {
|
return tokens.map((token) => {
|
||||||
if (isFrequencyExcludedByPos(token)) {
|
if (isFrequencyExcludedByPos(token, pos1Exclusions, pos2Exclusions)) {
|
||||||
return { ...token, frequencyRank: undefined };
|
return { ...token, frequencyRank: undefined };
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -203,6 +276,101 @@ function isRepeatedKanaSfx(text: string): boolean {
|
|||||||
return topCount >= Math.ceil(chars.length / 2);
|
return topCount >= Math.ceil(chars.length / 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function isTrailingSmallTsuKanaSfx(text: string): boolean {
|
||||||
|
const normalized = normalizeJlptTextForExclusion(text);
|
||||||
|
if (!normalized) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const chars = [...normalized];
|
||||||
|
if (chars.length < 2 || chars.length > 4) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!chars.every(isKanaChar)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return chars[chars.length - 1] === 'っ';
|
||||||
|
}
|
||||||
|
|
||||||
|
function isReduplicatedKanaSfx(text: string): boolean {
|
||||||
|
const normalized = normalizeJlptTextForExclusion(text);
|
||||||
|
if (!normalized) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const chars = [...normalized];
|
||||||
|
if (chars.length < 4 || chars.length % 2 !== 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!chars.every(isKanaChar)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const half = chars.length / 2;
|
||||||
|
return chars.slice(0, half).join('') === chars.slice(half).join('');
|
||||||
|
}
|
||||||
|
|
||||||
|
function hasAdjacentKanaRepeat(text: string): boolean {
|
||||||
|
const normalized = normalizeJlptTextForExclusion(text);
|
||||||
|
if (!normalized) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const chars = [...normalized];
|
||||||
|
if (!chars.every(isKanaChar)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (let i = 1; i < chars.length; i += 1) {
|
||||||
|
if (chars[i] === chars[i - 1]) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
function isLikelyFrequencyNoiseToken(token: MergedToken): boolean {
|
||||||
|
const candidates = [token.headword, token.surface].filter(
|
||||||
|
(candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const candidate of candidates) {
|
||||||
|
const trimmedCandidate = candidate.trim();
|
||||||
|
if (!trimmedCandidate) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const normalizedCandidate = normalizeJlptTextForExclusion(trimmedCandidate);
|
||||||
|
if (!normalizedCandidate) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
shouldIgnoreJlptByTerm(trimmedCandidate) ||
|
||||||
|
shouldIgnoreJlptByTerm(normalizedCandidate)
|
||||||
|
) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
hasAdjacentKanaRepeat(trimmedCandidate) ||
|
||||||
|
hasAdjacentKanaRepeat(normalizedCandidate) ||
|
||||||
|
isReduplicatedKanaSfx(trimmedCandidate) ||
|
||||||
|
isReduplicatedKanaSfx(normalizedCandidate) ||
|
||||||
|
isTrailingSmallTsuKanaSfx(trimmedCandidate) ||
|
||||||
|
isTrailingSmallTsuKanaSfx(normalizedCandidate)
|
||||||
|
) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
function isJlptEligibleToken(token: MergedToken): boolean {
|
function isJlptEligibleToken(token: MergedToken): boolean {
|
||||||
if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) {
|
if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) {
|
||||||
return false;
|
return false;
|
||||||
@@ -261,6 +429,8 @@ export function annotateTokens(
|
|||||||
deps: AnnotationStageDeps,
|
deps: AnnotationStageDeps,
|
||||||
options: AnnotationStageOptions = {},
|
options: AnnotationStageOptions = {},
|
||||||
): MergedToken[] {
|
): MergedToken[] {
|
||||||
|
const pos1Exclusions = resolvePos1Exclusions(options);
|
||||||
|
const pos2Exclusions = resolvePos2Exclusions(options);
|
||||||
const nPlusOneEnabled = options.nPlusOneEnabled !== false;
|
const nPlusOneEnabled = options.nPlusOneEnabled !== false;
|
||||||
const knownMarkedTokens = nPlusOneEnabled
|
const knownMarkedTokens = nPlusOneEnabled
|
||||||
? applyKnownWordMarking(tokens, deps.isKnownWord, deps.knownWordMatchMode)
|
? applyKnownWordMarking(tokens, deps.isKnownWord, deps.knownWordMatchMode)
|
||||||
@@ -273,7 +443,7 @@ export function annotateTokens(
|
|||||||
const frequencyEnabled = options.frequencyEnabled !== false;
|
const frequencyEnabled = options.frequencyEnabled !== false;
|
||||||
const frequencyMarkedTokens =
|
const frequencyMarkedTokens =
|
||||||
frequencyEnabled
|
frequencyEnabled
|
||||||
? applyFrequencyMarking(knownMarkedTokens)
|
? applyFrequencyMarking(knownMarkedTokens, pos1Exclusions, pos2Exclusions)
|
||||||
: knownMarkedTokens.map((token) => ({
|
: knownMarkedTokens.map((token) => ({
|
||||||
...token,
|
...token,
|
||||||
frequencyRank: undefined,
|
frequencyRank: undefined,
|
||||||
@@ -303,5 +473,10 @@ export function annotateTokens(
|
|||||||
? minSentenceWordsForNPlusOne
|
? minSentenceWordsForNPlusOne
|
||||||
: 3;
|
: 3;
|
||||||
|
|
||||||
return markNPlusOneTargets(jlptMarkedTokens, sanitizedMinSentenceWordsForNPlusOne);
|
return markNPlusOneTargets(
|
||||||
|
jlptMarkedTokens,
|
||||||
|
sanitizedMinSentenceWordsForNPlusOne,
|
||||||
|
pos1Exclusions,
|
||||||
|
pos2Exclusions,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,12 +22,13 @@ function makeToken(overrides: Partial<MergedToken>): MergedToken {
|
|||||||
test('enrichTokensWithMecabPos1 picks pos1 by best overlap when no surface match exists', () => {
|
test('enrichTokensWithMecabPos1 picks pos1 by best overlap when no surface match exists', () => {
|
||||||
const tokens = [makeToken({ surface: 'grouped', startPos: 2, endPos: 7 })];
|
const tokens = [makeToken({ surface: 'grouped', startPos: 2, endPos: 7 })];
|
||||||
const mecabTokens = [
|
const mecabTokens = [
|
||||||
makeToken({ surface: 'left', startPos: 0, endPos: 4, pos1: 'A' }),
|
makeToken({ surface: 'left', startPos: 0, endPos: 4, pos1: 'A', pos2: 'L2' }),
|
||||||
makeToken({ surface: 'right', startPos: 2, endPos: 6, pos1: 'B' }),
|
makeToken({ surface: 'right', startPos: 2, endPos: 6, pos1: 'B', pos2: '非自立' }),
|
||||||
];
|
];
|
||||||
|
|
||||||
const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
|
const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens);
|
||||||
assert.equal(enriched[0]?.pos1, 'B');
|
assert.equal(enriched[0]?.pos1, 'A|B');
|
||||||
|
assert.equal(enriched[0]?.pos2, 'L2|非自立');
|
||||||
});
|
});
|
||||||
|
|
||||||
test('enrichTokensWithMecabPos1 fills missing pos1 using surface-sequence fallback', () => {
|
test('enrichTokensWithMecabPos1 fills missing pos1 using surface-sequence fallback', () => {
|
||||||
|
|||||||
@@ -1,13 +1,45 @@
|
|||||||
import { MergedToken } from '../../../types';
|
import { MergedToken } from '../../../types';
|
||||||
|
|
||||||
function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): string | undefined {
|
type MecabPosMetadata = {
|
||||||
if (mecabTokens.length === 0) {
|
pos1: string;
|
||||||
|
pos2?: string;
|
||||||
|
pos3?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
function joinUniqueTags(values: Array<string | undefined>): string | undefined {
|
||||||
|
const unique: string[] = [];
|
||||||
|
for (const value of values) {
|
||||||
|
if (!value) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const trimmed = value.trim();
|
||||||
|
if (!trimmed) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!unique.includes(trimmed)) {
|
||||||
|
unique.push(trimmed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (unique.length === 0) {
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
|
if (unique.length === 1) {
|
||||||
|
return unique[0];
|
||||||
|
}
|
||||||
|
return unique.join('|');
|
||||||
|
}
|
||||||
|
|
||||||
|
function pickClosestMecabPosMetadata(
|
||||||
|
token: MergedToken,
|
||||||
|
mecabTokens: MergedToken[],
|
||||||
|
): MecabPosMetadata | null {
|
||||||
|
if (mecabTokens.length === 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
const tokenStart = token.startPos ?? 0;
|
const tokenStart = token.startPos ?? 0;
|
||||||
const tokenEnd = token.endPos ?? tokenStart + token.surface.length;
|
const tokenEnd = token.endPos ?? tokenStart + token.surface.length;
|
||||||
let bestSurfaceMatchPos1: string | undefined;
|
let bestSurfaceMatchToken: MergedToken | null = null;
|
||||||
let bestSurfaceMatchDistance = Number.MAX_SAFE_INTEGER;
|
let bestSurfaceMatchDistance = Number.MAX_SAFE_INTEGER;
|
||||||
let bestSurfaceMatchEndDistance = Number.MAX_SAFE_INTEGER;
|
let bestSurfaceMatchEndDistance = Number.MAX_SAFE_INTEGER;
|
||||||
|
|
||||||
@@ -31,19 +63,24 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s
|
|||||||
) {
|
) {
|
||||||
bestSurfaceMatchDistance = startDistance;
|
bestSurfaceMatchDistance = startDistance;
|
||||||
bestSurfaceMatchEndDistance = endDistance;
|
bestSurfaceMatchEndDistance = endDistance;
|
||||||
bestSurfaceMatchPos1 = mecabToken.pos1;
|
bestSurfaceMatchToken = mecabToken;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (bestSurfaceMatchPos1) {
|
if (bestSurfaceMatchToken) {
|
||||||
return bestSurfaceMatchPos1;
|
return {
|
||||||
|
pos1: bestSurfaceMatchToken.pos1 as string,
|
||||||
|
pos2: bestSurfaceMatchToken.pos2,
|
||||||
|
pos3: bestSurfaceMatchToken.pos3,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
let bestPos1: string | undefined;
|
let bestToken: MergedToken | null = null;
|
||||||
let bestOverlap = 0;
|
let bestOverlap = 0;
|
||||||
let bestSpan = 0;
|
let bestSpan = 0;
|
||||||
let bestStartDistance = Number.MAX_SAFE_INTEGER;
|
let bestStartDistance = Number.MAX_SAFE_INTEGER;
|
||||||
let bestStart = Number.MAX_SAFE_INTEGER;
|
let bestStart = Number.MAX_SAFE_INTEGER;
|
||||||
|
const overlappingTokens: MergedToken[] = [];
|
||||||
|
|
||||||
for (const mecabToken of mecabTokens) {
|
for (const mecabToken of mecabTokens) {
|
||||||
if (!mecabToken.pos1) {
|
if (!mecabToken.pos1) {
|
||||||
@@ -58,6 +95,7 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s
|
|||||||
if (overlap === 0) {
|
if (overlap === 0) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
overlappingTokens.push(mecabToken);
|
||||||
|
|
||||||
const span = mecabEnd - mecabStart;
|
const span = mecabEnd - mecabStart;
|
||||||
if (
|
if (
|
||||||
@@ -71,11 +109,23 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s
|
|||||||
bestSpan = span;
|
bestSpan = span;
|
||||||
bestStartDistance = Math.abs(mecabStart - tokenStart);
|
bestStartDistance = Math.abs(mecabStart - tokenStart);
|
||||||
bestStart = mecabStart;
|
bestStart = mecabStart;
|
||||||
bestPos1 = mecabToken.pos1;
|
bestToken = mecabToken;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return bestOverlap > 0 ? bestPos1 : undefined;
|
if (bestOverlap === 0 || !bestToken) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const overlapPos1 = joinUniqueTags(overlappingTokens.map((token) => token.pos1));
|
||||||
|
const overlapPos2 = joinUniqueTags(overlappingTokens.map((token) => token.pos2));
|
||||||
|
const overlapPos3 = joinUniqueTags(overlappingTokens.map((token) => token.pos3));
|
||||||
|
|
||||||
|
return {
|
||||||
|
pos1: overlapPos1 ?? (bestToken.pos1 as string),
|
||||||
|
pos2: overlapPos2 ?? bestToken.pos2,
|
||||||
|
pos3: overlapPos3 ?? bestToken.pos3,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
function fillMissingPos1BySurfaceSequence(
|
function fillMissingPos1BySurfaceSequence(
|
||||||
@@ -101,7 +151,7 @@ function fillMissingPos1BySurfaceSequence(
|
|||||||
return token;
|
return token;
|
||||||
}
|
}
|
||||||
|
|
||||||
let best: { pos1: string; index: number } | null = null;
|
let best: { token: MergedToken; index: number } | null = null;
|
||||||
for (const candidate of indexedMecabTokens) {
|
for (const candidate of indexedMecabTokens) {
|
||||||
if (candidate.token.surface !== surface) {
|
if (candidate.token.surface !== surface) {
|
||||||
continue;
|
continue;
|
||||||
@@ -109,7 +159,7 @@ function fillMissingPos1BySurfaceSequence(
|
|||||||
if (candidate.index < cursor) {
|
if (candidate.index < cursor) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
best = { pos1: candidate.token.pos1 as string, index: candidate.index };
|
best = { token: candidate.token, index: candidate.index };
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -118,7 +168,7 @@ function fillMissingPos1BySurfaceSequence(
|
|||||||
if (candidate.token.surface !== surface) {
|
if (candidate.token.surface !== surface) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
best = { pos1: candidate.token.pos1 as string, index: candidate.index };
|
best = { token: candidate.token, index: candidate.index };
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -130,7 +180,9 @@ function fillMissingPos1BySurfaceSequence(
|
|||||||
cursor = best.index + 1;
|
cursor = best.index + 1;
|
||||||
return {
|
return {
|
||||||
...token,
|
...token,
|
||||||
pos1: best.pos1,
|
pos1: best.token.pos1,
|
||||||
|
pos2: best.token.pos2,
|
||||||
|
pos3: best.token.pos3,
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -152,14 +204,16 @@ export function enrichTokensWithMecabPos1(
|
|||||||
return token;
|
return token;
|
||||||
}
|
}
|
||||||
|
|
||||||
const pos1 = pickClosestMecabPos1(token, mecabTokens);
|
const metadata = pickClosestMecabPosMetadata(token, mecabTokens);
|
||||||
if (!pos1) {
|
if (!metadata) {
|
||||||
return token;
|
return token;
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
...token,
|
...token,
|
||||||
pos1,
|
pos1: metadata.pos1,
|
||||||
|
pos2: metadata.pos2,
|
||||||
|
pos3: metadata.pos3,
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -17,6 +17,8 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
import { PartOfSpeech, Token, MergedToken } from './types';
|
import { PartOfSpeech, Token, MergedToken } from './types';
|
||||||
|
import { DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG } from './token-pos1-exclusions';
|
||||||
|
import { DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG } from './token-pos2-exclusions';
|
||||||
|
|
||||||
export function isNoun(tok: Token): boolean {
|
export function isNoun(tok: Token): boolean {
|
||||||
return tok.partOfSpeech === PartOfSpeech.noun;
|
return tok.partOfSpeech === PartOfSpeech.noun;
|
||||||
@@ -241,25 +243,71 @@ export function mergeTokens(
|
|||||||
}
|
}
|
||||||
|
|
||||||
const SENTENCE_BOUNDARY_SURFACES = new Set(['。', '?', '!', '?', '!', '…', '\u2026']);
|
const SENTENCE_BOUNDARY_SURFACES = new Set(['。', '?', '!', '?', '!', '…', '\u2026']);
|
||||||
const N_PLUS_ONE_IGNORED_POS1 = new Set(['助詞', '助動詞', '記号', '補助記号']);
|
const N_PLUS_ONE_IGNORED_POS1 = new Set(
|
||||||
|
DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG.defaults,
|
||||||
|
);
|
||||||
|
const N_PLUS_ONE_IGNORED_POS2 = new Set(
|
||||||
|
DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG.defaults,
|
||||||
|
);
|
||||||
|
|
||||||
export function isNPlusOneCandidateToken(token: MergedToken): boolean {
|
function normalizePos1Tag(pos1: string | undefined): string {
|
||||||
|
return typeof pos1 === 'string' ? pos1.trim() : '';
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizePos2Tag(pos2: string | undefined): string {
|
||||||
|
return typeof pos2 === 'string' ? pos2.trim() : '';
|
||||||
|
}
|
||||||
|
|
||||||
|
function isExcludedByTagSet(
|
||||||
|
normalizedTag: string,
|
||||||
|
exclusions: ReadonlySet<string>,
|
||||||
|
): boolean {
|
||||||
|
if (!normalizedTag) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const parts = normalizedTag
|
||||||
|
.split('|')
|
||||||
|
.map((part) => part.trim())
|
||||||
|
.filter((part) => part.length > 0);
|
||||||
|
if (parts.length === 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return parts.every((part) => exclusions.has(part));
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isNPlusOneCandidateToken(
|
||||||
|
token: MergedToken,
|
||||||
|
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
|
||||||
|
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
|
||||||
|
): boolean {
|
||||||
if (token.isKnown) {
|
if (token.isKnown) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return isNPlusOneWordCountToken(token);
|
return isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions);
|
||||||
}
|
}
|
||||||
|
|
||||||
function isNPlusOneWordCountToken(token: MergedToken): boolean {
|
function isNPlusOneWordCountToken(
|
||||||
if (token.partOfSpeech === PartOfSpeech.particle) {
|
token: MergedToken,
|
||||||
|
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
|
||||||
|
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
|
||||||
|
): boolean {
|
||||||
|
const normalizedPos1 = normalizePos1Tag(token.pos1);
|
||||||
|
const hasPos1 = normalizedPos1.length > 0;
|
||||||
|
if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (token.partOfSpeech === PartOfSpeech.bound_auxiliary) {
|
const normalizedPos2 = normalizePos2Tag(token.pos2);
|
||||||
|
const hasPos2 = normalizedPos2.length > 0;
|
||||||
|
if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (token.partOfSpeech === PartOfSpeech.symbol) {
|
if (!hasPos1 && !hasPos2 && (
|
||||||
|
token.partOfSpeech === PartOfSpeech.particle ||
|
||||||
|
token.partOfSpeech === PartOfSpeech.bound_auxiliary ||
|
||||||
|
token.partOfSpeech === PartOfSpeech.symbol
|
||||||
|
)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -271,10 +319,6 @@ function isNPlusOneWordCountToken(token: MergedToken): boolean {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (token.pos1 && N_PLUS_ONE_IGNORED_POS1.has(token.pos1)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (token.surface.trim().length === 0) {
|
if (token.surface.trim().length === 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -290,7 +334,12 @@ function isSentenceBoundaryToken(token: MergedToken): boolean {
|
|||||||
return SENTENCE_BOUNDARY_SURFACES.has(token.surface);
|
return SENTENCE_BOUNDARY_SURFACES.has(token.surface);
|
||||||
}
|
}
|
||||||
|
|
||||||
export function markNPlusOneTargets(tokens: MergedToken[], minSentenceWords = 3): MergedToken[] {
|
export function markNPlusOneTargets(
|
||||||
|
tokens: MergedToken[],
|
||||||
|
minSentenceWords = 3,
|
||||||
|
pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
|
||||||
|
pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
|
||||||
|
): MergedToken[] {
|
||||||
if (tokens.length === 0) {
|
if (tokens.length === 0) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
@@ -311,11 +360,11 @@ export function markNPlusOneTargets(tokens: MergedToken[], minSentenceWords = 3)
|
|||||||
for (let i = start; i < endExclusive; i++) {
|
for (let i = start; i < endExclusive; i++) {
|
||||||
const token = markedTokens[i];
|
const token = markedTokens[i];
|
||||||
if (!token) continue;
|
if (!token) continue;
|
||||||
if (isNPlusOneWordCountToken(token)) {
|
if (isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions)) {
|
||||||
sentenceWordCount += 1;
|
sentenceWordCount += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isNPlusOneCandidateToken(token)) {
|
if (isNPlusOneCandidateToken(token, pos1Exclusions, pos2Exclusions)) {
|
||||||
sentenceCandidates.push(i);
|
sentenceCandidates.push(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
53
src/token-pos1-exclusions.ts
Normal file
53
src/token-pos1-exclusions.ts
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
import type { ResolvedTokenPos1ExclusionConfig, TokenPos1ExclusionConfig } from './types';
|
||||||
|
|
||||||
|
export const DEFAULT_ANNOTATION_POS1_EXCLUSION_DEFAULTS = Object.freeze([
|
||||||
|
'助詞',
|
||||||
|
'助動詞',
|
||||||
|
'記号',
|
||||||
|
'補助記号',
|
||||||
|
'連体詞',
|
||||||
|
'感動詞',
|
||||||
|
'接続詞',
|
||||||
|
'接頭詞',
|
||||||
|
]) as readonly string[];
|
||||||
|
|
||||||
|
export const DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG: ResolvedTokenPos1ExclusionConfig = {
|
||||||
|
defaults: [...DEFAULT_ANNOTATION_POS1_EXCLUSION_DEFAULTS],
|
||||||
|
add: [],
|
||||||
|
remove: [],
|
||||||
|
};
|
||||||
|
|
||||||
|
function normalizePosTag(value: string): string {
|
||||||
|
return value.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
export function normalizePos1ExclusionList(values: readonly string[]): string[] {
|
||||||
|
const deduped = new Set<string>();
|
||||||
|
for (const value of values) {
|
||||||
|
const normalized = normalizePosTag(value);
|
||||||
|
if (!normalized) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
deduped.add(normalized);
|
||||||
|
}
|
||||||
|
return [...deduped];
|
||||||
|
}
|
||||||
|
|
||||||
|
export function resolveAnnotationPos1ExclusionSet(
|
||||||
|
config: TokenPos1ExclusionConfig | ResolvedTokenPos1ExclusionConfig,
|
||||||
|
): ReadonlySet<string> {
|
||||||
|
const defaults = normalizePos1ExclusionList(config.defaults ?? []);
|
||||||
|
const added = normalizePos1ExclusionList(config.add ?? []);
|
||||||
|
const removed = new Set(normalizePos1ExclusionList(config.remove ?? []));
|
||||||
|
const resolved = new Set<string>();
|
||||||
|
for (const value of defaults) {
|
||||||
|
resolved.add(value);
|
||||||
|
}
|
||||||
|
for (const value of added) {
|
||||||
|
resolved.add(value);
|
||||||
|
}
|
||||||
|
for (const value of removed) {
|
||||||
|
resolved.delete(value);
|
||||||
|
}
|
||||||
|
return resolved;
|
||||||
|
}
|
||||||
29
src/token-pos2-exclusions.ts
Normal file
29
src/token-pos2-exclusions.ts
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
import type { ResolvedTokenPos2ExclusionConfig, TokenPos2ExclusionConfig } from './types';
|
||||||
|
import { normalizePos1ExclusionList } from './token-pos1-exclusions';
|
||||||
|
|
||||||
|
export const DEFAULT_ANNOTATION_POS2_EXCLUSION_DEFAULTS = Object.freeze(['非自立']) as readonly string[];
|
||||||
|
|
||||||
|
export const DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG: ResolvedTokenPos2ExclusionConfig = {
|
||||||
|
defaults: [...DEFAULT_ANNOTATION_POS2_EXCLUSION_DEFAULTS],
|
||||||
|
add: [],
|
||||||
|
remove: [],
|
||||||
|
};
|
||||||
|
|
||||||
|
export function resolveAnnotationPos2ExclusionSet(
|
||||||
|
config: TokenPos2ExclusionConfig | ResolvedTokenPos2ExclusionConfig,
|
||||||
|
): ReadonlySet<string> {
|
||||||
|
const defaults = normalizePos1ExclusionList(config.defaults ?? []);
|
||||||
|
const added = normalizePos1ExclusionList(config.add ?? []);
|
||||||
|
const removed = new Set(normalizePos1ExclusionList(config.remove ?? []));
|
||||||
|
const resolved = new Set<string>();
|
||||||
|
for (const value of defaults) {
|
||||||
|
resolved.add(value);
|
||||||
|
}
|
||||||
|
for (const value of added) {
|
||||||
|
resolved.add(value);
|
||||||
|
}
|
||||||
|
for (const value of removed) {
|
||||||
|
resolved.delete(value);
|
||||||
|
}
|
||||||
|
return resolved;
|
||||||
|
}
|
||||||
24
src/types.ts
24
src/types.ts
@@ -334,6 +334,30 @@ export interface SubtitleStyleConfig {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface TokenPos1ExclusionConfig {
|
||||||
|
defaults?: string[];
|
||||||
|
add?: string[];
|
||||||
|
remove?: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ResolvedTokenPos1ExclusionConfig {
|
||||||
|
defaults: string[];
|
||||||
|
add: string[];
|
||||||
|
remove: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface TokenPos2ExclusionConfig {
|
||||||
|
defaults?: string[];
|
||||||
|
add?: string[];
|
||||||
|
remove?: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ResolvedTokenPos2ExclusionConfig {
|
||||||
|
defaults: string[];
|
||||||
|
add: string[];
|
||||||
|
remove: string[];
|
||||||
|
}
|
||||||
|
|
||||||
export type FrequencyDictionaryMode = 'single' | 'banded';
|
export type FrequencyDictionaryMode = 'single' | 'banded';
|
||||||
|
|
||||||
export interface ShortcutsConfig {
|
export interface ShortcutsConfig {
|
||||||
|
|||||||
Reference in New Issue
Block a user