mirror of
https://github.com/ksyasuda/SubMiner.git
synced 2026-03-01 18:22:41 -08:00
fix(tokenizer): tighten n+1 eligibility using mecab pos overlaps
This commit is contained in:
53
src/token-pos1-exclusions.ts
Normal file
53
src/token-pos1-exclusions.ts
Normal file
@@ -0,0 +1,53 @@
|
||||
import type { ResolvedTokenPos1ExclusionConfig, TokenPos1ExclusionConfig } from './types';
|
||||
|
||||
export const DEFAULT_ANNOTATION_POS1_EXCLUSION_DEFAULTS = Object.freeze([
|
||||
'助詞',
|
||||
'助動詞',
|
||||
'記号',
|
||||
'補助記号',
|
||||
'連体詞',
|
||||
'感動詞',
|
||||
'接続詞',
|
||||
'接頭詞',
|
||||
]) as readonly string[];
|
||||
|
||||
export const DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG: ResolvedTokenPos1ExclusionConfig = {
|
||||
defaults: [...DEFAULT_ANNOTATION_POS1_EXCLUSION_DEFAULTS],
|
||||
add: [],
|
||||
remove: [],
|
||||
};
|
||||
|
||||
function normalizePosTag(value: string): string {
|
||||
return value.trim();
|
||||
}
|
||||
|
||||
export function normalizePos1ExclusionList(values: readonly string[]): string[] {
|
||||
const deduped = new Set<string>();
|
||||
for (const value of values) {
|
||||
const normalized = normalizePosTag(value);
|
||||
if (!normalized) {
|
||||
continue;
|
||||
}
|
||||
deduped.add(normalized);
|
||||
}
|
||||
return [...deduped];
|
||||
}
|
||||
|
||||
export function resolveAnnotationPos1ExclusionSet(
|
||||
config: TokenPos1ExclusionConfig | ResolvedTokenPos1ExclusionConfig,
|
||||
): ReadonlySet<string> {
|
||||
const defaults = normalizePos1ExclusionList(config.defaults ?? []);
|
||||
const added = normalizePos1ExclusionList(config.add ?? []);
|
||||
const removed = new Set(normalizePos1ExclusionList(config.remove ?? []));
|
||||
const resolved = new Set<string>();
|
||||
for (const value of defaults) {
|
||||
resolved.add(value);
|
||||
}
|
||||
for (const value of added) {
|
||||
resolved.add(value);
|
||||
}
|
||||
for (const value of removed) {
|
||||
resolved.delete(value);
|
||||
}
|
||||
return resolved;
|
||||
}
|
||||
Reference in New Issue
Block a user