From a7d220e1820384addd06be08ce4243397402bbb2 Mon Sep 17 00:00:00 2001 From: sudacode Date: Sat, 28 Feb 2026 19:07:43 -0800 Subject: [PATCH] fix(tokenizer): tighten n+1 eligibility using mecab pos overlaps --- src/core/services/tokenizer.test.ts | 122 +++++++++++ src/core/services/tokenizer.ts | 20 +- .../tokenizer/annotation-stage.test.ts | 168 +++++++++++++++ .../services/tokenizer/annotation-stage.ts | 193 +++++++++++++++++- .../tokenizer/parser-enrichment-stage.test.ts | 7 +- .../tokenizer/parser-enrichment-stage.ts | 86 ++++++-- src/token-merger.ts | 77 +++++-- src/token-pos1-exclusions.ts | 53 +++++ src/token-pos2-exclusions.ts | 29 +++ src/types.ts | 24 +++ 10 files changed, 736 insertions(+), 43 deletions(-) create mode 100644 src/token-pos1-exclusions.ts create mode 100644 src/token-pos2-exclusions.ts diff --git a/src/core/services/tokenizer.test.ts b/src/core/services/tokenizer.test.ts index c3a0989..1561bf1 100644 --- a/src/core/services/tokenizer.test.ts +++ b/src/core/services/tokenizer.test.ts @@ -2038,3 +2038,125 @@ test('tokenizeSubtitle keeps frequency enrichment while n+1 is disabled', async assert.equal(mecabCalls, 1); assert.equal(frequencyCalls, 1); }); + + +test('tokenizeSubtitle excludes default non-independent pos2 from N+1 and frequency annotations', async () => { + const result = await tokenizeSubtitle( + 'になれば', + makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], { + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => (text === 'なる' ? 11 : null), + tokenizeWithMecab: async () => [ + { + headword: 'なる', + surface: 'になれば', + reading: 'ニナレバ', + startPos: 0, + endPos: 4, + partOfSpeech: PartOfSpeech.verb, + pos1: '動詞', + pos2: '非自立', + isMerged: true, + isKnown: false, + isNPlusOneTarget: false, + }, + ], + getMinSentenceWordsForNPlusOne: () => 1, + }), + ); + + assert.equal(result.tokens?.length, 1); + assert.equal(result.tokens?.[0]?.frequencyRank, undefined); + assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false); +}); + +test('tokenizeSubtitle keeps merged token when overlap contains at least one content pos1 tag', async () => { + const result = await tokenizeSubtitle( + 'になれば', + makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], { + getFrequencyDictionaryEnabled: () => true, + getFrequencyRank: (text) => (text === 'なる' ? 13 : null), + tokenizeWithMecab: async () => [ + { + headword: 'に', + surface: 'に', + reading: 'ニ', + startPos: 0, + endPos: 1, + partOfSpeech: PartOfSpeech.particle, + pos1: '助詞', + pos2: '格助詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'なる', + surface: 'なれ', + reading: 'ナレ', + startPos: 1, + endPos: 3, + partOfSpeech: PartOfSpeech.verb, + pos1: '動詞', + pos2: '自立', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + { + headword: 'ば', + surface: 'ば', + reading: 'バ', + startPos: 3, + endPos: 4, + partOfSpeech: PartOfSpeech.particle, + pos1: '助詞', + pos2: '接続助詞', + isMerged: false, + isKnown: false, + isNPlusOneTarget: false, + }, + ], + getMinSentenceWordsForNPlusOne: () => 1, + }), + ); + + assert.equal(result.tokens?.length, 1); + assert.equal(result.tokens?.[0]?.pos1, '助詞|動詞'); + assert.equal(result.tokens?.[0]?.frequencyRank, 13); + assert.equal(result.tokens?.[0]?.isNPlusOneTarget, true); +}); + +test('tokenizeSubtitle excludes default non-independent pos2 from N+1 when JLPT/frequency are disabled', async () => { + let mecabCalls = 0; + const result = await tokenizeSubtitle( + 'になれば', + makeDepsFromYomitanTokens([{ surface: 'になれば', reading: 'になれば', headword: 'なる' }], { + getJlptEnabled: () => false, + getFrequencyDictionaryEnabled: () => false, + getMinSentenceWordsForNPlusOne: () => 1, + tokenizeWithMecab: async () => { + mecabCalls += 1; + return [ + { + headword: 'なる', + surface: 'になれば', + reading: 'ニナレバ', + startPos: 0, + endPos: 4, + partOfSpeech: PartOfSpeech.verb, + pos1: '動詞', + pos2: '非自立', + isMerged: true, + isKnown: false, + isNPlusOneTarget: false, + }, + ]; + }, + }), + ); + + assert.equal(mecabCalls, 1); + assert.equal(result.tokens?.length, 1); + assert.equal(result.tokens?.[0]?.isNPlusOneTarget, false); +}); diff --git a/src/core/services/tokenizer.ts b/src/core/services/tokenizer.ts index 764cbaa..ee218b1 100644 --- a/src/core/services/tokenizer.ts +++ b/src/core/services/tokenizer.ts @@ -10,6 +10,14 @@ import { FrequencyDictionaryLookup, JlptLevel, } from '../../types'; +import { + DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG, + resolveAnnotationPos1ExclusionSet, +} from '../../token-pos1-exclusions'; +import { + DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG, + resolveAnnotationPos2ExclusionSet, +} from '../../token-pos2-exclusions'; import { selectYomitanParseTokens } from './tokenizer/parser-selection-stage'; import { requestYomitanParseResults, @@ -78,6 +86,8 @@ interface TokenizerAnnotationOptions { frequencyEnabled: boolean; frequencyMatchMode: FrequencyDictionaryMatchMode; minSentenceWordsForNPlusOne: number | undefined; + pos1Exclusions: ReadonlySet; + pos2Exclusions: ReadonlySet; } let parserEnrichmentWorkerRuntimeModulePromise: @@ -87,6 +97,12 @@ let annotationStageModulePromise: Promise | null = null; +const DEFAULT_ANNOTATION_POS1_EXCLUSIONS = resolveAnnotationPos1ExclusionSet( + DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG, +); +const DEFAULT_ANNOTATION_POS2_EXCLUSIONS = resolveAnnotationPos2ExclusionSet( + DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG, +); function getKnownWordLookup(deps: TokenizerServiceDeps, options: TokenizerAnnotationOptions): (text: string) => boolean { if (!options.nPlusOneEnabled) { @@ -96,7 +112,7 @@ function getKnownWordLookup(deps: TokenizerServiceDeps, options: TokenizerAnnota } function needsMecabPosEnrichment(options: TokenizerAnnotationOptions): boolean { - return options.jlptEnabled || options.frequencyEnabled; + return options.nPlusOneEnabled || options.jlptEnabled || options.frequencyEnabled; } function hasAnyAnnotationEnabled(options: TokenizerAnnotationOptions): boolean { @@ -389,6 +405,8 @@ function getAnnotationOptions(deps: TokenizerServiceDeps): TokenizerAnnotationOp frequencyEnabled: deps.getFrequencyDictionaryEnabled?.() !== false, frequencyMatchMode: deps.getFrequencyDictionaryMatchMode?.() ?? 'headword', minSentenceWordsForNPlusOne: deps.getMinSentenceWordsForNPlusOne?.(), + pos1Exclusions: DEFAULT_ANNOTATION_POS1_EXCLUSIONS, + pos2Exclusions: DEFAULT_ANNOTATION_POS2_EXCLUSIONS, }; } diff --git a/src/core/services/tokenizer/annotation-stage.test.ts b/src/core/services/tokenizer/annotation-stage.test.ts index fd4541b..50d2cbd 100644 --- a/src/core/services/tokenizer/annotation-stage.test.ts +++ b/src/core/services/tokenizer/annotation-stage.test.ts @@ -205,3 +205,171 @@ test('annotateTokens N+1 minimum sentence words counts only eligible word tokens assert.equal(result[2]?.isKnown, true); assert.equal(result[0]?.isNPlusOneTarget, false); }); + +test('annotateTokens applies configured pos1 exclusions to both frequency and N+1', () => { + const tokens = [ + makeToken({ + surface: '猫', + headword: '猫', + pos1: '名詞', + frequencyRank: 21, + startPos: 0, + endPos: 1, + }), + makeToken({ + surface: '走る', + headword: '走る', + pos1: '動詞', + partOfSpeech: PartOfSpeech.verb, + startPos: 1, + endPos: 3, + frequencyRank: 22, + }), + ]; + + const result = annotateTokens( + tokens, + makeDeps({ + isKnownWord: (text) => text === '走る', + }), + { + minSentenceWordsForNPlusOne: 1, + pos1Exclusions: new Set(['名詞']), + }, + ); + + assert.equal(result[0]?.frequencyRank, undefined); + assert.equal(result[1]?.frequencyRank, 22); + assert.equal(result[0]?.isNPlusOneTarget, false); + assert.equal(result[1]?.isNPlusOneTarget, false); +}); + +test('annotateTokens allows previously default-excluded pos1 when removed from effective set', () => { + const tokens = [ + makeToken({ + surface: 'は', + headword: 'は', + partOfSpeech: PartOfSpeech.other, + pos1: '助詞', + startPos: 0, + endPos: 1, + frequencyRank: 8, + }), + ]; + + const result = annotateTokens(tokens, makeDeps(), { + minSentenceWordsForNPlusOne: 1, + pos1Exclusions: new Set(), + }); + + assert.equal(result[0]?.frequencyRank, 8); + assert.equal(result[0]?.isNPlusOneTarget, true); +}); + +test('annotateTokens excludes default non-independent pos2 from frequency and N+1', () => { + const tokens = [ + makeToken({ + surface: 'になれば', + headword: 'なる', + partOfSpeech: PartOfSpeech.verb, + pos1: '動詞', + pos2: '非自立', + startPos: 0, + endPos: 4, + frequencyRank: 7, + }), + ]; + + const result = annotateTokens(tokens, makeDeps(), { + minSentenceWordsForNPlusOne: 1, + }); + + assert.equal(result[0]?.frequencyRank, undefined); + assert.equal(result[0]?.isNPlusOneTarget, false); +}); + +test('annotateTokens excludes likely kana SFX tokens from frequency when POS tags are missing', () => { + const tokens = [ + makeToken({ + surface: 'ぐわっ', + reading: 'ぐわっ', + headword: 'ぐわっ', + pos1: '', + pos2: '', + frequencyRank: 12, + startPos: 0, + endPos: 3, + }), + ]; + + const result = annotateTokens(tokens, makeDeps(), { + minSentenceWordsForNPlusOne: 1, + }); + + assert.equal(result[0]?.frequencyRank, undefined); +}); + +test('annotateTokens allows previously default-excluded pos2 when removed from effective set', () => { + const tokens = [ + makeToken({ + surface: 'になれば', + headword: 'なる', + partOfSpeech: PartOfSpeech.verb, + pos1: '動詞', + pos2: '非自立', + startPos: 0, + endPos: 4, + frequencyRank: 9, + }), + ]; + + const result = annotateTokens(tokens, makeDeps(), { + minSentenceWordsForNPlusOne: 1, + pos2Exclusions: new Set(), + }); + + assert.equal(result[0]?.frequencyRank, 9); + assert.equal(result[0]?.isNPlusOneTarget, true); +}); + +test('annotateTokens keeps composite tokens when any component pos tag is content-bearing', () => { + const tokens = [ + makeToken({ + surface: 'になれば', + headword: 'なる', + pos1: '助詞|動詞', + pos2: '格助詞|自立|接続助詞', + startPos: 0, + endPos: 4, + frequencyRank: 5, + }), + ]; + + const result = annotateTokens(tokens, makeDeps(), { + minSentenceWordsForNPlusOne: 1, + }); + + assert.equal(result[0]?.frequencyRank, 5); + assert.equal(result[0]?.isNPlusOneTarget, true); +}); + +test('annotateTokens excludes composite tokens when all component pos tags are excluded', () => { + const tokens = [ + makeToken({ + surface: 'けど', + headword: 'けど', + pos1: '助詞|助詞', + pos2: '接続助詞|終助詞', + startPos: 0, + endPos: 2, + frequencyRank: 6, + }), + ]; + + const result = annotateTokens(tokens, makeDeps(), { + minSentenceWordsForNPlusOne: 1, + }); + + assert.equal(result[0]?.frequencyRank, undefined); + assert.equal(result[0]?.isNPlusOneTarget, false); +}); diff --git a/src/core/services/tokenizer/annotation-stage.ts b/src/core/services/tokenizer/annotation-stage.ts index af409e4..922f873 100644 --- a/src/core/services/tokenizer/annotation-stage.ts +++ b/src/core/services/tokenizer/annotation-stage.ts @@ -1,4 +1,12 @@ import { markNPlusOneTargets } from '../../../token-merger'; +import { + DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG, + resolveAnnotationPos1ExclusionSet, +} from '../../../token-pos1-exclusions'; +import { + DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG, + resolveAnnotationPos2ExclusionSet, +} from '../../../token-pos2-exclusions'; import { JlptLevel, MergedToken, @@ -28,6 +36,8 @@ export interface AnnotationStageOptions { jlptEnabled?: boolean; frequencyEnabled?: boolean; minSentenceWordsForNPlusOne?: number; + pos1Exclusions?: ReadonlySet; + pos2Exclusions?: ReadonlySet; } function resolveKnownWordText( @@ -53,22 +63,85 @@ function applyKnownWordMarking( }); } -function isFrequencyExcludedByPos(token: MergedToken): boolean { - if ( - token.partOfSpeech === PartOfSpeech.particle || - token.partOfSpeech === PartOfSpeech.bound_auxiliary - ) { +function normalizePos1Tag(pos1: string | undefined): string { + return typeof pos1 === 'string' ? pos1.trim() : ''; +} + +function isExcludedByTagSet( + normalizedTag: string, + exclusions: ReadonlySet, +): boolean { + if (!normalizedTag) { + return false; + } + const parts = normalizedTag + .split('|') + .map((part) => part.trim()) + .filter((part) => part.length > 0); + if (parts.length === 0) { + return false; + } + return parts.every((part) => exclusions.has(part)); +} + +function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet { + if (options.pos1Exclusions) { + return options.pos1Exclusions; + } + + return resolveAnnotationPos1ExclusionSet(DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG); +} + +function resolvePos2Exclusions(options: AnnotationStageOptions): ReadonlySet { + if (options.pos2Exclusions) { + return options.pos2Exclusions; + } + + return resolveAnnotationPos2ExclusionSet(DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG); +} + +function normalizePos2Tag(pos2: string | undefined): string { + return typeof pos2 === 'string' ? pos2.trim() : ''; +} + +function isFrequencyExcludedByPos( + token: MergedToken, + pos1Exclusions: ReadonlySet, + pos2Exclusions: ReadonlySet, +): boolean { + const normalizedPos1 = normalizePos1Tag(token.pos1); + const hasPos1 = normalizedPos1.length > 0; + if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) { return true; } - return token.pos1 === '助詞' || token.pos1 === '助動詞'; + const normalizedPos2 = normalizePos2Tag(token.pos2); + const hasPos2 = normalizedPos2.length > 0; + if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) { + return true; + } + + if (hasPos1 || hasPos2) { + return false; + } + + if (isLikelyFrequencyNoiseToken(token)) { + return true; + } + + return ( + token.partOfSpeech === PartOfSpeech.particle || + token.partOfSpeech === PartOfSpeech.bound_auxiliary + ); } function applyFrequencyMarking( tokens: MergedToken[], + pos1Exclusions: ReadonlySet, + pos2Exclusions: ReadonlySet, ): MergedToken[] { return tokens.map((token) => { - if (isFrequencyExcludedByPos(token)) { + if (isFrequencyExcludedByPos(token, pos1Exclusions, pos2Exclusions)) { return { ...token, frequencyRank: undefined }; } @@ -203,6 +276,101 @@ function isRepeatedKanaSfx(text: string): boolean { return topCount >= Math.ceil(chars.length / 2); } +function isTrailingSmallTsuKanaSfx(text: string): boolean { + const normalized = normalizeJlptTextForExclusion(text); + if (!normalized) { + return false; + } + + const chars = [...normalized]; + if (chars.length < 2 || chars.length > 4) { + return false; + } + + if (!chars.every(isKanaChar)) { + return false; + } + + return chars[chars.length - 1] === 'っ'; +} + +function isReduplicatedKanaSfx(text: string): boolean { + const normalized = normalizeJlptTextForExclusion(text); + if (!normalized) { + return false; + } + + const chars = [...normalized]; + if (chars.length < 4 || chars.length % 2 !== 0) { + return false; + } + + if (!chars.every(isKanaChar)) { + return false; + } + + const half = chars.length / 2; + return chars.slice(0, half).join('') === chars.slice(half).join(''); +} + +function hasAdjacentKanaRepeat(text: string): boolean { + const normalized = normalizeJlptTextForExclusion(text); + if (!normalized) { + return false; + } + + const chars = [...normalized]; + if (!chars.every(isKanaChar)) { + return false; + } + + for (let i = 1; i < chars.length; i += 1) { + if (chars[i] === chars[i - 1]) { + return true; + } + } + + return false; +} + +function isLikelyFrequencyNoiseToken(token: MergedToken): boolean { + const candidates = [token.headword, token.surface].filter( + (candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0, + ); + + for (const candidate of candidates) { + const trimmedCandidate = candidate.trim(); + if (!trimmedCandidate) { + continue; + } + + const normalizedCandidate = normalizeJlptTextForExclusion(trimmedCandidate); + if (!normalizedCandidate) { + continue; + } + + if ( + shouldIgnoreJlptByTerm(trimmedCandidate) || + shouldIgnoreJlptByTerm(normalizedCandidate) + ) { + return true; + } + + if ( + hasAdjacentKanaRepeat(trimmedCandidate) || + hasAdjacentKanaRepeat(normalizedCandidate) || + isReduplicatedKanaSfx(trimmedCandidate) || + isReduplicatedKanaSfx(normalizedCandidate) || + isTrailingSmallTsuKanaSfx(trimmedCandidate) || + isTrailingSmallTsuKanaSfx(normalizedCandidate) + ) { + return true; + } + } + + return false; +} + function isJlptEligibleToken(token: MergedToken): boolean { if (token.pos1 && shouldIgnoreJlptForMecabPos1(token.pos1)) { return false; @@ -261,6 +429,8 @@ export function annotateTokens( deps: AnnotationStageDeps, options: AnnotationStageOptions = {}, ): MergedToken[] { + const pos1Exclusions = resolvePos1Exclusions(options); + const pos2Exclusions = resolvePos2Exclusions(options); const nPlusOneEnabled = options.nPlusOneEnabled !== false; const knownMarkedTokens = nPlusOneEnabled ? applyKnownWordMarking(tokens, deps.isKnownWord, deps.knownWordMatchMode) @@ -273,7 +443,7 @@ export function annotateTokens( const frequencyEnabled = options.frequencyEnabled !== false; const frequencyMarkedTokens = frequencyEnabled - ? applyFrequencyMarking(knownMarkedTokens) + ? applyFrequencyMarking(knownMarkedTokens, pos1Exclusions, pos2Exclusions) : knownMarkedTokens.map((token) => ({ ...token, frequencyRank: undefined, @@ -303,5 +473,10 @@ export function annotateTokens( ? minSentenceWordsForNPlusOne : 3; - return markNPlusOneTargets(jlptMarkedTokens, sanitizedMinSentenceWordsForNPlusOne); + return markNPlusOneTargets( + jlptMarkedTokens, + sanitizedMinSentenceWordsForNPlusOne, + pos1Exclusions, + pos2Exclusions, + ); } diff --git a/src/core/services/tokenizer/parser-enrichment-stage.test.ts b/src/core/services/tokenizer/parser-enrichment-stage.test.ts index a00f82c..86178a4 100644 --- a/src/core/services/tokenizer/parser-enrichment-stage.test.ts +++ b/src/core/services/tokenizer/parser-enrichment-stage.test.ts @@ -22,12 +22,13 @@ function makeToken(overrides: Partial): MergedToken { test('enrichTokensWithMecabPos1 picks pos1 by best overlap when no surface match exists', () => { const tokens = [makeToken({ surface: 'grouped', startPos: 2, endPos: 7 })]; const mecabTokens = [ - makeToken({ surface: 'left', startPos: 0, endPos: 4, pos1: 'A' }), - makeToken({ surface: 'right', startPos: 2, endPos: 6, pos1: 'B' }), + makeToken({ surface: 'left', startPos: 0, endPos: 4, pos1: 'A', pos2: 'L2' }), + makeToken({ surface: 'right', startPos: 2, endPos: 6, pos1: 'B', pos2: '非自立' }), ]; const enriched = enrichTokensWithMecabPos1(tokens, mecabTokens); - assert.equal(enriched[0]?.pos1, 'B'); + assert.equal(enriched[0]?.pos1, 'A|B'); + assert.equal(enriched[0]?.pos2, 'L2|非自立'); }); test('enrichTokensWithMecabPos1 fills missing pos1 using surface-sequence fallback', () => { diff --git a/src/core/services/tokenizer/parser-enrichment-stage.ts b/src/core/services/tokenizer/parser-enrichment-stage.ts index 3c3aeb2..857d255 100644 --- a/src/core/services/tokenizer/parser-enrichment-stage.ts +++ b/src/core/services/tokenizer/parser-enrichment-stage.ts @@ -1,13 +1,45 @@ import { MergedToken } from '../../../types'; -function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): string | undefined { - if (mecabTokens.length === 0) { +type MecabPosMetadata = { + pos1: string; + pos2?: string; + pos3?: string; +}; + +function joinUniqueTags(values: Array): string | undefined { + const unique: string[] = []; + for (const value of values) { + if (!value) { + continue; + } + const trimmed = value.trim(); + if (!trimmed) { + continue; + } + if (!unique.includes(trimmed)) { + unique.push(trimmed); + } + } + if (unique.length === 0) { return undefined; } + if (unique.length === 1) { + return unique[0]; + } + return unique.join('|'); +} + +function pickClosestMecabPosMetadata( + token: MergedToken, + mecabTokens: MergedToken[], +): MecabPosMetadata | null { + if (mecabTokens.length === 0) { + return null; + } const tokenStart = token.startPos ?? 0; const tokenEnd = token.endPos ?? tokenStart + token.surface.length; - let bestSurfaceMatchPos1: string | undefined; + let bestSurfaceMatchToken: MergedToken | null = null; let bestSurfaceMatchDistance = Number.MAX_SAFE_INTEGER; let bestSurfaceMatchEndDistance = Number.MAX_SAFE_INTEGER; @@ -31,19 +63,24 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s ) { bestSurfaceMatchDistance = startDistance; bestSurfaceMatchEndDistance = endDistance; - bestSurfaceMatchPos1 = mecabToken.pos1; + bestSurfaceMatchToken = mecabToken; } } - if (bestSurfaceMatchPos1) { - return bestSurfaceMatchPos1; + if (bestSurfaceMatchToken) { + return { + pos1: bestSurfaceMatchToken.pos1 as string, + pos2: bestSurfaceMatchToken.pos2, + pos3: bestSurfaceMatchToken.pos3, + }; } - let bestPos1: string | undefined; + let bestToken: MergedToken | null = null; let bestOverlap = 0; let bestSpan = 0; let bestStartDistance = Number.MAX_SAFE_INTEGER; let bestStart = Number.MAX_SAFE_INTEGER; + const overlappingTokens: MergedToken[] = []; for (const mecabToken of mecabTokens) { if (!mecabToken.pos1) { @@ -58,6 +95,7 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s if (overlap === 0) { continue; } + overlappingTokens.push(mecabToken); const span = mecabEnd - mecabStart; if ( @@ -71,11 +109,23 @@ function pickClosestMecabPos1(token: MergedToken, mecabTokens: MergedToken[]): s bestSpan = span; bestStartDistance = Math.abs(mecabStart - tokenStart); bestStart = mecabStart; - bestPos1 = mecabToken.pos1; + bestToken = mecabToken; } } - return bestOverlap > 0 ? bestPos1 : undefined; + if (bestOverlap === 0 || !bestToken) { + return null; + } + + const overlapPos1 = joinUniqueTags(overlappingTokens.map((token) => token.pos1)); + const overlapPos2 = joinUniqueTags(overlappingTokens.map((token) => token.pos2)); + const overlapPos3 = joinUniqueTags(overlappingTokens.map((token) => token.pos3)); + + return { + pos1: overlapPos1 ?? (bestToken.pos1 as string), + pos2: overlapPos2 ?? bestToken.pos2, + pos3: overlapPos3 ?? bestToken.pos3, + }; } function fillMissingPos1BySurfaceSequence( @@ -101,7 +151,7 @@ function fillMissingPos1BySurfaceSequence( return token; } - let best: { pos1: string; index: number } | null = null; + let best: { token: MergedToken; index: number } | null = null; for (const candidate of indexedMecabTokens) { if (candidate.token.surface !== surface) { continue; @@ -109,7 +159,7 @@ function fillMissingPos1BySurfaceSequence( if (candidate.index < cursor) { continue; } - best = { pos1: candidate.token.pos1 as string, index: candidate.index }; + best = { token: candidate.token, index: candidate.index }; break; } @@ -118,7 +168,7 @@ function fillMissingPos1BySurfaceSequence( if (candidate.token.surface !== surface) { continue; } - best = { pos1: candidate.token.pos1 as string, index: candidate.index }; + best = { token: candidate.token, index: candidate.index }; break; } } @@ -130,7 +180,9 @@ function fillMissingPos1BySurfaceSequence( cursor = best.index + 1; return { ...token, - pos1: best.pos1, + pos1: best.token.pos1, + pos2: best.token.pos2, + pos3: best.token.pos3, }; }); } @@ -152,14 +204,16 @@ export function enrichTokensWithMecabPos1( return token; } - const pos1 = pickClosestMecabPos1(token, mecabTokens); - if (!pos1) { + const metadata = pickClosestMecabPosMetadata(token, mecabTokens); + if (!metadata) { return token; } return { ...token, - pos1, + pos1: metadata.pos1, + pos2: metadata.pos2, + pos3: metadata.pos3, }; }); diff --git a/src/token-merger.ts b/src/token-merger.ts index 55dfb66..c30d986 100644 --- a/src/token-merger.ts +++ b/src/token-merger.ts @@ -17,6 +17,8 @@ */ import { PartOfSpeech, Token, MergedToken } from './types'; +import { DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG } from './token-pos1-exclusions'; +import { DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG } from './token-pos2-exclusions'; export function isNoun(tok: Token): boolean { return tok.partOfSpeech === PartOfSpeech.noun; @@ -241,25 +243,71 @@ export function mergeTokens( } const SENTENCE_BOUNDARY_SURFACES = new Set(['。', '?', '!', '?', '!', '…', '\u2026']); -const N_PLUS_ONE_IGNORED_POS1 = new Set(['助詞', '助動詞', '記号', '補助記号']); +const N_PLUS_ONE_IGNORED_POS1 = new Set( + DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG.defaults, +); +const N_PLUS_ONE_IGNORED_POS2 = new Set( + DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG.defaults, +); -export function isNPlusOneCandidateToken(token: MergedToken): boolean { +function normalizePos1Tag(pos1: string | undefined): string { + return typeof pos1 === 'string' ? pos1.trim() : ''; +} + +function normalizePos2Tag(pos2: string | undefined): string { + return typeof pos2 === 'string' ? pos2.trim() : ''; +} + +function isExcludedByTagSet( + normalizedTag: string, + exclusions: ReadonlySet, +): boolean { + if (!normalizedTag) { + return false; + } + const parts = normalizedTag + .split('|') + .map((part) => part.trim()) + .filter((part) => part.length > 0); + if (parts.length === 0) { + return false; + } + return parts.every((part) => exclusions.has(part)); +} + +export function isNPlusOneCandidateToken( + token: MergedToken, + pos1Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS1, + pos2Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS2, +): boolean { if (token.isKnown) { return false; } - return isNPlusOneWordCountToken(token); + return isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions); } -function isNPlusOneWordCountToken(token: MergedToken): boolean { - if (token.partOfSpeech === PartOfSpeech.particle) { +function isNPlusOneWordCountToken( + token: MergedToken, + pos1Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS1, + pos2Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS2, +): boolean { + const normalizedPos1 = normalizePos1Tag(token.pos1); + const hasPos1 = normalizedPos1.length > 0; + if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) { return false; } - if (token.partOfSpeech === PartOfSpeech.bound_auxiliary) { + const normalizedPos2 = normalizePos2Tag(token.pos2); + const hasPos2 = normalizedPos2.length > 0; + if (isExcludedByTagSet(normalizedPos2, pos2Exclusions)) { return false; } - if (token.partOfSpeech === PartOfSpeech.symbol) { + if (!hasPos1 && !hasPos2 && ( + token.partOfSpeech === PartOfSpeech.particle || + token.partOfSpeech === PartOfSpeech.bound_auxiliary || + token.partOfSpeech === PartOfSpeech.symbol + )) { return false; } @@ -271,10 +319,6 @@ function isNPlusOneWordCountToken(token: MergedToken): boolean { return false; } - if (token.pos1 && N_PLUS_ONE_IGNORED_POS1.has(token.pos1)) { - return false; - } - if (token.surface.trim().length === 0) { return false; } @@ -290,7 +334,12 @@ function isSentenceBoundaryToken(token: MergedToken): boolean { return SENTENCE_BOUNDARY_SURFACES.has(token.surface); } -export function markNPlusOneTargets(tokens: MergedToken[], minSentenceWords = 3): MergedToken[] { +export function markNPlusOneTargets( + tokens: MergedToken[], + minSentenceWords = 3, + pos1Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS1, + pos2Exclusions: ReadonlySet = N_PLUS_ONE_IGNORED_POS2, +): MergedToken[] { if (tokens.length === 0) { return []; } @@ -311,11 +360,11 @@ export function markNPlusOneTargets(tokens: MergedToken[], minSentenceWords = 3) for (let i = start; i < endExclusive; i++) { const token = markedTokens[i]; if (!token) continue; - if (isNPlusOneWordCountToken(token)) { + if (isNPlusOneWordCountToken(token, pos1Exclusions, pos2Exclusions)) { sentenceWordCount += 1; } - if (isNPlusOneCandidateToken(token)) { + if (isNPlusOneCandidateToken(token, pos1Exclusions, pos2Exclusions)) { sentenceCandidates.push(i); } } diff --git a/src/token-pos1-exclusions.ts b/src/token-pos1-exclusions.ts new file mode 100644 index 0000000..f0fb258 --- /dev/null +++ b/src/token-pos1-exclusions.ts @@ -0,0 +1,53 @@ +import type { ResolvedTokenPos1ExclusionConfig, TokenPos1ExclusionConfig } from './types'; + +export const DEFAULT_ANNOTATION_POS1_EXCLUSION_DEFAULTS = Object.freeze([ + '助詞', + '助動詞', + '記号', + '補助記号', + '連体詞', + '感動詞', + '接続詞', + '接頭詞', +]) as readonly string[]; + +export const DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG: ResolvedTokenPos1ExclusionConfig = { + defaults: [...DEFAULT_ANNOTATION_POS1_EXCLUSION_DEFAULTS], + add: [], + remove: [], +}; + +function normalizePosTag(value: string): string { + return value.trim(); +} + +export function normalizePos1ExclusionList(values: readonly string[]): string[] { + const deduped = new Set(); + for (const value of values) { + const normalized = normalizePosTag(value); + if (!normalized) { + continue; + } + deduped.add(normalized); + } + return [...deduped]; +} + +export function resolveAnnotationPos1ExclusionSet( + config: TokenPos1ExclusionConfig | ResolvedTokenPos1ExclusionConfig, +): ReadonlySet { + const defaults = normalizePos1ExclusionList(config.defaults ?? []); + const added = normalizePos1ExclusionList(config.add ?? []); + const removed = new Set(normalizePos1ExclusionList(config.remove ?? [])); + const resolved = new Set(); + for (const value of defaults) { + resolved.add(value); + } + for (const value of added) { + resolved.add(value); + } + for (const value of removed) { + resolved.delete(value); + } + return resolved; +} diff --git a/src/token-pos2-exclusions.ts b/src/token-pos2-exclusions.ts new file mode 100644 index 0000000..a6eeef2 --- /dev/null +++ b/src/token-pos2-exclusions.ts @@ -0,0 +1,29 @@ +import type { ResolvedTokenPos2ExclusionConfig, TokenPos2ExclusionConfig } from './types'; +import { normalizePos1ExclusionList } from './token-pos1-exclusions'; + +export const DEFAULT_ANNOTATION_POS2_EXCLUSION_DEFAULTS = Object.freeze(['非自立']) as readonly string[]; + +export const DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG: ResolvedTokenPos2ExclusionConfig = { + defaults: [...DEFAULT_ANNOTATION_POS2_EXCLUSION_DEFAULTS], + add: [], + remove: [], +}; + +export function resolveAnnotationPos2ExclusionSet( + config: TokenPos2ExclusionConfig | ResolvedTokenPos2ExclusionConfig, +): ReadonlySet { + const defaults = normalizePos1ExclusionList(config.defaults ?? []); + const added = normalizePos1ExclusionList(config.add ?? []); + const removed = new Set(normalizePos1ExclusionList(config.remove ?? [])); + const resolved = new Set(); + for (const value of defaults) { + resolved.add(value); + } + for (const value of added) { + resolved.add(value); + } + for (const value of removed) { + resolved.delete(value); + } + return resolved; +} diff --git a/src/types.ts b/src/types.ts index 4915f2c..8830a27 100644 --- a/src/types.ts +++ b/src/types.ts @@ -334,6 +334,30 @@ export interface SubtitleStyleConfig { }; } +export interface TokenPos1ExclusionConfig { + defaults?: string[]; + add?: string[]; + remove?: string[]; +} + +export interface ResolvedTokenPos1ExclusionConfig { + defaults: string[]; + add: string[]; + remove: string[]; +} + +export interface TokenPos2ExclusionConfig { + defaults?: string[]; + add?: string[]; + remove?: string[]; +} + +export interface ResolvedTokenPos2ExclusionConfig { + defaults: string[]; + add: string[]; + remove: string[]; +} + export type FrequencyDictionaryMode = 'single' | 'banded'; export interface ShortcutsConfig {