Replace grammar-ending permutations with shared matcher; preserve word a

- Extract `grammar-ending.ts` with `isStandaloneGrammarEndingText` / `isSubtitleGrammarEndingText` pattern matchers
- Replace `STANDALONE_GRAMMAR_ENDINGS` set in parser-selection-stage with shared matcher
- Replace generated phrase sets in subtitle-annotation-filter with shared matcher
- Remove stale duplicate subtitle-exclusion constants and helpers from annotation-stage
- Manual clipboard card updates now write only to the sentence audio field, leaving word/expression audio untouched
This commit is contained in:
2026-05-02 23:25:33 -07:00
parent f83005bf70
commit a9625f8777
15 changed files with 285 additions and 265 deletions
@@ -126,7 +126,7 @@ function createManualUpdateService(overrides: Partial<CardCreationDeps> = {}): {
};
}
test('manual clipboard subtitle update replaces expression and sentence audio even when overwriteAudio is disabled', async () => {
test('manual clipboard subtitle update replaces sentence audio without touching expression audio', async () => {
const { service, updatedFields, mergeCalls, storedMedia } = createManualUpdateService();
await service.updateLastAddedFromClipboard('字幕');
@@ -134,10 +134,10 @@ test('manual clipboard subtitle update replaces expression and sentence audio ev
assert.equal(updatedFields.length, 1);
assert.equal(storedMedia.length, 1);
const audioValue = `[sound:${storedMedia[0]}]`;
assert.equal(updatedFields[0]?.ExpressionAudio, audioValue);
assert.equal(updatedFields[0]?.SentenceAudio, audioValue);
assert.equal('ExpressionAudio' in updatedFields[0]!, false);
assert.deepEqual(
mergeCalls.map((call) => call.overwrite),
[true, true],
[true],
);
});
+7 -18
View File
@@ -219,10 +219,6 @@ export class CardCreationService {
this.deps.getConfig(),
);
const sentenceAudioField = this.getResolvedSentenceAudioFieldName(noteInfo);
const expressionAudioField = this.deps.resolveConfiguredFieldName(
noteInfo,
this.deps.getConfig().fields?.audio || 'ExpressionAudio',
);
const sentenceField = this.deps.getEffectiveSentenceCardConfig().sentenceField;
const sentence = blocks.join(' ');
@@ -252,22 +248,15 @@ export class CardCreationService {
if (audioBuffer) {
await this.deps.client.storeMediaFile(audioFilename, audioBuffer);
if (sentenceAudioField || expressionAudioField) {
if (sentenceAudioField) {
const audioValue = `[sound:${audioFilename}]`;
const audioFields = new Set(
[sentenceAudioField, expressionAudioField].filter(
(fieldName): fieldName is string => Boolean(fieldName),
),
const existingAudio = noteInfo.fields[sentenceAudioField]?.value || '';
// Manual clipboard updates intentionally replace old captured sentence audio.
updatedFields[sentenceAudioField] = this.deps.mergeFieldValue(
existingAudio,
audioValue,
true,
);
for (const audioField of audioFields) {
const existingAudio = noteInfo.fields[audioField]?.value || '';
// Manual clipboard updates intentionally replace old captured audio.
updatedFields[audioField] = this.deps.mergeFieldValue(
existingAudio,
audioValue,
true,
);
}
}
miscInfoFilename = audioFilename;
updatePerformed = true;
@@ -301,6 +301,31 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone polite copul
}
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes grammar-ending patterns without enumerating variants', () => {
const tokens = [
makeToken({
surface: 'ですわ',
headword: 'です',
reading: 'デスワ',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
}),
makeToken({
surface: 'ではないですか',
headword: 'ない',
reading: 'デハナイデスカ',
partOfSpeech: PartOfSpeech.other,
pos1: '',
pos2: '',
}),
];
for (const token of tokens) {
assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
}
});
test('shouldExcludeTokenFromSubtitleAnnotations excludes auxiliary-stem そうだ grammar tails', () => {
const token = makeToken({
surface: 'そうだ',
+1 -147
View File
@@ -18,57 +18,6 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
const KATAKANA_CODEPOINT_START = 0x30a1;
const KATAKANA_CODEPOINT_END = 0x30f6;
const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
'ああ',
'ええ',
'うう',
'おお',
'はあ',
'はは',
'へえ',
'ふう',
'ほう',
]);
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
'だ',
'です',
'でした',
'だった',
'では',
'じゃ',
'でしょう',
'だろう',
] as const;
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES = [
'',
'か',
'ね',
'よ',
'な',
'けど',
'よね',
'かな',
'かね',
] as const;
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set(
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.flatMap((prefix) =>
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES.flatMap((core) =>
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES.map(
(particle) => `${prefix}${core}${particle}`,
),
),
),
);
const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
'って',
'ってよ',
'ってね',
'ってな',
'ってさ',
'ってか',
'ってば',
]);
const jlptLevelLookupCaches = new WeakMap<
(text: string) => JlptLevel | null,
@@ -104,10 +53,6 @@ function normalizePos1Tag(pos1: string | undefined): string {
return typeof pos1 === 'string' ? pos1.trim() : '';
}
const SUBTITLE_ANNOTATION_EXCLUDED_POS1 = new Set(['感動詞']);
const SUBTITLE_ANNOTATION_GRAMMAR_ONLY_POS1 = new Set(['助詞', '助動詞', '連体詞']);
const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
function splitNormalizedTagParts(normalizedTag: string): string[] {
if (!normalizedTag) {
return [];
@@ -129,57 +74,6 @@ function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet<strin
return parts.some((part) => exclusions.has(part));
}
function isExcludedFromSubtitleAnnotationsByPos1(normalizedPos1: string): boolean {
const parts = splitNormalizedTagParts(normalizedPos1);
if (parts.some((part) => SUBTITLE_ANNOTATION_EXCLUDED_POS1.has(part))) {
return true;
}
return parts.length > 0 && parts.every((part) => SUBTITLE_ANNOTATION_GRAMMAR_ONLY_POS1.has(part));
}
function isExcludedTrailingParticleMergedToken(token: MergedToken): boolean {
const normalizedSurface = normalizeJlptTextForExclusion(token.surface);
const normalizedHeadword = normalizeJlptTextForExclusion(token.headword);
if (
!normalizedSurface ||
!normalizedHeadword ||
!normalizedSurface.startsWith(normalizedHeadword)
) {
return false;
}
const suffix = normalizedSurface.slice(normalizedHeadword.length);
if (!SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES.has(suffix)) {
return false;
}
const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1));
if (pos1Parts.length < 2) {
return false;
}
const [leadingPos1, ...trailingPos1] = pos1Parts;
if (!leadingPos1 || SUBTITLE_ANNOTATION_GRAMMAR_ONLY_POS1.has(leadingPos1)) {
return false;
}
return trailingPos1.length > 0 && trailingPos1.every((part) => part === '助詞');
}
function isAuxiliaryStemGrammarTailToken(token: MergedToken): boolean {
const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1));
if (
pos1Parts.length === 0 ||
!pos1Parts.every((part) => AUXILIARY_STEM_GRAMMAR_TAIL_POS1.has(part))
) {
return false;
}
const pos3Parts = splitNormalizedTagParts(normalizePos2Tag(token.pos3));
return pos3Parts.includes('助動詞語幹');
}
function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
if (options.pos1Exclusions) {
return options.pos1Exclusions;
@@ -609,44 +503,6 @@ function isJlptEligibleToken(token: MergedToken): boolean {
return true;
}
function isExcludedFromSubtitleAnnotationsByTerm(token: MergedToken): boolean {
const candidates = [token.surface, token.reading, resolveJlptLookupText(token)].filter(
(candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
);
for (const candidate of candidates) {
const trimmedCandidate = candidate.trim();
if (!trimmedCandidate) {
continue;
}
const normalizedCandidate = normalizeJlptTextForExclusion(trimmedCandidate);
if (!normalizedCandidate) {
continue;
}
if (
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(trimmedCandidate) ||
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalizedCandidate) ||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(trimmedCandidate) ||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(normalizedCandidate)
) {
return true;
}
if (
isTrailingSmallTsuKanaSfx(trimmedCandidate) ||
isTrailingSmallTsuKanaSfx(normalizedCandidate) ||
isReduplicatedKanaSfxWithOptionalTrailingTo(trimmedCandidate) ||
isReduplicatedKanaSfxWithOptionalTrailingTo(normalizedCandidate)
) {
return true;
}
}
return false;
}
export function shouldExcludeTokenFromSubtitleAnnotations(token: MergedToken): boolean {
return sharedShouldExcludeTokenFromSubtitleAnnotations(token);
}
@@ -771,9 +627,7 @@ export function annotateTokens(
});
return {
...strippedToken,
isKnown:
nPlusOneEnabled &&
computeExcludedTokenKnownStatus(token, deps.isKnownWord),
isKnown: nPlusOneEnabled && computeExcludedTokenKnownStatus(token, deps.isKnownWord),
};
}
@@ -0,0 +1,124 @@
const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
const KATAKANA_CODEPOINT_START = 0x30a1;
const KATAKANA_CODEPOINT_END = 0x30f6;
const SENTENCE_FINAL_PARTICLE_SUFFIXES = ['', 'か', 'ね', 'よ', 'な', 'わ'] as const;
const EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'] as const;
const EXPLANATORY_ENDING_CORES = [
'だ',
'です',
'でした',
'だった',
'では',
'じゃ',
'でしょう',
'だろう',
] as const;
const EXPLANATORY_ENDING_TRAILING_PARTICLES = [
'',
'か',
'ね',
'よ',
'な',
'けど',
'よね',
'かな',
'かね',
] as const;
const EXPLANATORY_ENDING_THOUGHT_SUFFIXES = ['か', 'かな', 'かね'] as const;
const NEGATIVE_COPULA_PREFIXES = ['じゃ', 'では'] as const;
export function normalizeGrammarEndingText(text: string): string {
const raw = text.trim();
if (!raw) {
return '';
}
let normalized = '';
for (const char of raw) {
const code = char.codePointAt(0);
if (code === undefined) {
continue;
}
if (code >= KATAKANA_CODEPOINT_START && code <= KATAKANA_CODEPOINT_END) {
normalized += String.fromCodePoint(code - KATAKANA_TO_HIRAGANA_OFFSET);
continue;
}
normalized += char;
}
return normalized;
}
function matchesSuffix(text: string, suffixes: readonly string[]): boolean {
return suffixes.some((suffix) => text === suffix);
}
function matchesPoliteCopulaEnding(text: string): boolean {
if (!text.startsWith('です')) {
return false;
}
return matchesSuffix(text.slice('です'.length), SENTENCE_FINAL_PARTICLE_SUFFIXES);
}
function matchesNegativeCopulaEnding(text: string): boolean {
for (const prefix of NEGATIVE_COPULA_PREFIXES) {
const negativeStem = `${prefix}ない`;
if (!text.startsWith(negativeStem)) {
continue;
}
const suffix = text.slice(negativeStem.length);
return (
matchesSuffix(suffix, SENTENCE_FINAL_PARTICLE_SUFFIXES) || matchesPoliteCopulaEnding(suffix)
);
}
return false;
}
function matchesExplanatoryEnding(text: string): boolean {
for (const prefix of EXPLANATORY_ENDING_PREFIXES) {
if (EXPLANATORY_ENDING_THOUGHT_SUFFIXES.some((suffix) => text === `${prefix}${suffix}`)) {
return true;
}
if (!text.startsWith(prefix)) {
continue;
}
const suffix = text.slice(prefix.length);
for (const core of EXPLANATORY_ENDING_CORES) {
if (!suffix.startsWith(core)) {
continue;
}
if (matchesSuffix(suffix.slice(core.length), EXPLANATORY_ENDING_TRAILING_PARTICLES)) {
return true;
}
}
}
return false;
}
export function isStandaloneGrammarEndingText(text: string): boolean {
const normalized = normalizeGrammarEndingText(text);
if (!normalized) {
return false;
}
return matchesPoliteCopulaEnding(normalized) || matchesNegativeCopulaEnding(normalized);
}
export function isSubtitleGrammarEndingText(text: string): boolean {
const normalized = normalizeGrammarEndingText(text);
if (!normalized) {
return false;
}
return isStandaloneGrammarEndingText(normalized) || matchesExplanatoryEnding(normalized);
}
@@ -219,6 +219,38 @@ test('splits trailing ja-nai grammar endings from preceding content', () => {
);
});
test('splits trailing negative-copula grammar endings by pattern', () => {
const parseResults = [
makeParseItem('scanning-parser', [
[
{ text: '問題', reading: 'もんだい', headword: '問題' },
{ text: 'ではないですか', reading: 'ではないですか', headword: 'ない' },
],
]),
];
const tokens = selectYomitanParseTokens(parseResults, () => false, 'headword');
assert.deepEqual(
tokens?.map((token) => ({
surface: token.surface,
reading: token.reading,
headword: token.headword,
})),
[
{
surface: '問題',
reading: 'もんだい',
headword: '問題',
},
{
surface: 'ではないですか',
reading: 'ではないですか',
headword: 'ない',
},
],
);
});
test('merges trailing katakana continuation without headword into previous token', () => {
const parseResults = [
makeParseItem('scanning-parser', [
@@ -1,4 +1,5 @@
import { MergedToken, NPlusOneMatchMode, PartOfSpeech } from '../../../types';
import { isStandaloneGrammarEndingText } from './grammar-ending';
interface YomitanParseHeadword {
term?: unknown;
@@ -24,24 +25,6 @@ export interface YomitanParseCandidate {
tokens: MergedToken[];
}
const STANDALONE_GRAMMAR_ENDINGS = new Set([
'です',
'ですか',
'ですね',
'ですよ',
'ですな',
'じゃない',
'じゃないか',
'じゃないね',
'じゃないよ',
'じゃないな',
'じゃないです',
'じゃないですか',
'じゃないですね',
'じゃないですよ',
'じゃないですな',
]);
function isObject(value: unknown): value is Record<string, unknown> {
return Boolean(value && typeof value === 'object');
}
@@ -164,7 +147,7 @@ function isStandaloneGrammarEndingSegment(segment: YomitanParseSegment): boolean
const headword = extractYomitanHeadword(segment).trim();
return (
headword.length > 0 &&
(STANDALONE_GRAMMAR_ENDINGS.has(surface) || STANDALONE_GRAMMAR_ENDINGS.has(headword))
(isStandaloneGrammarEndingText(surface) || isStandaloneGrammarEndingText(headword))
);
}
@@ -8,6 +8,7 @@ import {
} from '../../../token-pos2-exclusions';
import { MergedToken, PartOfSpeech } from '../../../types';
import { shouldIgnoreJlptByTerm } from '../jlpt-token-filter';
import { isSubtitleGrammarEndingText } from './grammar-ending';
const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
const KATAKANA_CODEPOINT_START = 0x30a1;
@@ -58,61 +59,6 @@ export const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
'ものか',
...STANDALONE_GRAMMAR_PARTICLE_PHRASES,
]);
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
'だ',
'です',
'でした',
'だった',
'では',
'じゃ',
'でしょう',
'だろう',
] as const;
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES = [
'',
'か',
'ね',
'よ',
'な',
'けど',
'よね',
'かな',
'かね',
] as const;
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_THOUGHT_SUFFIXES = [
'か',
'かな',
'かね',
] as const;
const SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_SUFFIXES = ['', 'か', 'ね', 'よ', 'な'] as const;
const SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_SUFFIXES = [
'',
'か',
'ね',
'よ',
'な',
'です',
'ですか',
'ですよ',
'ですね',
'ですな',
] as const;
const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set(
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.flatMap((prefix) =>
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES.flatMap((core) =>
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES.map(
(particle) => `${prefix}${core}${particle}`,
),
),
),
);
const SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS = new Set(
SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_SUFFIXES.map((suffix) => `です${suffix}`),
);
const SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS = new Set(
SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_SUFFIXES.map((suffix) => `じゃない${suffix}`),
);
const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
'って',
'ってよ',
@@ -460,25 +406,11 @@ function isExcludedByTerm(token: MergedToken): boolean {
continue;
}
if (
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.some((prefix) =>
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_THOUGHT_SUFFIXES.some(
(suffix) => normalized === `${prefix}${suffix}`,
),
)
) {
return true;
}
if (
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(trimmed) ||
SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalized) ||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(trimmed) ||
SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(normalized) ||
SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS.has(trimmed) ||
SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS.has(normalized) ||
SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS.has(trimmed) ||
SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS.has(normalized) ||
isSubtitleGrammarEndingText(trimmed) ||
isSubtitleGrammarEndingText(normalized) ||
shouldIgnoreJlptByTerm(trimmed) ||
shouldIgnoreJlptByTerm(normalized)
) {