Replace grammar-ending permutations with shared matcher; preserve word a

- Extract `grammar-ending.ts` with `isStandaloneGrammarEndingText` / `isSubtitleGrammarEndingText` pattern matchers - Replace `STANDALONE_GRAMMAR_ENDINGS` set in parser-selection-stage with shared matcher - Replace generated phrase sets in subtitle-annotation-filter with shared matcher - Remove stale duplicate subtitle-exclusion constants and helpers from annotation-stage - Manual clipboard card updates now write only to the sentence audio field, leaving word/expression audio untouched
2026-05-04 00:41:33 -07:00 · 2026-05-02 23:25:33 -07:00
parent f83005bf70
commit a9625f8777
15 changed files with 285 additions and 265 deletions
@@ -126,7 +126,7 @@ function createManualUpdateService(overrides: Partial<CardCreationDeps> = {}): {
  };
 }

-test('manual clipboard subtitle update replaces expression and sentence audio even when overwriteAudio is disabled', async () => {
+test('manual clipboard subtitle update replaces sentence audio without touching expression audio', async () => {
  const { service, updatedFields, mergeCalls, storedMedia } = createManualUpdateService();

  await service.updateLastAddedFromClipboard('字幕');
@@ -134,10 +134,10 @@ test('manual clipboard subtitle update replaces expression and sentence audio ev
  assert.equal(updatedFields.length, 1);
  assert.equal(storedMedia.length, 1);
  const audioValue = `[sound:${storedMedia[0]}]`;
-  assert.equal(updatedFields[0]?.ExpressionAudio, audioValue);
  assert.equal(updatedFields[0]?.SentenceAudio, audioValue);
+  assert.equal('ExpressionAudio' in updatedFields[0]!, false);
  assert.deepEqual(
    mergeCalls.map((call) => call.overwrite),
-    [true, true],
+    [true],
  );
 });
@@ -219,10 +219,6 @@ export class CardCreationService {
          this.deps.getConfig(),
        );
        const sentenceAudioField = this.getResolvedSentenceAudioFieldName(noteInfo);
-        const expressionAudioField = this.deps.resolveConfiguredFieldName(
-          noteInfo,
-          this.deps.getConfig().fields?.audio || 'ExpressionAudio',
-        );
        const sentenceField = this.deps.getEffectiveSentenceCardConfig().sentenceField;

        const sentence = blocks.join(' ');
@@ -252,22 +248,15 @@ export class CardCreationService {

            if (audioBuffer) {
              await this.deps.client.storeMediaFile(audioFilename, audioBuffer);
-              if (sentenceAudioField || expressionAudioField) {
+              if (sentenceAudioField) {
                const audioValue = `[sound:${audioFilename}]`;
-                const audioFields = new Set(
-                  [sentenceAudioField, expressionAudioField].filter(
-                    (fieldName): fieldName is string => Boolean(fieldName),
-                  ),
+                const existingAudio = noteInfo.fields[sentenceAudioField]?.value || '';
+                // Manual clipboard updates intentionally replace old captured sentence audio.
+                updatedFields[sentenceAudioField] = this.deps.mergeFieldValue(
+                  existingAudio,
+                  audioValue,
+                  true,
                );
-                for (const audioField of audioFields) {
-                  const existingAudio = noteInfo.fields[audioField]?.value || '';
-                  // Manual clipboard updates intentionally replace old captured audio.
-                  updatedFields[audioField] = this.deps.mergeFieldValue(
-                    existingAudio,
-                    audioValue,
-                    true,
-                  );
-                }
              }
              miscInfoFilename = audioFilename;
              updatePerformed = true;
@@ -301,6 +301,31 @@ test('shouldExcludeTokenFromSubtitleAnnotations excludes standalone polite copul
  }
 });

+test('shouldExcludeTokenFromSubtitleAnnotations excludes grammar-ending patterns without enumerating variants', () => {
+  const tokens = [
+    makeToken({
+      surface: 'ですわ',
+      headword: 'です',
+      reading: 'デスワ',
+      partOfSpeech: PartOfSpeech.other,
+      pos1: '',
+      pos2: '',
+    }),
+    makeToken({
+      surface: 'ではないですか',
+      headword: 'ない',
+      reading: 'デハナイデスカ',
+      partOfSpeech: PartOfSpeech.other,
+      pos1: '',
+      pos2: '',
+    }),
+  ];
+
+  for (const token of tokens) {
+    assert.equal(shouldExcludeTokenFromSubtitleAnnotations(token), true, token.surface);
+  }
+});
+
 test('shouldExcludeTokenFromSubtitleAnnotations excludes auxiliary-stem そうだ grammar tails', () => {
  const token = makeToken({
    surface: 'そうだ',
@@ -18,57 +18,6 @@ const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
 const KATAKANA_CODEPOINT_START = 0x30a1;
 const KATAKANA_CODEPOINT_END = 0x30f6;
 const JLPT_LEVEL_LOOKUP_CACHE_LIMIT = 2048;
-const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
-  'ああ',
-  'ええ',
-  'うう',
-  'おお',
-  'はあ',
-  'はは',
-  'へえ',
-  'ふう',
-  'ほう',
-]);
-const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
-const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
-  'だ',
-  'です',
-  'でした',
-  'だった',
-  'では',
-  'じゃ',
-  'でしょう',
-  'だろう',
-] as const;
-const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES = [
-  '',
-  'か',
-  'ね',
-  'よ',
-  'な',
-  'けど',
-  'よね',
-  'かな',
-  'かね',
-] as const;
-const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set(
-  SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.flatMap((prefix) =>
-    SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES.flatMap((core) =>
-      SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES.map(
-        (particle) => `${prefix}${core}${particle}`,
-      ),
-    ),
-  ),
-);
-const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
-  'って',
-  'ってよ',
-  'ってね',
-  'ってな',
-  'ってさ',
-  'ってか',
-  'ってば',
-]);

 const jlptLevelLookupCaches = new WeakMap<
  (text: string) => JlptLevel | null,
@@ -104,10 +53,6 @@ function normalizePos1Tag(pos1: string | undefined): string {
  return typeof pos1 === 'string' ? pos1.trim() : '';
 }

-const SUBTITLE_ANNOTATION_EXCLUDED_POS1 = new Set(['感動詞']);
-const SUBTITLE_ANNOTATION_GRAMMAR_ONLY_POS1 = new Set(['助詞', '助動詞', '連体詞']);
-const AUXILIARY_STEM_GRAMMAR_TAIL_POS1 = new Set(['名詞', '助動詞', '助詞']);
-
 function splitNormalizedTagParts(normalizedTag: string): string[] {
  if (!normalizedTag) {
    return [];
@@ -129,57 +74,6 @@ function isExcludedByTagSet(normalizedTag: string, exclusions: ReadonlySet<strin
  return parts.some((part) => exclusions.has(part));
 }

-function isExcludedFromSubtitleAnnotationsByPos1(normalizedPos1: string): boolean {
-  const parts = splitNormalizedTagParts(normalizedPos1);
-  if (parts.some((part) => SUBTITLE_ANNOTATION_EXCLUDED_POS1.has(part))) {
-    return true;
-  }
-
-  return parts.length > 0 && parts.every((part) => SUBTITLE_ANNOTATION_GRAMMAR_ONLY_POS1.has(part));
-}
-
-function isExcludedTrailingParticleMergedToken(token: MergedToken): boolean {
-  const normalizedSurface = normalizeJlptTextForExclusion(token.surface);
-  const normalizedHeadword = normalizeJlptTextForExclusion(token.headword);
-  if (
-    !normalizedSurface ||
-    !normalizedHeadword ||
-    !normalizedSurface.startsWith(normalizedHeadword)
-  ) {
-    return false;
-  }
-
-  const suffix = normalizedSurface.slice(normalizedHeadword.length);
-  if (!SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES.has(suffix)) {
-    return false;
-  }
-
-  const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1));
-  if (pos1Parts.length < 2) {
-    return false;
-  }
-
-  const [leadingPos1, ...trailingPos1] = pos1Parts;
-  if (!leadingPos1 || SUBTITLE_ANNOTATION_GRAMMAR_ONLY_POS1.has(leadingPos1)) {
-    return false;
-  }
-
-  return trailingPos1.length > 0 && trailingPos1.every((part) => part === '助詞');
-}
-
-function isAuxiliaryStemGrammarTailToken(token: MergedToken): boolean {
-  const pos1Parts = splitNormalizedTagParts(normalizePos1Tag(token.pos1));
-  if (
-    pos1Parts.length === 0 ||
-    !pos1Parts.every((part) => AUXILIARY_STEM_GRAMMAR_TAIL_POS1.has(part))
-  ) {
-    return false;
-  }
-
-  const pos3Parts = splitNormalizedTagParts(normalizePos2Tag(token.pos3));
-  return pos3Parts.includes('助動詞語幹');
-}
-
 function resolvePos1Exclusions(options: AnnotationStageOptions): ReadonlySet<string> {
  if (options.pos1Exclusions) {
    return options.pos1Exclusions;
@@ -609,44 +503,6 @@ function isJlptEligibleToken(token: MergedToken): boolean {
  return true;
 }

-function isExcludedFromSubtitleAnnotationsByTerm(token: MergedToken): boolean {
-  const candidates = [token.surface, token.reading, resolveJlptLookupText(token)].filter(
-    (candidate): candidate is string => typeof candidate === 'string' && candidate.length > 0,
-  );
-
-  for (const candidate of candidates) {
-    const trimmedCandidate = candidate.trim();
-    if (!trimmedCandidate) {
-      continue;
-    }
-
-    const normalizedCandidate = normalizeJlptTextForExclusion(trimmedCandidate);
-    if (!normalizedCandidate) {
-      continue;
-    }
-
-    if (
-      SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(trimmedCandidate) ||
-      SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalizedCandidate) ||
-      SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(trimmedCandidate) ||
-      SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(normalizedCandidate)
-    ) {
-      return true;
-    }
-
-    if (
-      isTrailingSmallTsuKanaSfx(trimmedCandidate) ||
-      isTrailingSmallTsuKanaSfx(normalizedCandidate) ||
-      isReduplicatedKanaSfxWithOptionalTrailingTo(trimmedCandidate) ||
-      isReduplicatedKanaSfxWithOptionalTrailingTo(normalizedCandidate)
-    ) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
 export function shouldExcludeTokenFromSubtitleAnnotations(token: MergedToken): boolean {
  return sharedShouldExcludeTokenFromSubtitleAnnotations(token);
 }
@@ -771,9 +627,7 @@ export function annotateTokens(
      });
      return {
        ...strippedToken,
-        isKnown:
-          nPlusOneEnabled &&
-          computeExcludedTokenKnownStatus(token, deps.isKnownWord),
+        isKnown: nPlusOneEnabled && computeExcludedTokenKnownStatus(token, deps.isKnownWord),
      };
    }

@@ -0,0 +1,124 @@
+const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
+const KATAKANA_CODEPOINT_START = 0x30a1;
+const KATAKANA_CODEPOINT_END = 0x30f6;
+
+const SENTENCE_FINAL_PARTICLE_SUFFIXES = ['', 'か', 'ね', 'よ', 'な', 'わ'] as const;
+const EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'] as const;
+const EXPLANATORY_ENDING_CORES = [
+  'だ',
+  'です',
+  'でした',
+  'だった',
+  'では',
+  'じゃ',
+  'でしょう',
+  'だろう',
+] as const;
+const EXPLANATORY_ENDING_TRAILING_PARTICLES = [
+  '',
+  'か',
+  'ね',
+  'よ',
+  'な',
+  'けど',
+  'よね',
+  'かな',
+  'かね',
+] as const;
+const EXPLANATORY_ENDING_THOUGHT_SUFFIXES = ['か', 'かな', 'かね'] as const;
+const NEGATIVE_COPULA_PREFIXES = ['じゃ', 'では'] as const;
+
+export function normalizeGrammarEndingText(text: string): string {
+  const raw = text.trim();
+  if (!raw) {
+    return '';
+  }
+
+  let normalized = '';
+  for (const char of raw) {
+    const code = char.codePointAt(0);
+    if (code === undefined) {
+      continue;
+    }
+
+    if (code >= KATAKANA_CODEPOINT_START && code <= KATAKANA_CODEPOINT_END) {
+      normalized += String.fromCodePoint(code - KATAKANA_TO_HIRAGANA_OFFSET);
+      continue;
+    }
+
+    normalized += char;
+  }
+
+  return normalized;
+}
+
+function matchesSuffix(text: string, suffixes: readonly string[]): boolean {
+  return suffixes.some((suffix) => text === suffix);
+}
+
+function matchesPoliteCopulaEnding(text: string): boolean {
+  if (!text.startsWith('です')) {
+    return false;
+  }
+
+  return matchesSuffix(text.slice('です'.length), SENTENCE_FINAL_PARTICLE_SUFFIXES);
+}
+
+function matchesNegativeCopulaEnding(text: string): boolean {
+  for (const prefix of NEGATIVE_COPULA_PREFIXES) {
+    const negativeStem = `${prefix}ない`;
+    if (!text.startsWith(negativeStem)) {
+      continue;
+    }
+
+    const suffix = text.slice(negativeStem.length);
+    return (
+      matchesSuffix(suffix, SENTENCE_FINAL_PARTICLE_SUFFIXES) || matchesPoliteCopulaEnding(suffix)
+    );
+  }
+
+  return false;
+}
+
+function matchesExplanatoryEnding(text: string): boolean {
+  for (const prefix of EXPLANATORY_ENDING_PREFIXES) {
+    if (EXPLANATORY_ENDING_THOUGHT_SUFFIXES.some((suffix) => text === `${prefix}${suffix}`)) {
+      return true;
+    }
+
+    if (!text.startsWith(prefix)) {
+      continue;
+    }
+
+    const suffix = text.slice(prefix.length);
+    for (const core of EXPLANATORY_ENDING_CORES) {
+      if (!suffix.startsWith(core)) {
+        continue;
+      }
+
+      if (matchesSuffix(suffix.slice(core.length), EXPLANATORY_ENDING_TRAILING_PARTICLES)) {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+export function isStandaloneGrammarEndingText(text: string): boolean {
+  const normalized = normalizeGrammarEndingText(text);
+  if (!normalized) {
+    return false;
+  }
+
+  return matchesPoliteCopulaEnding(normalized) || matchesNegativeCopulaEnding(normalized);
+}
+
+export function isSubtitleGrammarEndingText(text: string): boolean {
+  const normalized = normalizeGrammarEndingText(text);
+  if (!normalized) {
+    return false;
+  }
+
+  return isStandaloneGrammarEndingText(normalized) || matchesExplanatoryEnding(normalized);
+}
@@ -219,6 +219,38 @@ test('splits trailing ja-nai grammar endings from preceding content', () => {
  );
 });

+test('splits trailing negative-copula grammar endings by pattern', () => {
+  const parseResults = [
+    makeParseItem('scanning-parser', [
+      [
+        { text: '問題', reading: 'もんだい', headword: '問題' },
+        { text: 'ではないですか', reading: 'ではないですか', headword: 'ない' },
+      ],
+    ]),
+  ];
+
+  const tokens = selectYomitanParseTokens(parseResults, () => false, 'headword');
+  assert.deepEqual(
+    tokens?.map((token) => ({
+      surface: token.surface,
+      reading: token.reading,
+      headword: token.headword,
+    })),
+    [
+      {
+        surface: '問題',
+        reading: 'もんだい',
+        headword: '問題',
+      },
+      {
+        surface: 'ではないですか',
+        reading: 'ではないですか',
+        headword: 'ない',
+      },
+    ],
+  );
+});
+
 test('merges trailing katakana continuation without headword into previous token', () => {
  const parseResults = [
    makeParseItem('scanning-parser', [
@@ -1,4 +1,5 @@
 import { MergedToken, NPlusOneMatchMode, PartOfSpeech } from '../../../types';
+import { isStandaloneGrammarEndingText } from './grammar-ending';

 interface YomitanParseHeadword {
  term?: unknown;
@@ -24,24 +25,6 @@ export interface YomitanParseCandidate {
  tokens: MergedToken[];
 }

-const STANDALONE_GRAMMAR_ENDINGS = new Set([
-  'です',
-  'ですか',
-  'ですね',
-  'ですよ',
-  'ですな',
-  'じゃない',
-  'じゃないか',
-  'じゃないね',
-  'じゃないよ',
-  'じゃないな',
-  'じゃないです',
-  'じゃないですか',
-  'じゃないですね',
-  'じゃないですよ',
-  'じゃないですな',
-]);
-
 function isObject(value: unknown): value is Record<string, unknown> {
  return Boolean(value && typeof value === 'object');
 }
@@ -164,7 +147,7 @@ function isStandaloneGrammarEndingSegment(segment: YomitanParseSegment): boolean
  const headword = extractYomitanHeadword(segment).trim();
  return (
    headword.length > 0 &&
-    (STANDALONE_GRAMMAR_ENDINGS.has(surface) || STANDALONE_GRAMMAR_ENDINGS.has(headword))
+    (isStandaloneGrammarEndingText(surface) || isStandaloneGrammarEndingText(headword))
  );
 }

@@ -8,6 +8,7 @@ import {
 } from '../../../token-pos2-exclusions';
 import { MergedToken, PartOfSpeech } from '../../../types';
 import { shouldIgnoreJlptByTerm } from '../jlpt-token-filter';
+import { isSubtitleGrammarEndingText } from './grammar-ending';

 const KATAKANA_TO_HIRAGANA_OFFSET = 0x60;
 const KATAKANA_CODEPOINT_START = 0x30a1;
@@ -58,61 +59,6 @@ export const SUBTITLE_ANNOTATION_EXCLUDED_TERMS = new Set([
  'ものか',
  ...STANDALONE_GRAMMAR_PARTICLE_PHRASES,
 ]);
-const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES = ['ん', 'の', 'なん', 'なの'];
-const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES = [
-  'だ',
-  'です',
-  'でした',
-  'だった',
-  'では',
-  'じゃ',
-  'でしょう',
-  'だろう',
-] as const;
-const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES = [
-  '',
-  'か',
-  'ね',
-  'よ',
-  'な',
-  'けど',
-  'よね',
-  'かな',
-  'かね',
-] as const;
-const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_THOUGHT_SUFFIXES = [
-  'か',
-  'かな',
-  'かね',
-] as const;
-const SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_SUFFIXES = ['', 'か', 'ね', 'よ', 'な'] as const;
-const SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_SUFFIXES = [
-  '',
-  'か',
-  'ね',
-  'よ',
-  'な',
-  'です',
-  'ですか',
-  'ですよ',
-  'ですね',
-  'ですな',
-] as const;
-const SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS = new Set(
-  SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.flatMap((prefix) =>
-    SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_CORES.flatMap((core) =>
-      SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_TRAILING_PARTICLES.map(
-        (particle) => `${prefix}${core}${particle}`,
-      ),
-    ),
-  ),
-);
-const SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS = new Set(
-  SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_SUFFIXES.map((suffix) => `です${suffix}`),
-);
-const SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS = new Set(
-  SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_SUFFIXES.map((suffix) => `じゃない${suffix}`),
-);
 const SUBTITLE_ANNOTATION_EXCLUDED_TRAILING_PARTICLE_SUFFIXES = new Set([
  'って',
  'ってよ',
@@ -460,25 +406,11 @@ function isExcludedByTerm(token: MergedToken): boolean {
      continue;
    }

-    if (
-      SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_PREFIXES.some((prefix) =>
-        SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDING_THOUGHT_SUFFIXES.some(
-          (suffix) => normalized === `${prefix}${suffix}`,
-        ),
-      )
-    ) {
-      return true;
-    }
-
    if (
      SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(trimmed) ||
      SUBTITLE_ANNOTATION_EXCLUDED_TERMS.has(normalized) ||
-      SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(trimmed) ||
-      SUBTITLE_ANNOTATION_EXCLUDED_EXPLANATORY_ENDINGS.has(normalized) ||
-      SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS.has(trimmed) ||
-      SUBTITLE_ANNOTATION_EXCLUDED_POLITE_COPULA_ENDINGS.has(normalized) ||
-      SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS.has(trimmed) ||
-      SUBTITLE_ANNOTATION_EXCLUDED_JA_NAI_ENDINGS.has(normalized) ||
+      isSubtitleGrammarEndingText(trimmed) ||
+      isSubtitleGrammarEndingText(normalized) ||
      shouldIgnoreJlptByTerm(trimmed) ||
      shouldIgnoreJlptByTerm(normalized)
    ) {