feat: sync animated anki images to sentence audio

2026-05-28 00:55:16 -07:00 · 2026-03-18 19:21:12 -07:00
parent f4cce31d4a
commit ad1f66a842
18 changed files with 452 additions and 38 deletions
@@ -340,6 +340,7 @@
      "animatedFps": 10, // Animated fps setting.
      "animatedMaxWidth": 640, // Animated max width setting.
      "animatedCrf": 35, // Animated crf setting.
+      "syncAnimatedImageToWordAudio": true, // For animated AVIF images, prepend a frozen first frame matching the existing word-audio duration so motion starts with sentence audio. Values: true | false
      "audioPadding": 0.5, // Audio padding setting.
      "fallbackDuration": 3, // Fallback duration setting.
      "maxMediaDuration": 30 // Max media duration setting.
@@ -340,6 +340,7 @@
      "animatedFps": 10, // Animated fps setting.
      "animatedMaxWidth": 640, // Animated max width setting.
      "animatedCrf": 35, // Animated crf setting.
+      "syncAnimatedImageToWordAudio": true, // For animated AVIF images, prepend a frozen first frame matching the existing word-audio duration so motion starts with sentence audio. Values: true | false
      "audioPadding": 0.5, // Audio padding setting.
      "fallbackDuration": 3, // Fallback duration setting.
      "maxMediaDuration": 30 // Max media duration setting.
@@ -54,6 +54,7 @@ import { FieldGroupingService } from './anki-integration/field-grouping';
 import { FieldGroupingMergeCollaborator } from './anki-integration/field-grouping-merge';
 import { NoteUpdateWorkflow } from './anki-integration/note-update-workflow';
 import { FieldGroupingWorkflow } from './anki-integration/field-grouping-workflow';
+import { resolveAnimatedImageLeadInSeconds } from './anki-integration/animated-image-sync';
 import { AnkiIntegrationRuntime, normalizeAnkiIntegrationConfig } from './anki-integration/runtime';

 const log = createLogger('anki').child('integration');
@@ -190,7 +191,7 @@ export class AnkiIntegration {
        this.resolveNoteFieldName(noteInfo, preferredName),
      extractFields: (fields) => this.extractFields(fields),
      processSentence: (mpvSentence, noteFields) => this.processSentence(mpvSentence, noteFields),
-      generateMediaForMerge: () => this.generateMediaForMerge(),
+      generateMediaForMerge: (noteInfo) => this.generateMediaForMerge(noteInfo),
      warnFieldParseOnce: (fieldName, reason, detail) =>
        this.warnFieldParseOnce(fieldName, reason, detail),
    });
@@ -286,6 +287,7 @@ export class AnkiIntegration {
        storeMediaFile: (filename, data) => this.client.storeMediaFile(filename, data),
        findNotes: async (query, options) =>
          (await this.client.findNotes(query, options)) as number[],
+        retrieveMediaFile: (filename) => this.client.retrieveMediaFile(filename),
      },
      mediaGenerator: {
        generateAudio: (videoPath, startTime, endTime, audioPadding, audioStreamIndex) =>
@@ -319,6 +321,7 @@ export class AnkiIntegration {
        this.resolveConfiguredFieldName(noteInfo, ...preferredNames),
      resolveNoteFieldName: (noteInfo, preferredName) =>
        this.resolveNoteFieldName(noteInfo, preferredName),
+      getAnimatedImageLeadInSeconds: (noteInfo) => this.getAnimatedImageLeadInSeconds(noteInfo),
      extractFields: (fields) => this.extractFields(fields),
      processSentence: (mpvSentence, noteFields) => this.processSentence(mpvSentence, noteFields),
      setCardTypeFields: (updatedFields, availableFieldNames, cardKind) =>
@@ -407,12 +410,13 @@ export class AnkiIntegration {
        this.resolveConfiguredFieldName(noteInfo, ...preferredNames),
      getResolvedSentenceAudioFieldName: (noteInfo) =>
        this.getResolvedSentenceAudioFieldName(noteInfo),
+      getAnimatedImageLeadInSeconds: (noteInfo) => this.getAnimatedImageLeadInSeconds(noteInfo),
      mergeFieldValue: (existing, newValue, overwrite) =>
        this.mergeFieldValue(existing, newValue, overwrite),
      generateAudioFilename: () => this.generateAudioFilename(),
      generateAudio: () => this.generateAudio(),
      generateImageFilename: () => this.generateImageFilename(),
-      generateImage: () => this.generateImage(),
+      generateImage: (animatedLeadInSeconds) => this.generateImage(animatedLeadInSeconds),
      formatMiscInfoPattern: (fallbackFilename, startTimeSeconds) =>
        this.formatMiscInfoPattern(fallbackFilename, startTimeSeconds),
      addConfiguredTagsToNote: (noteId) => this.addConfiguredTagsToNote(noteId),
@@ -637,7 +641,7 @@ export class AnkiIntegration {
    );
  }

-  private async generateImage(): Promise<Buffer | null> {
+  private async generateImage(animatedLeadInSeconds = 0): Promise<Buffer | null> {
    if (!this.mpvClient || !this.mpvClient.currentVideoPath) {
      return null;
    }
@@ -665,6 +669,7 @@ export class AnkiIntegration {
          maxWidth: this.config.media?.animatedMaxWidth,
          maxHeight: this.config.media?.animatedMaxHeight,
          crf: this.config.media?.animatedCrf,
+          leadingStillDuration: animatedLeadInSeconds,
        },
      );
    } else {
@@ -1020,7 +1025,18 @@ export class AnkiIntegration {
    return getPreferredWordValueFromExtractedFields(fields, this.config);
  }

-  private async generateMediaForMerge(): Promise<{
+  private async getAnimatedImageLeadInSeconds(noteInfo: NoteInfo): Promise<number> {
+    return resolveAnimatedImageLeadInSeconds({
+      config: this.config,
+      noteInfo,
+      resolveConfiguredFieldName: (candidateNoteInfo, ...preferredNames) =>
+        this.resolveConfiguredFieldName(candidateNoteInfo, ...preferredNames),
+      retrieveMediaFileBase64: (filename) => this.client.retrieveMediaFile(filename),
+      logWarn: (message, ...args) => log.warn(message, ...args),
+    });
+  }
+
+  private async generateMediaForMerge(noteInfo?: NoteInfo): Promise<{
    audioField?: string;
    audioValue?: string;
    imageField?: string;
@@ -1057,8 +1073,11 @@ export class AnkiIntegration {

    if (this.config.media?.generateImage && this.mpvClient?.currentVideoPath) {
      try {
+        const animatedLeadInSeconds = noteInfo
+          ? await this.getAnimatedImageLeadInSeconds(noteInfo)
+          : 0;
        const imageFilename = this.generateImageFilename();
-        const imageBuffer = await this.generateImage();
+        const imageBuffer = await this.generateImage(animatedLeadInSeconds);
        if (imageBuffer) {
          await this.client.storeMediaFile(imageFilename, imageBuffer);
          result.imageField = this.config.fields?.image || DEFAULT_ANKI_CONNECT_CONFIG.fields.image;
@@ -0,0 +1,82 @@
+import assert from 'node:assert/strict';
+import test from 'node:test';
+
+import { resolveAnimatedImageLeadInSeconds, extractSoundFilenames } from './animated-image-sync';
+
+test('extractSoundFilenames returns ordered sound filenames from an Anki field value', () => {
+  assert.deepEqual(
+    extractSoundFilenames('before [sound:word.mp3] middle [sound:alt.ogg] after'),
+    ['word.mp3', 'alt.ogg'],
+  );
+});
+
+test('resolveAnimatedImageLeadInSeconds sums configured word audio durations for animated images', async () => {
+  const leadInSeconds = await resolveAnimatedImageLeadInSeconds({
+    config: {
+      fields: {
+        audio: 'ExpressionAudio',
+      },
+      media: {
+        imageType: 'avif',
+        syncAnimatedImageToWordAudio: true,
+      },
+    },
+    noteInfo: {
+      noteId: 42,
+      fields: {
+        ExpressionAudio: {
+          value: '[sound:word.mp3][sound:alt.ogg]',
+        },
+      },
+    },
+    resolveConfiguredFieldName: (noteInfo, ...preferredNames) => {
+      for (const preferredName of preferredNames) {
+        if (!preferredName) continue;
+        const resolved = Object.keys(noteInfo.fields).find(
+          (fieldName) => fieldName.toLowerCase() === preferredName.toLowerCase(),
+        );
+        if (resolved) return resolved;
+      }
+      return null;
+    },
+    retrieveMediaFileBase64: async (filename) =>
+      filename === 'word.mp3' ? 'd29yZA==' : filename === 'alt.ogg' ? 'YWx0' : '',
+    probeAudioDurationSeconds: async (_buffer, filename) =>
+      filename === 'word.mp3' ? 0.41 : filename === 'alt.ogg' ? 0.84 : null,
+    logWarn: () => undefined,
+  });
+
+  assert.equal(leadInSeconds, 1.25);
+});
+
+test('resolveAnimatedImageLeadInSeconds falls back to zero when sync is disabled', async () => {
+  const leadInSeconds = await resolveAnimatedImageLeadInSeconds({
+    config: {
+      fields: {
+        audio: 'ExpressionAudio',
+      },
+      media: {
+        imageType: 'avif',
+        syncAnimatedImageToWordAudio: false,
+      },
+    },
+    noteInfo: {
+      noteId: 42,
+      fields: {
+        ExpressionAudio: {
+          value: '[sound:word.mp3]',
+        },
+      },
+    },
+    resolveConfiguredFieldName: () => 'ExpressionAudio',
+    retrieveMediaFileBase64: async () => {
+      throw new Error('should not be called');
+    },
+    probeAudioDurationSeconds: async () => {
+      throw new Error('should not be called');
+    },
+    logWarn: () => undefined,
+  });
+
+  assert.equal(leadInSeconds, 0);
+});
@@ -0,0 +1,133 @@
+import { execFile as nodeExecFile } from 'node:child_process';
+import * as fs from 'node:fs';
+import * as os from 'node:os';
+import * as path from 'node:path';
+
+import { DEFAULT_ANKI_CONNECT_CONFIG } from '../config';
+import type { AnkiConnectConfig } from '../types';
+
+type NoteInfoLike = {
+  noteId: number;
+  fields: Record<string, { value: string }>;
+};
+
+interface ResolveAnimatedImageLeadInSecondsArgs<TNoteInfo extends NoteInfoLike> {
+  config: Pick<AnkiConnectConfig, 'fields' | 'media'>;
+  noteInfo: TNoteInfo;
+  resolveConfiguredFieldName: (
+    noteInfo: TNoteInfo,
+    ...preferredNames: (string | undefined)[]
+  ) => string | null;
+  retrieveMediaFileBase64: (filename: string) => Promise<string>;
+  probeAudioDurationSeconds?: (buffer: Buffer, filename: string) => Promise<number | null>;
+  logWarn?: (message: string, ...args: unknown[]) => void;
+}
+
+interface ProbeAudioDurationDeps {
+  execFile?: typeof nodeExecFile;
+  mkdtempSync?: typeof fs.mkdtempSync;
+  writeFileSync?: typeof fs.writeFileSync;
+  rmSync?: typeof fs.rmSync;
+}
+
+export function extractSoundFilenames(value: string): string[] {
+  const matches = value.matchAll(/\[sound:([^\]]+)\]/gi);
+  return Array.from(matches, (match) => match[1]?.trim() || '').filter((value) => value.length > 0);
+}
+
+function shouldSyncAnimatedImageToWordAudio(config: Pick<AnkiConnectConfig, 'media'>): boolean {
+  return (
+    config.media?.imageType === 'avif' && config.media?.syncAnimatedImageToWordAudio !== false
+  );
+}
+
+export async function probeAudioDurationSeconds(
+  buffer: Buffer,
+  filename: string,
+  deps: ProbeAudioDurationDeps = {},
+): Promise<number | null> {
+  const execFile = deps.execFile ?? nodeExecFile;
+  const mkdtempSync = deps.mkdtempSync ?? fs.mkdtempSync;
+  const writeFileSync = deps.writeFileSync ?? fs.writeFileSync;
+  const rmSync = deps.rmSync ?? fs.rmSync;
+
+  const tempDir = mkdtempSync(path.join(os.tmpdir(), 'subminer-audio-probe-'));
+  const ext = path.extname(filename) || '.bin';
+  const tempPath = path.join(tempDir, `probe${ext}`);
+  writeFileSync(tempPath, buffer);
+
+  return new Promise((resolve) => {
+    execFile(
+      'ffprobe',
+      [
+        '-v',
+        'error',
+        '-show_entries',
+        'format=duration',
+        '-of',
+        'default=noprint_wrappers=1:nokey=1',
+        tempPath,
+      ],
+      (error, stdout) => {
+        try {
+          if (error) {
+            resolve(null);
+            return;
+          }
+
+          const durationSeconds = Number.parseFloat((stdout || '').trim());
+          resolve(Number.isFinite(durationSeconds) && durationSeconds > 0 ? durationSeconds : null);
+        } finally {
+          rmSync(tempDir, { recursive: true, force: true });
+        }
+      },
+    );
+  });
+}
+
+export async function resolveAnimatedImageLeadInSeconds<TNoteInfo extends NoteInfoLike>({
+  config,
+  noteInfo,
+  resolveConfiguredFieldName,
+  retrieveMediaFileBase64,
+  probeAudioDurationSeconds: probeDuration = probeAudioDurationSeconds,
+  logWarn,
+}: ResolveAnimatedImageLeadInSecondsArgs<TNoteInfo>): Promise<number> {
+  if (!shouldSyncAnimatedImageToWordAudio(config)) {
+    return 0;
+  }
+
+  const wordAudioFieldName = resolveConfiguredFieldName(
+    noteInfo,
+    config.fields?.audio,
+    DEFAULT_ANKI_CONNECT_CONFIG.fields.audio,
+  );
+  if (!wordAudioFieldName) {
+    return 0;
+  }
+
+  const wordAudioValue = noteInfo.fields[wordAudioFieldName]?.value || '';
+  const filenames = extractSoundFilenames(wordAudioValue);
+  if (filenames.length === 0) {
+    return 0;
+  }
+
+  let totalLeadInSeconds = 0;
+  for (const filename of filenames) {
+    const encoded = await retrieveMediaFileBase64(filename);
+    if (!encoded) {
+      logWarn?.('Animated image sync skipped: failed to retrieve word audio', filename);
+      return 0;
+    }
+
+    const durationSeconds = await probeDuration(Buffer.from(encoded, 'base64'), filename);
+    if (!(typeof durationSeconds === 'number' && Number.isFinite(durationSeconds))) {
+      logWarn?.('Animated image sync skipped: failed to probe word audio duration', filename);
+      return 0;
+    }
+
+    totalLeadInSeconds += durationSeconds;
+  }
+
+  return totalLeadInSeconds;
+}
@@ -30,6 +30,7 @@ interface CardCreationClient {
  updateNoteFields(noteId: number, fields: Record<string, string>): Promise<void>;
  storeMediaFile(filename: string, data: Buffer): Promise<void>;
  findNotes(query: string, options?: { maxRetries?: number }): Promise<number[]>;
+  retrieveMediaFile(filename: string): Promise<string>;
 }

 interface CardCreationMediaGenerator {
@@ -60,6 +61,7 @@ interface CardCreationMediaGenerator {
      maxWidth?: number;
      maxHeight?: number;
      crf?: number;
+      leadingStillDuration?: number;
    },
  ): Promise<Buffer | null>;
 }
@@ -83,6 +85,7 @@ interface CardCreationDeps {
    ...preferredNames: (string | undefined)[]
  ) => string | null;
  resolveNoteFieldName: (noteInfo: CardCreationNoteInfo, preferredName?: string) => string | null;
+  getAnimatedImageLeadInSeconds: (noteInfo: CardCreationNoteInfo) => Promise<number>;
  extractFields: (fields: Record<string, { value: string }>) => Record<string, string>;
  processSentence: (mpvSentence: string, noteFields: Record<string, string>) => string;
  setCardTypeFields: (
@@ -258,11 +261,14 @@ export class CardCreationService {

        if (this.deps.getConfig().media?.generateImage) {
          try {
+            const animatedLeadInSeconds =
+              await this.deps.getAnimatedImageLeadInSeconds(noteInfo);
            const imageFilename = this.generateImageFilename();
            const imageBuffer = await this.generateImageBuffer(
              mpvClient.currentVideoPath,
              rangeStart,
              rangeEnd,
+              animatedLeadInSeconds,
            );

            if (imageBuffer) {
@@ -414,11 +420,14 @@ export class CardCreationService {

        if (this.deps.getConfig().media?.generateImage) {
          try {
+            const animatedLeadInSeconds =
+              await this.deps.getAnimatedImageLeadInSeconds(noteInfo);
            const imageFilename = this.generateImageFilename();
            const imageBuffer = await this.generateImageBuffer(
              mpvClient.currentVideoPath,
              startTime,
              endTime,
+              animatedLeadInSeconds,
            );

            const imageField = this.deps.getConfig().fields?.image;
@@ -679,6 +688,7 @@ export class CardCreationService {
    videoPath: string,
    startTime: number,
    endTime: number,
+    animatedLeadInSeconds = 0,
  ): Promise<Buffer | null> {
    const mpvClient = this.deps.getMpvClient();
    if (!mpvClient) {
@@ -707,6 +717,7 @@ export class CardCreationService {
          maxWidth: this.deps.getConfig().media?.animatedMaxWidth,
          maxHeight: this.deps.getConfig().media?.animatedMaxHeight,
          crf: this.deps.getConfig().media?.animatedCrf,
+          leadingStillDuration: animatedLeadInSeconds,
        },
      );
    }
@@ -28,7 +28,7 @@ interface FieldGroupingMergeDeps {
  ) => string | null;
  extractFields: (fields: Record<string, { value: string }>) => Record<string, string>;
  processSentence: (mpvSentence: string, noteFields: Record<string, string>) => string;
-  generateMediaForMerge: () => Promise<FieldGroupingMergeMedia>;
+  generateMediaForMerge: (noteInfo: FieldGroupingMergeNoteInfo) => Promise<FieldGroupingMergeMedia>;
  warnFieldParseOnce: (fieldName: string, reason: string, detail?: string) => void;
 }

@@ -132,7 +132,7 @@ export class FieldGroupingMergeCollaborator {
    }

    if (includeGeneratedMedia) {
-      const media = await this.deps.generateMediaForMerge();
+      const media = await this.deps.generateMediaForMerge(keepNoteInfo);
      if (media.audioField && media.audioValue && !sourceFields[media.audioField]) {
        sourceFields[media.audioField] = media.audioValue;
      }
@@ -62,6 +62,7 @@ function createWorkflowHarness() {
      return names.find((name) => name.toLowerCase() === preferred.toLowerCase()) ?? null;
    },
    getResolvedSentenceAudioFieldName: () => null,
+    getAnimatedImageLeadInSeconds: async () => 0,
    mergeFieldValue: (_existing: string, next: string, _overwrite: boolean) => next,
    generateAudioFilename: () => 'audio_1.mp3',
    generateAudio: async () => null,
@@ -163,3 +164,42 @@ test('NoteUpdateWorkflow updates note before auto field grouping merge', async (
  assert.deepEqual(callOrder, ['update', 'auto']);
  assert.equal(harness.updates.length, 1);
 });
+
+test('NoteUpdateWorkflow passes animated image lead-in when syncing avif to word audio', async () => {
+  const harness = createWorkflowHarness();
+  let receivedLeadInSeconds = 0;
+
+  harness.deps.client.notesInfo = async () =>
+    [
+      {
+        noteId: 42,
+        fields: {
+          Expression: { value: 'taberu' },
+          ExpressionAudio: { value: '[sound:word.mp3]' },
+          Sentence: { value: '' },
+          Picture: { value: '' },
+        },
+      },
+    ] satisfies NoteUpdateWorkflowNoteInfo[];
+  harness.deps.getConfig = () => ({
+    fields: {
+      sentence: 'Sentence',
+      image: 'Picture',
+    },
+    media: {
+      generateImage: true,
+      imageType: 'avif',
+      syncAnimatedImageToWordAudio: true,
+    },
+    behavior: {},
+  });
+  harness.deps.getAnimatedImageLeadInSeconds = async () => 1.25;
+  harness.deps.generateImage = async (leadInSeconds?: number) => {
+    receivedLeadInSeconds = leadInSeconds ?? 0;
+    return Buffer.from('image');
+  };
+
+  await harness.workflow.execute(42);
+
+  assert.equal(receivedLeadInSeconds, 1.25);
+});
@@ -22,6 +22,8 @@ export interface NoteUpdateWorkflowDeps {
    media?: {
      generateAudio?: boolean;
      generateImage?: boolean;
+      imageType?: 'static' | 'avif';
+      syncAnimatedImageToWordAudio?: boolean;
    };
    behavior?: {
      overwriteAudio?: boolean;
@@ -60,11 +62,12 @@ export interface NoteUpdateWorkflowDeps {
    ...preferredNames: (string | undefined)[]
  ) => string | null;
  getResolvedSentenceAudioFieldName: (noteInfo: NoteUpdateWorkflowNoteInfo) => string | null;
+  getAnimatedImageLeadInSeconds: (noteInfo: NoteUpdateWorkflowNoteInfo) => Promise<number>;
  mergeFieldValue: (existing: string, newValue: string, overwrite: boolean) => string;
  generateAudioFilename: () => string;
  generateAudio: () => Promise<Buffer | null>;
  generateImageFilename: () => string;
-  generateImage: () => Promise<Buffer | null>;
+  generateImage: (animatedLeadInSeconds?: number) => Promise<Buffer | null>;
  formatMiscInfoPattern: (fallbackFilename: string, startTimeSeconds?: number) => string;
  addConfiguredTagsToNote: (noteId: number) => Promise<void>;
  showNotification: (noteId: number, label: string | number) => Promise<void>;
@@ -153,8 +156,9 @@ export class NoteUpdateWorkflow {

      if (config.media?.generateImage) {
        try {
+          const animatedLeadInSeconds = await this.deps.getAnimatedImageLeadInSeconds(noteInfo);
          const imageFilename = this.deps.generateImageFilename();
-          const imageBuffer = await this.deps.generateImage();
+          const imageBuffer = await this.deps.generateImage(animatedLeadInSeconds);

          if (imageBuffer) {
            await this.deps.client.storeMediaFile(imageFilename, imageBuffer);
@@ -59,6 +59,10 @@ test('AnkiIntegrationRuntime normalizes url and proxy defaults', () => {
    normalized.media?.fallbackDuration,
    DEFAULT_ANKI_CONNECT_CONFIG.media.fallbackDuration,
  );
+  assert.equal(
+    normalized.media?.syncAnimatedImageToWordAudio,
+    DEFAULT_ANKI_CONNECT_CONFIG.media.syncAnimatedImageToWordAudio,
+  );
 });

 test('AnkiIntegrationRuntime starts proxy transport when proxy mode is enabled', () => {
@@ -47,6 +47,7 @@ export const INTEGRATIONS_DEFAULT_CONFIG: Pick<
      animatedMaxWidth: 640,
      animatedMaxHeight: undefined,
      animatedCrf: 35,
+      syncAnimatedImageToWordAudio: true,
      audioPadding: 0.5,
      fallbackDuration: 3.0,
      maxMediaDuration: 30,
@@ -82,6 +82,13 @@ export function buildIntegrationConfigOptionRegistry(
      description: 'Automatically update newly added cards.',
      runtime: runtimeOptionById.get('anki.autoUpdateNewCards'),
    },
+    {
+      path: 'ankiConnect.media.syncAnimatedImageToWordAudio',
+      kind: 'boolean',
+      defaultValue: defaultConfig.ankiConnect.media.syncAnimatedImageToWordAudio,
+      description:
+        'For animated AVIF images, prepend a frozen first frame matching the existing word-audio duration so motion starts with sentence audio.',
+    },
    {
      path: 'ankiConnect.knownWords.matchMode',
      kind: 'enum',
@@ -121,6 +121,22 @@ test('accepts configured ankiConnect.fields.word override', () => {
  );
 });

+test('accepts ankiConnect.media.syncAnimatedImageToWordAudio override', () => {
+  const { context, warnings } = makeContext({
+    media: {
+      syncAnimatedImageToWordAudio: false,
+    },
+  });
+
+  applyAnkiConnectResolution(context);
+
+  assert.equal(context.resolved.ankiConnect.media.syncAnimatedImageToWordAudio, false);
+  assert.equal(
+    warnings.some((warning) => warning.path === 'ankiConnect.media.syncAnimatedImageToWordAudio'),
+    false,
+  );
+});
+
 test('maps legacy ankiConnect.wordField to modern ankiConnect.fields.word', () => {
  const { context, warnings } = makeContext({
    wordField: 'TargetWordLegacy',
@@ -31,6 +31,7 @@ export function applyAnkiConnectResolution(context: ResolveContext): void {
    'animatedMaxWidth',
    'animatedMaxHeight',
    'animatedCrf',
+    'syncAnimatedImageToWordAudio',
    'audioPadding',
    'fallbackDuration',
    'maxMediaDuration',
@@ -536,6 +537,17 @@ export function applyAnkiConnectResolution(context: ResolveContext): void {
      'Expected integer between 0 and 63.',
    );
  }
+  if (!hasOwn(media, 'syncAnimatedImageToWordAudio')) {
+    mapLegacy(
+      'syncAnimatedImageToWordAudio',
+      asBoolean,
+      (value) => {
+        context.resolved.ankiConnect.media.syncAnimatedImageToWordAudio = value;
+      },
+      context.resolved.ankiConnect.media.syncAnimatedImageToWordAudio,
+      'Expected boolean.',
+    );
+  }
  if (!hasOwn(media, 'audioPadding')) {
    mapLegacy(
      'audioPadding',
@@ -12,6 +12,12 @@ import {
  getConfiguredWordFieldName,
  getPreferredNoteFieldValue,
 } from '../../anki-field-config.js';
+import { resolveAnimatedImageLeadInSeconds } from '../../anki-integration/animated-image-sync.js';
+
+type StatsServerNoteInfo = {
+  noteId: number;
+  fields: Record<string, { value: string }>;
+};

 function parseIntQuery(raw: string | undefined, fallback: number, maxLimit?: number): number {
  if (raw === undefined) return fallback;
@@ -40,6 +46,20 @@ function parseEventTypesQuery(raw: string | undefined): number[] | undefined {
  return parsed.length > 0 ? parsed : undefined;
 }

+function resolveStatsNoteFieldName(
+  noteInfo: StatsServerNoteInfo,
+  ...preferredNames: (string | undefined)[]
+): string | null {
+  for (const preferredName of preferredNames) {
+    if (!preferredName) continue;
+    const resolved = Object.keys(noteInfo.fields).find(
+      (fieldName) => fieldName.toLowerCase() === preferredName.toLowerCase(),
+    );
+    if (resolved) return resolved;
+  }
+  return null;
+}
+
 /** Load known words cache from disk into a Set. Returns null if unavailable. */
 function loadKnownWordsSet(cachePath: string | undefined): Set<string> | null {
  if (!cachePath || !existsSync(cachePath)) return null;
@@ -621,36 +641,41 @@ export function createStatsApp(
    const generateAudio = ankiConfig.media?.generateAudio !== false;
    const generateImage = ankiConfig.media?.generateImage !== false && mode !== 'audio';
    const imageType = ankiConfig.media?.imageType ?? 'static';
+    const syncAnimatedImageToWordAudio =
+      imageType === 'avif' && ankiConfig.media?.syncAnimatedImageToWordAudio !== false;

    const audioPromise = generateAudio
      ? mediaGen.generateAudio(sourcePath, startSec, clampedEndSec, audioPadding)
      : Promise.resolve(null);

-    let imagePromise: Promise<Buffer | null>;
-    if (!generateImage) {
-      imagePromise = Promise.resolve(null);
-    } else if (imageType === 'avif') {
-      imagePromise = mediaGen.generateAnimatedImage(
-        sourcePath,
-        startSec,
-        clampedEndSec,
-        audioPadding,
-        {
+    const createImagePromise = (animatedLeadInSeconds = 0): Promise<Buffer | null> => {
+      if (!generateImage) {
+        return Promise.resolve(null);
+      }
+
+      if (imageType === 'avif') {
+        return mediaGen.generateAnimatedImage(sourcePath, startSec, clampedEndSec, audioPadding, {
          fps: ankiConfig.media?.animatedFps ?? 10,
          maxWidth: ankiConfig.media?.animatedMaxWidth ?? 640,
          maxHeight: ankiConfig.media?.animatedMaxHeight,
          crf: ankiConfig.media?.animatedCrf ?? 35,
-        },
-      );
-    } else {
+          leadingStillDuration: animatedLeadInSeconds,
+        });
+      }
+
      const midpointSec = (startSec + clampedEndSec) / 2;
-      imagePromise = mediaGen.generateScreenshot(sourcePath, midpointSec, {
+      return mediaGen.generateScreenshot(sourcePath, midpointSec, {
        format: ankiConfig.media?.imageFormat ?? 'jpg',
        quality: ankiConfig.media?.imageQuality ?? 92,
        maxWidth: ankiConfig.media?.imageMaxWidth,
        maxHeight: ankiConfig.media?.imageMaxHeight,
      });
-    }
+    };
+
+    const imagePromise =
+      mode === 'word' && syncAnimatedImageToWordAudio
+        ? Promise.resolve<Buffer | null>(null)
+        : createImagePromise();

    const errors: string[] = [];
    let noteId: number;
@@ -677,12 +702,31 @@ export function createStatsApp(

      noteId = yomitanResult.value;
      const audioBuffer = audioResult.status === 'fulfilled' ? audioResult.value : null;
-      const imageBuffer = imageResult.status === 'fulfilled' ? imageResult.value : null;
      if (audioResult.status === 'rejected')
        errors.push(`audio: ${(audioResult.reason as Error).message}`);
      if (imageResult.status === 'rejected')
        errors.push(`image: ${(imageResult.reason as Error).message}`);

+      let imageBuffer = imageResult.status === 'fulfilled' ? imageResult.value : null;
+      if (syncAnimatedImageToWordAudio && generateImage) {
+        try {
+          const noteInfoResult = (await client.notesInfo([noteId])) as StatsServerNoteInfo[];
+          const noteInfo = noteInfoResult[0] ?? null;
+          const animatedLeadInSeconds = noteInfo
+            ? await resolveAnimatedImageLeadInSeconds({
+                config: ankiConfig,
+                noteInfo,
+                resolveConfiguredFieldName: (candidateNoteInfo, ...preferredNames) =>
+                  resolveStatsNoteFieldName(candidateNoteInfo, ...preferredNames),
+                retrieveMediaFileBase64: (filename) => client.retrieveMediaFile(filename),
+              })
+            : 0;
+          imageBuffer = await createImagePromise(animatedLeadInSeconds);
+        } catch (err) {
+          errors.push(`image: ${(err as Error).message}`);
+        }
+      }
+
      const mediaFields: Record<string, string> = {};
      const timestamp = Date.now();
      const sentenceFieldName = ankiConfig.fields?.sentence ?? 'Sentence';
@@ -0,0 +1,15 @@
+import assert from 'node:assert/strict';
+import test from 'node:test';
+
+import { buildAnimatedImageVideoFilter } from './media-generator';
+
+test('buildAnimatedImageVideoFilter prepends a cloned first frame when lead-in is provided', () => {
+  assert.equal(
+    buildAnimatedImageVideoFilter({
+      fps: 10,
+      maxWidth: 640,
+      leadingStillDuration: 1.25,
+    }),
+    'tpad=start_duration=1.25:start_mode=clone,fps=10,scale=w=640:h=-2',
+  );
+});
@@ -24,6 +24,33 @@ import { createLogger } from './logger';

 const log = createLogger('media');

+export function buildAnimatedImageVideoFilter(options: {
+  fps?: number;
+  maxWidth?: number;
+  maxHeight?: number;
+  leadingStillDuration?: number;
+}): string {
+  const { fps = 10, maxWidth = 640, maxHeight, leadingStillDuration = 0 } = options;
+  const clampedFps = Math.max(1, Math.min(60, fps));
+  const vfParts: string[] = [];
+
+  if (leadingStillDuration > 0) {
+    vfParts.push(`tpad=start_duration=${leadingStillDuration}:start_mode=clone`);
+  }
+
+  vfParts.push(`fps=${clampedFps}`);
+
+  if (maxWidth && maxWidth > 0 && maxHeight && maxHeight > 0) {
+    vfParts.push(`scale=w=${maxWidth}:h=${maxHeight}:force_original_aspect_ratio=decrease`);
+  } else if (maxWidth && maxWidth > 0) {
+    vfParts.push(`scale=w=${maxWidth}:h=-2`);
+  } else if (maxHeight && maxHeight > 0) {
+    vfParts.push(`scale=w=-2:h=${maxHeight}`);
+  }
+
+  return vfParts.join(',');
+}
+
 export class MediaGenerator {
  private tempDir: string;
  private notifyIconDir: string;
@@ -289,25 +316,15 @@ export class MediaGenerator {
      maxWidth?: number;
      maxHeight?: number;
      crf?: number;
+      leadingStillDuration?: number;
    } = {},
  ): Promise<Buffer> {
    const start = Math.max(0, startTime - padding);
    const duration = endTime - startTime + 2 * padding;
-    const { fps = 10, maxWidth = 640, maxHeight, crf = 35 } = options;
+    const { fps = 10, maxWidth = 640, maxHeight, crf = 35, leadingStillDuration = 0 } = options;

-    const clampedFps = Math.max(1, Math.min(60, fps));
    const clampedCrf = Math.max(0, Math.min(63, crf));

-    const vfParts: string[] = [];
-    vfParts.push(`fps=${clampedFps}`);
-    if (maxWidth && maxWidth > 0 && maxHeight && maxHeight > 0) {
-      vfParts.push(`scale=w=${maxWidth}:h=${maxHeight}:force_original_aspect_ratio=decrease`);
-    } else if (maxWidth && maxWidth > 0) {
-      vfParts.push(`scale=w=${maxWidth}:h=-2`);
-    } else if (maxHeight && maxHeight > 0) {
-      vfParts.push(`scale=w=-2:h=${maxHeight}`);
-    }
-
    const av1Encoder = await this.detectAv1Encoder();
    if (!av1Encoder) {
      throw new Error(
@@ -338,7 +355,12 @@ export class MediaGenerator {
          '-i',
          videoPath,
          '-vf',
-          vfParts.join(','),
+          buildAnimatedImageVideoFilter({
+            fps,
+            maxWidth,
+            maxHeight,
+            leadingStillDuration,
+          }),
          ...encoderArgs,
          '-y',
          outputPath,
@@ -241,6 +241,7 @@ export interface AnkiConnectConfig {
    animatedMaxWidth?: number;
    animatedMaxHeight?: number;
    animatedCrf?: number;
+    syncAnimatedImageToWordAudio?: boolean;
    audioPadding?: number;
    fallbackDuration?: number;
    maxMediaDuration?: number;
@@ -745,6 +746,7 @@ export interface ResolvedConfig {
      animatedMaxWidth: number;
      animatedMaxHeight?: number;
      animatedCrf: number;
+      syncAnimatedImageToWordAudio: boolean;
      audioPadding: number;
      fallbackDuration: number;
      maxMediaDuration: number;