feat: use cached annotations on subtitle change and skip pre-warmed cues (#97)

2026-07-29 19:21:33 -07:00 · 2026-05-28 00:50:41 -07:00
parent d33009d4a3
commit eed0a6a243
10 changed files with 239 additions and 7 deletions
@@ -242,3 +242,59 @@ test('prefetch service pause/resume halts and continues tokenization', async ()

  assert.ok(tokenizeCalls > callsWhenPaused + 1, 'Should resume tokenizing after unpause');
 });
+
+test('prefetch service skips cues already present in tokenization cache', async () => {
+  const cues = makeCues(5);
+  const tokenizedTexts: string[] = [];
+
+  const service = createSubtitlePrefetchService({
+    cues,
+    tokenizeSubtitle: async (text) => {
+      tokenizedTexts.push(text);
+      return { text, tokens: [] };
+    },
+    preCacheTokenization: () => {},
+    hasCachedTokenization: (text) => text === 'line-0' || text === 'line-1',
+    isCacheFull: () => false,
+    priorityWindowSize: 3,
+  });
+
+  service.start(0);
+  for (let i = 0; i < 10; i += 1) {
+    await flushMicrotasks();
+  }
+  service.stop();
+
+  assert.ok(!tokenizedTexts.includes('line-0'));
+  assert.ok(!tokenizedTexts.includes('line-1'));
+  assert.ok(tokenizedTexts.includes('line-2'));
+});
+
+test('prefetch service deduplicates repeated cue text within a run', async () => {
+  const cues: SubtitleCue[] = [
+    { startTime: 0, endTime: 1, text: 'same' },
+    { startTime: 1, endTime: 2, text: 'same' },
+    { startTime: 2, endTime: 3, text: 'other' },
+  ];
+  const tokenizedTexts: string[] = [];
+
+  const service = createSubtitlePrefetchService({
+    cues,
+    tokenizeSubtitle: async (text) => {
+      tokenizedTexts.push(text);
+      return { text, tokens: [] };
+    },
+    preCacheTokenization: () => {},
+    isCacheFull: () => false,
+    priorityWindowSize: 3,
+  });
+
+  service.start(0);
+  for (let i = 0; i < 10; i += 1) {
+    await flushMicrotasks();
+  }
+  service.stop();
+
+  assert.deepEqual(tokenizedTexts.filter((text) => text === 'same'), ['same']);
+  assert.ok(tokenizedTexts.includes('other'));
+});
@@ -1,10 +1,12 @@
 import type { SubtitleData } from '../../types';
 import type { SubtitleCue } from '../../types';
+import { normalizeSubtitleCacheKey } from './subtitle-processing-controller';

 export interface SubtitlePrefetchServiceDeps {
  cues: SubtitleCue[];
  tokenizeSubtitle: (text: string) => Promise<SubtitleData | null>;
  preCacheTokenization: (text: string, data: SubtitleData) => void;
+  hasCachedTokenization?: (text: string) => boolean;
  isCacheFull: () => boolean;
  priorityWindowSize?: number;
 }
@@ -58,6 +60,7 @@ export function createSubtitlePrefetchService(
  async function tokenizeCueList(
    cuesToProcess: SubtitleCue[],
    runId: number,
+    warmedKeys: Set<string>,
    options: { allowWhenCacheFull?: boolean } = {},
  ): Promise<void> {
    for (const cue of cuesToProcess) {
@@ -78,6 +81,15 @@ export function createSubtitlePrefetchService(
        return;
      }

+      const cacheKey = normalizeSubtitleCacheKey(cue.text);
+      if (!cacheKey || warmedKeys.has(cacheKey) || deps.hasCachedTokenization?.(cue.text)) {
+        if (cacheKey) {
+          warmedKeys.add(cacheKey);
+        }
+        continue;
+      }
+      warmedKeys.add(cacheKey);
+
      try {
        const result = await deps.tokenizeSubtitle(cue.text);
        if (result && !stopped && runId === currentRunId) {
@@ -94,10 +106,11 @@ export function createSubtitlePrefetchService(

  async function startPrefetching(currentTimeSeconds: number, runId: number): Promise<void> {
    const cues = deps.cues;
+    const warmedKeys = new Set<string>();

    // Phase 1: Priority window
    const priorityCues = computePriorityWindow(cues, currentTimeSeconds, windowSize);
-    await tokenizeCueList(priorityCues, runId, { allowWhenCacheFull: true });
+    await tokenizeCueList(priorityCues, runId, warmedKeys, { allowWhenCacheFull: true });

    if (stopped || runId !== currentRunId) {
      return;
@@ -108,7 +121,7 @@ export function createSubtitlePrefetchService(
    const remainingCues = cues.filter(
      (cue) => cue.startTime > currentTimeSeconds && !priorityTexts.has(cue.text),
    );
-    await tokenizeCueList(remainingCues, runId);
+    await tokenizeCueList(remainingCues, runId, warmedKeys);

    if (stopped || runId !== currentRunId) {
      return;
@@ -118,7 +131,7 @@ export function createSubtitlePrefetchService(
    const earlierCues = cues.filter(
      (cue) => cue.startTime <= currentTimeSeconds && !priorityTexts.has(cue.text),
    );
-    await tokenizeCueList(earlierCues, runId);
+    await tokenizeCueList(earlierCues, runId, warmedKeys);
  }

  return {
@@ -236,6 +236,31 @@ test('consumeCachedSubtitle returns prefetched payload and prevents reprocessing
  assert.deepEqual(emitted, []);
 });

+test('hasCachedSubtitle checks prefetched entries without consuming them', async () => {
+  const emitted: SubtitleData[] = [];
+  let tokenizeCalls = 0;
+  const controller = createSubtitleProcessingController({
+    tokenizeSubtitle: async (text) => {
+      tokenizeCalls += 1;
+      return { text, tokens: [] };
+    },
+    emitSubtitle: (payload) => emitted.push(payload),
+  });
+
+  controller.preCacheTokenization('猫\\Nです', { text: '猫\nです', tokens: [] });
+
+  assert.equal(controller.hasCachedSubtitle('猫\nです'), true);
+
+  controller.onSubtitleChange('猫\nです');
+  await flushMicrotasks();
+
+  assert.equal(tokenizeCalls, 0);
+  assert.deepEqual(emitted, [{ text: '猫\nです', tokens: [] }]);
+
+  controller.invalidateTokenizationCache();
+  assert.equal(controller.hasCachedSubtitle('猫\nです'), false);
+});
+
 test('isCacheFull returns false when cache is below limit', () => {
  const controller = createSubtitleProcessingController({
    tokenizeSubtitle: async (text) => ({ text, tokens: null }),
@@ -13,10 +13,11 @@ export interface SubtitleProcessingController {
  invalidateTokenizationCache: () => void;
  preCacheTokenization: (text: string, data: SubtitleData) => void;
  consumeCachedSubtitle: (text: string) => SubtitleData | null;
+  hasCachedSubtitle: (text: string) => boolean;
  isCacheFull: () => boolean;
 }

-function normalizeSubtitleCacheKey(text: string): string {
+export function normalizeSubtitleCacheKey(text: string): string {
  return text.replace(/\r\n/g, '\n').replace(/\\N/g, '\n').replace(/\\n/g, '\n').trim();
 }

@@ -152,6 +153,9 @@ export function createSubtitleProcessingController(
      refreshRequested = false;
      return cached;
    },
+    hasCachedSubtitle: (text: string) => {
+      return tokenizationCache.has(normalizeSubtitleCacheKey(text));
+    },
    isCacheFull: () => {
      return tokenizationCache.size >= SUBTITLE_TOKENIZATION_CACHE_LIMIT;
    },