feat(stats): add v1 immersion stats dashboard (#19)

2026-05-13 20:12:54 -07:00 · 2026-03-20 02:43:28 -07:00
parent 42abdd1268
commit 6749ff843c
555 changed files with 46356 additions and 2553 deletions
@@ -19,6 +19,7 @@
 import { PartOfSpeech, Token, MergedToken } from './types';
 import { DEFAULT_ANNOTATION_POS1_EXCLUSION_CONFIG } from './token-pos1-exclusions';
 import { DEFAULT_ANNOTATION_POS2_EXCLUSION_CONFIG } from './token-pos2-exclusions';
+import { shouldExcludeTokenFromSubtitleAnnotations } from './core/services/tokenizer/subtitle-annotation-filter';

 export function isNoun(tok: Token): boolean {
  return tok.partOfSpeech === PartOfSpeech.noun;
@@ -169,13 +170,17 @@ export function mergeTokens(
  isKnownWord: (text: string) => boolean = () => false,
  knownWordMatchMode: 'headword' | 'surface' = 'headword',
  shouldLookupKnownWords = true,
+  sourceText?: string,
 ): MergedToken[] {
  if (!tokens || tokens.length === 0) {
    return [];
  }

  const result: MergedToken[] = [];
+  const normalizedSourceText =
+    typeof sourceText === 'string' ? sourceText.replace(/\r?\n/g, ' ').trim() : null;
  let charOffset = 0;
+  let sourceCursor = 0;
  let lastStandaloneToken: Token | null = null;
  const resolveKnownMatch = (text: string | undefined): boolean => {
    if (!shouldLookupKnownWords || !text) {
@@ -185,9 +190,12 @@ export function mergeTokens(
  };

  for (const token of tokens) {
-    const start = charOffset;
-    const end = charOffset + token.word.length;
+    const matchedStart =
+      normalizedSourceText !== null ? normalizedSourceText.indexOf(token.word, sourceCursor) : -1;
+    const start = matchedStart >= sourceCursor ? matchedStart : charOffset;
+    const end = start + token.word.length;
    charOffset = end;
+    sourceCursor = end;

    let shouldMergeToken = false;

@@ -290,6 +298,10 @@ function isNPlusOneWordCountToken(
  pos1Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS1,
  pos2Exclusions: ReadonlySet<string> = N_PLUS_ONE_IGNORED_POS2,
 ): boolean {
+  if (shouldExcludeTokenFromSubtitleAnnotations(token, { pos1Exclusions, pos2Exclusions })) {
+    return false;
+  }
+
  const normalizedPos1 = normalizePos1Tag(token.pos1);
  const hasPos1 = normalizedPos1.length > 0;
  if (isExcludedByTagSet(normalizedPos1, pos1Exclusions)) {