feat(immersion): add anime metadata, occurrence tracking, and schema upgrades

- Add imm_anime table with AniList integration - Add imm_subtitle_lines, imm_word_line_occurrences, imm_kanji_line_occurrences - Add POS fields (part_of_speech, pos1, pos2, pos3) to imm_words - Add anime metadata parsing with guessit fallback - Add video duration tracking and watched status - Add episode, streak, trend, and word/kanji detail queries - Deduplicate subtitle line recording within sessions - Pass Anki note IDs through card mining callback chain
2026-04-12 04:19:25 -07:00 · 2026-03-14 22:13:42 -07:00
parent cc5d270b8e
commit fe8bb167c4
19 changed files with 5231 additions and 122 deletions
--- a/src/core/services/tokenizer/annotation-stage.ts
+++ b/src/core/services/tokenizer/annotation-stage.ts
@@ -133,6 +133,17 @@ function isFrequencyExcludedByPos(
  );
 }

+export function shouldExcludeTokenFromVocabularyPersistence(
+  token: MergedToken,
+  options: Pick<AnnotationStageOptions, 'pos1Exclusions' | 'pos2Exclusions'> = {},
+): boolean {
+  return isFrequencyExcludedByPos(
+    token,
+    resolvePos1Exclusions(options),
+    resolvePos2Exclusions(options),
+  );
+}
+
 function applyFrequencyMarking(
  tokens: MergedToken[],
  pos1Exclusions: ReadonlySet<string>,
--- a/src/core/services/tokenizer/part-of-speech.ts
+++ b/src/core/services/tokenizer/part-of-speech.ts
@@ -0,0 +1,56 @@
+import { PartOfSpeech } from '../../../types';
+
+function normalizePosTag(value: string | null | undefined): string {
+  return typeof value === 'string' ? value.trim() : '';
+}
+
+export function isPartOfSpeechValue(value: unknown): value is PartOfSpeech {
+  return typeof value === 'string' && Object.values(PartOfSpeech).includes(value as PartOfSpeech);
+}
+
+export function mapMecabPos1ToPartOfSpeech(pos1: string | null | undefined): PartOfSpeech {
+  switch (normalizePosTag(pos1)) {
+    case '名詞':
+      return PartOfSpeech.noun;
+    case '動詞':
+      return PartOfSpeech.verb;
+    case '形容詞':
+      return PartOfSpeech.i_adjective;
+    case '形状詞':
+    case '形容動詞':
+      return PartOfSpeech.na_adjective;
+    case '助詞':
+      return PartOfSpeech.particle;
+    case '助動詞':
+      return PartOfSpeech.bound_auxiliary;
+    case '記号':
+    case '補助記号':
+      return PartOfSpeech.symbol;
+    default:
+      return PartOfSpeech.other;
+  }
+}
+
+export function deriveStoredPartOfSpeech(input: {
+  partOfSpeech?: string | null;
+  pos1?: string | null;
+}): PartOfSpeech {
+  const pos1Parts = normalizePosTag(input.pos1)
+    .split('|')
+    .map((part) => part.trim())
+    .filter((part) => part.length > 0);
+
+  if (pos1Parts.length > 0) {
+    const derivedParts = [...new Set(pos1Parts.map((part) => mapMecabPos1ToPartOfSpeech(part)))];
+    if (derivedParts.length === 1) {
+      return derivedParts[0]!;
+    }
+    return PartOfSpeech.other;
+  }
+
+  if (isPartOfSpeechValue(input.partOfSpeech)) {
+    return input.partOfSpeech;
+  }
+
+  return PartOfSpeech.other;
+}