Persist stats exclusions in DB and fix word metrics filtering

- Stats vocabulary exclusions stored in `imm_stats_excluded_words` (schema v18); seeded from localStorage on first load
- Session, overview, trends, and library word metrics use filtered persisted occurrences with raw fallback
- Session known-word % chart uses filtered persisted totals as denominator for both known and total
- JLPT subtitle styling changed to underline-only; no longer overrides text color
This commit is contained in:
2026-05-03 19:40:54 -07:00
parent db30c61327
commit 25d0aa47db
32 changed files with 1541 additions and 211 deletions
@@ -1,4 +1,6 @@
import type { DatabaseSync } from './sqlite';
import { PartOfSpeech, type MergedToken } from '../../../types';
import { shouldExcludeTokenFromVocabularyPersistence } from '../tokenizer/annotation-stage';
import type {
KanjiAnimeAppearanceRow,
KanjiDetailRow,
@@ -7,18 +9,55 @@ import type {
KanjiWordRow,
SessionEventRow,
SimilarWordRow,
StatsExcludedWordRow,
VocabularyStatsRow,
WordAnimeAppearanceRow,
WordDetailRow,
WordOccurrenceRow,
} from './types';
import { fromDbTimestamp } from './query-shared';
import { fromDbTimestamp, toDbTimestamp } from './query-shared';
import { nowMs } from './time';
const VOCABULARY_STATS_FILTER_OVERSAMPLE_FACTOR = 4;
const VOCABULARY_STATS_FILTER_OVERSAMPLE_MIN = 100;
function toVocabularyToken(row: VocabularyStatsRow): MergedToken {
const partOfSpeech =
row.partOfSpeech && Object.values(PartOfSpeech).includes(row.partOfSpeech as PartOfSpeech)
? (row.partOfSpeech as PartOfSpeech)
: PartOfSpeech.other;
return {
surface: row.word,
reading: row.reading ?? '',
headword: row.headword,
startPos: 0,
endPos: row.word.length,
partOfSpeech,
pos1: row.pos1 ?? '',
pos2: row.pos2 ?? '',
pos3: row.pos3 ?? '',
frequencyRank: row.frequencyRank ?? undefined,
isMerged: false,
isKnown: false,
isNPlusOneTarget: false,
};
}
function isVocabularyStatsRowVisible(row: VocabularyStatsRow): boolean {
return !shouldExcludeTokenFromVocabularyPersistence(toVocabularyToken(row));
}
export function getVocabularyStats(
db: DatabaseSync,
limit = 100,
excludePos?: string[],
): VocabularyStatsRow[] {
const queryLimit = Math.max(
limit,
limit * VOCABULARY_STATS_FILTER_OVERSAMPLE_FACTOR,
limit + VOCABULARY_STATS_FILTER_OVERSAMPLE_MIN,
);
const hasExclude = excludePos && excludePos.length > 0;
const placeholders = hasExclude ? excludePos.map(() => '?').join(', ') : '';
const whereClause = hasExclude
@@ -37,8 +76,48 @@ export function getVocabularyStats(
GROUP BY w.id
ORDER BY w.frequency DESC LIMIT ?
`);
const params = hasExclude ? [...excludePos, limit] : [limit];
return stmt.all(...params) as VocabularyStatsRow[];
const params = hasExclude ? [...excludePos, queryLimit] : [queryLimit];
return (stmt.all(...params) as VocabularyStatsRow[])
.filter(isVocabularyStatsRowVisible)
.slice(0, limit);
}
export function getStatsExcludedWords(db: DatabaseSync): StatsExcludedWordRow[] {
return db
.prepare(
`
SELECT headword, word, reading
FROM imm_stats_excluded_words
ORDER BY headword COLLATE NOCASE, word COLLATE NOCASE, reading COLLATE NOCASE
`,
)
.all() as StatsExcludedWordRow[];
}
export function replaceStatsExcludedWords(db: DatabaseSync, words: StatsExcludedWordRow[]): void {
const now = toDbTimestamp(nowMs());
const insertStmt = db.prepare(`
INSERT OR IGNORE INTO imm_stats_excluded_words(
headword,
word,
reading,
CREATED_DATE,
LAST_UPDATE_DATE
)
VALUES (?, ?, ?, ?, ?)
`);
db.exec('BEGIN IMMEDIATE');
try {
db.prepare('DELETE FROM imm_stats_excluded_words').run();
for (const word of words) {
insertStmt.run(word.headword, word.word, word.reading, now, now);
}
db.exec('COMMIT');
} catch (error) {
db.exec('ROLLBACK');
throw error;
}
}
export function getKanjiStats(db: DatabaseSync, limit = 100): KanjiStatsRow[] {