import * as fs from 'node:fs/promises'; import * as path from 'node:path'; export interface FrequencyDictionaryLookupOptions { searchPaths: string[]; log: (message: string) => void; } type FrequencyDictionaryMode = 'occurrence-based' | 'rank-based'; interface FrequencyDictionaryEntry { rank: number; term: string; } const FREQUENCY_BANK_FILE_GLOB = /^term_meta_bank_.*\.json$/; const NOOP_LOOKUP = (): null => null; const ENTRY_YIELD_INTERVAL = 5000; function isErrorCode(error: unknown, code: string): boolean { return Boolean(error && typeof error === 'object' && (error as { code?: unknown }).code === code); } async function yieldToEventLoop(): Promise { await new Promise((resolve) => { setImmediate(resolve); }); } function normalizeFrequencyTerm(value: string): string { return value.trim().toLowerCase(); } async function readDictionaryMetadata( dictionaryPath: string, log: (message: string) => void, ): Promise<{ title: string | null; frequencyMode: FrequencyDictionaryMode | null }> { const indexPath = path.join(dictionaryPath, 'index.json'); let rawText: string; try { rawText = await fs.readFile(indexPath, 'utf-8'); } catch (error) { if (isErrorCode(error, 'ENOENT')) { return { title: null, frequencyMode: null }; } log(`Failed to read frequency dictionary index ${indexPath}: ${String(error)}`); return { title: null, frequencyMode: null }; } let rawIndex: unknown; try { rawIndex = JSON.parse(rawText) as unknown; } catch { log(`Failed to parse frequency dictionary index as JSON: ${indexPath}`); return { title: null, frequencyMode: null }; } if (!rawIndex || typeof rawIndex !== 'object') { return { title: null, frequencyMode: null }; } const titleRaw = (rawIndex as { title?: unknown }).title; const frequencyModeRaw = (rawIndex as { frequencyMode?: unknown }).frequencyMode; return { title: typeof titleRaw === 'string' && titleRaw.trim().length > 0 ? titleRaw.trim() : null, frequencyMode: frequencyModeRaw === 'occurrence-based' || frequencyModeRaw === 'rank-based' ? frequencyModeRaw : null, }; } function parsePositiveFrequencyString(value: string): number | null { const trimmed = value.trim(); if (!trimmed) { return null; } const numericMatch = trimmed.match(/[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/)?.[0]; if (!numericMatch) { return null; } const parsed = Number.parseFloat(numericMatch); if (!Number.isFinite(parsed) || parsed <= 0) { return null; } const normalized = Math.floor(parsed); if (!Number.isFinite(normalized) || normalized <= 0) { return null; } return normalized; } function parsePositiveFrequencyNumber(value: unknown): number | null { if (typeof value === 'number') { if (!Number.isFinite(value) || value <= 0) return null; return Math.floor(value); } if (typeof value === 'string') { return parsePositiveFrequencyString(value); } return null; } function parseDisplayFrequencyNumber(value: unknown): number | null { if (typeof value === 'string') { const leadingDigits = value.trim().match(/^\d+/)?.[0]; if (!leadingDigits) { return null; } const parsed = Number.parseInt(leadingDigits, 10); return Number.isFinite(parsed) && parsed > 0 ? parsed : null; } return parsePositiveFrequencyNumber(value); } function extractFrequencyDisplayValue(meta: unknown): number | null { if (!meta || typeof meta !== 'object') return null; const frequency = (meta as { frequency?: unknown }).frequency; if (!frequency || typeof frequency !== 'object') return null; const rawValue = (frequency as { value?: unknown }).value; const parsedRawValue = parsePositiveFrequencyNumber(rawValue); const displayValue = (frequency as { displayValue?: unknown }).displayValue; const parsedDisplayValue = parseDisplayFrequencyNumber(displayValue); if (parsedDisplayValue !== null) { return parsedDisplayValue; } return parsedRawValue; } function asFrequencyDictionaryEntry(entry: unknown): FrequencyDictionaryEntry | null { if (!Array.isArray(entry) || entry.length < 3) { return null; } const [term, _id, meta] = entry as [unknown, unknown, unknown]; if (typeof term !== 'string') { return null; } const frequency = extractFrequencyDisplayValue(meta); if (frequency === null) return null; const normalizedTerm = normalizeFrequencyTerm(term); if (!normalizedTerm) return null; return { term: normalizedTerm, rank: frequency, }; } async function addEntriesToMap( rawEntries: unknown, terms: Map, ): Promise<{ duplicateCount: number }> { if (!Array.isArray(rawEntries)) { return { duplicateCount: 0 }; } let duplicateCount = 0; let processedCount = 0; for (const rawEntry of rawEntries) { processedCount += 1; if (processedCount % ENTRY_YIELD_INTERVAL === 0) { await yieldToEventLoop(); } const entry = asFrequencyDictionaryEntry(rawEntry); if (!entry) { continue; } const currentRank = terms.get(entry.term); if (currentRank === undefined || entry.rank < currentRank) { terms.set(entry.term, entry.rank); continue; } duplicateCount += 1; } return { duplicateCount }; } async function collectDictionaryFromPath( dictionaryPath: string, log: (message: string) => void, ): Promise> { const terms = new Map(); const metadata = await readDictionaryMetadata(dictionaryPath, log); if (metadata.frequencyMode === 'occurrence-based') { log( `Skipping occurrence-based frequency dictionary ${ metadata.title ?? dictionaryPath }; SubMiner frequency tags require rank-based values.`, ); return terms; } let fileNames: string[]; try { fileNames = await fs.readdir(dictionaryPath); } catch (error) { log(`Failed to read frequency dictionary directory ${dictionaryPath}: ${String(error)}`); return terms; } const bankFiles = fileNames.filter((name) => FREQUENCY_BANK_FILE_GLOB.test(name)).sort(); if (bankFiles.length === 0) { return terms; } for (const bankFile of bankFiles) { const bankPath = path.join(dictionaryPath, bankFile); let rawText: string; try { rawText = await fs.readFile(bankPath, 'utf-8'); } catch { log(`Failed to read frequency dictionary file ${bankPath}`); continue; } let rawEntries: unknown; try { await yieldToEventLoop(); rawEntries = JSON.parse(rawText) as unknown; } catch { log(`Failed to parse frequency dictionary file as JSON: ${bankPath}`); continue; } const beforeSize = terms.size; const { duplicateCount } = await addEntriesToMap(rawEntries, terms); if (duplicateCount > 0) { log( `Frequency dictionary ignored ${duplicateCount} duplicate term entr${ duplicateCount === 1 ? 'y' : 'ies' } in ${bankPath} (kept strongest rank per term).`, ); } if (terms.size === beforeSize) { log(`Frequency dictionary file contained no extractable entries: ${bankPath}`); } } return terms; } export async function createFrequencyDictionaryLookup( options: FrequencyDictionaryLookupOptions, ): Promise<(term: string) => number | null> { const attemptedPaths: string[] = []; let foundDictionaryPathCount = 0; for (const dictionaryPath of options.searchPaths) { attemptedPaths.push(dictionaryPath); let isDirectory = false; try { isDirectory = (await fs.stat(dictionaryPath)).isDirectory(); } catch (error) { if (isErrorCode(error, 'ENOENT')) { continue; } options.log( `Failed to inspect frequency dictionary path ${dictionaryPath}: ${String(error)}`, ); continue; } if (!isDirectory) { continue; } foundDictionaryPathCount += 1; const terms = await collectDictionaryFromPath(dictionaryPath, options.log); if (terms.size > 0) { options.log(`Frequency dictionary loaded from ${dictionaryPath} (${terms.size} entries)`); return (term: string): number | null => { const normalized = normalizeFrequencyTerm(term); if (!normalized) return null; return terms.get(normalized) ?? null; }; } options.log( `Frequency dictionary directory exists but contains no readable term_meta_bank_*.json files: ${dictionaryPath}`, ); } options.log( `Frequency dictionary not found. Searched ${attemptedPaths.length} candidate path(s): ${attemptedPaths.join(', ')}`, ); if (foundDictionaryPathCount > 0) { options.log( 'Frequency dictionary directories found, but no usable term_meta_bank_*.json files were loaded.', ); } return NOOP_LOOKUP; }