import logging from urllib.parse import urlparse import bleach from yt_dlp import YoutubeDL def fetch_video_info(video_url): """ Fetch comprehensive video information using yt-dlp. Returns a dictionary with video metadata or None if an error occurs. """ ydl_opts = { "format": "best", "quiet": True, "noplaylist": True, } with YoutubeDL(ydl_opts) as ydl: try: info = ydl.extract_info(video_url, download=False) # Get the first category or default to "Unknown" category = "Unknown" if "categories" in info and info["categories"]: category = info["categories"][0] # Extract basic required info plus additional metadata return { "video_url": video_url, "video_name": info.get("title", "Unknown Title"), "channel_url": info.get("channel_url", ""), "channel_name": info.get("uploader", "Unknown Channel"), "category": category, "view_count": info.get("view_count", 0), "subscriber_count": info.get("channel_follower_count", 0), "thumbnail_url": info.get("thumbnail", ""), "upload_date": info.get("upload_date", None), } except Exception as e: logging.error(f"Error fetching info for {video_url}: {e}") return None # Helper functions for validation and sanitization def is_valid_url(url, allowed_domains=None): """Validates URL format and optionally checks domain.""" if not url or not isinstance(url, str): return False try: result = urlparse(url) # Check for valid scheme and netloc valid_format = all([result.scheme in ["http", "https"], result.netloc]) # Check domain if specified if valid_format and allowed_domains: return any(domain in result.netloc for domain in allowed_domains) return valid_format except: return False def validate_video_data(data): """Validates all fields in the video data.""" errors = {} # URL validation if not is_valid_url(data.get("video_url")): errors["video_url"] = "Invalid video URL format" if not is_valid_url(data.get("channel_url")): errors["channel_url"] = "Invalid channel URL format" if data.get("thumbnail_url") and not is_valid_url(data.get("thumbnail_url")): errors["thumbnail_url"] = "Invalid thumbnail URL format" # String length validation if len(data.get("video_name", "")) > 500: errors["video_name"] = "Video name too long (max 500 characters)" if len(data.get("channel_name", "")) > 200: errors["channel_name"] = "Channel name too long (max 200 characters)" if data.get("category") and len(data.get("category")) > 100: errors["category"] = "Category too long (max 100 characters)" # Type validation for numeric fields if data.get("view_count") is not None: try: int(data.get("view_count")) except (ValueError, TypeError): errors["view_count"] = "View count must be a valid integer" if data.get("subscriber") is not None: try: int(data.get("subscriber")) except (ValueError, TypeError): errors["subscriber"] = "Subscriber count must be a valid integer" # Date validation if data.get("upload_date"): # Implement appropriate date validation based on expected format pass return errors def sanitize_video_data(data): """Sanitizes all string fields to prevent XSS.""" sanitized = {} # Copy all fields, sanitizing strings for key, value in data.items(): if isinstance(value, str): # Remove potentially harmful HTML/scripts sanitized[key] = bleach.clean(value, strip=True) else: sanitized[key] = value return sanitized