#!/usr/bin/env python import logging import os import time from argparse import ArgumentParser, RawDescriptionHelpFormatter from pathlib import Path from typing import Optional, Tuple import requests from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Constants AUDIO_BASE_URL = ( "https://www.heypera.com/listen/nihongo-con-teppei-for-beginners/{}/next" ) SUB_BASE_URL = "https://storage.googleapis.com/pera-transcripts/nihongo-con-teppei-for-beginners/transcripts/{}.vtt" DEFAULT_DELAY = 5 # seconds between requests DEFAULT_TIMEOUT = 30 # seconds for HTTP requests MAX_RETRIES = 3 class TeppeiDownloader: def __init__(self, delay: float = DEFAULT_DELAY, timeout: int = DEFAULT_TIMEOUT): self.delay = delay self.timeout = timeout self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36' }) def get_audio_url(self, episode_num: int) -> Optional[str]: """Get audio URL using Selenium scraping.""" chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--window-size=1920,1080") driver = None try: driver = webdriver.Chrome(options=chrome_options) driver.get(AUDIO_BASE_URL.format(episode_num)) # Wait for the audio element to be present audio_element = WebDriverWait(driver, 15).until( EC.presence_of_element_located((By.TAG_NAME, "audio")) ) audio_url = audio_element.get_attribute("src") if audio_url: logger.info(f"Found audio URL for episode {episode_num}") return audio_url else: logger.error(f"No audio URL found for episode {episode_num}") return None except Exception as e: logger.error(f"Error getting audio URL for episode {episode_num}: {e}") return None finally: if driver: driver.quit() def get_sub_url(self, episode_num: int) -> str: """Get subtitle URL (direct URL construction).""" return SUB_BASE_URL.format(episode_num) def verify_download(self, filename: str, expected_size: Optional[int] = None) -> bool: """Verify that a downloaded file exists and has reasonable content.""" if not os.path.exists(filename): logger.warning(f"File verification failed: {filename} does not exist") return False try: file_size = os.path.getsize(filename) if file_size == 0: logger.warning(f"File verification failed: {filename} is empty (0 bytes)") return False # For subtitle files, check if they have at least some basic content if filename.endswith('.vtt'): with open(filename, 'r', encoding='utf-8') as f: content = f.read().strip() if len(content) < 10: # Very basic check for minimal content logger.warning(f"File verification failed: {filename} appears to have insufficient content") return False # If expected size is provided, check if it's reasonable if expected_size and abs(file_size - expected_size) > expected_size * 0.1: logger.warning(f"File verification failed: {filename} size ({file_size} bytes) differs significantly from expected ({expected_size} bytes)") logger.debug(f"File verification passed: {filename} ({file_size} bytes)") return True except (OSError, IOError) as e: logger.warning(f"File verification failed: {filename} - {e}") return False def download_file(self, url: str, filename: str, retries: int = MAX_RETRIES) -> bool: """Download a file with retry logic and proper error handling.""" for attempt in range(retries): try: logger.info(f"Downloading {filename} (attempt {attempt + 1}/{retries})") response = self.session.get(url, timeout=self.timeout) if response.status_code == 200: with open(filename, "wb") as file: file.write(response.content) # Verify the download was successful if self.verify_download(filename, len(response.content)): logger.info(f"Successfully downloaded and verified {filename}") return True else: logger.warning(f"Download verification failed for {filename}, retrying...") # Remove the corrupted/incomplete file try: os.remove(filename) except OSError: pass # Ignore if file doesn't exist or can't be removed else: logger.warning(f"HTTP {response.status_code} for {filename}") except requests.exceptions.RequestException as e: logger.warning(f"Download attempt {attempt + 1} failed for {filename}: {e}") if attempt < retries - 1: time.sleep(self.delay) logger.error(f"Failed to download {filename} after {retries} attempts") return False def file_exists(self, filepath: str) -> bool: """Check if file exists and has content.""" if not os.path.exists(filepath): return False # Check if file has content (not empty) try: return os.path.getsize(filepath) > 0 except OSError: return False def download_episode(self, episode_num: int, output_dir: str, force: bool = False) -> Tuple[bool, bool]: """Download both audio and subtitle files for an episode.""" output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) audio_filename = output_path / f"Nihongo-Con-Teppei-E{episode_num:02d}.mp3" sub_filename = output_path / f"Nihongo-Con-Teppei-E{episode_num:02d}.vtt" audio_success = True sub_success = True # Download audio file if force or not self.file_exists(str(audio_filename)): audio_url = self.get_audio_url(episode_num) if audio_url: audio_success = self.download_file(audio_url, str(audio_filename)) else: audio_success = False logger.error(f"Could not get audio URL for episode {episode_num}") # Add delay between audio and subtitle downloads if audio_success: time.sleep(self.delay) else: logger.info(f"Audio file already exists: {audio_filename}") # Download subtitle file if force or not self.file_exists(str(sub_filename)): sub_url = self.get_sub_url(episode_num) sub_success = self.download_file(sub_url, str(sub_filename)) else: logger.info(f"Subtitle file already exists: {sub_filename}") return audio_success, sub_success def download_range(self, start: int, end: int, output_dir: str, force: bool = False) -> None: """Download a range of episodes with progress tracking.""" if start > end: logger.error("Start episode must be less than or equal to end episode") return total_episodes = end - start + 1 successful_downloads = 0 logger.info(f"Starting download of episodes {start} to {end} ({total_episodes} episodes)") for episode in range(start, end + 1): logger.info(f"Processing episode {episode}/{end} ({episode-start+1}/{total_episodes})") audio_success, sub_success = self.download_episode(episode, output_dir, force) if audio_success and sub_success: successful_downloads += 1 # Add delay between episodes to be respectful to the server if episode < end: # Don't delay after the last episode logger.info(f"Waiting {self.delay} seconds before next episode...") time.sleep(self.delay) logger.info(f"Download complete! Successfully downloaded {successful_downloads}/{total_episodes} episodes") def parse_args(): parser = ArgumentParser( description="Download Nihongo Con Teppei episodes with audio and subtitles", formatter_class=RawDescriptionHelpFormatter, epilog=""" Examples: %(prog)s 11 --download # Download episode 11 %(prog)s --start 11 --end 15 --download # Download episodes 11-15 %(prog)s 11 --download --force # Force re-download episode 11 %(prog)s --start 1 --end 20 --download --output ./teppei_episodes """ ) # Single episode mode parser.add_argument( "episode_num", type=int, nargs='?', help="Episode number to download (for single episode mode)" ) # Range mode parser.add_argument( "--start", type=int, help="Starting episode number for range download" ) parser.add_argument( "--end", type=int, help="Ending episode number for range download" ) # Download options parser.add_argument( "-d", "--download", action="store_true", help="Download the files (if not specified, only show URLs)" ) parser.add_argument( "-o", "--output", default=".", help="Output directory (default: current directory)" ) parser.add_argument( "--force", action="store_true", help="Force re-download even if files already exist" ) parser.add_argument( "--delay", type=float, default=DEFAULT_DELAY, help=f"Delay between requests in seconds (default: {DEFAULT_DELAY})" ) parser.add_argument( "--timeout", type=int, default=DEFAULT_TIMEOUT, help=f"HTTP request timeout in seconds (default: {DEFAULT_TIMEOUT})" ) return parser.parse_args() def main(): args = parse_args() # Validate arguments if args.start is not None and args.end is not None: # Range mode if args.start < 1 or args.end < 1: logger.error("Episode numbers must be greater than 0") return if args.start > args.end: logger.error("Start episode must be less than or equal to end episode") return elif args.episode_num is not None: # Single episode mode if args.episode_num < 1: logger.error("Episode number must be greater than 0") return else: logger.error("Must specify either a single episode number or a range (--start and --end)") return # Create downloader instance downloader = TeppeiDownloader(delay=args.delay, timeout=args.timeout) if args.start is not None and args.end is not None: # Range download mode downloader.download_range(args.start, args.end, args.output, args.force) else: # Single episode mode episode = args.episode_num if not args.download: # Just show URLs audio_url = downloader.get_audio_url(episode) sub_url = downloader.get_sub_url(episode) if audio_url: print(f"Audio URL: {audio_url}") else: print("Could not retrieve audio URL") print(f"Subtitle URL: {sub_url}") else: # Download files audio_success, sub_success = downloader.download_episode(episode, args.output, args.force) if audio_success and sub_success: logger.info(f"Successfully downloaded episode {episode}") else: logger.error(f"Failed to download episode {episode}") if __name__ == "__main__": main()