#!/usr/bin/env python3 """ Fetch movie data for Hugo posts based on IMDB ID in frontmatter. Scans all posts with an `imdb` field and fetches missing data: - Poster (downloaded locally) - Runtime - Year - Director - Genres Usage: python scripts/fetch_movie_data.py # Process all movie posts python scripts/fetch_movie_data.py --dry-run # Show what would be updated python scripts/fetch_movie_data.py --force # Re-fetch even if data exists """ import argparse import os import re import sys from pathlib import Path import requests import yaml # Configuration try: from config import TMDB_API_KEY except ImportError: raise SystemExit("Error: scripts/config.py not found. Copy config.example.py to config.py and add your API key.") # Paths SCRIPT_DIR = Path(__file__).parent PROJECT_ROOT = SCRIPT_DIR.parent CONTENT_DIR = PROJECT_ROOT / "content" / "posts" IMAGES_DIR = PROJECT_ROOT / "static" / "images" / "posters" # Regex to split frontmatter from content FRONTMATTER_RE = re.compile(r'^---\s*\n(.*?)\n---\s*\n', re.DOTALL) def find_movie_by_imdb(imdb_id): """Find TMDB movie by IMDB ID.""" url = f"https://api.themoviedb.org/3/find/{imdb_id}" params = { "api_key": TMDB_API_KEY, "external_source": "imdb_id" } resp = requests.get(url, params=params, timeout=10) resp.raise_for_status() data = resp.json() results = data.get("movie_results", []) if results: return results[0] return None def get_movie_details(tmdb_id): """Get full movie details from TMDB.""" url = f"https://api.themoviedb.org/3/movie/{tmdb_id}" params = { "api_key": TMDB_API_KEY, "append_to_response": "credits" } resp = requests.get(url, params=params, timeout=10) resp.raise_for_status() return resp.json() def get_directors(credits): """Extract director names from credits.""" crew = credits.get("crew", []) directors = [p["name"] for p in crew if p.get("job") == "Director"] return directors def slugify(title): """Convert title to URL-friendly slug.""" slug = title.lower() slug = re.sub(r"[^a-z0-9\s-]", "", slug) slug = re.sub(r"[\s_]+", "-", slug) slug = re.sub(r"-+", "-", slug) return slug.strip("-") def download_poster(poster_path, filename): """Download poster from TMDB.""" if not poster_path: return None url = f"https://image.tmdb.org/t/p/w500{poster_path}" resp = requests.get(url, timeout=10) resp.raise_for_status() IMAGES_DIR.mkdir(parents=True, exist_ok=True) filepath = IMAGES_DIR / filename filepath.write_bytes(resp.content) return f"/images/posters/{filename}" def parse_post(filepath): """Parse a markdown post into frontmatter dict and content string.""" text = filepath.read_text() match = FRONTMATTER_RE.match(text) if not match: return None, text fm_text = match.group(1) content = text[match.end():] try: frontmatter = yaml.safe_load(fm_text) except yaml.YAMLError: return None, text return frontmatter, content def write_post(filepath, frontmatter, content): """Write frontmatter and content back to markdown file.""" # Use default_flow_style=False for readable YAML # Use allow_unicode=True for proper character handling fm_text = yaml.dump( frontmatter, default_flow_style=False, allow_unicode=True, sort_keys=False ) text = f"---\n{fm_text}---\n{content}" filepath.write_text(text) def process_post(filepath, dry_run=False, force=False): """Process a single post, fetching missing movie data.""" frontmatter, content = parse_post(filepath) if frontmatter is None: return False imdb_id = frontmatter.get("imdb") if not imdb_id: return False # Check what's missing has_poster = bool(frontmatter.get("poster")) has_runtime = bool(frontmatter.get("runtime")) has_year = bool(frontmatter.get("year")) has_director = bool(frontmatter.get("director")) needs_update = not (has_poster and has_runtime and has_year and has_director) if not needs_update and not force: return False print(f"\nProcessing: {filepath.name}") print(f" IMDB: {imdb_id}") if dry_run: missing = [] if not has_poster: missing.append("poster") if not has_runtime: missing.append("runtime") if not has_year: missing.append("year") if not has_director: missing.append("director") print(f" Would fetch: {', '.join(missing)}") return True # Find movie on TMDB print(" Finding movie on TMDB...") movie = find_movie_by_imdb(imdb_id) if not movie: print(f" ERROR: Movie not found for IMDB ID: {imdb_id}") return False tmdb_id = movie["id"] print(f" Found: {movie.get('title')} (TMDB: {tmdb_id})") # Get full details print(" Fetching details...") details = get_movie_details(tmdb_id) updated = False # Update poster if not has_poster or force: poster_path = details.get("poster_path") if poster_path: title = frontmatter.get("title", "movie") filename = f"{slugify(title)}.jpg" print(f" Downloading poster...") poster_url = download_poster(poster_path, filename) if poster_url: frontmatter["poster"] = poster_url print(f" Poster saved: {poster_url}") updated = True # Update runtime if not has_runtime or force: runtime = details.get("runtime") if runtime: frontmatter["runtime"] = runtime print(f" Runtime: {runtime} minutes") updated = True # Update year if not has_year or force: release_date = details.get("release_date", "") if release_date: year = release_date.split("-")[0] frontmatter["year"] = int(year) print(f" Year: {year}") updated = True # Update director if not has_director or force: credits = details.get("credits", {}) directors = get_directors(credits) if directors: # Store as string if single, list if multiple if len(directors) == 1: frontmatter["director"] = directors[0] else: frontmatter["director"] = directors print(f" Director: {', '.join(directors)}") updated = True # Update genres (bonus) if "genres" not in frontmatter or force: genres = [g["name"] for g in details.get("genres", [])] if genres: frontmatter["genres"] = genres updated = True if updated: write_post(filepath, frontmatter, content) print(" Updated!") return updated def main(): parser = argparse.ArgumentParser(description="Fetch movie data for Hugo posts") parser.add_argument("--dry-run", action="store_true", help="Show what would be updated") parser.add_argument("--force", action="store_true", help="Re-fetch even if data exists") parser.add_argument("file", nargs="?", help="Specific file to process") args = parser.parse_args() if args.file: filepath = Path(args.file) if not filepath.is_absolute(): filepath = PROJECT_ROOT / filepath if not filepath.exists(): print(f"File not found: {filepath}") sys.exit(1) files = [filepath] else: files = list(CONTENT_DIR.glob("**/*.md")) print(f"Scanning {len(files)} posts for movie data...") updated = 0 for filepath in files: if process_post(filepath, dry_run=args.dry_run, force=args.force): updated += 1 print(f"\n{'Would update' if args.dry_run else 'Updated'}: {updated} posts") if __name__ == "__main__": main()