#!/usr/bin/env python3 """ Setup NFR (National Film Registry) data for a specific year. This script fetches the Library of Congress announcement for a given year, extracts film titles and descriptions, and generates a Python dictionary that can be added to new_nfr.py. Usage: python3 scripts/setup_nfr.py 2024 python3 scripts/setup_nfr.py 2015 --output scripts/nfr_data/nfr_2015.py python3 scripts/setup_nfr.py 2023 --no-ollama # Don't use ollama for extraction Requirements: - requests library - access to ollama server (optional, for better extraction) The script will: 1. Search for the LOC announcement URL for the given year 2. Fetch the announcement page 3. Use ollama (if available) or basic parsing to extract film data 4. Generate a Python dictionary with film titles, years, and descriptions 5. Save to a file or print to stdout """ import argparse import json import os import re import sys from pathlib import Path from urllib.parse import urljoin import requests # Configuration SCRIPT_DIR = Path(__file__).parent PROJECT_ROOT = SCRIPT_DIR.parent NFR_DATA_DIR = SCRIPT_DIR / "nfr_data" # Ollama configuration OLLAMA_HOST = os.environ.get("OLLAMA_HOST", "http://192.168.0.109:11434") OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "llama3.2") # or whatever model you have def search_for_nfr_announcement(year): """ Search for the LOC NFR announcement URL for a given year. Returns dict with: - newsroom_url: Main press release - blog_url: Blog announcement (if found) """ print(f"Searching for {year} NFR announcement...") # Try common URL patterns for LOC announcements urls_to_try = [] # Newsroom pattern (most reliable for recent years) # Example: https://newsroom.loc.gov/news/25-films-named-to-national-film-registry-for-preservation/ # The URL doesn't always have the year in it, so we'll search # Try searching via web search_queries = [ f"site:newsroom.loc.gov national film registry {year}", f"site:blogs.loc.gov national film registry {year}", f'"national film registry" {year} site:loc.gov' ] results = { "newsroom_url": None, "blog_url": None, "webcast_url": None, } # For now, return known patterns - user can manually find URL # We'll enhance this with actual search later print(f"\nPlease find the LOC announcement URL for {year}.") print(f"\nCommon places to look:") print(f" - https://newsroom.loc.gov/") print(f" - https://blogs.loc.gov/now-see-hear/") print(f" - https://www.loc.gov/programs/national-film-preservation-board/film-registry/") # For 2024, we know the URL if year == 2024: results["newsroom_url"] = "https://newsroom.loc.gov/news/25-films-named-to-national-film-registry-for-preservation/s/55d5285d-916f-4105-b7d4-7fc3ba8664e3" results["blog_url"] = "https://blogs.loc.gov/now-see-hear/2024/12/announcing-the-2024-national-film-registry/" return results # Prompt user for URL url = input(f"\nEnter the LOC announcement URL for {year} (or press Enter to skip): ").strip() if url: results["newsroom_url"] = url return results def fetch_url_content(url): """Fetch content from a URL.""" print(f"Fetching {url}...") resp = requests.get(url, timeout=30) resp.raise_for_status() return resp.text def call_ollama(prompt, model=OLLAMA_MODEL, system_prompt=None): """ Call ollama API to process text. Args: prompt: The user prompt model: Model name (default from OLLAMA_MODEL env var) system_prompt: Optional system prompt Returns: The model's response text """ url = f"{OLLAMA_HOST}/api/generate" payload = { "model": model, "prompt": prompt, "stream": False, } if system_prompt: payload["system"] = system_prompt print(f"Calling ollama at {OLLAMA_HOST} with model {model}...") try: resp = requests.post(url, json=payload, timeout=300) # 5 min timeout resp.raise_for_status() data = resp.json() return data.get("response", "") except requests.exceptions.RequestException as e: print(f"Error calling ollama: {e}") return None def extract_films_with_ollama(html_content, year): """ Use ollama to extract film data from HTML content. Returns a list of dicts with: title, year, description """ system_prompt = """You are a helpful assistant that extracts structured data from web pages. Your task is to extract information about films from National Film Registry announcements. Output ONLY valid JSON, nothing else. No markdown formatting, no code blocks, just raw JSON.""" user_prompt = f"""From the following HTML content, extract ALL films that were added to the National Film Registry in {year}. For each film, extract: 1. The exact title 2. The release year of the film 3. The description/reason why it was selected for preservation Format your response as a JSON array of objects with this structure: [ {{ "title": "Film Title", "year": 1999, "description": "The reason it was selected..." }} ] IMPORTANT: - Extract ALL {year} films, typically 25 films - Keep descriptions concise but complete - Use the exact text from the announcement - Output ONLY the JSON array, no other text - Do not include markdown code blocks HTML Content: {html_content[:50000]} """ # Limit to first 50k chars to avoid token limits response = call_ollama(user_prompt, system_prompt=system_prompt) if not response: return None # Try to parse JSON from response try: # Sometimes models wrap in code blocks, try to extract json_match = re.search(r'(\[.*\])', response, re.DOTALL) if json_match: response = json_match.group(1) films = json.loads(response) return films except json.JSONDecodeError as e: print(f"Failed to parse JSON from ollama response: {e}") print(f"Response was: {response[:500]}...") return None def extract_films_basic(html_content, year): """ Basic extraction without ollama - looks for common patterns. This is a fallback method and may not work for all years. """ print("Using basic extraction (without ollama)...") print("Note: This may not capture all details. Consider using --ollama for better results.") films = [] # Look for numbered lists or bold film titles # This is a simple heuristic and may need adjustment # Pattern: Look for year in parentheses near potential titles # Example: "Film Title (1999)" pattern = r'([A-Z][^(]{3,50})\s*\((\d{4})\)' matches = re.findall(pattern, html_content) seen_titles = set() for title, film_year in matches: title = title.strip() # Filter out obviously wrong matches if title and len(title) > 3 and title not in seen_titles: # Try to get a reasonable year range try: y = int(film_year) if 1890 <= y <= year: # Reasonable film year range films.append({ "title": title, "year": y, "description": "[Description not extracted - please add manually]" }) seen_titles.add(title) except ValueError: pass return films if films else None def generate_python_dict(films, year): """ Generate Python code for the NFR dictionary. Args: films: List of film dicts year: NFR induction year Returns: String containing Python code """ output = f'''# {year} National Film Registry inductees with LOC descriptions # Source: [Add URL here] NFR_{year} = {{''' for film in films: title = film["title"].replace("'", "\\'") desc = film["description"].replace("'", "\\'").replace("\n", " ") output += f''' "{title}": {{ "year": {film["year"]}, "description": '{desc}' }},''' output += "\n}\n" return output def save_nfr_data(films, year, output_path=None): """ Save NFR data to a file. Args: films: List of film dicts year: NFR induction year output_path: Optional path to save to (default: nfr_data/nfr_YEAR.py) """ if output_path is None: NFR_DATA_DIR.mkdir(exist_ok=True) output_path = NFR_DATA_DIR / f"nfr_{year}.py" else: output_path = Path(output_path) code = generate_python_dict(films, year) output_path.write_text(code) print(f"\n✓ Saved to {output_path}") print(f"\nTo use this data:") print(f" 1. Review and edit {output_path} if needed") print(f" 2. Copy the NFR_{year} dictionary into scripts/new_nfr.py") print(f" 3. Update the script to handle multiple years") return output_path def main(): parser = argparse.ArgumentParser( description="Setup NFR data for a specific year" ) parser.add_argument("year", type=int, help="NFR induction year (e.g., 2024)") parser.add_argument( "--url", help="Direct URL to LOC announcement (skip search)" ) parser.add_argument( "--output", help="Output file path (default: scripts/nfr_data/nfr_YEAR.py)" ) parser.add_argument( "--no-ollama", action="store_true", help="Don't use ollama for extraction (use basic parsing)" ) parser.add_argument( "--ollama-host", default=OLLAMA_HOST, help=f"Ollama server URL (default: {OLLAMA_HOST})" ) parser.add_argument( "--ollama-model", default=OLLAMA_MODEL, help=f"Ollama model to use (default: {OLLAMA_MODEL})" ) args = parser.parse_args() # Update ollama config from args global OLLAMA_HOST, OLLAMA_MODEL OLLAMA_HOST = args.ollama_host OLLAMA_MODEL = args.ollama_model print(f"\n{'='*60}") print(f"Setting up NFR data for {args.year}") print(f"{'='*60}\n") # Get announcement URL if args.url: urls = {"newsroom_url": args.url} else: urls = search_for_nfr_announcement(args.year) if not urls.get("newsroom_url"): print("\nError: No announcement URL found.") print("Please provide a URL with --url") sys.exit(1) # Fetch content try: html_content = fetch_url_content(urls["newsroom_url"]) except Exception as e: print(f"Error fetching URL: {e}") sys.exit(1) # Extract films films = None if not args.no_ollama: try: films = extract_films_with_ollama(html_content, args.year) except Exception as e: print(f"Error using ollama: {e}") print("Falling back to basic extraction...") if not films: films = extract_films_basic(html_content, args.year) if not films: print("\nError: Could not extract films from announcement.") print("Try:") print(" 1. Using --ollama if you skipped it") print(" 2. Manually creating the dictionary") sys.exit(1) print(f"\n✓ Extracted {len(films)} films") # Show preview print("\nPreview of extracted films:") for i, film in enumerate(films[:5], 1): print(f" {i}. {film['title']} ({film['year']})") if len(film['description']) > 100: print(f" {film['description'][:100]}...") else: print(f" {film['description']}") if len(films) > 5: print(f" ... and {len(films) - 5} more") # Confirm confirm = input("\nSave this data? (Y/n): ").strip().lower() if confirm == 'n': print("Cancelled") sys.exit(0) # Save output_path = save_nfr_data(films, args.year, args.output) print("\n✓ Done!") if __name__ == "__main__": main()