marcus-web/scripts/setup_nfr.py

#!/usr/bin/env python3
"""
Setup NFR (National Film Registry) data for a specific year.

This script fetches the Library of Congress announcement for a given year,
extracts film titles and descriptions, and generates a Python dictionary
that can be added to new_nfr.py.

Usage:
    python3 scripts/setup_nfr.py 2024
    python3 scripts/setup_nfr.py 2015 --output scripts/nfr_data/nfr_2015.py
    python3 scripts/setup_nfr.py 2023 --no-ollama  # Don't use ollama for extraction

Requirements:
    - requests library
    - access to ollama server (optional, for better extraction)

The script will:
1. Search for the LOC announcement URL for the given year
2. Fetch the announcement page
3. Use ollama (if available) or basic parsing to extract film data
4. Generate a Python dictionary with film titles, years, and descriptions
5. Save to a file or print to stdout
"""

import argparse
import json
import os
import re
import sys
from pathlib import Path
from urllib.parse import urljoin

import requests

# Configuration
SCRIPT_DIR = Path(__file__).parent
PROJECT_ROOT = SCRIPT_DIR.parent
NFR_DATA_DIR = SCRIPT_DIR / "nfr_data"

# Ollama configuration
OLLAMA_HOST = os.environ.get("OLLAMA_HOST", "http://192.168.0.109:11434")
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "llama3.2")  # or whatever model you have


def search_for_nfr_announcement(year):
    """
    Search for the LOC NFR announcement URL for a given year.

    Returns dict with:
        - newsroom_url: Main press release
        - blog_url: Blog announcement (if found)
    """
    print(f"Searching for {year} NFR announcement...")

    # Try common URL patterns for LOC announcements
    urls_to_try = []

    # Newsroom pattern (most reliable for recent years)
    # Example: https://newsroom.loc.gov/news/25-films-named-to-national-film-registry-for-preservation/
    # The URL doesn't always have the year in it, so we'll search

    # Try searching via web
    search_queries = [
        f"site:newsroom.loc.gov national film registry {year}",
        f"site:blogs.loc.gov national film registry {year}",
        f'"national film registry" {year} site:loc.gov'
    ]

    results = {
        "newsroom_url": None,
        "blog_url": None,
        "webcast_url": None,
    }

    # For now, return known patterns - user can manually find URL
    # We'll enhance this with actual search later
    print(f"\nPlease find the LOC announcement URL for {year}.")
    print(f"\nCommon places to look:")
    print(f"  - https://newsroom.loc.gov/")
    print(f"  - https://blogs.loc.gov/now-see-hear/")
    print(f"  - https://www.loc.gov/programs/national-film-preservation-board/film-registry/")

    # For 2024, we know the URL
    if year == 2024:
        results["newsroom_url"] = "https://newsroom.loc.gov/news/25-films-named-to-national-film-registry-for-preservation/s/55d5285d-916f-4105-b7d4-7fc3ba8664e3"
        results["blog_url"] = "https://blogs.loc.gov/now-see-hear/2024/12/announcing-the-2024-national-film-registry/"
        return results

    # Prompt user for URL
    url = input(f"\nEnter the LOC announcement URL for {year} (or press Enter to skip): ").strip()
    if url:
        results["newsroom_url"] = url

    return results


def fetch_url_content(url):
    """Fetch content from a URL."""
    print(f"Fetching {url}...")
    resp = requests.get(url, timeout=30)
    resp.raise_for_status()
    return resp.text


def call_ollama(prompt, model=OLLAMA_MODEL, system_prompt=None):
    """
    Call ollama API to process text.

    Args:
        prompt: The user prompt
        model: Model name (default from OLLAMA_MODEL env var)
        system_prompt: Optional system prompt

    Returns:
        The model's response text
    """
    url = f"{OLLAMA_HOST}/api/generate"

    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False,
    }

    if system_prompt:
        payload["system"] = system_prompt

    print(f"Calling ollama at {OLLAMA_HOST} with model {model}...")
    try:
        resp = requests.post(url, json=payload, timeout=300)  # 5 min timeout
        resp.raise_for_status()
        data = resp.json()
        return data.get("response", "")
    except requests.exceptions.RequestException as e:
        print(f"Error calling ollama: {e}")
        return None


def extract_films_with_ollama(html_content, year):
    """
    Use ollama to extract film data from HTML content.

    Returns a list of dicts with: title, year, description
    """
    system_prompt = """You are a helpful assistant that extracts structured data from web pages.
Your task is to extract information about films from National Film Registry announcements.
Output ONLY valid JSON, nothing else. No markdown formatting, no code blocks, just raw JSON."""

    user_prompt = f"""From the following HTML content, extract ALL films that were added to the National Film Registry in {year}.

For each film, extract:
1. The exact title
2. The release year of the film
3. The description/reason why it was selected for preservation

Format your response as a JSON array of objects with this structure:
[
  {{
    "title": "Film Title",
    "year": 1999,
    "description": "The reason it was selected..."
  }}
]

IMPORTANT:
- Extract ALL {year} films, typically 25 films
- Keep descriptions concise but complete
- Use the exact text from the announcement
- Output ONLY the JSON array, no other text
- Do not include markdown code blocks

HTML Content:
{html_content[:50000]}
"""  # Limit to first 50k chars to avoid token limits

    response = call_ollama(user_prompt, system_prompt=system_prompt)

    if not response:
        return None

    # Try to parse JSON from response
    try:
        # Sometimes models wrap in code blocks, try to extract
        json_match = re.search(r'(\[.*\])', response, re.DOTALL)
        if json_match:
            response = json_match.group(1)

        films = json.loads(response)
        return films
    except json.JSONDecodeError as e:
        print(f"Failed to parse JSON from ollama response: {e}")
        print(f"Response was: {response[:500]}...")
        return None


def extract_films_basic(html_content, year):
    """
    Basic extraction without ollama - looks for common patterns.
    This is a fallback method and may not work for all years.
    """
    print("Using basic extraction (without ollama)...")
    print("Note: This may not capture all details. Consider using --ollama for better results.")

    films = []

    # Look for numbered lists or bold film titles
    # This is a simple heuristic and may need adjustment

    # Pattern: Look for year in parentheses near potential titles
    # Example: "Film Title (1999)"
    pattern = r'([A-Z][^(]{3,50})\s*\((\d{4})\)'
    matches = re.findall(pattern, html_content)

    seen_titles = set()
    for title, film_year in matches:
        title = title.strip()
        # Filter out obviously wrong matches
        if title and len(title) > 3 and title not in seen_titles:
            # Try to get a reasonable year range
            try:
                y = int(film_year)
                if 1890 <= y <= year:  # Reasonable film year range
                    films.append({
                        "title": title,
                        "year": y,
                        "description": "[Description not extracted - please add manually]"
                    })
                    seen_titles.add(title)
            except ValueError:
                pass

    return films if films else None


def generate_python_dict(films, year):
    """
    Generate Python code for the NFR dictionary.

    Args:
        films: List of film dicts
        year: NFR induction year

    Returns:
        String containing Python code
    """
    output = f'''# {year} National Film Registry inductees with LOC descriptions
# Source: [Add URL here]
NFR_{year} = {{'''

    for film in films:
        title = film["title"].replace("'", "\\'")
        desc = film["description"].replace("'", "\\'").replace("\n", " ")

        output += f'''
    "{title}": {{
        "year": {film["year"]},
        "description": '{desc}'
    }},'''

    output += "\n}\n"

    return output


def save_nfr_data(films, year, output_path=None):
    """
    Save NFR data to a file.

    Args:
        films: List of film dicts
        year: NFR induction year
        output_path: Optional path to save to (default: nfr_data/nfr_YEAR.py)
    """
    if output_path is None:
        NFR_DATA_DIR.mkdir(exist_ok=True)
        output_path = NFR_DATA_DIR / f"nfr_{year}.py"
    else:
        output_path = Path(output_path)

    code = generate_python_dict(films, year)

    output_path.write_text(code)
    print(f"\n✓ Saved to {output_path}")
    print(f"\nTo use this data:")
    print(f"  1. Review and edit {output_path} if needed")
    print(f"  2. Copy the NFR_{year} dictionary into scripts/new_nfr.py")
    print(f"  3. Update the script to handle multiple years")

    return output_path


def main():
    parser = argparse.ArgumentParser(
        description="Setup NFR data for a specific year"
    )
    parser.add_argument("year", type=int, help="NFR induction year (e.g., 2024)")
    parser.add_argument(
        "--url",
        help="Direct URL to LOC announcement (skip search)"
    )
    parser.add_argument(
        "--output",
        help="Output file path (default: scripts/nfr_data/nfr_YEAR.py)"
    )
    parser.add_argument(
        "--no-ollama",
        action="store_true",
        help="Don't use ollama for extraction (use basic parsing)"
    )
    parser.add_argument(
        "--ollama-host",
        default=OLLAMA_HOST,
        help=f"Ollama server URL (default: {OLLAMA_HOST})"
    )
    parser.add_argument(
        "--ollama-model",
        default=OLLAMA_MODEL,
        help=f"Ollama model to use (default: {OLLAMA_MODEL})"
    )
    args = parser.parse_args()

    # Update ollama config from args
    global OLLAMA_HOST, OLLAMA_MODEL
    OLLAMA_HOST = args.ollama_host
    OLLAMA_MODEL = args.ollama_model

    print(f"\n{'='*60}")
    print(f"Setting up NFR data for {args.year}")
    print(f"{'='*60}\n")

    # Get announcement URL
    if args.url:
        urls = {"newsroom_url": args.url}
    else:
        urls = search_for_nfr_announcement(args.year)

    if not urls.get("newsroom_url"):
        print("\nError: No announcement URL found.")
        print("Please provide a URL with --url")
        sys.exit(1)

    # Fetch content
    try:
        html_content = fetch_url_content(urls["newsroom_url"])
    except Exception as e:
        print(f"Error fetching URL: {e}")
        sys.exit(1)

    # Extract films
    films = None

    if not args.no_ollama:
        try:
            films = extract_films_with_ollama(html_content, args.year)
        except Exception as e:
            print(f"Error using ollama: {e}")
            print("Falling back to basic extraction...")

    if not films:
        films = extract_films_basic(html_content, args.year)

    if not films:
        print("\nError: Could not extract films from announcement.")
        print("Try:")
        print("  1. Using --ollama if you skipped it")
        print("  2. Manually creating the dictionary")
        sys.exit(1)

    print(f"\n✓ Extracted {len(films)} films")

    # Show preview
    print("\nPreview of extracted films:")
    for i, film in enumerate(films[:5], 1):
        print(f"  {i}. {film['title']} ({film['year']})")
        if len(film['description']) > 100:
            print(f"     {film['description'][:100]}...")
        else:
            print(f"     {film['description']}")

    if len(films) > 5:
        print(f"  ... and {len(films) - 5} more")

    # Confirm
    confirm = input("\nSave this data? (Y/n): ").strip().lower()
    if confirm == 'n':
        print("Cancelled")
        sys.exit(0)

    # Save
    output_path = save_nfr_data(films, args.year, args.output)

    print("\n✓ Done!")


if __name__ == "__main__":
    main()