marcus-web/scripts/batch_generate_nfr.py

#!/usr/bin/env python3
"""
Batch generate NFR dictionaries for multiple years.

This script automates the process of generating NFR data dictionaries
for years 1989-2023 by searching for and fetching LOC announcement pages.

Usage:
    python3 scripts/batch_generate_nfr.py --years 2020-2023
    python3 scripts/batch_generate_nfr.py --years 2015,2016,2017
    python3 scripts/batch_generate_nfr.py --all  # Process all years 1989-2023
"""

import argparse
import subprocess
import sys
from pathlib import Path

# NFR started in 1989
FIRST_NFR_YEAR = 1989
CURRENT_YEAR = 2024  # Update this as needed

# Known announcement URLs (add more as we find them)
KNOWN_URLS = {
    2024: "https://newsroom.loc.gov/news/25-films-named-to-national-film-registry-for-preservation/s/55d5285d-916f-4105-b7d4-7fc3ba8664e3",
    2023: "https://newsroom.loc.gov/news/25-films-selected-for-preservation-in-national-film-registry/s/aa4bef48-95f6-486f-882d-110613633b1e",
    2022: "https://newsroom.loc.gov/news/25-eclectic-films-chosen-for-national-film-registry/s/8c41f7a1-b9d9-4f9e-b252-4795b73a4aaf",
}


def parse_year_range(year_spec):
    """
    Parse year specification into a list of years.

    Examples:
        "2020-2023" -> [2020, 2021, 2022, 2023]
        "2015,2016,2017" -> [2015, 2016, 2017]
        "2020" -> [2020]
    """
    years = []

    # Handle comma-separated list
    if ',' in year_spec:
        for year_str in year_spec.split(','):
            years.append(int(year_str.strip()))
    # Handle range
    elif '-' in year_spec:
        start, end = year_spec.split('-')
        years = list(range(int(start.strip()), int(end.strip()) + 1))
    # Handle single year
    else:
        years = [int(year_spec)]

    return years


def run_setup_for_year(year, use_ollama=True, ollama_host=None, ollama_model=None):
    """
    Run setup_nfr.py for a specific year.

    Returns True if successful, False otherwise.
    """
    print(f"\n{'='*60}")
    print(f"Processing NFR {year}")
    print(f"{'='*60}\n")

    cmd = ["python3", "scripts/setup_nfr.py", str(year)]

    # Add URL if we know it
    if year in KNOWN_URLS:
        cmd.extend(["--url", KNOWN_URLS[year]])
        print(f"Using known URL for {year}")

    # Add ollama options
    if not use_ollama:
        cmd.append("--no-ollama")
    else:
        if ollama_host:
            cmd.extend(["--ollama-host", ollama_host])
        if ollama_model:
            cmd.extend(["--ollama-model", ollama_model])

    # Check if output file already exists
    output_file = Path(f"scripts/nfr_data/nfr_{year}.py")
    if output_file.exists():
        print(f"⚠️  {output_file} already exists")
        response = input("Overwrite? (y/N): ").strip().lower()
        if response != 'y':
            print(f"Skipping {year}")
            return False

    try:
        # Run the command - this may be interactive
        result = subprocess.run(cmd, check=False)

        if result.returncode == 0:
            print(f"✓ Successfully processed {year}")
            return True
        else:
            print(f"✗ Failed to process {year}")
            return False

    except KeyboardInterrupt:
        print(f"\n\nInterrupted while processing {year}")
        sys.exit(1)
    except Exception as e:
        print(f"Error processing {year}: {e}")
        return False


def main():
    parser = argparse.ArgumentParser(
        description="Batch generate NFR dictionaries"
    )

    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument(
        "--years",
        help="Years to process (e.g., '2020-2023' or '2015,2016,2017')"
    )
    group.add_argument(
        "--all",
        action="store_true",
        help=f"Process all years from {FIRST_NFR_YEAR} to 2023"
    )

    parser.add_argument(
        "--no-ollama",
        action="store_true",
        help="Don't use ollama (use basic extraction)"
    )
    parser.add_argument(
        "--ollama-host",
        help="Ollama server URL"
    )
    parser.add_argument(
        "--ollama-model",
        help="Ollama model to use"
    )

    args = parser.parse_args()

    # Determine which years to process
    if args.all:
        years = list(range(FIRST_NFR_YEAR, 2024))  # 1989-2023
    else:
        years = parse_year_range(args.years)

    # Sort years
    years.sort()

    print(f"\nWill process {len(years)} years: {years[0]}-{years[-1]}")
    print(f"Ollama: {'disabled' if args.no_ollama else 'enabled'}")

    if len(years) > 5:
        response = input("\nThis will process many years. Continue? (Y/n): ").strip().lower()
        if response == 'n':
            print("Cancelled")
            sys.exit(0)

    # Process each year
    successful = []
    failed = []

    for year in years:
        success = run_setup_for_year(
            year,
            use_ollama=not args.no_ollama,
            ollama_host=args.ollama_host,
            ollama_model=args.ollama_model
        )

        if success:
            successful.append(year)
        else:
            failed.append(year)

    # Summary
    print(f"\n{'='*60}")
    print("SUMMARY")
    print(f"{'='*60}\n")
    print(f"✓ Successfully processed: {len(successful)} years")
    if successful:
        print(f"  {successful}")

    if failed:
        print(f"\n✗ Failed: {len(failed)} years")
        print(f"  {failed}")
        print(f"\nYou can retry failed years individually:")
        for year in failed:
            print(f"  python3 scripts/setup_nfr.py {year}")

    print(f"\nGenerated files are in: scripts/nfr_data/")


if __name__ == "__main__":
    main()