Files
marcus-web/scripts/batch_generate_nfr.py

198 lines
5.5 KiB
Python

#!/usr/bin/env python3
"""
Batch generate NFR dictionaries for multiple years.
This script automates the process of generating NFR data dictionaries
for years 1989-2023 by searching for and fetching LOC announcement pages.
Usage:
python3 scripts/batch_generate_nfr.py --years 2020-2023
python3 scripts/batch_generate_nfr.py --years 2015,2016,2017
python3 scripts/batch_generate_nfr.py --all # Process all years 1989-2023
"""
import argparse
import subprocess
import sys
from pathlib import Path
# NFR started in 1989
FIRST_NFR_YEAR = 1989
CURRENT_YEAR = 2024 # Update this as needed
# Known announcement URLs (add more as we find them)
KNOWN_URLS = {
2024: "https://newsroom.loc.gov/news/25-films-named-to-national-film-registry-for-preservation/s/55d5285d-916f-4105-b7d4-7fc3ba8664e3",
2023: "https://newsroom.loc.gov/news/25-films-selected-for-preservation-in-national-film-registry/s/aa4bef48-95f6-486f-882d-110613633b1e",
2022: "https://newsroom.loc.gov/news/25-eclectic-films-chosen-for-national-film-registry/s/8c41f7a1-b9d9-4f9e-b252-4795b73a4aaf",
}
def parse_year_range(year_spec):
"""
Parse year specification into a list of years.
Examples:
"2020-2023" -> [2020, 2021, 2022, 2023]
"2015,2016,2017" -> [2015, 2016, 2017]
"2020" -> [2020]
"""
years = []
# Handle comma-separated list
if ',' in year_spec:
for year_str in year_spec.split(','):
years.append(int(year_str.strip()))
# Handle range
elif '-' in year_spec:
start, end = year_spec.split('-')
years = list(range(int(start.strip()), int(end.strip()) + 1))
# Handle single year
else:
years = [int(year_spec)]
return years
def run_setup_for_year(year, use_ollama=True, ollama_host=None, ollama_model=None):
"""
Run setup_nfr.py for a specific year.
Returns True if successful, False otherwise.
"""
print(f"\n{'='*60}")
print(f"Processing NFR {year}")
print(f"{'='*60}\n")
cmd = ["python3", "scripts/setup_nfr.py", str(year)]
# Add URL if we know it
if year in KNOWN_URLS:
cmd.extend(["--url", KNOWN_URLS[year]])
print(f"Using known URL for {year}")
# Add ollama options
if not use_ollama:
cmd.append("--no-ollama")
else:
if ollama_host:
cmd.extend(["--ollama-host", ollama_host])
if ollama_model:
cmd.extend(["--ollama-model", ollama_model])
# Check if output file already exists
output_file = Path(f"scripts/nfr_data/nfr_{year}.py")
if output_file.exists():
print(f"⚠️ {output_file} already exists")
response = input("Overwrite? (y/N): ").strip().lower()
if response != 'y':
print(f"Skipping {year}")
return False
try:
# Run the command - this may be interactive
result = subprocess.run(cmd, check=False)
if result.returncode == 0:
print(f"✓ Successfully processed {year}")
return True
else:
print(f"✗ Failed to process {year}")
return False
except KeyboardInterrupt:
print(f"\n\nInterrupted while processing {year}")
sys.exit(1)
except Exception as e:
print(f"Error processing {year}: {e}")
return False
def main():
parser = argparse.ArgumentParser(
description="Batch generate NFR dictionaries"
)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument(
"--years",
help="Years to process (e.g., '2020-2023' or '2015,2016,2017')"
)
group.add_argument(
"--all",
action="store_true",
help=f"Process all years from {FIRST_NFR_YEAR} to 2023"
)
parser.add_argument(
"--no-ollama",
action="store_true",
help="Don't use ollama (use basic extraction)"
)
parser.add_argument(
"--ollama-host",
help="Ollama server URL"
)
parser.add_argument(
"--ollama-model",
help="Ollama model to use"
)
args = parser.parse_args()
# Determine which years to process
if args.all:
years = list(range(FIRST_NFR_YEAR, 2024)) # 1989-2023
else:
years = parse_year_range(args.years)
# Sort years
years.sort()
print(f"\nWill process {len(years)} years: {years[0]}-{years[-1]}")
print(f"Ollama: {'disabled' if args.no_ollama else 'enabled'}")
if len(years) > 5:
response = input("\nThis will process many years. Continue? (Y/n): ").strip().lower()
if response == 'n':
print("Cancelled")
sys.exit(0)
# Process each year
successful = []
failed = []
for year in years:
success = run_setup_for_year(
year,
use_ollama=not args.no_ollama,
ollama_host=args.ollama_host,
ollama_model=args.ollama_model
)
if success:
successful.append(year)
else:
failed.append(year)
# Summary
print(f"\n{'='*60}")
print("SUMMARY")
print(f"{'='*60}\n")
print(f"✓ Successfully processed: {len(successful)} years")
if successful:
print(f" {successful}")
if failed:
print(f"\n✗ Failed: {len(failed)} years")
print(f" {failed}")
print(f"\nYou can retry failed years individually:")
for year in failed:
print(f" python3 scripts/setup_nfr.py {year}")
print(f"\nGenerated files are in: scripts/nfr_data/")
if __name__ == "__main__":
main()