198 lines
5.5 KiB
Python
198 lines
5.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Batch generate NFR dictionaries for multiple years.
|
|
|
|
This script automates the process of generating NFR data dictionaries
|
|
for years 1989-2023 by searching for and fetching LOC announcement pages.
|
|
|
|
Usage:
|
|
python3 scripts/batch_generate_nfr.py --years 2020-2023
|
|
python3 scripts/batch_generate_nfr.py --years 2015,2016,2017
|
|
python3 scripts/batch_generate_nfr.py --all # Process all years 1989-2023
|
|
"""
|
|
|
|
import argparse
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# NFR started in 1989
|
|
FIRST_NFR_YEAR = 1989
|
|
CURRENT_YEAR = 2024 # Update this as needed
|
|
|
|
# Known announcement URLs (add more as we find them)
|
|
KNOWN_URLS = {
|
|
2024: "https://newsroom.loc.gov/news/25-films-named-to-national-film-registry-for-preservation/s/55d5285d-916f-4105-b7d4-7fc3ba8664e3",
|
|
2023: "https://newsroom.loc.gov/news/25-films-selected-for-preservation-in-national-film-registry/s/aa4bef48-95f6-486f-882d-110613633b1e",
|
|
2022: "https://newsroom.loc.gov/news/25-eclectic-films-chosen-for-national-film-registry/s/8c41f7a1-b9d9-4f9e-b252-4795b73a4aaf",
|
|
}
|
|
|
|
|
|
def parse_year_range(year_spec):
|
|
"""
|
|
Parse year specification into a list of years.
|
|
|
|
Examples:
|
|
"2020-2023" -> [2020, 2021, 2022, 2023]
|
|
"2015,2016,2017" -> [2015, 2016, 2017]
|
|
"2020" -> [2020]
|
|
"""
|
|
years = []
|
|
|
|
# Handle comma-separated list
|
|
if ',' in year_spec:
|
|
for year_str in year_spec.split(','):
|
|
years.append(int(year_str.strip()))
|
|
# Handle range
|
|
elif '-' in year_spec:
|
|
start, end = year_spec.split('-')
|
|
years = list(range(int(start.strip()), int(end.strip()) + 1))
|
|
# Handle single year
|
|
else:
|
|
years = [int(year_spec)]
|
|
|
|
return years
|
|
|
|
|
|
def run_setup_for_year(year, use_ollama=True, ollama_host=None, ollama_model=None):
|
|
"""
|
|
Run setup_nfr.py for a specific year.
|
|
|
|
Returns True if successful, False otherwise.
|
|
"""
|
|
print(f"\n{'='*60}")
|
|
print(f"Processing NFR {year}")
|
|
print(f"{'='*60}\n")
|
|
|
|
cmd = ["python3", "scripts/setup_nfr.py", str(year)]
|
|
|
|
# Add URL if we know it
|
|
if year in KNOWN_URLS:
|
|
cmd.extend(["--url", KNOWN_URLS[year]])
|
|
print(f"Using known URL for {year}")
|
|
|
|
# Add ollama options
|
|
if not use_ollama:
|
|
cmd.append("--no-ollama")
|
|
else:
|
|
if ollama_host:
|
|
cmd.extend(["--ollama-host", ollama_host])
|
|
if ollama_model:
|
|
cmd.extend(["--ollama-model", ollama_model])
|
|
|
|
# Check if output file already exists
|
|
output_file = Path(f"scripts/nfr_data/nfr_{year}.py")
|
|
if output_file.exists():
|
|
print(f"⚠️ {output_file} already exists")
|
|
response = input("Overwrite? (y/N): ").strip().lower()
|
|
if response != 'y':
|
|
print(f"Skipping {year}")
|
|
return False
|
|
|
|
try:
|
|
# Run the command - this may be interactive
|
|
result = subprocess.run(cmd, check=False)
|
|
|
|
if result.returncode == 0:
|
|
print(f"✓ Successfully processed {year}")
|
|
return True
|
|
else:
|
|
print(f"✗ Failed to process {year}")
|
|
return False
|
|
|
|
except KeyboardInterrupt:
|
|
print(f"\n\nInterrupted while processing {year}")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"Error processing {year}: {e}")
|
|
return False
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Batch generate NFR dictionaries"
|
|
)
|
|
|
|
group = parser.add_mutually_exclusive_group(required=True)
|
|
group.add_argument(
|
|
"--years",
|
|
help="Years to process (e.g., '2020-2023' or '2015,2016,2017')"
|
|
)
|
|
group.add_argument(
|
|
"--all",
|
|
action="store_true",
|
|
help=f"Process all years from {FIRST_NFR_YEAR} to 2023"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--no-ollama",
|
|
action="store_true",
|
|
help="Don't use ollama (use basic extraction)"
|
|
)
|
|
parser.add_argument(
|
|
"--ollama-host",
|
|
help="Ollama server URL"
|
|
)
|
|
parser.add_argument(
|
|
"--ollama-model",
|
|
help="Ollama model to use"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Determine which years to process
|
|
if args.all:
|
|
years = list(range(FIRST_NFR_YEAR, 2024)) # 1989-2023
|
|
else:
|
|
years = parse_year_range(args.years)
|
|
|
|
# Sort years
|
|
years.sort()
|
|
|
|
print(f"\nWill process {len(years)} years: {years[0]}-{years[-1]}")
|
|
print(f"Ollama: {'disabled' if args.no_ollama else 'enabled'}")
|
|
|
|
if len(years) > 5:
|
|
response = input("\nThis will process many years. Continue? (Y/n): ").strip().lower()
|
|
if response == 'n':
|
|
print("Cancelled")
|
|
sys.exit(0)
|
|
|
|
# Process each year
|
|
successful = []
|
|
failed = []
|
|
|
|
for year in years:
|
|
success = run_setup_for_year(
|
|
year,
|
|
use_ollama=not args.no_ollama,
|
|
ollama_host=args.ollama_host,
|
|
ollama_model=args.ollama_model
|
|
)
|
|
|
|
if success:
|
|
successful.append(year)
|
|
else:
|
|
failed.append(year)
|
|
|
|
# Summary
|
|
print(f"\n{'='*60}")
|
|
print("SUMMARY")
|
|
print(f"{'='*60}\n")
|
|
print(f"✓ Successfully processed: {len(successful)} years")
|
|
if successful:
|
|
print(f" {successful}")
|
|
|
|
if failed:
|
|
print(f"\n✗ Failed: {len(failed)} years")
|
|
print(f" {failed}")
|
|
print(f"\nYou can retry failed years individually:")
|
|
for year in failed:
|
|
print(f" python3 scripts/setup_nfr.py {year}")
|
|
|
|
print(f"\nGenerated files are in: scripts/nfr_data/")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|