Files
marcus-web/scripts/setup_nfr.py

398 lines
12 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Setup NFR (National Film Registry) data for a specific year.
This script fetches the Library of Congress announcement for a given year,
extracts film titles and descriptions, and generates a Python dictionary
that can be added to new_nfr.py.
Usage:
python3 scripts/setup_nfr.py 2024
python3 scripts/setup_nfr.py 2015 --output scripts/nfr_data/nfr_2015.py
python3 scripts/setup_nfr.py 2023 --no-ollama # Don't use ollama for extraction
Requirements:
- requests library
- access to ollama server (optional, for better extraction)
The script will:
1. Search for the LOC announcement URL for the given year
2. Fetch the announcement page
3. Use ollama (if available) or basic parsing to extract film data
4. Generate a Python dictionary with film titles, years, and descriptions
5. Save to a file or print to stdout
"""
import argparse
import json
import os
import re
import sys
from pathlib import Path
from urllib.parse import urljoin
import requests
# Configuration
SCRIPT_DIR = Path(__file__).parent
PROJECT_ROOT = SCRIPT_DIR.parent
NFR_DATA_DIR = SCRIPT_DIR / "nfr_data"
# Ollama configuration
OLLAMA_HOST = os.environ.get("OLLAMA_HOST", "http://192.168.0.109:11434")
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "llama3.2") # or whatever model you have
def search_for_nfr_announcement(year):
"""
Search for the LOC NFR announcement URL for a given year.
Returns dict with:
- newsroom_url: Main press release
- blog_url: Blog announcement (if found)
"""
print(f"Searching for {year} NFR announcement...")
# Try common URL patterns for LOC announcements
urls_to_try = []
# Newsroom pattern (most reliable for recent years)
# Example: https://newsroom.loc.gov/news/25-films-named-to-national-film-registry-for-preservation/
# The URL doesn't always have the year in it, so we'll search
# Try searching via web
search_queries = [
f"site:newsroom.loc.gov national film registry {year}",
f"site:blogs.loc.gov national film registry {year}",
f'"national film registry" {year} site:loc.gov'
]
results = {
"newsroom_url": None,
"blog_url": None,
"webcast_url": None,
}
# For now, return known patterns - user can manually find URL
# We'll enhance this with actual search later
print(f"\nPlease find the LOC announcement URL for {year}.")
print(f"\nCommon places to look:")
print(f" - https://newsroom.loc.gov/")
print(f" - https://blogs.loc.gov/now-see-hear/")
print(f" - https://www.loc.gov/programs/national-film-preservation-board/film-registry/")
# For 2024, we know the URL
if year == 2024:
results["newsroom_url"] = "https://newsroom.loc.gov/news/25-films-named-to-national-film-registry-for-preservation/s/55d5285d-916f-4105-b7d4-7fc3ba8664e3"
results["blog_url"] = "https://blogs.loc.gov/now-see-hear/2024/12/announcing-the-2024-national-film-registry/"
return results
# Prompt user for URL
url = input(f"\nEnter the LOC announcement URL for {year} (or press Enter to skip): ").strip()
if url:
results["newsroom_url"] = url
return results
def fetch_url_content(url):
"""Fetch content from a URL."""
print(f"Fetching {url}...")
resp = requests.get(url, timeout=30)
resp.raise_for_status()
return resp.text
def call_ollama(prompt, model=OLLAMA_MODEL, system_prompt=None):
"""
Call ollama API to process text.
Args:
prompt: The user prompt
model: Model name (default from OLLAMA_MODEL env var)
system_prompt: Optional system prompt
Returns:
The model's response text
"""
url = f"{OLLAMA_HOST}/api/generate"
payload = {
"model": model,
"prompt": prompt,
"stream": False,
}
if system_prompt:
payload["system"] = system_prompt
print(f"Calling ollama at {OLLAMA_HOST} with model {model}...")
try:
resp = requests.post(url, json=payload, timeout=300) # 5 min timeout
resp.raise_for_status()
data = resp.json()
return data.get("response", "")
except requests.exceptions.RequestException as e:
print(f"Error calling ollama: {e}")
return None
def extract_films_with_ollama(html_content, year):
"""
Use ollama to extract film data from HTML content.
Returns a list of dicts with: title, year, description
"""
system_prompt = """You are a helpful assistant that extracts structured data from web pages.
Your task is to extract information about films from National Film Registry announcements.
Output ONLY valid JSON, nothing else. No markdown formatting, no code blocks, just raw JSON."""
user_prompt = f"""From the following HTML content, extract ALL films that were added to the National Film Registry in {year}.
For each film, extract:
1. The exact title
2. The release year of the film
3. The description/reason why it was selected for preservation
Format your response as a JSON array of objects with this structure:
[
{{
"title": "Film Title",
"year": 1999,
"description": "The reason it was selected..."
}}
]
IMPORTANT:
- Extract ALL {year} films, typically 25 films
- Keep descriptions concise but complete
- Use the exact text from the announcement
- Output ONLY the JSON array, no other text
- Do not include markdown code blocks
HTML Content:
{html_content[:50000]}
""" # Limit to first 50k chars to avoid token limits
response = call_ollama(user_prompt, system_prompt=system_prompt)
if not response:
return None
# Try to parse JSON from response
try:
# Sometimes models wrap in code blocks, try to extract
json_match = re.search(r'(\[.*\])', response, re.DOTALL)
if json_match:
response = json_match.group(1)
films = json.loads(response)
return films
except json.JSONDecodeError as e:
print(f"Failed to parse JSON from ollama response: {e}")
print(f"Response was: {response[:500]}...")
return None
def extract_films_basic(html_content, year):
"""
Basic extraction without ollama - looks for common patterns.
This is a fallback method and may not work for all years.
"""
print("Using basic extraction (without ollama)...")
print("Note: This may not capture all details. Consider using --ollama for better results.")
films = []
# Look for numbered lists or bold film titles
# This is a simple heuristic and may need adjustment
# Pattern: Look for year in parentheses near potential titles
# Example: "Film Title (1999)"
pattern = r'([A-Z][^(]{3,50})\s*\((\d{4})\)'
matches = re.findall(pattern, html_content)
seen_titles = set()
for title, film_year in matches:
title = title.strip()
# Filter out obviously wrong matches
if title and len(title) > 3 and title not in seen_titles:
# Try to get a reasonable year range
try:
y = int(film_year)
if 1890 <= y <= year: # Reasonable film year range
films.append({
"title": title,
"year": y,
"description": "[Description not extracted - please add manually]"
})
seen_titles.add(title)
except ValueError:
pass
return films if films else None
def generate_python_dict(films, year):
"""
Generate Python code for the NFR dictionary.
Args:
films: List of film dicts
year: NFR induction year
Returns:
String containing Python code
"""
output = f'''# {year} National Film Registry inductees with LOC descriptions
# Source: [Add URL here]
NFR_{year} = {{'''
for film in films:
title = film["title"].replace("'", "\\'")
desc = film["description"].replace("'", "\\'").replace("\n", " ")
output += f'''
"{title}": {{
"year": {film["year"]},
"description": '{desc}'
}},'''
output += "\n}\n"
return output
def save_nfr_data(films, year, output_path=None):
"""
Save NFR data to a file.
Args:
films: List of film dicts
year: NFR induction year
output_path: Optional path to save to (default: nfr_data/nfr_YEAR.py)
"""
if output_path is None:
NFR_DATA_DIR.mkdir(exist_ok=True)
output_path = NFR_DATA_DIR / f"nfr_{year}.py"
else:
output_path = Path(output_path)
code = generate_python_dict(films, year)
output_path.write_text(code)
print(f"\n✓ Saved to {output_path}")
print(f"\nTo use this data:")
print(f" 1. Review and edit {output_path} if needed")
print(f" 2. Copy the NFR_{year} dictionary into scripts/new_nfr.py")
print(f" 3. Update the script to handle multiple years")
return output_path
def main():
parser = argparse.ArgumentParser(
description="Setup NFR data for a specific year"
)
parser.add_argument("year", type=int, help="NFR induction year (e.g., 2024)")
parser.add_argument(
"--url",
help="Direct URL to LOC announcement (skip search)"
)
parser.add_argument(
"--output",
help="Output file path (default: scripts/nfr_data/nfr_YEAR.py)"
)
parser.add_argument(
"--no-ollama",
action="store_true",
help="Don't use ollama for extraction (use basic parsing)"
)
parser.add_argument(
"--ollama-host",
default=OLLAMA_HOST,
help=f"Ollama server URL (default: {OLLAMA_HOST})"
)
parser.add_argument(
"--ollama-model",
default=OLLAMA_MODEL,
help=f"Ollama model to use (default: {OLLAMA_MODEL})"
)
args = parser.parse_args()
# Update ollama config from args
global OLLAMA_HOST, OLLAMA_MODEL
OLLAMA_HOST = args.ollama_host
OLLAMA_MODEL = args.ollama_model
print(f"\n{'='*60}")
print(f"Setting up NFR data for {args.year}")
print(f"{'='*60}\n")
# Get announcement URL
if args.url:
urls = {"newsroom_url": args.url}
else:
urls = search_for_nfr_announcement(args.year)
if not urls.get("newsroom_url"):
print("\nError: No announcement URL found.")
print("Please provide a URL with --url")
sys.exit(1)
# Fetch content
try:
html_content = fetch_url_content(urls["newsroom_url"])
except Exception as e:
print(f"Error fetching URL: {e}")
sys.exit(1)
# Extract films
films = None
if not args.no_ollama:
try:
films = extract_films_with_ollama(html_content, args.year)
except Exception as e:
print(f"Error using ollama: {e}")
print("Falling back to basic extraction...")
if not films:
films = extract_films_basic(html_content, args.year)
if not films:
print("\nError: Could not extract films from announcement.")
print("Try:")
print(" 1. Using --ollama if you skipped it")
print(" 2. Manually creating the dictionary")
sys.exit(1)
print(f"\n✓ Extracted {len(films)} films")
# Show preview
print("\nPreview of extracted films:")
for i, film in enumerate(films[:5], 1):
print(f" {i}. {film['title']} ({film['year']})")
if len(film['description']) > 100:
print(f" {film['description'][:100]}...")
else:
print(f" {film['description']}")
if len(films) > 5:
print(f" ... and {len(films) - 5} more")
# Confirm
confirm = input("\nSave this data? (Y/n): ").strip().lower()
if confirm == 'n':
print("Cancelled")
sys.exit(0)
# Save
output_path = save_nfr_data(films, args.year, args.output)
print("\n✓ Done!")
if __name__ == "__main__":
main()