398 lines
12 KiB
Python
Executable File
398 lines
12 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Setup NFR (National Film Registry) data for a specific year.
|
|
|
|
This script fetches the Library of Congress announcement for a given year,
|
|
extracts film titles and descriptions, and generates a Python dictionary
|
|
that can be added to new_nfr.py.
|
|
|
|
Usage:
|
|
python3 scripts/setup_nfr.py 2024
|
|
python3 scripts/setup_nfr.py 2015 --output scripts/nfr_data/nfr_2015.py
|
|
python3 scripts/setup_nfr.py 2023 --no-ollama # Don't use ollama for extraction
|
|
|
|
Requirements:
|
|
- requests library
|
|
- access to ollama server (optional, for better extraction)
|
|
|
|
The script will:
|
|
1. Search for the LOC announcement URL for the given year
|
|
2. Fetch the announcement page
|
|
3. Use ollama (if available) or basic parsing to extract film data
|
|
4. Generate a Python dictionary with film titles, years, and descriptions
|
|
5. Save to a file or print to stdout
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from urllib.parse import urljoin
|
|
|
|
import requests
|
|
|
|
# Configuration
|
|
SCRIPT_DIR = Path(__file__).parent
|
|
PROJECT_ROOT = SCRIPT_DIR.parent
|
|
NFR_DATA_DIR = SCRIPT_DIR / "nfr_data"
|
|
|
|
# Ollama configuration
|
|
OLLAMA_HOST = os.environ.get("OLLAMA_HOST", "http://192.168.0.109:11434")
|
|
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "llama3.2") # or whatever model you have
|
|
|
|
|
|
def search_for_nfr_announcement(year):
|
|
"""
|
|
Search for the LOC NFR announcement URL for a given year.
|
|
|
|
Returns dict with:
|
|
- newsroom_url: Main press release
|
|
- blog_url: Blog announcement (if found)
|
|
"""
|
|
print(f"Searching for {year} NFR announcement...")
|
|
|
|
# Try common URL patterns for LOC announcements
|
|
urls_to_try = []
|
|
|
|
# Newsroom pattern (most reliable for recent years)
|
|
# Example: https://newsroom.loc.gov/news/25-films-named-to-national-film-registry-for-preservation/
|
|
# The URL doesn't always have the year in it, so we'll search
|
|
|
|
# Try searching via web
|
|
search_queries = [
|
|
f"site:newsroom.loc.gov national film registry {year}",
|
|
f"site:blogs.loc.gov national film registry {year}",
|
|
f'"national film registry" {year} site:loc.gov'
|
|
]
|
|
|
|
results = {
|
|
"newsroom_url": None,
|
|
"blog_url": None,
|
|
"webcast_url": None,
|
|
}
|
|
|
|
# For now, return known patterns - user can manually find URL
|
|
# We'll enhance this with actual search later
|
|
print(f"\nPlease find the LOC announcement URL for {year}.")
|
|
print(f"\nCommon places to look:")
|
|
print(f" - https://newsroom.loc.gov/")
|
|
print(f" - https://blogs.loc.gov/now-see-hear/")
|
|
print(f" - https://www.loc.gov/programs/national-film-preservation-board/film-registry/")
|
|
|
|
# For 2024, we know the URL
|
|
if year == 2024:
|
|
results["newsroom_url"] = "https://newsroom.loc.gov/news/25-films-named-to-national-film-registry-for-preservation/s/55d5285d-916f-4105-b7d4-7fc3ba8664e3"
|
|
results["blog_url"] = "https://blogs.loc.gov/now-see-hear/2024/12/announcing-the-2024-national-film-registry/"
|
|
return results
|
|
|
|
# Prompt user for URL
|
|
url = input(f"\nEnter the LOC announcement URL for {year} (or press Enter to skip): ").strip()
|
|
if url:
|
|
results["newsroom_url"] = url
|
|
|
|
return results
|
|
|
|
|
|
def fetch_url_content(url):
|
|
"""Fetch content from a URL."""
|
|
print(f"Fetching {url}...")
|
|
resp = requests.get(url, timeout=30)
|
|
resp.raise_for_status()
|
|
return resp.text
|
|
|
|
|
|
def call_ollama(prompt, model=OLLAMA_MODEL, system_prompt=None):
|
|
"""
|
|
Call ollama API to process text.
|
|
|
|
Args:
|
|
prompt: The user prompt
|
|
model: Model name (default from OLLAMA_MODEL env var)
|
|
system_prompt: Optional system prompt
|
|
|
|
Returns:
|
|
The model's response text
|
|
"""
|
|
url = f"{OLLAMA_HOST}/api/generate"
|
|
|
|
payload = {
|
|
"model": model,
|
|
"prompt": prompt,
|
|
"stream": False,
|
|
}
|
|
|
|
if system_prompt:
|
|
payload["system"] = system_prompt
|
|
|
|
print(f"Calling ollama at {OLLAMA_HOST} with model {model}...")
|
|
try:
|
|
resp = requests.post(url, json=payload, timeout=300) # 5 min timeout
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
return data.get("response", "")
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"Error calling ollama: {e}")
|
|
return None
|
|
|
|
|
|
def extract_films_with_ollama(html_content, year):
|
|
"""
|
|
Use ollama to extract film data from HTML content.
|
|
|
|
Returns a list of dicts with: title, year, description
|
|
"""
|
|
system_prompt = """You are a helpful assistant that extracts structured data from web pages.
|
|
Your task is to extract information about films from National Film Registry announcements.
|
|
Output ONLY valid JSON, nothing else. No markdown formatting, no code blocks, just raw JSON."""
|
|
|
|
user_prompt = f"""From the following HTML content, extract ALL films that were added to the National Film Registry in {year}.
|
|
|
|
For each film, extract:
|
|
1. The exact title
|
|
2. The release year of the film
|
|
3. The description/reason why it was selected for preservation
|
|
|
|
Format your response as a JSON array of objects with this structure:
|
|
[
|
|
{{
|
|
"title": "Film Title",
|
|
"year": 1999,
|
|
"description": "The reason it was selected..."
|
|
}}
|
|
]
|
|
|
|
IMPORTANT:
|
|
- Extract ALL {year} films, typically 25 films
|
|
- Keep descriptions concise but complete
|
|
- Use the exact text from the announcement
|
|
- Output ONLY the JSON array, no other text
|
|
- Do not include markdown code blocks
|
|
|
|
HTML Content:
|
|
{html_content[:50000]}
|
|
""" # Limit to first 50k chars to avoid token limits
|
|
|
|
response = call_ollama(user_prompt, system_prompt=system_prompt)
|
|
|
|
if not response:
|
|
return None
|
|
|
|
# Try to parse JSON from response
|
|
try:
|
|
# Sometimes models wrap in code blocks, try to extract
|
|
json_match = re.search(r'(\[.*\])', response, re.DOTALL)
|
|
if json_match:
|
|
response = json_match.group(1)
|
|
|
|
films = json.loads(response)
|
|
return films
|
|
except json.JSONDecodeError as e:
|
|
print(f"Failed to parse JSON from ollama response: {e}")
|
|
print(f"Response was: {response[:500]}...")
|
|
return None
|
|
|
|
|
|
def extract_films_basic(html_content, year):
|
|
"""
|
|
Basic extraction without ollama - looks for common patterns.
|
|
This is a fallback method and may not work for all years.
|
|
"""
|
|
print("Using basic extraction (without ollama)...")
|
|
print("Note: This may not capture all details. Consider using --ollama for better results.")
|
|
|
|
films = []
|
|
|
|
# Look for numbered lists or bold film titles
|
|
# This is a simple heuristic and may need adjustment
|
|
|
|
# Pattern: Look for year in parentheses near potential titles
|
|
# Example: "Film Title (1999)"
|
|
pattern = r'([A-Z][^(]{3,50})\s*\((\d{4})\)'
|
|
matches = re.findall(pattern, html_content)
|
|
|
|
seen_titles = set()
|
|
for title, film_year in matches:
|
|
title = title.strip()
|
|
# Filter out obviously wrong matches
|
|
if title and len(title) > 3 and title not in seen_titles:
|
|
# Try to get a reasonable year range
|
|
try:
|
|
y = int(film_year)
|
|
if 1890 <= y <= year: # Reasonable film year range
|
|
films.append({
|
|
"title": title,
|
|
"year": y,
|
|
"description": "[Description not extracted - please add manually]"
|
|
})
|
|
seen_titles.add(title)
|
|
except ValueError:
|
|
pass
|
|
|
|
return films if films else None
|
|
|
|
|
|
def generate_python_dict(films, year):
|
|
"""
|
|
Generate Python code for the NFR dictionary.
|
|
|
|
Args:
|
|
films: List of film dicts
|
|
year: NFR induction year
|
|
|
|
Returns:
|
|
String containing Python code
|
|
"""
|
|
output = f'''# {year} National Film Registry inductees with LOC descriptions
|
|
# Source: [Add URL here]
|
|
NFR_{year} = {{'''
|
|
|
|
for film in films:
|
|
title = film["title"].replace("'", "\\'")
|
|
desc = film["description"].replace("'", "\\'").replace("\n", " ")
|
|
|
|
output += f'''
|
|
"{title}": {{
|
|
"year": {film["year"]},
|
|
"description": '{desc}'
|
|
}},'''
|
|
|
|
output += "\n}\n"
|
|
|
|
return output
|
|
|
|
|
|
def save_nfr_data(films, year, output_path=None):
|
|
"""
|
|
Save NFR data to a file.
|
|
|
|
Args:
|
|
films: List of film dicts
|
|
year: NFR induction year
|
|
output_path: Optional path to save to (default: nfr_data/nfr_YEAR.py)
|
|
"""
|
|
if output_path is None:
|
|
NFR_DATA_DIR.mkdir(exist_ok=True)
|
|
output_path = NFR_DATA_DIR / f"nfr_{year}.py"
|
|
else:
|
|
output_path = Path(output_path)
|
|
|
|
code = generate_python_dict(films, year)
|
|
|
|
output_path.write_text(code)
|
|
print(f"\n✓ Saved to {output_path}")
|
|
print(f"\nTo use this data:")
|
|
print(f" 1. Review and edit {output_path} if needed")
|
|
print(f" 2. Copy the NFR_{year} dictionary into scripts/new_nfr.py")
|
|
print(f" 3. Update the script to handle multiple years")
|
|
|
|
return output_path
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Setup NFR data for a specific year"
|
|
)
|
|
parser.add_argument("year", type=int, help="NFR induction year (e.g., 2024)")
|
|
parser.add_argument(
|
|
"--url",
|
|
help="Direct URL to LOC announcement (skip search)"
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
help="Output file path (default: scripts/nfr_data/nfr_YEAR.py)"
|
|
)
|
|
parser.add_argument(
|
|
"--no-ollama",
|
|
action="store_true",
|
|
help="Don't use ollama for extraction (use basic parsing)"
|
|
)
|
|
parser.add_argument(
|
|
"--ollama-host",
|
|
default=OLLAMA_HOST,
|
|
help=f"Ollama server URL (default: {OLLAMA_HOST})"
|
|
)
|
|
parser.add_argument(
|
|
"--ollama-model",
|
|
default=OLLAMA_MODEL,
|
|
help=f"Ollama model to use (default: {OLLAMA_MODEL})"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
# Update ollama config from args
|
|
global OLLAMA_HOST, OLLAMA_MODEL
|
|
OLLAMA_HOST = args.ollama_host
|
|
OLLAMA_MODEL = args.ollama_model
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Setting up NFR data for {args.year}")
|
|
print(f"{'='*60}\n")
|
|
|
|
# Get announcement URL
|
|
if args.url:
|
|
urls = {"newsroom_url": args.url}
|
|
else:
|
|
urls = search_for_nfr_announcement(args.year)
|
|
|
|
if not urls.get("newsroom_url"):
|
|
print("\nError: No announcement URL found.")
|
|
print("Please provide a URL with --url")
|
|
sys.exit(1)
|
|
|
|
# Fetch content
|
|
try:
|
|
html_content = fetch_url_content(urls["newsroom_url"])
|
|
except Exception as e:
|
|
print(f"Error fetching URL: {e}")
|
|
sys.exit(1)
|
|
|
|
# Extract films
|
|
films = None
|
|
|
|
if not args.no_ollama:
|
|
try:
|
|
films = extract_films_with_ollama(html_content, args.year)
|
|
except Exception as e:
|
|
print(f"Error using ollama: {e}")
|
|
print("Falling back to basic extraction...")
|
|
|
|
if not films:
|
|
films = extract_films_basic(html_content, args.year)
|
|
|
|
if not films:
|
|
print("\nError: Could not extract films from announcement.")
|
|
print("Try:")
|
|
print(" 1. Using --ollama if you skipped it")
|
|
print(" 2. Manually creating the dictionary")
|
|
sys.exit(1)
|
|
|
|
print(f"\n✓ Extracted {len(films)} films")
|
|
|
|
# Show preview
|
|
print("\nPreview of extracted films:")
|
|
for i, film in enumerate(films[:5], 1):
|
|
print(f" {i}. {film['title']} ({film['year']})")
|
|
if len(film['description']) > 100:
|
|
print(f" {film['description'][:100]}...")
|
|
else:
|
|
print(f" {film['description']}")
|
|
|
|
if len(films) > 5:
|
|
print(f" ... and {len(films) - 5} more")
|
|
|
|
# Confirm
|
|
confirm = input("\nSave this data? (Y/n): ").strip().lower()
|
|
if confirm == 'n':
|
|
print("Cancelled")
|
|
sys.exit(0)
|
|
|
|
# Save
|
|
output_path = save_nfr_data(films, args.year, args.output)
|
|
|
|
print("\n✓ Done!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|