marcus-web/scripts/gopher/convert_to_gopher.py

#!/usr/bin/env python3
"""
Convert Hugo markdown posts to gopher-formatted text files.

Usage:
    python scripts/gopher/convert_to_gopher.py content/posts/blog-posting.md
    python scripts/gopher/convert_to_gopher.py --all
    python scripts/gopher/convert_to_gopher.py --all --output gopher_build/blog/
"""

import argparse
import re
import sys
import textwrap
import yaml
from pathlib import Path

# Add script directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

from ascii_art import (
    LINE_WIDTH,
    POST_HEADER,
    POST_FOOTER,
    SERIES_TO_DIR,
    get_post_meta_header,
    get_title_block,
    get_section_header,
    get_subheading,
    generate_movie_table,
    format_links_section,
)

# Paths
SCRIPT_DIR = Path(__file__).parent
PROJECT_ROOT = SCRIPT_DIR.parent.parent
CONTENT_DIR = PROJECT_ROOT / "content" / "posts"
DEFAULT_OUTPUT = PROJECT_ROOT / "gopher_build" / "blog"


def parse_frontmatter(content: str) -> tuple[dict, str]:
    """Parse YAML frontmatter and return (metadata, body)."""
    if not content.startswith("---"):
        return {}, content

    # Find the closing ---
    end_match = re.search(r"\n---\n", content[3:])
    if not end_match:
        return {}, content

    yaml_end = end_match.start() + 3
    yaml_content = content[3:yaml_end]
    body = content[yaml_end + 4 :]  # Skip the closing ---\n

    try:
        metadata = yaml.safe_load(yaml_content)
    except yaml.YAMLError:
        metadata = {}

    return metadata or {}, body


def extract_links(text: str) -> tuple[str, list]:
    """Extract markdown links and replace with numbered references."""
    links = []
    link_pattern = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")

    def replace_link(match):
        text = match.group(1)
        url = match.group(2)
        links.append(url)
        return f"{text} [{len(links)}]"

    converted = link_pattern.sub(replace_link, text)
    return converted, links


def convert_headings(text: str) -> str:
    """Convert markdown headings to gopher-style text."""
    # H1: Double line with centered text
    def h1_replace(match):
        title = match.group(1).strip()
        return get_section_header(title)

    # H2: Dashed subheading
    def h2_replace(match):
        title = match.group(1).strip()
        return get_subheading(title)

    # H3-H6: Just bold-style text
    def h3_replace(match):
        title = match.group(1).strip()
        return f"\n*{title}*\n"

    text = re.sub(r"^# (.+)$", h1_replace, text, flags=re.MULTILINE)
    text = re.sub(r"^## (.+)$", h2_replace, text, flags=re.MULTILINE)
    text = re.sub(r"^###+ (.+)$", h3_replace, text, flags=re.MULTILINE)

    return text


def convert_formatting(text: str) -> str:
    """Convert markdown formatting to gopher-style text."""
    # Bold: **text** -> *text*
    text = re.sub(r"\*\*([^*]+)\*\*", r"*\1*", text)

    # Italic: *text* -> _text_ (but not our converted bold *text*)
    text = re.sub(r"(?<!\*)\*([^*]+)\*(?!\*)", r"_\1_", text)

    # Horizontal rules
    text = re.sub(r"^---+$", "-" * LINE_WIDTH, text, flags=re.MULTILINE)

    return text


def convert_code_blocks(text: str) -> str:
    """Convert code blocks to indented text."""
    # Fenced code blocks
    def indent_code(match):
        code = match.group(2)
        # Indent each line by 4 spaces
        indented = "\n".join("    " + line for line in code.split("\n"))
        return f"\n{indented}\n"

    text = re.sub(r"```(\w*)\n(.*?)```", indent_code, text, flags=re.DOTALL)

    # Inline code: `code` -> code (just remove backticks)
    text = re.sub(r"`([^`]+)`", r"\1", text)

    return text


def handle_imdbposter(text: str, metadata: dict, slug: str) -> str:
    """Replace imdbposter shortcode with ASCII movie table."""
    # Check if there's an imdbposter shortcode
    pattern = r"\{\{<\s*imdbposter\s*>\}\}(.*?)\{\{<\s*/imdbposter\s*>\}\}"
    match = re.search(pattern, text, flags=re.DOTALL)

    if not match:
        return text

    # Extract movie info from frontmatter
    title = metadata.get("title", "Unknown")
    year = metadata.get("year", "")
    director = metadata.get("director", "")
    runtime = metadata.get("runtime", 0)
    genres = metadata.get("genres", [])
    web_url = f"https://mnw.sdf.org/posts/{slug}/"

    # If year not in frontmatter, try to parse from date
    if not year and metadata.get("date"):
        date_str = str(metadata.get("date"))
        year_match = re.match(r"(\d{4})", date_str)
        if year_match:
            year = int(year_match.group(1))

    # Generate the ASCII table
    table = generate_movie_table(
        title=title,
        year=year,
        director=director,
        runtime=runtime,
        genres=genres,
        web_url=web_url,
    )

    # Also extract and preserve the viewing info table if present
    inner_content = match.group(1).strip()
    if inner_content:
        # Convert markdown table to plain text
        table_lines = []
        for line in inner_content.split("\n"):
            line = line.strip()
            if line and not line.startswith("|--"):
                # Remove leading/trailing pipes and clean up
                line = re.sub(r"^\||\|$", "", line)
                cells = [c.strip() for c in line.split("|")]
                if len(cells) >= 2:
                    table_lines.append(f"  {cells[0]}: {cells[1]}")
        if table_lines:
            table += "\n\n" + "\n".join(table_lines)

    return text[: match.start()] + table + text[match.end() :]


def wrap_paragraphs(text: str) -> str:
    """Wrap text to LINE_WIDTH, preserving code blocks and structure."""
    lines = text.split("\n")
    result = []
    paragraph = []

    def flush_paragraph():
        if paragraph:
            para_text = " ".join(paragraph)
            wrapped = textwrap.fill(para_text, width=LINE_WIDTH)
            result.append(wrapped)
            paragraph.clear()

    for line in lines:
        # Detect code blocks (indented by 4 spaces)
        if line.startswith("    "):
            flush_paragraph()
            result.append(line)
            continue

        # Empty line = paragraph break
        if not line.strip():
            flush_paragraph()
            result.append("")
            continue

        # Lines that look like headers or dividers - don't wrap
        if (
            line.startswith("=")
            or line.startswith("-" * 10)
            or line.startswith("*")
            or line.startswith(" " * 10)
        ):
            flush_paragraph()
            result.append(line)
            continue

        # Accumulate paragraph text
        paragraph.append(line.strip())

    flush_paragraph()
    return "\n".join(result)


def convert_post(filepath: Path, output_dir: Path = None) -> Path | None:
    """Convert a single markdown post to gopher format."""
    content = filepath.read_text()
    metadata, body = parse_frontmatter(content)

    # Check if phlog is enabled
    if not metadata.get("phlog", False):
        return None

    # Check if draft
    if metadata.get("draft", False):
        return None

    # Determine output directory from series
    series = metadata.get("series", "Fun Center")
    gopher_dir = SERIES_TO_DIR.get(series, "fun-center")

    # Get slug from filename
    slug = filepath.stem

    # Output path
    output_dir = output_dir or DEFAULT_OUTPUT
    category_dir = output_dir / gopher_dir
    category_dir.mkdir(parents=True, exist_ok=True)
    output_path = category_dir / f"{slug}.txt"

    # Handle imdbposter shortcode first
    body = handle_imdbposter(body, metadata, slug)

    # Extract links before other conversions
    body, links = extract_links(body)

    # Convert markdown to gopher text
    body = convert_headings(body)
    body = convert_formatting(body)
    body = convert_code_blocks(body)

    # Wrap paragraphs
    body = wrap_paragraphs(body)

    # Build the final document
    date_str = ""
    if metadata.get("date"):
        date_obj = metadata["date"]
        if hasattr(date_obj, "strftime"):
            date_str = date_obj.strftime("%Y-%m-%d")
        else:
            date_str = str(date_obj)[:10]

    title = metadata.get("title", slug)
    summary = metadata.get("summary", "")

    parts = [
        POST_HEADER,
        get_post_meta_header(date_str, series),
        get_title_block(title, summary),
        "",
        "-" * LINE_WIDTH,
        "",
        body.strip(),
        "",
    ]

    if links:
        parts.append(format_links_section(links))
        parts.append("")

    parts.append(POST_FOOTER)
    parts.append("")
    parts.append(f"Web version: https://mnw.sdf.org/posts/{slug}/")

    output_content = "\n".join(parts)
    output_path.write_text(output_content)

    return output_path


def convert_all(output_dir: Path = None) -> list[Path]:
    """Convert all posts with phlog: true."""
    output_dir = output_dir or DEFAULT_OUTPUT
    converted = []

    for post_path in CONTENT_DIR.glob("*.md"):
        result = convert_post(post_path, output_dir)
        if result:
            converted.append(result)
            print(f"Converted: {post_path.name} -> {result.relative_to(output_dir)}")

    return converted


def main():
    parser = argparse.ArgumentParser(description="Convert Hugo posts to gopher format")
    parser.add_argument("file", nargs="?", help="Single file to convert")
    parser.add_argument("--all", action="store_true", help="Convert all phlog posts")
    parser.add_argument("--output", "-o", type=Path, help="Output directory")
    args = parser.parse_args()

    output = args.output or DEFAULT_OUTPUT

    if args.all:
        converted = convert_all(output)
        print(f"\nConverted {len(converted)} posts to {output}")
    elif args.file:
        filepath = Path(args.file)
        if not filepath.exists():
            print(f"File not found: {filepath}")
            return 1
        result = convert_post(filepath, output)
        if result:
            print(f"Converted: {result}")
        else:
            print("Post skipped (no phlog: true or is draft)")
    else:
        parser.print_help()
        return 1

    return 0


if __name__ == "__main__":
    exit(main())