feeds/rst2atom.py

# -*- coding: utf-8 -*-

import sys

filename=sys.argv[1]

generator_uri = "https://git.sdf.org/baruchel/feeds/src/branch/master/rst2atom.py"
generator_name = "rst2atom"

import docutils.core
import dateutil.parser
import datetime
import bs4

html = docutils.core.publish_parts(open(filename, mode="r").read(),
                                   source_path=filename,
                                   writer_name='html')


def extract_docinfos(html):
    """
    Parse the publish_parts dictionary and return a dictionary containing
    the docinfos from the RsT document.
    """
    soup1 = bs4.BeautifulSoup(html["docinfo"], 'html.parser')
    return { tr.find("th").text:
             tr.find("td").text
               for tr in [
                 e.find_parent("tr") for e in
                 soup1.find_all("th", {"class": "docinfo-name"})
               ] }

def extract_meta(html):
    """
    Parse the publish_parts dictionary and return a dictionary containing
    the metadata from the RsT document.
    """
    soup1 = bs4.BeautifulSoup(html["meta"], 'html.parser')
    return { m.attrs["name"]: m.attrs["content"]
               for m in soup1.find_all("meta", {"name": True,
                                                "content": True}) }


def find_date(d):
    """
    Parse the comments in a <div> section and return a an RFC 3339 formatted
    string being the first recognized date string.
    Raise dateutil.parser.ParserError if no such comment contains a date.

    In the RestructuredText, the date would be typically written as:

        This is a new entry
        ~~~~~~~~~~~~~~~~~~~
        .. Note: created on 2022/11/05

        Blah...
    """
    comments = d.find_all(string=lambda text: isinstance(text, bs4.Comment))
    for c in comments:
        for k in c.extract().split(" "):
            try:
                t = dateutil.parser.parse(k)
                return t.isoformat() + "Z"
            except dateutil.parser.ParserError:
                pass
    raise dateutil.parser.ParserError


def build_entry(d, base_url):
    """
    Print on stdout an Atom <entry> section built from the <div>.
    """
    # Get the date
    mydate = ""
    mytitle = d.find("h1").text
    try:
        mydate = find_date(d)
    except dateutil.parser.ParserError:
        pass
    print("""    <entry>
        <id>%s</id>
        <title>%s</title>
        <updated>%s</updated>
        <content type="xhtml">
            <div xmlns="http://www.w3.org/1999/xhtml">%s</div>
        </content>
    </entry>
        """ % (
            base_url + "#" + mytitle.lower().replace(" ", "-")
              + "_(" + mydate + ")",
            mytitle, mydate, d
        ))


docinfos = extract_docinfos(html)
meta = extract_meta(html)

soup2 = bs4.BeautifulSoup(html["body"], 'html.parser')
divs = soup2.select("div")

print("""<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
    <id>%s</id>
    <title>%s</title>
    <subtitle>%s</subtitle>
    <author>
        <name>%s</name>
        <email>%s</email>
    </author>
    <rights>%s</rights>
    <updated>%s</updated>
    <link href="%s" />
    <link href="%s" rel="self" />
    <generator uri="%s">%s</generator>
""" % (
         meta["original-source"],
         html["title"],
         meta["description"],
         docinfos["Author:"],
         docinfos["Contact:"],
         meta["copyright"],
         datetime.datetime.utcnow().isoformat() + "Z",
         meta["original-source"],
         meta["syndication-source"],
         generator_uri, generator_name
))

for d in divs:
    # don't handle subsections
    if d.find_parent("div"): continue
    build_entry(d, meta["original-source"])

print("</feed>")