feeds/rst2atom.py

# -*- coding: utf-8 -*-

filename="test.rst"
generator_uri = "http://git.sdf.org/rst2atom.py"
generator_name = "rst2atom"

# https://docutils.sourceforge.io/docs/api/publisher.html

import docutils.core
import dateutil.parser
import datetime
import bs4

html = docutils.core.publish_parts(open(filename, mode="r").read(),
                                   source_path=filename,
                                   writer_name='html')


def extract_docinfos(html):
    """
    Parse the publish_parts dictionary and return a dictionary containing
    the docinfos from the RsT document.
    """
    soup1 = bs4.BeautifulSoup(html["docinfo"], 'html.parser')
    return { tr.find("th").text:
             tr.find("td").text
               for tr in [
                 e.find_parent("tr") for e in
                 soup1.find_all("th", {"class": "docinfo-name"})
               ] }

def extract_meta(html):
    """
    Parse the publish_parts dictionary and return a dictionary containing
    the metadata from the RsT document.
    """
    soup1 = bs4.BeautifulSoup(html["meta"], 'html.parser')
    return { m.attrs["name"]: m.attrs["content"]
               for m in soup1.find_all("meta", {"name": True,
                                                "content": True}) }


def find_date(d):
    """
    Parse the comments in a <div> section and return a an ISO8601 formatted
    string being the first recognized date string.
    Raise dateutil.parser.ParserError if no such comment contains a date.

    In the RestructuredText, the date would be typically written as:

        This is a new entry
        ~~~~~~~~~~~~~~~~~~~
        .. Note: created on 2022/11/05

        Blah...
    """
    comments = d.find_all(string=lambda text: isinstance(text, bs4.Comment))
    for c in comments:
        for k in c.extract().split(" "):
            try:
                t = dateutil.parser.parse(k)
                return t.isoformat()
            except dateutil.parser.ParserError:
                pass
    raise dateutil.parser.ParserError


def build_entry(d):
    """
    Print on stdout an Atom <entry> section built from the <div>.
    """
    print("""    <entry>
        <title>Atom-Powered Robots Run Amok</title>
        <link href="http://example.org/2003/12/13/atom03" />
        <link rel="alternate" type="text/html" href="http://example.org/2003/12/13/atom03.html"/>
        <link rel="edit" href="http://example.org/2003/12/13/atom03/edit"/>
        <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
        <published>2003-11-09T17:23:02Z</published>
                <content type="xhtml">
                        <div xmlns="http://www.w3.org/1999/xhtml">
                                <p>This is the entry content.</p>
                        </div>
                </content>
                <author>
                        <name>John Doe</name>
                        <email>johndoe@example.com</email>
                </author>
        </entry>
        """)


docinfos = extract_docinfos(html)
meta = extract_meta(html)

print("""<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
    <id>%s</id>
    <title>%s</title>
    <author>
        <name>%s</name>
        <email>%s</email>
    </author>
    <rights>%s</rights>
    <updated>%s</updated>
    <link href="%s" />
    <link href="%s" rel="self" />
    <generator uri="%s">%s</generator>
""" % (
         meta["original-source"],
         html["title"],
         docinfos["Author:"],
         docinfos["Contact:"],
         meta["copyright"],
         datetime.datetime.now().isoformat(),
         meta["original-source"],
         meta["syndication-source"],
         generator_uri, generator_name
))


soup2 = bs4.BeautifulSoup(html["body"], 'html.parser')
divs = soup2.select("div")

for d in divs:
    # don't handle subsections
    if d.find_parent("div"): continue

    print("="*40)
    print(d)
    try:
        print(find_date(d))
    except dateutil.parser.ParserError:
        pass


print("</feed>")