feeds/rst2atom.py

# -*- coding: utf-8 -*-

filename="test.rst"
generator_uri = "http://git.sdf.org/rst2atom.py"
generator_name = "rst2atom"

# https://docutils.sourceforge.io/docs/api/publisher.html

import docutils.core
import dateutil.parser
import datetime
import bs4

html = docutils.core.publish_parts(open(filename, mode="r").read(),
                                   source_path=filename,
                                   writer_name='html')


def extract_docinfos(html):
    """
    Parse the publish_parts dictionary and return a dictionary containing
    the docinfos from the RsT document.
    """
    soup1 = bs4.BeautifulSoup(html["docinfo"], 'html.parser')
    return { tr.find("th").text:
             tr.find("td").text
               for tr in [
                 e.find_parent("tr") for e in
                 soup1.find_all("th", {"class": "docinfo-name"})
               ] }

def extract_meta(html):
    """
    Parse the publish_parts dictionary and return a dictionary containing
    the metadata from the RsT document.
    """
    soup1 = bs4.BeautifulSoup(html["meta"], 'html.parser')
    return { m.attrs["name"]: m.attrs["content"]
               for m in soup1.find_all("meta", {"name": True,
                                                "content": True}) }


def find_date(d):
    """
    Parse the comments in a <div> section and return a an ISO8601 formatted
    string being the first recognized date string.
    Raise dateutil.parser.ParserError if no such comment contains a date.

    In the RestructuredText, the date would be typically written as:

        This is a new entry
        ~~~~~~~~~~~~~~~~~~~
        .. Note: created on 2022/11/05

        Blah...
    """
    comments = d.find_all(string=lambda text: isinstance(text, bs4.Comment))
    for c in comments:
        for k in c.extract().split(" "):
            try:
                t = dateutil.parser.parse(k)
                return t.isoformat()
            except dateutil.parser.ParserError:
                pass
    raise dateutil.parser.ParserError


def build_entry(d):
    """
    Print on stdout an Atom <entry> section built from the <div>.
    """
    # Get the date
    mydate = ""
    try:
        mydate = find_date(d)
    except dateutil.parser.ParserError:
        pass
    print("""    <entry>
        <title>%s</title>
        <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
        <updated>%s</updated>
        <content type="xhtml">
            <div xmlns="http://www.w3.org/1999/xhtml">
                    <p>This is the entry content.</p>
            </div>
        </content>
    </entry>
        """ % (
            d.find("h1").text,
            mydate
        ))


docinfos = extract_docinfos(html)
meta = extract_meta(html)

soup2 = bs4.BeautifulSoup(html["body"], 'html.parser')
divs = soup2.select("div")

print("""<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
    <id>%s</id>
    <title>%s</title>
    <author>
        <name>%s</name>
        <email>%s</email>
    </author>
    <rights>%s</rights>
    <updated>%s</updated>
    <link href="%s" />
    <link href="%s" rel="self" />
    <generator uri="%s">%s</generator>
""" % (
         meta["original-source"],
         html["title"],
         docinfos["Author:"],
         docinfos["Contact:"],
         meta["copyright"],
         datetime.datetime.now().isoformat(),
         meta["original-source"],
         meta["syndication-source"],
         generator_uri, generator_name
))

for d in divs:
    # don't handle subsections
    if d.find_parent("div"): continue
    build_entry(d)

print("</feed>")
Initial commit 2022-11-04 23:19:53 +00:00			`# -- coding: utf-8 --`

			`filename="test.rst"`
Update 2022-11-05 11:13:14 +00:00			`generator_uri = "http://git.sdf.org/rst2atom.py"`
			`generator_name = "rst2atom"`
Initial commit 2022-11-04 23:19:53 +00:00
			`# https://docutils.sourceforge.io/docs/api/publisher.html`

			`import docutils.core`
			`import dateutil.parser`
Update 2022-11-05 10:35:34 +00:00			`import datetime`
Initial commit 2022-11-04 23:19:53 +00:00			`import bs4`

			`html = docutils.core.publish_parts(open(filename, mode="r").read(),`
			`source_path=filename,`
			`writer_name='html')`

Update 2022-11-05 10:35:34 +00:00
			`def extract_docinfos(html):`
Update 2022-11-05 10:37:18 +00:00			`"""`
			`Parse the publish_parts dictionary and return a dictionary containing`
			`the docinfos from the RsT document.`
			`"""`
Update 2022-11-05 10:35:34 +00:00			`soup1 = bs4.BeautifulSoup(html["docinfo"], 'html.parser')`
			`return { tr.find("th").text:`
			`tr.find("td").text`
			`for tr in [`
			`e.find_parent("tr") for e in`
			`soup1.find_all("th", {"class": "docinfo-name"})`
			`] }`

Update 2022-11-05 11:13:14 +00:00			`def extract_meta(html):`
			`"""`
			`Parse the publish_parts dictionary and return a dictionary containing`
			`the metadata from the RsT document.`
			`"""`
			`soup1 = bs4.BeautifulSoup(html["meta"], 'html.parser')`
			`return { m.attrs["name"]: m.attrs["content"]`
			`for m in soup1.find_all("meta", {"name": True,`
			`"content": True}) }`

Update 2022-11-05 10:35:34 +00:00
Initial commit 2022-11-04 23:19:53 +00:00			`def find_date(d):`
Update 2022-11-05 10:35:34 +00:00			`"""`
			`Parse the comments in a <div> section and return a an ISO8601 formatted`
			`string being the first recognized date string.`
			`Raise dateutil.parser.ParserError if no such comment contains a date.`

			`In the RestructuredText, the date would be typically written as:`

			`This is a new entry`
			`~~~~~~~~~~~~~~~~~~~`
			`.. Note: created on 2022/11/05`

			`Blah...`
			`"""`
Initial commit 2022-11-04 23:19:53 +00:00			`comments = d.find_all(string=lambda text: isinstance(text, bs4.Comment))`
			`for c in comments:`
			`for k in c.extract().split(" "):`
			`try:`
			`t = dateutil.parser.parse(k)`
Update 2022-11-05 10:35:34 +00:00			`return t.isoformat()`
Initial commit 2022-11-04 23:19:53 +00:00			`except dateutil.parser.ParserError:`
			`pass`
Update 2022-11-05 10:35:34 +00:00			`raise dateutil.parser.ParserError`
Initial commit 2022-11-04 23:19:53 +00:00

Update 2022-11-05 10:35:34 +00:00			`def build_entry(d):`
			`"""`
			`Print on stdout an Atom <entry> section built from the <div>.`
			`"""`
Update 2022-11-05 11:22:32 +00:00			`# Get the date`
			`mydate = ""`
			`try:`
			`mydate = find_date(d)`
			`except dateutil.parser.ParserError:`
			`pass`
Update 2022-11-05 10:35:34 +00:00			`print(""" <entry>`
Update 2022-11-05 11:22:32 +00:00			`<title>%s</title>`
Update 2022-11-05 10:35:34 +00:00			`<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>`
Update 2022-11-05 11:22:32 +00:00			`<updated>%s</updated>`
			`<content type="xhtml">`
			`<div xmlns="http://www.w3.org/1999/xhtml">`
			`<p>This is the entry content.</p>`
			`</div>`
			`</content>`
			`</entry>`
			`""" % (`
			`d.find("h1").text,`
			`mydate`
			`))`
Initial commit 2022-11-04 23:19:53 +00:00
Update 2022-11-05 11:13:14 +00:00

			`docinfos = extract_docinfos(html)`
			`meta = extract_meta(html)`

Update 2022-11-05 11:22:32 +00:00			`soup2 = bs4.BeautifulSoup(html["body"], 'html.parser')`
			`divs = soup2.select("div")`

Update 2022-11-05 10:35:34 +00:00			`print("""<?xml version="1.0" encoding="utf-8"?>`
			`<feed xmlns="http://www.w3.org/2005/Atom">`
Update 2022-11-05 11:13:14 +00:00			`<id>%s</id>`
Update 2022-11-05 10:35:34 +00:00			`<title>%s</title>`
Update 2022-11-05 11:13:14 +00:00			`<author>`
			`<name>%s</name>`
			`<email>%s</email>`
			`</author>`
			`<rights>%s</rights>`
Update 2022-11-05 10:35:34 +00:00			`<updated>%s</updated>`
Update 2022-11-05 11:13:14 +00:00			`<link href="%s" />`
			`<link href="%s" rel="self" />`
			`<generator uri="%s">%s</generator>`
Update 2022-11-05 10:35:34 +00:00			`""" % (`
Update 2022-11-05 11:13:14 +00:00			`meta["original-source"],`
Update 2022-11-05 10:35:34 +00:00			`html["title"],`
Update 2022-11-05 11:13:14 +00:00			`docinfos["Author:"],`
			`docinfos["Contact:"],`
			`meta["copyright"],`
			`datetime.datetime.now().isoformat(),`
			`meta["original-source"],`
			`meta["syndication-source"],`
			`generator_uri, generator_name`
Update 2022-11-05 10:35:34 +00:00			`))`
Initial commit 2022-11-04 23:19:53 +00:00
			`for d in divs:`
			`# don't handle subsections`
			`if d.find_parent("div"): continue`
Update 2022-11-05 11:22:32 +00:00			`build_entry(d)`
Update 2022-11-05 10:35:34 +00:00
			`print("</feed>")`