feeds/rst2atom.py

# -*- coding: utf-8 -*-

import sys

filename=sys.argv[1]

generator_uri = "https://git.sdf.org/baruchel/feeds/src/branch/master/rst2atom.py"
generator_name = "rst2atom"

import docutils.core
import dateutil.parser
import datetime
import bs4

html = docutils.core.publish_parts(open(filename, mode="r").read(),
                                   source_path=filename,
                                   writer_name='html')


def extract_docinfos(html):
    """
    Parse the publish_parts dictionary and return a dictionary containing
    the docinfos from the RsT document.
    """
    soup1 = bs4.BeautifulSoup(html["docinfo"], 'html.parser')
    return { tr.find("th").text:
             tr.find("td").text
               for tr in [
                 e.find_parent("tr") for e in
                 soup1.find_all("th", {"class": "docinfo-name"})
               ] }

def extract_meta(html):
    """
    Parse the publish_parts dictionary and return a dictionary containing
    the metadata from the RsT document.
    """
    soup1 = bs4.BeautifulSoup(html["meta"], 'html.parser')
    return { m.attrs["name"]: m.attrs["content"]
               for m in soup1.find_all("meta", {"name": True,
                                                "content": True}) }


def find_date(d):
    """
    Parse the comments in a <div> section and return a an RFC 3339 formatted
    string being the first recognized date string.
    Raise dateutil.parser.ParserError if no such comment contains a date.

    In the RestructuredText, the date would be typically written as:

        This is a new entry
        ~~~~~~~~~~~~~~~~~~~
        .. Note: created on 2022/11/05

        Blah...
    """
    comments = d.find_all(string=lambda text: isinstance(text, bs4.Comment))
    for c in comments:
        for k in c.extract().split(" "):
            try:
                t = dateutil.parser.parse(k)
                return t.isoformat() + "Z"
            except dateutil.parser.ParserError:
                pass
    raise dateutil.parser.ParserError


def build_entry(d, base_url):
    """
    Print on stdout an Atom <entry> section built from the <div>.
    """
    # Get the date
    mydate = ""
    mytitle = d.find("h1").text
    try:
        mydate = find_date(d)
    except dateutil.parser.ParserError:
        pass
    print("""    <entry>
        <id>%s</id>
        <title>%s</title>
        <updated>%s</updated>
        <content type="xhtml">
            <div xmlns="http://www.w3.org/1999/xhtml">%s</div>
        </content>
    </entry>
        """ % (
            base_url + "#" + mytitle.lower().replace(" ", "-")
              + "_(" + mydate + ")",
            mytitle, mydate, d
        ))


docinfos = extract_docinfos(html)
meta = extract_meta(html)

soup2 = bs4.BeautifulSoup(html["body"], 'html.parser')
divs = soup2.select("div")

print("""<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
    <id>%s</id>
    <title>%s</title>
    <subtitle>%s</subtitle>
    <author>
        <name>%s</name>
        <email>%s</email>
    </author>
    <rights>%s</rights>
    <updated>%s</updated>
    <link href="%s" />
    <link href="%s" rel="self" />
    <generator uri="%s">%s</generator>
""" % (
         meta["original-source"],
         html["title"],
         meta["description"],
         docinfos["Author:"],
         docinfos["Contact:"],
         meta["copyright"],
         datetime.datetime.utcnow().isoformat() + "Z",
         meta["original-source"],
         meta["syndication-source"],
         generator_uri, generator_name
))

for d in divs:
    # don't handle subsections
    if d.find_parent("div"): continue
    build_entry(d, meta["original-source"])

print("</feed>")
Initial commit 2022-11-04 23:19:53 +00:00			`# -- coding: utf-8 --`

Update 2022-11-05 12:23:14 +00:00			`import sys`

			`filename=sys.argv[1]`

Update 2022-11-05 12:31:27 +00:00			`generator_uri = "https://git.sdf.org/baruchel/feeds/src/branch/master/rst2atom.py"`
Update 2022-11-05 11:13:14 +00:00			`generator_name = "rst2atom"`
Initial commit 2022-11-04 23:19:53 +00:00
			`import docutils.core`
			`import dateutil.parser`
Update 2022-11-05 10:35:34 +00:00			`import datetime`
Initial commit 2022-11-04 23:19:53 +00:00			`import bs4`

			`html = docutils.core.publish_parts(open(filename, mode="r").read(),`
			`source_path=filename,`
			`writer_name='html')`

Update 2022-11-05 10:35:34 +00:00
			`def extract_docinfos(html):`
Update 2022-11-05 10:37:18 +00:00			`"""`
			`Parse the publish_parts dictionary and return a dictionary containing`
			`the docinfos from the RsT document.`
			`"""`
Update 2022-11-05 10:35:34 +00:00			`soup1 = bs4.BeautifulSoup(html["docinfo"], 'html.parser')`
			`return { tr.find("th").text:`
			`tr.find("td").text`
			`for tr in [`
			`e.find_parent("tr") for e in`
			`soup1.find_all("th", {"class": "docinfo-name"})`
			`] }`

Update 2022-11-05 11:13:14 +00:00			`def extract_meta(html):`
			`"""`
			`Parse the publish_parts dictionary and return a dictionary containing`
			`the metadata from the RsT document.`
			`"""`
			`soup1 = bs4.BeautifulSoup(html["meta"], 'html.parser')`
			`return { m.attrs["name"]: m.attrs["content"]`
			`for m in soup1.find_all("meta", {"name": True,`
			`"content": True}) }`

Update 2022-11-05 10:35:34 +00:00
Initial commit 2022-11-04 23:19:53 +00:00			`def find_date(d):`
Update 2022-11-05 10:35:34 +00:00			`"""`
Fixed stuff 2022-11-05 14:38:35 +00:00			`Parse the comments in a <div> section and return a an RFC 3339 formatted`
Update 2022-11-05 10:35:34 +00:00			`string being the first recognized date string.`
			`Raise dateutil.parser.ParserError if no such comment contains a date.`

			`In the RestructuredText, the date would be typically written as:`

			`This is a new entry`
			`~~~~~~~~~~~~~~~~~~~`
			`.. Note: created on 2022/11/05`

			`Blah...`
			`"""`
Initial commit 2022-11-04 23:19:53 +00:00			`comments = d.find_all(string=lambda text: isinstance(text, bs4.Comment))`
			`for c in comments:`
			`for k in c.extract().split(" "):`
			`try:`
			`t = dateutil.parser.parse(k)`
Fixed stuff 2022-11-05 14:42:35 +00:00			`return t.isoformat() + "Z"`
Initial commit 2022-11-04 23:19:53 +00:00			`except dateutil.parser.ParserError:`
			`pass`
Update 2022-11-05 10:35:34 +00:00			`raise dateutil.parser.ParserError`
Initial commit 2022-11-04 23:19:53 +00:00

Fixed stuff 2022-11-05 14:53:31 +00:00			`def build_entry(d, base_url):`
Update 2022-11-05 10:35:34 +00:00			`"""`
			`Print on stdout an Atom <entry> section built from the <div>.`
			`"""`
Update 2022-11-05 11:22:32 +00:00			`# Get the date`
			`mydate = ""`
Fixed stuff 2022-11-05 14:53:31 +00:00			`mytitle = d.find("h1").text`
Update 2022-11-05 11:22:32 +00:00			`try:`
			`mydate = find_date(d)`
			`except dateutil.parser.ParserError:`
			`pass`
Update 2022-11-05 10:35:34 +00:00			`print(""" <entry>`
Fixed stuff 2022-11-05 14:53:31 +00:00			`<id>%s</id>`
Update 2022-11-05 11:22:32 +00:00			`<title>%s</title>`
			`<updated>%s</updated>`
			`<content type="xhtml">`
Update 2022-11-05 11:28:02 +00:00			`<div xmlns="http://www.w3.org/1999/xhtml">%s</div>`
Update 2022-11-05 11:22:32 +00:00			`</content>`
			`</entry>`
			`""" % (`
Fixed stuff 2022-11-05 14:56:20 +00:00			`base_url + "#" + mytitle.lower().replace(" ", "-")`
			`+ "_(" + mydate + ")",`
Fixed stuff 2022-11-05 14:53:31 +00:00			`mytitle, mydate, d`
Update 2022-11-05 11:22:32 +00:00			`))`
Initial commit 2022-11-04 23:19:53 +00:00
Update 2022-11-05 11:13:14 +00:00

			`docinfos = extract_docinfos(html)`
			`meta = extract_meta(html)`

Update 2022-11-05 11:22:32 +00:00			`soup2 = bs4.BeautifulSoup(html["body"], 'html.parser')`
			`divs = soup2.select("div")`

Update 2022-11-05 10:35:34 +00:00			`print("""<?xml version="1.0" encoding="utf-8"?>`
			`<feed xmlns="http://www.w3.org/2005/Atom">`
Update 2022-11-05 11:13:14 +00:00			`<id>%s</id>`
Update 2022-11-05 10:35:34 +00:00			`<title>%s</title>`
Fixed stuff 2022-11-05 14:33:01 +00:00			`<subtitle>%s</subtitle>`
Update 2022-11-05 11:13:14 +00:00			`<author>`
			`<name>%s</name>`
			`<email>%s</email>`
			`</author>`
			`<rights>%s</rights>`
Update 2022-11-05 10:35:34 +00:00			`<updated>%s</updated>`
Update 2022-11-05 11:13:14 +00:00			`<link href="%s" />`
			`<link href="%s" rel="self" />`
			`<generator uri="%s">%s</generator>`
Update 2022-11-05 10:35:34 +00:00			`""" % (`
Update 2022-11-05 11:13:14 +00:00			`meta["original-source"],`
Update 2022-11-05 10:35:34 +00:00			`html["title"],`
Fixed stuff 2022-11-05 14:33:01 +00:00			`meta["description"],`
Update 2022-11-05 11:13:14 +00:00			`docinfos["Author:"],`
			`docinfos["Contact:"],`
			`meta["copyright"],`
Fixed stuff 2022-11-05 14:46:35 +00:00			`datetime.datetime.utcnow().isoformat() + "Z",`
Update 2022-11-05 11:13:14 +00:00			`meta["original-source"],`
			`meta["syndication-source"],`
			`generator_uri, generator_name`
Update 2022-11-05 10:35:34 +00:00			`))`
Initial commit 2022-11-04 23:19:53 +00:00
			`for d in divs:`
			`# don't handle subsections`
			`if d.find_parent("div"): continue`
Fixed stuff 2022-11-05 14:53:31 +00:00			`build_entry(d, meta["original-source"])`
Update 2022-11-05 10:35:34 +00:00
			`print("</feed>")`