# -*- coding: utf-8 -*-
filename="test.rst"
# https://docutils.sourceforge.io/docs/api/publisher.html
import docutils.core
import dateutil.parser
import datetime
import bs4
html = docutils.core.publish_parts(open(filename, mode="r").read(),
source_path=filename,
writer_name='html')
def extract_docinfos(html):
"""
Parse the publish_parts dictionary and return a dictionary containing
the docinfos from the RsT document.
"""
soup1 = bs4.BeautifulSoup(html["docinfo"], 'html.parser')
return { tr.find("th").text:
tr.find("td").text
for tr in [
e.find_parent("tr") for e in
soup1.find_all("th", {"class": "docinfo-name"})
] }
def find_date(d):
"""
Parse the comments in a
section and return a an ISO8601 formatted
string being the first recognized date string.
Raise dateutil.parser.ParserError if no such comment contains a date.
In the RestructuredText, the date would be typically written as:
This is a new entry
~~~~~~~~~~~~~~~~~~~
.. Note: created on 2022/11/05
Blah...
"""
comments = d.find_all(string=lambda text: isinstance(text, bs4.Comment))
for c in comments:
for k in c.extract().split(" "):
try:
t = dateutil.parser.parse(k)
return t.isoformat()
except dateutil.parser.ParserError:
pass
raise dateutil.parser.ParserError
def build_entry(d):
"""
Print on stdout an Atom section built from the
.
"""
print(""" Atom-Powered Robots Run Amokurn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a2003-11-09T17:23:02Z
This is the entry content.
John Doejohndoe@example.com
""")
print("""
%s%surn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6
""" % (
html["title"],
datetime.datetime.now().isoformat()
))
docinfos = extract_docinfos(html)
soup2 = bs4.BeautifulSoup(html["body"], 'html.parser')
divs = soup2.select("div")
for d in divs:
# don't handle subsections
if d.find_parent("div"): continue
print("="*40)
print(d)
try:
print(find_date(d))
except dateutil.parser.ParserError:
pass
print("")