# -*- coding: utf-8 -*- filename="test.rst" # https://docutils.sourceforge.io/docs/api/publisher.html import docutils.core import dateutil.parser import datetime import bs4 html = docutils.core.publish_parts(open(filename, mode="r").read(), source_path=filename, writer_name='html') def extract_docinfos(html): """ Parse the publish_parts dictionary and return a dictionary containing the docinfos from the RsT document. """ soup1 = bs4.BeautifulSoup(html["docinfo"], 'html.parser') return { tr.find("th").text: tr.find("td").text for tr in [ e.find_parent("tr") for e in soup1.find_all("th", {"class": "docinfo-name"}) ] } def find_date(d): """ Parse the comments in a
section and return a an ISO8601 formatted string being the first recognized date string. Raise dateutil.parser.ParserError if no such comment contains a date. In the RestructuredText, the date would be typically written as: This is a new entry ~~~~~~~~~~~~~~~~~~~ .. Note: created on 2022/11/05 Blah... """ comments = d.find_all(string=lambda text: isinstance(text, bs4.Comment)) for c in comments: for k in c.extract().split(" "): try: t = dateutil.parser.parse(k) return t.isoformat() except dateutil.parser.ParserError: pass raise dateutil.parser.ParserError def build_entry(d): """ Print on stdout an Atom section built from the
. """ print(""" Atom-Powered Robots Run Amok urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 2003-11-09T17:23:02Z

This is the entry content.

John Doe johndoe@example.com
""") print(""" %s %s urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6 """ % ( html["title"], datetime.datetime.now().isoformat() )) docinfos = extract_docinfos(html) soup2 = bs4.BeautifulSoup(html["body"], 'html.parser') divs = soup2.select("div") for d in divs: # don't handle subsections if d.find_parent("div"): continue print("="*40) print(d) try: print(find_date(d)) except dateutil.parser.ParserError: pass print("")