# -*- coding: utf-8 -*- import sys filename=sys.argv[1] generator_uri = "http://git.sdf.org/rst2atom.py" generator_name = "rst2atom" import docutils.core import dateutil.parser import datetime import bs4 html = docutils.core.publish_parts(open(filename, mode="r").read(), source_path=filename, writer_name='html') def extract_docinfos(html): """ Parse the publish_parts dictionary and return a dictionary containing the docinfos from the RsT document. """ soup1 = bs4.BeautifulSoup(html["docinfo"], 'html.parser') return { tr.find("th").text: tr.find("td").text for tr in [ e.find_parent("tr") for e in soup1.find_all("th", {"class": "docinfo-name"}) ] } def extract_meta(html): """ Parse the publish_parts dictionary and return a dictionary containing the metadata from the RsT document. """ soup1 = bs4.BeautifulSoup(html["meta"], 'html.parser') return { m.attrs["name"]: m.attrs["content"] for m in soup1.find_all("meta", {"name": True, "content": True}) } def find_date(d): """ Parse the comments in a
section and return a an ISO8601 formatted string being the first recognized date string. Raise dateutil.parser.ParserError if no such comment contains a date. In the RestructuredText, the date would be typically written as: This is a new entry ~~~~~~~~~~~~~~~~~~~ .. Note: created on 2022/11/05 Blah... """ comments = d.find_all(string=lambda text: isinstance(text, bs4.Comment)) for c in comments: for k in c.extract().split(" "): try: t = dateutil.parser.parse(k) return t.isoformat() except dateutil.parser.ParserError: pass raise dateutil.parser.ParserError def build_entry(d): """ Print on stdout an Atom section built from the
. """ # Get the date mydate = "" try: mydate = find_date(d) except dateutil.parser.ParserError: pass print(""" %s urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a %s
%s
""" % ( d.find("h1").text, mydate, d )) docinfos = extract_docinfos(html) meta = extract_meta(html) soup2 = bs4.BeautifulSoup(html["body"], 'html.parser') divs = soup2.select("div") print(""" %s %s %s %s %s %s %s """ % ( meta["original-source"], html["title"], docinfos["Author:"], docinfos["Contact:"], meta["copyright"], datetime.datetime.now().isoformat(), meta["original-source"], meta["syndication-source"], generator_uri, generator_name )) for d in divs: # don't handle subsections if d.find_parent("div"): continue build_entry(d) print("")