# -*- coding: utf-8 -*- import sys filename=sys.argv[1] generator_uri = "https://git.sdf.org/baruchel/feeds/src/branch/master/rst2atom.py" generator_name = "rst2atom" import docutils.core import dateutil.parser import datetime import bs4 html = docutils.core.publish_parts(open(filename, mode="r").read(), source_path=filename, writer_name='html') def extract_docinfos(html): """ Parse the publish_parts dictionary and return a dictionary containing the docinfos from the RsT document. """ soup1 = bs4.BeautifulSoup(html["docinfo"], 'html.parser') return { tr.find("th").text: tr.find("td").text for tr in [ e.find_parent("tr") for e in soup1.find_all("th", {"class": "docinfo-name"}) ] } def extract_meta(html): """ Parse the publish_parts dictionary and return a dictionary containing the metadata from the RsT document. """ soup1 = bs4.BeautifulSoup(html["meta"], 'html.parser') return { m.attrs["name"]: m.attrs["content"] for m in soup1.find_all("meta", {"name": True, "content": True}) } def find_date(d): """ Parse the comments in a
section and return a an RFC 3339 formatted string being the first recognized date string. Raise dateutil.parser.ParserError if no such comment contains a date. In the RestructuredText, the date would be typically written as: This is a new entry ~~~~~~~~~~~~~~~~~~~ .. Note: created on 2022/11/05 Blah... """ comments = d.find_all(string=lambda text: isinstance(text, bs4.Comment)) for c in comments: for k in c.extract().split(" "): try: t = dateutil.parser.parse(k) return t.isoformat() + "Z" except dateutil.parser.ParserError: pass raise dateutil.parser.ParserError def build_entry(d, base_url): """ Print on stdout an Atom section built from the
. """ # Get the date mydate = "" mytitle = d.find("h1").text try: mydate = find_date(d) except dateutil.parser.ParserError: pass print(""" %s %s %s
%s
""" % ( base_url + "#" + mytitle.lower().replace(" ", "-") + "_(" + mydate + ")", mytitle, mydate, d )) docinfos = extract_docinfos(html) meta = extract_meta(html) soup2 = bs4.BeautifulSoup(html["body"], 'html.parser') divs = soup2.select("div") print(""" %s %s %s %s %s %s %s %s """ % ( meta["original-source"], html["title"], meta["description"], docinfos["Author:"], docinfos["Contact:"], meta["copyright"], datetime.datetime.utcnow().isoformat() + "Z", meta["original-source"], meta["syndication-source"], generator_uri, generator_name )) for d in divs: # don't handle subsections if d.find_parent("div"): continue build_entry(d, meta["original-source"]) print("")