135 lines
3.6 KiB
Python
135 lines
3.6 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
import sys
|
|
|
|
filename=sys.argv[1]
|
|
|
|
generator_uri = "https://git.sdf.org/baruchel/feeds/src/branch/master/rst2atom.py"
|
|
generator_name = "rst2atom"
|
|
|
|
import docutils.core
|
|
import dateutil.parser
|
|
import datetime
|
|
import bs4
|
|
|
|
html = docutils.core.publish_parts(open(filename, mode="r").read(),
|
|
source_path=filename,
|
|
writer_name='html')
|
|
|
|
|
|
def extract_docinfos(html):
|
|
"""
|
|
Parse the publish_parts dictionary and return a dictionary containing
|
|
the docinfos from the RsT document.
|
|
"""
|
|
soup1 = bs4.BeautifulSoup(html["docinfo"], 'html.parser')
|
|
return { tr.find("th").text:
|
|
tr.find("td").text
|
|
for tr in [
|
|
e.find_parent("tr") for e in
|
|
soup1.find_all("th", {"class": "docinfo-name"})
|
|
] }
|
|
|
|
def extract_meta(html):
|
|
"""
|
|
Parse the publish_parts dictionary and return a dictionary containing
|
|
the metadata from the RsT document.
|
|
"""
|
|
soup1 = bs4.BeautifulSoup(html["meta"], 'html.parser')
|
|
return { m.attrs["name"]: m.attrs["content"]
|
|
for m in soup1.find_all("meta", {"name": True,
|
|
"content": True}) }
|
|
|
|
|
|
def find_date(d):
|
|
"""
|
|
Parse the comments in a <div> section and return a an RFC 3339 formatted
|
|
string being the first recognized date string.
|
|
Raise dateutil.parser.ParserError if no such comment contains a date.
|
|
|
|
In the RestructuredText, the date would be typically written as:
|
|
|
|
This is a new entry
|
|
~~~~~~~~~~~~~~~~~~~
|
|
.. Note: created on 2022/11/05
|
|
|
|
Blah...
|
|
"""
|
|
comments = d.find_all(string=lambda text: isinstance(text, bs4.Comment))
|
|
for c in comments:
|
|
for k in c.extract().split(" "):
|
|
try:
|
|
t = dateutil.parser.parse(k)
|
|
return t.isoformat() + "Z"
|
|
except dateutil.parser.ParserError:
|
|
pass
|
|
raise dateutil.parser.ParserError
|
|
|
|
|
|
def build_entry(d, base_url):
|
|
"""
|
|
Print on stdout an Atom <entry> section built from the <div>.
|
|
"""
|
|
# Get the date
|
|
mydate = ""
|
|
mytitle = d.find("h1").text
|
|
try:
|
|
mydate = find_date(d)
|
|
except dateutil.parser.ParserError:
|
|
pass
|
|
print(""" <entry>
|
|
<id>%s</id>
|
|
<title>%s</title>
|
|
<updated>%s</updated>
|
|
<content type="xhtml">
|
|
<div xmlns="http://www.w3.org/1999/xhtml">%s</div>
|
|
</content>
|
|
</entry>
|
|
""" % (
|
|
base_url + "#" + mytitle.lower().replace(" ", "-")
|
|
+ "_(" + mydate + ")",
|
|
mytitle, mydate, d
|
|
))
|
|
|
|
|
|
|
|
docinfos = extract_docinfos(html)
|
|
meta = extract_meta(html)
|
|
|
|
soup2 = bs4.BeautifulSoup(html["body"], 'html.parser')
|
|
divs = soup2.select("div")
|
|
|
|
print("""<?xml version="1.0" encoding="utf-8"?>
|
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
|
<id>%s</id>
|
|
<title>%s</title>
|
|
<subtitle>%s</subtitle>
|
|
<author>
|
|
<name>%s</name>
|
|
<email>%s</email>
|
|
</author>
|
|
<rights>%s</rights>
|
|
<updated>%s</updated>
|
|
<link href="%s" />
|
|
<link href="%s" rel="self" />
|
|
<generator uri="%s">%s</generator>
|
|
""" % (
|
|
meta["original-source"],
|
|
html["title"],
|
|
meta["description"],
|
|
docinfos["Author:"],
|
|
docinfos["Contact:"],
|
|
meta["copyright"],
|
|
datetime.datetime.utcnow().isoformat() + "Z",
|
|
meta["original-source"],
|
|
meta["syndication-source"],
|
|
generator_uri, generator_name
|
|
))
|
|
|
|
for d in divs:
|
|
# don't handle subsections
|
|
if d.find_parent("div"): continue
|
|
build_entry(d, meta["original-source"])
|
|
|
|
print("</feed>")
|