feeds/rst2atom.py

135 lines
3.6 KiB
Python
Raw Normal View History

2022-11-04 23:19:53 +00:00
# -*- coding: utf-8 -*-
2022-11-05 12:23:14 +00:00
import sys
filename=sys.argv[1]
2022-11-05 12:31:27 +00:00
generator_uri = "https://git.sdf.org/baruchel/feeds/src/branch/master/rst2atom.py"
2022-11-05 11:13:14 +00:00
generator_name = "rst2atom"
2022-11-04 23:19:53 +00:00
import docutils.core
import dateutil.parser
2022-11-05 10:35:34 +00:00
import datetime
2022-11-04 23:19:53 +00:00
import bs4
html = docutils.core.publish_parts(open(filename, mode="r").read(),
source_path=filename,
writer_name='html')
2022-11-05 10:35:34 +00:00
def extract_docinfos(html):
2022-11-05 10:37:18 +00:00
"""
Parse the publish_parts dictionary and return a dictionary containing
the docinfos from the RsT document.
"""
2022-11-05 10:35:34 +00:00
soup1 = bs4.BeautifulSoup(html["docinfo"], 'html.parser')
return { tr.find("th").text:
tr.find("td").text
for tr in [
e.find_parent("tr") for e in
soup1.find_all("th", {"class": "docinfo-name"})
] }
2022-11-05 11:13:14 +00:00
def extract_meta(html):
"""
Parse the publish_parts dictionary and return a dictionary containing
the metadata from the RsT document.
"""
soup1 = bs4.BeautifulSoup(html["meta"], 'html.parser')
return { m.attrs["name"]: m.attrs["content"]
for m in soup1.find_all("meta", {"name": True,
"content": True}) }
2022-11-05 10:35:34 +00:00
2022-11-04 23:19:53 +00:00
def find_date(d):
2022-11-05 10:35:34 +00:00
"""
2022-11-05 14:38:35 +00:00
Parse the comments in a <div> section and return a an RFC 3339 formatted
2022-11-05 10:35:34 +00:00
string being the first recognized date string.
Raise dateutil.parser.ParserError if no such comment contains a date.
In the RestructuredText, the date would be typically written as:
This is a new entry
~~~~~~~~~~~~~~~~~~~
.. Note: created on 2022/11/05
Blah...
"""
2022-11-04 23:19:53 +00:00
comments = d.find_all(string=lambda text: isinstance(text, bs4.Comment))
for c in comments:
for k in c.extract().split(" "):
try:
t = dateutil.parser.parse(k)
2022-11-05 14:42:35 +00:00
return t.isoformat() + "Z"
2022-11-04 23:19:53 +00:00
except dateutil.parser.ParserError:
pass
2022-11-05 10:35:34 +00:00
raise dateutil.parser.ParserError
2022-11-04 23:19:53 +00:00
2022-11-05 14:53:31 +00:00
def build_entry(d, base_url):
2022-11-05 10:35:34 +00:00
"""
Print on stdout an Atom <entry> section built from the <div>.
"""
2022-11-05 11:22:32 +00:00
# Get the date
mydate = ""
2022-11-05 14:53:31 +00:00
mytitle = d.find("h1").text
2022-11-05 11:22:32 +00:00
try:
mydate = find_date(d)
except dateutil.parser.ParserError:
pass
2022-11-05 10:35:34 +00:00
print(""" <entry>
2022-11-05 14:53:31 +00:00
<id>%s</id>
2022-11-05 11:22:32 +00:00
<title>%s</title>
<updated>%s</updated>
<content type="xhtml">
2022-11-05 11:28:02 +00:00
<div xmlns="http://www.w3.org/1999/xhtml">%s</div>
2022-11-05 11:22:32 +00:00
</content>
</entry>
""" % (
2022-11-05 14:56:20 +00:00
base_url + "#" + mytitle.lower().replace(" ", "-")
+ "_(" + mydate + ")",
2022-11-05 14:53:31 +00:00
mytitle, mydate, d
2022-11-05 11:22:32 +00:00
))
2022-11-04 23:19:53 +00:00
2022-11-05 11:13:14 +00:00
docinfos = extract_docinfos(html)
meta = extract_meta(html)
2022-11-05 11:22:32 +00:00
soup2 = bs4.BeautifulSoup(html["body"], 'html.parser')
divs = soup2.select("div")
2022-11-05 10:35:34 +00:00
print("""<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
2022-11-05 11:13:14 +00:00
<id>%s</id>
2022-11-05 10:35:34 +00:00
<title>%s</title>
2022-11-05 14:33:01 +00:00
<subtitle>%s</subtitle>
2022-11-05 11:13:14 +00:00
<author>
<name>%s</name>
<email>%s</email>
</author>
<rights>%s</rights>
2022-11-05 10:35:34 +00:00
<updated>%s</updated>
2022-11-05 11:13:14 +00:00
<link href="%s" />
<link href="%s" rel="self" />
<generator uri="%s">%s</generator>
2022-11-05 10:35:34 +00:00
""" % (
2022-11-05 11:13:14 +00:00
meta["original-source"],
2022-11-05 10:35:34 +00:00
html["title"],
2022-11-05 14:33:01 +00:00
meta["description"],
2022-11-05 11:13:14 +00:00
docinfos["Author:"],
docinfos["Contact:"],
meta["copyright"],
2022-11-05 14:46:35 +00:00
datetime.datetime.utcnow().isoformat() + "Z",
2022-11-05 11:13:14 +00:00
meta["original-source"],
meta["syndication-source"],
generator_uri, generator_name
2022-11-05 10:35:34 +00:00
))
2022-11-04 23:19:53 +00:00
for d in divs:
# don't handle subsections
if d.find_parent("div"): continue
2022-11-05 14:53:31 +00:00
build_entry(d, meta["original-source"])
2022-11-05 10:35:34 +00:00
print("</feed>")