Compare commits
2 Commits
30e2b29161
...
5c09e23172
Author | SHA1 | Date | |
---|---|---|---|
|
5c09e23172 | ||
|
d16cbdbffc |
84
rst2atom.py
84
rst2atom.py
@ -6,31 +6,93 @@ filename="test.rst"
|
|||||||
|
|
||||||
import docutils.core
|
import docutils.core
|
||||||
import dateutil.parser
|
import dateutil.parser
|
||||||
|
import datetime
|
||||||
import bs4
|
import bs4
|
||||||
|
|
||||||
html = docutils.core.publish_parts(open(filename, mode="r").read(),
|
html = docutils.core.publish_parts(open(filename, mode="r").read(),
|
||||||
source_path=filename,
|
source_path=filename,
|
||||||
writer_name='html')
|
writer_name='html')
|
||||||
|
|
||||||
|
|
||||||
|
def extract_docinfos(html):
|
||||||
|
"""
|
||||||
|
Parse the publish_parts dictionary and return a dictionary containing
|
||||||
|
the docinfos from the RsT document.
|
||||||
|
"""
|
||||||
|
soup1 = bs4.BeautifulSoup(html["docinfo"], 'html.parser')
|
||||||
|
return { tr.find("th").text:
|
||||||
|
tr.find("td").text
|
||||||
|
for tr in [
|
||||||
|
e.find_parent("tr") for e in
|
||||||
|
soup1.find_all("th", {"class": "docinfo-name"})
|
||||||
|
] }
|
||||||
|
|
||||||
|
|
||||||
def find_date(d):
|
def find_date(d):
|
||||||
|
"""
|
||||||
|
Parse the comments in a <div> section and return a an ISO8601 formatted
|
||||||
|
string being the first recognized date string.
|
||||||
|
Raise dateutil.parser.ParserError if no such comment contains a date.
|
||||||
|
|
||||||
|
In the RestructuredText, the date would be typically written as:
|
||||||
|
|
||||||
|
This is a new entry
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
.. Note: created on 2022/11/05
|
||||||
|
|
||||||
|
Blah...
|
||||||
|
"""
|
||||||
comments = d.find_all(string=lambda text: isinstance(text, bs4.Comment))
|
comments = d.find_all(string=lambda text: isinstance(text, bs4.Comment))
|
||||||
for c in comments:
|
for c in comments:
|
||||||
for k in c.extract().split(" "):
|
for k in c.extract().split(" "):
|
||||||
try:
|
try:
|
||||||
t = dateutil.parser.parse(k)
|
t = dateutil.parser.parse(k)
|
||||||
return t
|
return t.isoformat()
|
||||||
except dateutil.parser.ParserError:
|
except dateutil.parser.ParserError:
|
||||||
pass
|
pass
|
||||||
|
raise dateutil.parser.ParserError
|
||||||
|
|
||||||
|
|
||||||
|
def build_entry(d):
|
||||||
|
"""
|
||||||
|
Print on stdout an Atom <entry> section built from the <div>.
|
||||||
|
"""
|
||||||
|
print(""" <entry>
|
||||||
|
<title>Atom-Powered Robots Run Amok</title>
|
||||||
|
<link href="http://example.org/2003/12/13/atom03" />
|
||||||
|
<link rel="alternate" type="text/html" href="http://example.org/2003/12/13/atom03.html"/>
|
||||||
|
<link rel="edit" href="http://example.org/2003/12/13/atom03/edit"/>
|
||||||
|
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
|
||||||
|
<published>2003-11-09T17:23:02Z</published>
|
||||||
|
<content type="xhtml">
|
||||||
|
<div xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<p>This is the entry content.</p>
|
||||||
|
</div>
|
||||||
|
</content>
|
||||||
|
<author>
|
||||||
|
<name>John Doe</name>
|
||||||
|
<email>johndoe@example.com</email>
|
||||||
|
</author>
|
||||||
|
</entry>
|
||||||
|
""")
|
||||||
|
|
||||||
|
print("""<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||||
|
<title>%s</title>
|
||||||
|
<updated>%s</updated>
|
||||||
|
<link href="http://example.org/feed/" rel="self" />
|
||||||
|
<link href="http://example.org/" />
|
||||||
|
<id>urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6</id>
|
||||||
|
""" % (
|
||||||
|
html["title"],
|
||||||
|
datetime.datetime.now().isoformat()
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
docinfos = extract_docinfos(html)
|
||||||
|
|
||||||
|
soup2 = bs4.BeautifulSoup(html["body"], 'html.parser')
|
||||||
print("TITRE:", html["title"])
|
divs = soup2.select("div")
|
||||||
|
|
||||||
soup = bs4.BeautifulSoup(html["body"], 'html.parser')
|
|
||||||
divs = soup.select("div")
|
|
||||||
|
|
||||||
for d in divs:
|
for d in divs:
|
||||||
# don't handle subsections
|
# don't handle subsections
|
||||||
@ -38,4 +100,12 @@ for d in divs:
|
|||||||
|
|
||||||
print("="*40)
|
print("="*40)
|
||||||
print(d)
|
print(d)
|
||||||
print(find_date(d))
|
try:
|
||||||
|
print(find_date(d))
|
||||||
|
except dateutil.parser.ParserError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
print("</feed>")
|
||||||
|
Loading…
Reference in New Issue
Block a user