This commit is contained in:
Thomas Baruchel 2022-11-05 11:35:34 +01:00
parent 30e2b29161
commit d16cbdbffc
1 changed files with 73 additions and 7 deletions

View File

@ -6,31 +6,89 @@ filename="test.rst"
import docutils.core
import dateutil.parser
import datetime
import bs4
html = docutils.core.publish_parts(open(filename, mode="r").read(),
source_path=filename,
writer_name='html')
def extract_docinfos(html):
soup1 = bs4.BeautifulSoup(html["docinfo"], 'html.parser')
return { tr.find("th").text:
tr.find("td").text
for tr in [
e.find_parent("tr") for e in
soup1.find_all("th", {"class": "docinfo-name"})
] }
def find_date(d):
"""
Parse the comments in a <div> section and return a an ISO8601 formatted
string being the first recognized date string.
Raise dateutil.parser.ParserError if no such comment contains a date.
In the RestructuredText, the date would be typically written as:
This is a new entry
~~~~~~~~~~~~~~~~~~~
.. Note: created on 2022/11/05
Blah...
"""
comments = d.find_all(string=lambda text: isinstance(text, bs4.Comment))
for c in comments:
for k in c.extract().split(" "):
try:
t = dateutil.parser.parse(k)
return t
return t.isoformat()
except dateutil.parser.ParserError:
pass
raise dateutil.parser.ParserError
def build_entry(d):
"""
Print on stdout an Atom <entry> section built from the <div>.
"""
print(""" <entry>
<title>Atom-Powered Robots Run Amok</title>
<link href="http://example.org/2003/12/13/atom03" />
<link rel="alternate" type="text/html" href="http://example.org/2003/12/13/atom03.html"/>
<link rel="edit" href="http://example.org/2003/12/13/atom03/edit"/>
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
<published>2003-11-09T17:23:02Z</published>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<p>This is the entry content.</p>
</div>
</content>
<author>
<name>John Doe</name>
<email>johndoe@example.com</email>
</author>
</entry>
""")
print("""<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>%s</title>
<updated>%s</updated>
<link href="http://example.org/feed/" rel="self" />
<link href="http://example.org/" />
<id>urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6</id>
""" % (
html["title"],
datetime.datetime.now().isoformat()
))
docinfos = extract_docinfos(html)
print("TITRE:", html["title"])
soup = bs4.BeautifulSoup(html["body"], 'html.parser')
divs = soup.select("div")
soup2 = bs4.BeautifulSoup(html["body"], 'html.parser')
divs = soup2.select("div")
for d in divs:
# don't handle subsections
@ -38,4 +96,12 @@ for d in divs:
print("="*40)
print(d)
print(find_date(d))
try:
print(find_date(d))
except dateutil.parser.ParserError:
pass
print("</feed>")