Update

2022-11-05 11:37:18 +01:00 · 2022-11-05 11:35:34 +01:00
1 changed files with 77 additions and 7 deletions
--- a/rst2atom.py
+++ b/rst2atom.py
@ -6,31 +6,93 @@ filename="test.rst"
 import docutils.core
 import dateutil.parser
 import datetime
 import bs4
 html = docutils.core.publish_parts(open(filename, mode="r").read(),
                                   source_path=filename,
                                   writer_name='html')
 def extract_docinfos(html):
    """
    Parse the publish_parts dictionary and return a dictionary containing
    the docinfos from the RsT document.
    """
    soup1 = bs4.BeautifulSoup(html["docinfo"], 'html.parser')
    return { tr.find("th").text:
             tr.find("td").text
               for tr in [
                 e.find_parent("tr") for e in
                 soup1.find_all("th", {"class": "docinfo-name"})
               ] }
 def find_date(d):
    """
    Parse the comments in a <div> section and return a an ISO8601 formatted
    string being the first recognized date string.
    Raise dateutil.parser.ParserError if no such comment contains a date.
    In the RestructuredText, the date would be typically written as:
        This is a new entry
        ~~~~~~~~~~~~~~~~~~~
        .. Note: created on 2022/11/05
        Blah...
    """
    comments = d.find_all(string=lambda text: isinstance(text, bs4.Comment))
    for c in comments:
        for k in c.extract().split(" "):
            try:
                t = dateutil.parser.parse(k)
-                return t
+                return t.isoformat()
            except dateutil.parser.ParserError:
                pass
    raise dateutil.parser.ParserError
 def build_entry(d):
    """
    Print on stdout an Atom <entry> section built from the <div>.
    """
    print("""    <entry>
        <title>Atom-Powered Robots Run Amok</title>
        <link href="http://example.org/2003/12/13/atom03" />
        <link rel="alternate" type="text/html" href="http://example.org/2003/12/13/atom03.html"/>
        <link rel="edit" href="http://example.org/2003/12/13/atom03/edit"/>
        <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
        <published>2003-11-09T17:23:02Z</published>
                <content type="xhtml">
                        <div xmlns="http://www.w3.org/1999/xhtml">
                                <p>This is the entry content.</p>
                        </div>
                </content>
                <author>
                        <name>John Doe</name>
                        <email>johndoe@example.com</email>
                </author>
        </entry>
        """)
 print("""<?xml version="1.0" encoding="utf-8"?>
 <feed xmlns="http://www.w3.org/2005/Atom">
    <title>%s</title>
    <updated>%s</updated>
    <link href="http://example.org/feed/" rel="self" />
    <link href="http://example.org/" />
    <id>urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6</id>
 """ % (
         html["title"],
         datetime.datetime.now().isoformat()
 ))
 docinfos = extract_docinfos(html)
-
+soup2 = bs4.BeautifulSoup(html["body"], 'html.parser')
-print("TITRE:", html["title"])
+divs = soup2.select("div")
 soup = bs4.BeautifulSoup(html["body"], 'html.parser')
 divs = soup.select("div")
 for d in divs:
    # don't handle subsections
@ -38,4 +100,12 @@ for d in divs:
    print("="*40)
    print(d)
-    print(find_date(d))
+    try:
        print(find_date(d))
    except dateutil.parser.ParserError:
        pass
 print("</feed>")
Author	SHA1	Message	Date
Thomas Baruchel	5c09e23172	Update	2022-11-05 11:37:18 +01:00
Thomas Baruchel	d16cbdbffc	Update	2022-11-05 11:35:34 +01:00