Update

2022-11-05 11:35:34 +01:00 · 2022-11-05 11:35:34 +01:00 · d16cbdbffc
parent 30e2b29161
commit d16cbdbffc
1 changed files with 73 additions and 7 deletions
--- a/rst2atom.py
+++ b/rst2atom.py
@ -6,31 +6,89 @@ filename="test.rst"

 import docutils.core
 import dateutil.parser
+import datetime
 import bs4

 html = docutils.core.publish_parts(open(filename, mode="r").read(),
                                   source_path=filename,
                                   writer_name='html')

+
+def extract_docinfos(html):
+    soup1 = bs4.BeautifulSoup(html["docinfo"], 'html.parser')
+    return { tr.find("th").text:
+             tr.find("td").text
+               for tr in [
+                 e.find_parent("tr") for e in
+                 soup1.find_all("th", {"class": "docinfo-name"})
+               ] }
+
+
 def find_date(d):
+    """
+    Parse the comments in a <div> section and return a an ISO8601 formatted
+    string being the first recognized date string.
+    Raise dateutil.parser.ParserError if no such comment contains a date.
+
+    In the RestructuredText, the date would be typically written as:
+
+        This is a new entry
+        ~~~~~~~~~~~~~~~~~~~
+        .. Note: created on 2022/11/05
+
+        Blah...
+    """
    comments = d.find_all(string=lambda text: isinstance(text, bs4.Comment))
    for c in comments:
        for k in c.extract().split(" "):
            try:
                t = dateutil.parser.parse(k)
-                return t
+                return t.isoformat()
            except dateutil.parser.ParserError:
                pass
+    raise dateutil.parser.ParserError


+def build_entry(d):
+    """
+    Print on stdout an Atom <entry> section built from the <div>.
+    """
+    print("""    <entry>
+        <title>Atom-Powered Robots Run Amok</title>
+        <link href="http://example.org/2003/12/13/atom03" />
+        <link rel="alternate" type="text/html" href="http://example.org/2003/12/13/atom03.html"/>
+        <link rel="edit" href="http://example.org/2003/12/13/atom03/edit"/>
+        <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+        <published>2003-11-09T17:23:02Z</published>
+                <content type="xhtml">
+                        <div xmlns="http://www.w3.org/1999/xhtml">
+                                <p>This is the entry content.</p>
+                        </div>
+                </content>
+                <author>
+                        <name>John Doe</name>
+                        <email>johndoe@example.com</email>
+                </author>
+        </entry>
+        """)
+
+print("""<?xml version="1.0" encoding="utf-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+    <title>%s</title>
+    <updated>%s</updated>
+    <link href="http://example.org/feed/" rel="self" />
+    <link href="http://example.org/" />
+    <id>urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6</id>
+""" % (
+         html["title"],
+         datetime.datetime.now().isoformat()
+))


+docinfos = extract_docinfos(html)

-
-print("TITRE:", html["title"])
-
-soup = bs4.BeautifulSoup(html["body"], 'html.parser')
-divs = soup.select("div")
+soup2 = bs4.BeautifulSoup(html["body"], 'html.parser')
+divs = soup2.select("div")

 for d in divs:
    # don't handle subsections
@ -38,4 +96,12 @@ for d in divs:

    print("="*40)
    print(d)
-    print(find_date(d))
+    try:
+        print(find_date(d))
+    except dateutil.parser.ParserError:
+        pass
+
+
+
+
+print("</feed>")