This commit is contained in:
Thomas Baruchel 2022-11-05 12:22:32 +01:00
parent c21562a04a
commit e2258b08fa
1 changed files with 22 additions and 30 deletions

View File

@ -69,30 +69,35 @@ def build_entry(d):
""" """
Print on stdout an Atom <entry> section built from the <div>. Print on stdout an Atom <entry> section built from the <div>.
""" """
# Get the date
mydate = ""
try:
mydate = find_date(d)
except dateutil.parser.ParserError:
pass
print(""" <entry> print(""" <entry>
<title>Atom-Powered Robots Run Amok</title> <title>%s</title>
<link href="http://example.org/2003/12/13/atom03" />
<link rel="alternate" type="text/html" href="http://example.org/2003/12/13/atom03.html"/>
<link rel="edit" href="http://example.org/2003/12/13/atom03/edit"/>
<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
<published>2003-11-09T17:23:02Z</published> <updated>%s</updated>
<content type="xhtml"> <content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml"> <div xmlns="http://www.w3.org/1999/xhtml">
<p>This is the entry content.</p> <p>This is the entry content.</p>
</div> </div>
</content> </content>
<author> </entry>
<name>John Doe</name> """ % (
<email>johndoe@example.com</email> d.find("h1").text,
</author> mydate
</entry> ))
""")
docinfos = extract_docinfos(html) docinfos = extract_docinfos(html)
meta = extract_meta(html) meta = extract_meta(html)
soup2 = bs4.BeautifulSoup(html["body"], 'html.parser')
divs = soup2.select("div")
print("""<?xml version="1.0" encoding="utf-8"?> print("""<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom"> <feed xmlns="http://www.w3.org/2005/Atom">
<id>%s</id> <id>%s</id>
@ -118,22 +123,9 @@ print("""<?xml version="1.0" encoding="utf-8"?>
generator_uri, generator_name generator_uri, generator_name
)) ))
soup2 = bs4.BeautifulSoup(html["body"], 'html.parser')
divs = soup2.select("div")
for d in divs: for d in divs:
# don't handle subsections # don't handle subsections
if d.find_parent("div"): continue if d.find_parent("div"): continue
build_entry(d)
print("="*40)
print(d)
try:
print(find_date(d))
except dateutil.parser.ParserError:
pass
print("</feed>") print("</feed>")