lots of improvements (XML output, database layout etc)

This commit is contained in:
Michael Clemens 2020-08-13 00:44:45 +02:00
parent 34ba5b79f0
commit 6c8aca866a

View File

@ -1,8 +1,8 @@
import urllib3 import urllib3
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import textwrap
import hashlib import hashlib
import sqlite3 import sqlite3
import time
from sqlite3 import Error from sqlite3 import Error
def create_conn(db_file): def create_conn(db_file):
@ -24,86 +24,99 @@ def create_table(conn, create_table_SQL):
print(e) print(e)
def insert_site(conn, site): def insert_site(conn, site):
sql = ''' INSERT INTO sites(hash,site,content) sql = ''' INSERT INTO sites(unixtime,hash,tafel,description,title)
VALUES(?,?,?) ''' VALUES(?,?,?,?,?) '''
try: try:
c = conn.cursor() c = conn.cursor()
c.execute(sql, site) c.execute(sql, site)
conn.commit() conn.commit()
return c.lastrowid return c.lastrowid
except Error as e: except Error as e:
print(e) err = e
def get_site(conn, site): def get_sites(conn):
#sql = ''' SELECT content from sites WHERE site = ? ''' sql = ''' SELECT description,title from sites order by unixtime desc limit 3 '''
sql = ''' SELECT content from sites '''
try: try:
c = conn.cursor() c = conn.cursor()
#c.execute(sql, (site,))
c.execute(sql) c.execute(sql)
rows = c.fetchall() rows = c.fetchall()
return rows return rows
except Error as e: except Error as e:
print(e) print(e)
def store_site(conn, site): def store_site(conn, tafel):
link = "http://www.ard-text.de/mobil/"+str(site) link = "http://www.ard-text.de/mobil/"+str(tafel)
http = urllib3.PoolManager() http = urllib3.PoolManager()
r = http.request('GET', link) r = http.request('GET', link)
soup = BeautifulSoup(r.data, 'html.parser') soup = BeautifulSoup(r.data, 'html.parser')
bla = soup.find('div', class_='std').text desc = soup.find('div', class_='std')
bla_hash = hashlib.md5(bla.encode('utf-8')).hexdigest() title = soup.find('h1')
content = (bla_hash,site,bla) if desc is not None:
insert_site(conn,content) if title is not None:
title = title.text.replace("<h1>","")
title = title.replace("<b>","")
title = title.replace("</h1>","")
title = title.replace("</b>","")
else:
title = "N/A"
unixtime = time.time()
desc = desc.text
desc_hash = hashlib.md5(desc.encode('utf-8')).hexdigest()
content = (unixtime,desc_hash,tafel,desc,title)
insert_site(conn,content)
def gen_rss(): def gen_rss(rows):
out = """ out = """<?xml version="1.0" encoding="UTF-8" ?>
<?xml version=1.0 encoding=UTF-8 ?> <rss version="2.0">
<rss version=2.0>
<channel> <channel>
<title>W3Schools Home Page</title> <title>ARD Teletext RSS Feed (inofficial)</title>
<link>https://www.w3schools.com</link> <link>https://www.exitnode.net</link>
<description>Free web building tutorials</description>""" <description>bla</description>"""
#bla = """ <item> for r in rows:
# <title>RSS Tutorial</title> cont = r[0]
# <link>https://www.w3schools.com/xml/xml_rss.asp</link> title = r[1]
# <description>New RSS tutorial on W3Schools</description>
# </item>"""
out += """ </channel> if cont is not None:
cont = cont.replace("\n","")
out+= """
<item>
<title>""" + title + """</title>
<description>
""" + cont + """
</description>
</item>"""
out += """
</channel>
</rss>""" </rss>"""
print(out) print(out)
def main(): def main():
db = r"/home/micha/bla.db" db = r"/home/micha/bla.db"
sql_create_sites_table = """CREATE TABLE IF NOT EXISTS sites ( sql_create_sites_table = """CREATE TABLE IF NOT EXISTS sites (
unixtime int NOT NULL,
hash text PRIMARY KEY, hash text PRIMARY KEY,
site int, tafel int,
content text description text,
title text
); """ ); """
#conn = create_conn(db) conn = create_conn(db)
conn = None
if conn is not None: if conn is not None:
create_table(conn, sql_create_sites_table) create_table(conn, sql_create_sites_table)
for s in range(104, 116): for s in range(104, 116):
store_site(conn,s) store_site(conn,s)
rows = get_site(conn,"104") rows = get_sites(conn)
for row in rows: gen_rss(rows)
#r = row[0].replace("^ ", "")
#print(row[0])
print(textwrap.fill(r, 40))
else: else:
print("Error: No db conn") print("Error: No db conn")
gen_rss()
if __name__ == "__main__": if __name__ == "__main__":
main() main()