1
0
mirror of https://github.com/ihabunek/toot.git synced 2024-11-03 04:17:21 -05:00

Normalize unicode

This commit is contained in:
Ivan Habunek 2018-01-21 16:39:40 +01:00
parent cb1f7b4e61
commit 2ecc6a28c6
No known key found for this signature in database
GPG Key ID: CDBD63C43A30BB95

View File

@ -2,6 +2,7 @@
import re
import socket
import unicodedata
from bs4 import BeautifulSoup
@ -10,7 +11,9 @@ from toot.exceptions import ConsoleError
def get_text(html):
"""Converts html to text, strips all tags."""
return BeautifulSoup(html, "html.parser").get_text().replace(''', "'")
text = BeautifulSoup(html, "html.parser").get_text().replace(''', "'")
return unicodedata.normalize('NFKC', text)
def parse_html(html):