From 2ecc6a28c6b1cd2efd4bd94d801954e87ab1b320 Mon Sep 17 00:00:00 2001 From: Ivan Habunek Date: Sun, 21 Jan 2018 16:39:40 +0100 Subject: [PATCH] Normalize unicode --- toot/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/toot/utils.py b/toot/utils.py index b7ae649..dc22bfb 100644 --- a/toot/utils.py +++ b/toot/utils.py @@ -2,6 +2,7 @@ import re import socket +import unicodedata from bs4 import BeautifulSoup @@ -10,7 +11,9 @@ from toot.exceptions import ConsoleError def get_text(html): """Converts html to text, strips all tags.""" - return BeautifulSoup(html, "html.parser").get_text().replace(''', "'") + text = BeautifulSoup(html, "html.parser").get_text().replace(''', "'") + + return unicodedata.normalize('NFKC', text) def parse_html(html):