mirror of
https://github.com/ihabunek/toot.git
synced 2024-09-22 04:25:55 -04:00
Extract beautiful soup parsing code
This commit is contained in:
parent
2298357480
commit
b99a193704
@ -4,8 +4,9 @@ richtext
|
||||
from typing import List, Tuple
|
||||
import urwid
|
||||
import unicodedata
|
||||
|
||||
from toot.utils import bs4_parse
|
||||
from .constants import PALETTE
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import NavigableString, Tag
|
||||
|
||||
|
||||
@ -21,7 +22,7 @@ class ContentParser:
|
||||
"""Convert html to urwid widgets"""
|
||||
widgets: List[urwid.Widget] = []
|
||||
html = unicodedata.normalize("NFKC", html)
|
||||
soup = BeautifulSoup(html.replace("'", "'"), "html.parser")
|
||||
soup = bs4_parse(html)
|
||||
for e in soup.body or soup:
|
||||
if isinstance(e, NavigableString):
|
||||
continue
|
||||
|
@ -24,15 +24,22 @@ def str_bool_nullable(b):
|
||||
|
||||
def get_text(html):
|
||||
"""Converts html to text, strips all tags."""
|
||||
text = bs4_parse(html).get_text()
|
||||
return unicodedata.normalize("NFKC", text)
|
||||
|
||||
|
||||
def bs4_parse(html: str) -> BeautifulSoup:
|
||||
# Versions of BeautifulSoup before 4.8.0 do not convert ' to '
|
||||
# correctly so replace it before decoding. Required in case someone still
|
||||
# uses an older version.
|
||||
html = html.replace("'", "'")
|
||||
|
||||
# Ignore warnings made by BeautifulSoup, if passed something that looks like
|
||||
# a file (e.g. a dot which matches current dict), it will warn that the file
|
||||
# should be opened instead of passing a filename.
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
text = BeautifulSoup(html.replace(''', "'"), "html.parser").get_text()
|
||||
|
||||
return unicodedata.normalize('NFKC', text)
|
||||
return BeautifulSoup(html, "html.parser")
|
||||
|
||||
|
||||
def parse_html(html):
|
||||
|
Loading…
Reference in New Issue
Block a user