mirror of
https://github.com/ihabunek/toot.git
synced 2025-02-02 15:07:51 -05:00
Extract parsing html
This commit is contained in:
parent
d91c73520e
commit
199a96625b
@ -2,11 +2,10 @@ import re
|
||||
import urwid
|
||||
import unicodedata
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import NavigableString, Tag
|
||||
from toot.tui.constants import PALETTE
|
||||
from toot.tui.stubs.urwidgets import TextEmbed, Hyperlink, parse_text, has_urwidgets
|
||||
from toot.utils import urlencode_url
|
||||
from toot.utils import parse_html, urlencode_url
|
||||
from typing import List, Tuple
|
||||
from urwid.util import decompose_tagmarkup
|
||||
|
||||
@ -23,7 +22,7 @@ class ContentParser:
|
||||
"""Convert html to urwid widgets"""
|
||||
widgets: List[urwid.Widget] = []
|
||||
html = unicodedata.normalize("NFKC", html)
|
||||
soup = BeautifulSoup(html.replace("'", "'"), "html.parser")
|
||||
soup = parse_html(html)
|
||||
first_tag = True
|
||||
for e in soup.body or soup:
|
||||
if isinstance(e, NavigableString):
|
||||
|
@ -23,17 +23,19 @@ def str_bool_nullable(b):
|
||||
return None if b is None else str_bool(b)
|
||||
|
||||
|
||||
def get_text(html):
|
||||
"""Converts html to text, strips all tags."""
|
||||
|
||||
def parse_html(html: str) -> BeautifulSoup:
|
||||
# Ignore warnings made by BeautifulSoup, if passed something that looks like
|
||||
# a file (e.g. a dot which matches current dict), it will warn that the file
|
||||
# should be opened instead of passing a filename.
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
text = BeautifulSoup(html.replace(''', "'"), "html.parser").get_text()
|
||||
return BeautifulSoup(html.replace("'", "'"), "html.parser")
|
||||
|
||||
return unicodedata.normalize('NFKC', text)
|
||||
|
||||
def get_text(html):
|
||||
"""Converts html to text, strips all tags."""
|
||||
text = parse_html(html).get_text()
|
||||
return unicodedata.normalize("NFKC", text)
|
||||
|
||||
|
||||
def html_to_paragraphs(html):
|
||||
|
Loading…
x
Reference in New Issue
Block a user