1
0
mirror of https://github.com/ihabunek/toot.git synced 2024-06-30 06:35:24 +00:00

Replace ' by "'" before parsing HTML

Beautiful will does not parse HTML entities like `'` as we expect
and the previous logic of replacing this *after* HTML parsing occurred
did not produced expected results.

To illustrate this, we change data in "test_timeline" to include a
literal `'` as it sometimes occur in data returned by Mastodon API.
New HTML content is:

    <p>The computer can&apos;t tell you the emotional story [...] </p>

Beautiful will parse this as as:

    <p>The computer can&amp;apost tell you the emotional story [...] </p>

which is not what we expect.

We fix this by replacing `&apos;` *before* HTML parsing by Beautiful.
Since test data in "test_timeline" got updated we also add an extra
assertion checking that part of the content with a literal "'" is
(still) properly rendered.
This commit is contained in:
Denis Laxalde 2019-01-01 22:55:49 +01:00
parent 91fc273af7
commit 0f6bd920c3
3 changed files with 5 additions and 4 deletions

View File

@ -126,7 +126,7 @@ def test_timeline(mock_get, monkeypatch, capsys):
'username': 'fz' 'username': 'fz'
}, },
'created_at': '2017-04-12T15:53:18.174Z', 'created_at': '2017-04-12T15:53:18.174Z',
'content': "<p>The computer can't tell you the emotional story. It can give you the exact mathematical design, but what's missing is the eyebrows.</p>", 'content': "<p>The computer can&apos;t tell you the emotional story. It can give you the exact mathematical design, but what's missing is the eyebrows.</p>",
'reblog': None, 'reblog': None,
}]) }])
@ -136,6 +136,7 @@ def test_timeline(mock_get, monkeypatch, capsys):
out, err = capsys.readouterr() out, err = capsys.readouterr()
assert "The computer can't tell you the emotional story." in out assert "The computer can't tell you the emotional story." in out
assert "but what's missing is the eyebrows." in out
assert "Frank Zappa" in out assert "Frank Zappa" in out
assert "@fz" in out assert "@fz" in out

View File

@ -148,8 +148,8 @@ def print_timeline(items):
content = item['reblog']['content'] if item['reblog'] else item['content'] content = item['reblog']['content'] if item['reblog'] else item['content']
reblogged = item['reblog']['account']['username'] if item['reblog'] else None reblogged = item['reblog']['account']['username'] if item['reblog'] else None
soup = BeautifulSoup(content, "html.parser") soup = BeautifulSoup(content.replace('&apos;', "'"), "html.parser")
text = soup.get_text().replace('&apos;', "'") text = soup.get_text()
time = datetime.strptime(item['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ") time = datetime.strptime(item['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ")
return { return {

View File

@ -11,7 +11,7 @@ from toot.exceptions import ConsoleError
def get_text(html): def get_text(html):
"""Converts html to text, strips all tags.""" """Converts html to text, strips all tags."""
text = BeautifulSoup(html, "html.parser").get_text().replace('&apos;', "'") text = BeautifulSoup(html.replace('&apos;', "'"), "html.parser").get_text()
return unicodedata.normalize('NFKC', text) return unicodedata.normalize('NFKC', text)