1
0
mirror of https://github.com/ihabunek/toot.git synced 2024-06-30 06:35:24 +00:00

Merge pull request #83 from dlax/apos

Replace ' by "'" before parsing HTML
This commit is contained in:
Ivan Habunek 2019-01-02 11:39:21 +01:00 committed by GitHub
commit fc57d2695a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 5 additions and 4 deletions

View File

@ -126,7 +126,7 @@ def test_timeline(mock_get, monkeypatch, capsys):
'username': 'fz' 'username': 'fz'
}, },
'created_at': '2017-04-12T15:53:18.174Z', 'created_at': '2017-04-12T15:53:18.174Z',
'content': "<p>The computer can't tell you the emotional story. It can give you the exact mathematical design, but what's missing is the eyebrows.</p>", 'content': "<p>The computer can&apos;t tell you the emotional story. It can give you the exact mathematical design, but what's missing is the eyebrows.</p>",
'reblog': None, 'reblog': None,
}]) }])
@ -136,6 +136,7 @@ def test_timeline(mock_get, monkeypatch, capsys):
out, err = capsys.readouterr() out, err = capsys.readouterr()
assert "The computer can't tell you the emotional story." in out assert "The computer can't tell you the emotional story." in out
assert "but what's missing is the eyebrows." in out
assert "Frank Zappa" in out assert "Frank Zappa" in out
assert "@fz" in out assert "@fz" in out

View File

@ -148,8 +148,8 @@ def print_timeline(items):
content = item['reblog']['content'] if item['reblog'] else item['content'] content = item['reblog']['content'] if item['reblog'] else item['content']
reblogged = item['reblog']['account']['username'] if item['reblog'] else None reblogged = item['reblog']['account']['username'] if item['reblog'] else None
soup = BeautifulSoup(content, "html.parser") soup = BeautifulSoup(content.replace('&apos;', "'"), "html.parser")
text = soup.get_text().replace('&apos;', "'") text = soup.get_text()
time = datetime.strptime(item['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ") time = datetime.strptime(item['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ")
return { return {

View File

@ -12,7 +12,7 @@ from toot.exceptions import ConsoleError
def get_text(html): def get_text(html):
"""Converts html to text, strips all tags.""" """Converts html to text, strips all tags."""
text = BeautifulSoup(html, "html.parser").get_text().replace('&apos;', "'") text = BeautifulSoup(html.replace('&apos;', "'"), "html.parser").get_text()
return unicodedata.normalize('NFKC', text) return unicodedata.normalize('NFKC', text)