From 8cb294f3c866e276edb89b048718b7e5673e097f Mon Sep 17 00:00:00 2001 From: Daniel Schwarz Date: Thu, 23 Nov 2023 11:10:02 -0500 Subject: [PATCH] Use Pandoc to render markdown, fallback to plaintext Also used for markdown rendering in console and copy-to-clipboard --- setup.py | 7 +- tests/test_console.py | 204 ---------------------------------- toot/output.py | 17 +-- toot/richtext/__init__.py | 25 +++++ toot/richtext/markdown.py | 11 ++ toot/tui/app.py | 12 +- toot/tui/richtext/__init__.py | 35 +++--- toot/tui/richtext/markdown.py | 21 ++++ 8 files changed, 85 insertions(+), 247 deletions(-) create mode 100644 toot/richtext/__init__.py create mode 100644 toot/richtext/markdown.py create mode 100644 toot/tui/richtext/markdown.py diff --git a/setup.py b/setup.py index 5946faf..7ed6c15 100644 --- a/setup.py +++ b/setup.py @@ -39,12 +39,15 @@ setup( "wcwidth>=0.1.7", "urwid>=2.0.0,<3.0", "tomlkit>=0.10.0,<1.0", - "html2text>=2020.1.16" ], extras_require={ # Required to display rich text in the TUI "richtext": [ - "urwidgets>=0.1,<0.2" + "urwidgets>=0.1,<0.2", + ], + "markdown": [ + "pypandoc>=1.12.0,<2.0", + "pypandoc-binary>=1.12.0,<2.0", ], "dev": [ "coverage", diff --git a/tests/test_console.py b/tests/test_console.py index 5eeb171..1d321df 100644 --- a/tests/test_console.py +++ b/tests/test_console.py @@ -152,210 +152,6 @@ def test_timeline(mock_get, monkeypatch, capsys): assert err == "" -@mock.patch('toot.http.get') -def test_timeline_html_content(mock_get, monkeypatch, capsys): - mock_get.return_value = MockResponse([{ - 'id': '111111111111111111', - 'account': { - 'display_name': 'Frank Zappa 🎸', - 'acct': 'fz' - }, - 'created_at': '2017-04-12T15:53:18.174Z', - 'content': "

HTML Render Test

emphasized
underlined
bold
bold and italic
strikethrough
regular text

Code block:

10 PRINT \"HELLO WORLD\"
20 GOTO 10

Something blockquoted here. The indentation is maintained as the text line wraps.

  1. List item
    • Nested item
    • Another nested
  2. Another list item.
    1. Something else nested
    2. And a last nested

Blockquote

  1. List in BQ
  2. List item 2 in BQ

#hashtag #test
https://a.com text after link

", - 'reblog': None, - 'in_reply_to_id': None, - 'media_attachments': [], - }]) - - console.run_command(app, user, 'timeline', ['--once']) - - mock_get.assert_called_once_with(app, user, '/api/v1/timelines/home', {'limit': 10}) - - out, err = capsys.readouterr() - lines = out.split("\n") - reference = [ - "────────────────────────────────────────────────────────────────────────────────────────────────────", - "Frank Zappa 🎸 @fz 2017-04-12 15:53 UTC", - "", - "## HTML Render Test", - "", - " _emphasized_ ", - " _underlined_ ", - " **bold** ", - " ** _bold and italic_** ", - " ~~strikethrough~~ ", - "regular text", - "", - "Code block:", - "", - " ", - " 10 PRINT \"HELLO WORLD\" ", - " 20 GOTO 10 ", - " ", - "> Something blockquoted here. The indentation is maintained as the text line wraps.", - " 1. List item", - " • Nested item", - " • Another nested ", - " 2. Another list item. ", - " 1. Something else nested", - " 2. And a last nested", - "", - "> Blockquote", - "> 1. List in BQ", - "> 2. List item 2 in BQ", - ">", - "", - "#hashtag #test ", - "https://a.com text after link", - "", - "ID 111111111111111111 ", - "────────────────────────────────────────────────────────────────────────────────────────────────────", - "", - ] - - assert len(lines) == len(reference) - for index, line in enumerate(lines): - assert line == reference[index], f"Line #{index}: Expected:\n{reference[index]}\nGot:\n{line}" - - assert err == "" - - -@mock.patch('toot.http.get') -def test_timeline_html_content(mock_get, monkeypatch, capsys): - mock_get.return_value = MockResponse([{ - 'id': '111111111111111111', - 'account': { - 'display_name': 'Frank Zappa 🎸', - 'acct': 'fz' - }, - 'created_at': '2017-04-12T15:53:18.174Z', - 'content': "

HTML Render Test

emphasized
underlined
bold
bold and italic
strikethrough
regular text

Code block:

10 PRINT \"HELLO WORLD\"
20 GOTO 10

Something blockquoted here. The indentation is maintained as the text line wraps.

  1. List item
    • Nested item
    • Another nested
  2. Another list item.
    1. Something else nested
    2. And a last nested

Blockquote

  1. List in BQ
  2. List item 2 in BQ

#hashtag #test
https://a.com text after link

", - 'reblog': None, - 'in_reply_to_id': None, - 'media_attachments': [], - }]) - - console.run_command(app, user, 'timeline', ['--once']) - - mock_get.assert_called_once_with(app, user, '/api/v1/timelines/home', {'limit': 10}) - - out, err = capsys.readouterr() - lines = out.split("\n") - reference = [ - "────────────────────────────────────────────────────────────────────────────────────────────────────", - "Frank Zappa 🎸 @fz 2017-04-12 15:53 UTC", - "", - "## HTML Render Test", - "", - " _emphasized_ ", - " _underlined_ ", - " **bold** ", - " ** _bold and italic_** ", - " ~~strikethrough~~ ", - "regular text", - "", - "Code block:", - "", - " ", - " 10 PRINT \"HELLO WORLD\" ", - " 20 GOTO 10 ", - " ", - "> Something blockquoted here. The indentation is maintained as the text line wraps.", - " 1. List item", - " • Nested item", - " • Another nested ", - " 2. Another list item. ", - " 1. Something else nested", - " 2. And a last nested", - "", - "> Blockquote", - "> 1. List in BQ", - "> 2. List item 2 in BQ", - ">", - "", - "#hashtag #test ", - "https://a.com text after link", - "", - "ID 111111111111111111 ", - "────────────────────────────────────────────────────────────────────────────────────────────────────", - "", - ] - - assert len(lines) == len(reference) - for index, line in enumerate(lines): - assert line == reference[index], f"Line #{index}: Expected:\n{reference[index]}\nGot:\n{line}" - - assert err == "" - - -@mock.patch('toot.http.get') -def test_timeline_html_content(mock_get, monkeypatch, capsys): - mock_get.return_value = MockResponse([{ - 'id': '111111111111111111', - 'account': { - 'display_name': 'Frank Zappa 🎸', - 'acct': 'fz' - }, - 'created_at': '2017-04-12T15:53:18.174Z', - 'content': "

HTML Render Test

emphasized
underlined
bold
bold and italic
strikethrough
regular text

Code block:

10 PRINT \"HELLO WORLD\"
20 GOTO 10

Something blockquoted here. The indentation is maintained as the text line wraps.

  1. List item
    • Nested item
    • Another nested
  2. Another list item.
    1. Something else nested
    2. And a last nested

Blockquote

  1. List in BQ
  2. List item 2 in BQ

#hashtag #test
https://a.com text after link

", - 'reblog': None, - 'in_reply_to_id': None, - 'media_attachments': [], - }]) - - console.run_command(app, user, 'timeline', ['--once']) - - mock_get.assert_called_once_with(app, user, '/api/v1/timelines/home', {'limit': 10}) - - out, err = capsys.readouterr() - lines = out.split("\n") - reference = [ - "────────────────────────────────────────────────────────────────────────────────────────────────────", - "Frank Zappa 🎸 @fz 2017-04-12 15:53 UTC", - "", - "## HTML Render Test", - "", - " _emphasized_ ", - " _underlined_ ", - " **bold** ", - " ** _bold and italic_** ", - " ~~strikethrough~~ ", - "regular text", - "", - "Code block:", - "", - " ", - " 10 PRINT \"HELLO WORLD\" ", - " 20 GOTO 10 ", - " ", - "> Something blockquoted here. The indentation is maintained as the text line wraps.", - " 1. List item", - " • Nested item", - " • Another nested ", - " 2. Another list item. ", - " 1. Something else nested", - " 2. And a last nested", - "", - "> Blockquote", - "> 1. List in BQ", - "> 2. List item 2 in BQ", - ">", - "", - "#hashtag #test ", - "https://a.com text after link", - "", - "ID 111111111111111111 ", - "────────────────────────────────────────────────────────────────────────────────────────────────────", - "", - ] - - assert len(lines) == len(reference) - for index, line in enumerate(lines): - assert line == reference[index], f"Line #{index}: Expected:\n{reference[index]}\nGot:\n{line}" - - assert err == "" - - @mock.patch('toot.http.get') def test_timeline_with_re(mock_get, monkeypatch, capsys): mock_get.return_value = MockResponse([{ diff --git a/toot/output.py b/toot/output.py index 7bdc333..9bf7d91 100644 --- a/toot/output.py +++ b/toot/output.py @@ -2,11 +2,11 @@ import os import re import sys import textwrap -import html2text from functools import lru_cache from toot import settings from toot.utils import get_text +from toot.richtext import html_to_text from toot.entities import Account, Instance, Notification, Poll, Status from toot.wcstring import wc_wrap from typing import List @@ -321,20 +321,9 @@ def print_status(status: Status, width: int = 80): def print_html(text, width=80): - h2t = html2text.HTML2Text() - - h2t.body_width = width - h2t.single_line_break = True - h2t.ignore_links = True - h2t.wrap_links = True - h2t.wrap_list_items = True - h2t.wrap_tables = True - h2t.unicode_snob = True - h2t.ul_item_mark = "\N{bullet}" - markdown = h2t.handle(text).strip() - + markdown = "\n".join(html_to_text(text, columns=width, highlight_tags=False)) print_out("") - print_out(highlight_hashtags(markdown)) + print_out(markdown) def print_poll(poll: Poll): diff --git a/toot/richtext/__init__.py b/toot/richtext/__init__.py new file mode 100644 index 0000000..9888a5d --- /dev/null +++ b/toot/richtext/__init__.py @@ -0,0 +1,25 @@ +from toot.tui.utils import highlight_hashtags +from toot.utils import html_to_paragraphs +from toot.wcstring import wc_wrap +from typing import List + +try: + # first preference, render markup with pypandoc + from .markdown import html_to_text + +except ImportError: + # Fallback to render in plaintext + def html_to_text(html: str, columns=80, highlight_tags=False) -> List: + output = [] + first = True + for paragraph in html_to_paragraphs(html): + if not first: + output.append("") + for line in paragraph: + for subline in wc_wrap(line, columns): + if highlight_tags: + output.append(highlight_hashtags(subline)) + else: + output.append(subline) + first = False + return output diff --git a/toot/richtext/markdown.py b/toot/richtext/markdown.py new file mode 100644 index 0000000..a3ea03c --- /dev/null +++ b/toot/richtext/markdown.py @@ -0,0 +1,11 @@ +from pypandoc import convert_text +from typing import List + + +def html_to_text(html: str, columns=80, highlight_tags=False) -> List: + return [convert_text( + html, + format="html", + to="gfm-raw_html", + extra_args=["--wrap=auto", f"--columns={columns}"], + )] diff --git a/toot/tui/app.py b/toot/tui/app.py index d90428d..838b7b3 100644 --- a/toot/tui/app.py +++ b/toot/tui/app.py @@ -1,13 +1,13 @@ import logging import subprocess import urwid -import html2text from concurrent.futures import ThreadPoolExecutor from toot import api, config, __version__, settings from toot.console import get_default_visibility from toot.exceptions import ApiError +from toot.richtext import html_to_text from toot.utils.datetime import parse_datetime from .compose import StatusComposer @@ -656,12 +656,8 @@ class TUI(urwid.Frame): return self.run_in_thread(_delete, done_callback=_done) def copy_status(self, status): - h2t = html2text.HTML2Text() - h2t.body_width = 0 # nowrap - h2t.single_line_break = True - h2t.ignore_links = True - h2t.unicode_snob = True - h2t.ul_item_mark = "\N{bullet}" + + markdown = "\n".join(html_to_text(status.original.data["content"], columns=1024, highlight_tags=False)) time = parse_datetime(status.original.data['created_at']) time = time.strftime('%Y-%m-%d %H:%M %Z') @@ -671,7 +667,7 @@ class TUI(urwid.Frame): + "\n" + (status.original.author.account or "") + "\n\n" - + h2t.handle(status.original.data["content"]).strip() + + markdown + "\n\n" + f"Created at: {time}") diff --git a/toot/tui/richtext/__init__.py b/toot/tui/richtext/__init__.py index 2793493..e0e43dc 100644 --- a/toot/tui/richtext/__init__.py +++ b/toot/tui/richtext/__init__.py @@ -1,27 +1,24 @@ import urwid -import html2text - +from toot.tui.utils import highlight_hashtags +from toot.utils import format_content from typing import List try: + # our first preference is to render using urwidgets from .richtext import html_to_widgets, url_to_widget + except ImportError: - # Fallback if urwidgets are not available - def html_to_widgets(html: str) -> List[urwid.Widget]: - return [ - urwid.Text(_format_markdown(html)) - ] + try: + # second preference, render markup with pypandoc + from .markdown import html_to_widgets, url_to_widget - def url_to_widget(url: str): - return urwid.Text(("link", url)) + except ImportError: + # Fallback to render in plaintext - def _format_markdown(html) -> str: - h2t = html2text.HTML2Text() - h2t.single_line_break = True - h2t.ignore_links = True - h2t.wrap_links = False - h2t.wrap_list_items = False - h2t.wrap_tables = False - h2t.unicode_snob = True - h2t.ul_item_mark = "\N{bullet}" - return h2t.handle(html).strip() + def url_to_widget(url: str): + return urwid.Text(("link", url)) + + def html_to_widgets(html: str) -> List[urwid.Widget]: + return [ + urwid.Text(highlight_hashtags(line)) for line in format_content(html) + ] diff --git a/toot/tui/richtext/markdown.py b/toot/tui/richtext/markdown.py new file mode 100644 index 0000000..dcc5e7a --- /dev/null +++ b/toot/tui/richtext/markdown.py @@ -0,0 +1,21 @@ +import urwid +from pypandoc import convert_text + +from typing import List + + +def url_to_widget(url: str): + return urwid.Text(("link", url)) + + +def html_to_widgets(html: str) -> List[urwid.Widget]: + return [ + urwid.Text( + convert_text( + html, + format="html", + to="gfm-raw_html", + extra_args=["--wrap=none"], + ) + ) + ]