From e224f8b85ce5447f6f0aaae362753276f9108f14 Mon Sep 17 00:00:00 2001 From: Mike Lang Date: Sun, 6 Aug 2023 16:58:38 +1000 Subject: [PATCH] tootbot: html converter --- zulip/tootbot.py | 141 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 133 insertions(+), 8 deletions(-) diff --git a/zulip/tootbot.py b/zulip/tootbot.py index 687f46e..442c5f9 100644 --- a/zulip/tootbot.py +++ b/zulip/tootbot.py @@ -1,7 +1,8 @@ import argh import yaml -from mastodon import Mastodon +import mastodon +from bs4 import BeautifulSoup import zulip @@ -36,13 +37,137 @@ def format_status(status): if reblog is not None: boostee = format_account(reblog["account"]) boost_url = reblog["url"] - content = format_content(reblog["content"]) + content = format_content(reblog) return f"{sender} reblogged {boostee}'s [post]({boost_url})\n{content}" - return f"{" + content = format_content(status) + if reply: + return f"{sender} [replied]({url}) to message {reply}:\n{content}" -class Listener(Mastodon.StreamListener): + return f"{sender} [posted]({url}):\n{content}" + + +def md_wrap(kind, inner): + """ + Wrap inner in a markdown block with given kind, eg. "quote" or "spoiler TEXT HERE" + """ + return "\n".join([ + f"```{kind}", + inner, + "```", + ]) + + +def format_content(status): + # Main status text should be rendered into text and quoted + html = BeautifulSoup(status["content"], "html.parser") + text = html_to_md(html) + parts = [md_wrap("quote", text)] + # Append attachments, with the link name being the alt text. + # For images at least, zulip will auto-preview them. + for attachment in status["media_attachments"]: + type = attachment["type"] + description = attachment["description"] or "(no description provided)" + url = attachment["url"] + parts.append(f"Attached {type}: [{description}]({url})") + output = "\n".join(parts) + # If content warnings present, wrap in a spoiler block. + warning = status["spoiler_text"] + if warning: + output = md_wrap(f"spoiler {warning}", output) + return output + + +def html_to_md(html): + """ + Take a status HTML as a BeautifulSoup object + and make a best-effort attempt to render it in markdown: + * Convert each

section to text ending in a newline + * Convert tags to links + * Convert formatting tags to the markdown equivalent + * Ignore other tags and just pass through their inner content + + Note that zulip has no way of escaping markdown syntax, so if a message contains + characters that happen to be valid markdown, there's not much we can do about it. + The only thing that could cause problems is a malicious input that breaks out of + our surrounding quote block. And you know what? Fine, they can have that. We can always + just read the link to see the real deal. + """ + if html.name is None: + # Raw string, return it without surrounding whitespace + return html.strip() + + if html.name == "br": + #
should never have any contents, and just become a newline + return "\n" + + # Lists need to be handled specially as they should only contain

  • elements + # and the
  • elements should be rendered differently depending on the outer element. + if html.name in ("ul", "ol"): + prefix = {"ul": "*", "ol": "1."}[html.name] + items = [] + for item in html.children: + if item.name != "li": + logging.warning(f"Ignoring non-
  • inside <{html.name}> element") + continue + inner = "".join(html_to_md(child) for child in item.children) + # Prepend two spaces to every line, to create a "nesting" effect + # for sub-lists. This may break things if mixed with blockquotes but oh well. + inner = "\n".join(" " + line for line in inner.split("\n")) + items.append(prefix + inner) + return "\n".join(items) + + # Get contents recursively. Bail if empty as this might cause weird things if rendered naively, + # eg. would be rendered ** which would actually make other text bold. + inner = "".join(html_to_md(child) for child in html.children) + if not inner: + return "" + + #

    should insert a newline at the end + if html.name == "p": + return f"{inner}\n" + + # should become a link + href = html.get("href") + if html.name == "a" and href: + return f"[{inner}]({href})" + + #

    creates a nested quote block which is its own paragraph. + if html.name == "blockquote": + return md_wrap("quote", inner) + + # The following tag types are simple character-based format wrappers. + # Note we don't handle due to lack of underline formatting in markdown. + # We treat all headings as bold. + CHAR_FORMAT = { + "b": "**", + "strong": "**", + "h1": "**", + "h2": "**", + "h3": "**", + "h4": "**", + "h5": "**", + "h6": "**", + "i": "*", + "em": "*", + "del": "~~", + "pre": "`", + "code": "`", + } + if html.name in CHAR_FORMAT: + char = CHAR_FORMAT[html.name] + return f"{char}{inner}{char}" + + # For any other types, most notably but also anything that we don't recognize, + # just pass the inner text though unchanged. + return inner + + +LINE = "\n---" + + +class Listener(mastodon.StreamListener): def __init__(self, zulip_client, stream, post_topic, notification_topic): self.zulip_client = zulip_client self.stream = stream @@ -55,7 +180,7 @@ class Listener(Mastodon.StreamListener): def on_update(self, status): logging.info(f"Got update: {status!r}") - self.send(self.post_topic, format_status(status)) + self.send(self.post_topic, format_status(status) + LINE) def on_delete(self, status_id): logging.info(f"Got delete: {status_id}") @@ -63,13 +188,13 @@ class Listener(Mastodon.StreamListener): def on_status_update(self, status): logging.info(f"Got status update: {status!r}") - self.send(self.post_topic, f"*The following status has been updated*\n{format_status(status)}") + self.send(self.post_topic, f"*The following status has been updated*\n{format_status(status)}" + LINE) def on_notification(self, notification): logging.info(f"Got {notification['type']} notification: {notification!r}") if notification["type"] != "mention": return - self.send(self.notification_topic, format_status(status)) + self.send(self.notification_topic, format_status(status) + LINE) @cli @@ -95,7 +220,7 @@ def main(conf_file, stream="bot-spam", post_topic="Toots from Desert Bus", notif mc = conf["mastodon"] zulip_client = zulip.Client(zc["url"], zc["email"], zc["api_key"]) - mastodon = Mastodon(api_base_url=mc["url"], access_token=mc["access_token"]) + mastodon = mastodon.Mastodon(api_base_url=mc["url"], access_token=mc["access_token"]) listener = Listener(zulip_client, stream, post_topic, notification_topic) logging.info("Starting")