tootbot: html converter

pull/315/merge
Mike Lang 1 year ago committed by Mike Lang
parent e12191686f
commit e224f8b85c

@ -1,7 +1,8 @@
import argh
import yaml
from mastodon import Mastodon
import mastodon
from bs4 import BeautifulSoup
import zulip
@ -36,13 +37,137 @@ def format_status(status):
if reblog is not None:
boostee = format_account(reblog["account"])
boost_url = reblog["url"]
content = format_content(reblog["content"])
content = format_content(reblog)
return f"{sender} reblogged {boostee}'s [post]({boost_url})\n{content}"
return f"{"
content = format_content(status)
if reply:
return f"{sender} [replied]({url}) to message {reply}:\n{content}"
class Listener(Mastodon.StreamListener):
return f"{sender} [posted]({url}):\n{content}"
def md_wrap(kind, inner):
"""
Wrap inner in a markdown block with given kind, eg. "quote" or "spoiler TEXT HERE"
"""
return "\n".join([
f"```{kind}",
inner,
"```",
])
def format_content(status):
# Main status text should be rendered into text and quoted
html = BeautifulSoup(status["content"], "html.parser")
text = html_to_md(html)
parts = [md_wrap("quote", text)]
# Append attachments, with the link name being the alt text.
# For images at least, zulip will auto-preview them.
for attachment in status["media_attachments"]:
type = attachment["type"]
description = attachment["description"] or "(no description provided)"
url = attachment["url"]
parts.append(f"Attached {type}: [{description}]({url})")
output = "\n".join(parts)
# If content warnings present, wrap in a spoiler block.
warning = status["spoiler_text"]
if warning:
output = md_wrap(f"spoiler {warning}", output)
return output
def html_to_md(html):
"""
Take a status HTML as a BeautifulSoup object
and make a best-effort attempt to render it in markdown:
* Convert each <p> section to text ending in a newline
* Convert <a> tags to links
* Convert formatting tags to the markdown equivalent
* Ignore other tags and just pass through their inner content
Note that zulip has no way of escaping markdown syntax, so if a message contains
characters that happen to be valid markdown, there's not much we can do about it.
The only thing that could cause problems is a malicious input that breaks out of
our surrounding quote block. And you know what? Fine, they can have that. We can always
just read the link to see the real deal.
"""
if html.name is None:
# Raw string, return it without surrounding whitespace
return html.strip()
if html.name == "br":
# <br> should never have any contents, and just become a newline
return "\n"
# Lists need to be handled specially as they should only contain <li> elements
# and the <li> elements should be rendered differently depending on the outer element.
if html.name in ("ul", "ol"):
prefix = {"ul": "*", "ol": "1."}[html.name]
items = []
for item in html.children:
if item.name != "li":
logging.warning(f"Ignoring non-<li> inside <{html.name}> element")
continue
inner = "".join(html_to_md(child) for child in item.children)
# Prepend two spaces to every line, to create a "nesting" effect
# for sub-lists. This may break things if mixed with blockquotes but oh well.
inner = "\n".join(" " + line for line in inner.split("\n"))
items.append(prefix + inner)
return "\n".join(items)
# Get contents recursively. Bail if empty as this might cause weird things if rendered naively,
# eg. <i></i> would be rendered ** which would actually make other text bold.
inner = "".join(html_to_md(child) for child in html.children)
if not inner:
return ""
# <p> should insert a newline at the end
if html.name == "p":
return f"{inner}\n"
# <a> should become a link
href = html.get("href")
if html.name == "a" and href:
return f"[{inner}]({href})"
# <blockquote> creates a nested quote block which is its own paragraph.
if html.name == "blockquote":
return md_wrap("quote", inner)
# The following tag types are simple character-based format wrappers.
# Note we don't handle <u> due to lack of underline formatting in markdown.
# We treat all headings as bold.
CHAR_FORMAT = {
"b": "**",
"strong": "**",
"h1": "**",
"h2": "**",
"h3": "**",
"h4": "**",
"h5": "**",
"h6": "**",
"i": "*",
"em": "*",
"del": "~~",
"pre": "`",
"code": "`",
}
if html.name in CHAR_FORMAT:
char = CHAR_FORMAT[html.name]
return f"{char}{inner}{char}"
# For any other types, most notably <span> but also anything that we don't recognize,
# just pass the inner text though unchanged.
return inner
LINE = "\n---"
class Listener(mastodon.StreamListener):
def __init__(self, zulip_client, stream, post_topic, notification_topic):
self.zulip_client = zulip_client
self.stream = stream
@ -55,7 +180,7 @@ class Listener(Mastodon.StreamListener):
def on_update(self, status):
logging.info(f"Got update: {status!r}")
self.send(self.post_topic, format_status(status))
self.send(self.post_topic, format_status(status) + LINE)
def on_delete(self, status_id):
logging.info(f"Got delete: {status_id}")
@ -63,13 +188,13 @@ class Listener(Mastodon.StreamListener):
def on_status_update(self, status):
logging.info(f"Got status update: {status!r}")
self.send(self.post_topic, f"*The following status has been updated*\n{format_status(status)}")
self.send(self.post_topic, f"*The following status has been updated*\n{format_status(status)}" + LINE)
def on_notification(self, notification):
logging.info(f"Got {notification['type']} notification: {notification!r}")
if notification["type"] != "mention":
return
self.send(self.notification_topic, format_status(status))
self.send(self.notification_topic, format_status(status) + LINE)
@cli
@ -95,7 +220,7 @@ def main(conf_file, stream="bot-spam", post_topic="Toots from Desert Bus", notif
mc = conf["mastodon"]
zulip_client = zulip.Client(zc["url"], zc["email"], zc["api_key"])
mastodon = Mastodon(api_base_url=mc["url"], access_token=mc["access_token"])
mastodon = mastodon.Mastodon(api_base_url=mc["url"], access_token=mc["access_token"])
listener = Listener(zulip_client, stream, post_topic, notification_topic)
logging.info("Starting")

Loading…
Cancel
Save