tootbot: html converter

2 years ago · e224f8b85c
parent e12191686f
commit e224f8b85c
1 changed files with 133 additions and 8 deletions
--- a/zulip/tootbot.py
+++ b/zulip/tootbot.py
@ -1,7 +1,8 @@

 import argh
 import yaml
-from mastodon import Mastodon
+import mastodon
+from bs4 import BeautifulSoup

 import zulip

@ -36,13 +37,137 @@ def format_status(status):
 	if reblog is not None:
 		boostee = format_account(reblog["account"])
 		boost_url = reblog["url"]
-		content = format_content(reblog["content"])
+		content = format_content(reblog)
 		return f"{sender} reblogged {boostee}'s [post]({boost_url})\n{content}"

-	return f"{"
+	content = format_content(status)

+	if reply:
+		return f"{sender} [replied]({url}) to message {reply}:\n{content}"

-class Listener(Mastodon.StreamListener):
+	return f"{sender} [posted]({url}):\n{content}"
+
+
+def md_wrap(kind, inner):
+	"""
+		Wrap inner in a markdown block with given kind, eg. "quote" or "spoiler TEXT HERE"
+	"""
+	return "\n".join([
+		f"```{kind}",
+		inner,
+		"```",
+	])
+
+
+def format_content(status):
+	# Main status text should be rendered into text and quoted
+	html = BeautifulSoup(status["content"], "html.parser")
+	text = html_to_md(html)
+	parts = [md_wrap("quote", text)]
+	# Append attachments, with the link name being the alt text.
+	# For images at least, zulip will auto-preview them.
+	for attachment in status["media_attachments"]:
+		type = attachment["type"]
+		description = attachment["description"] or "(no description provided)"
+		url = attachment["url"]
+		parts.append(f"Attached {type}: [{description}]({url})")
+	output = "\n".join(parts)
+	# If content warnings present, wrap in a spoiler block.
+	warning = status["spoiler_text"]
+	if warning:
+		output = md_wrap(f"spoiler {warning}", output)
+	return output
+
+
+def html_to_md(html):
+	"""
+	Take a status HTML as a BeautifulSoup object
+	and make a best-effort attempt to render it in markdown:
+	* Convert each <p> section to text ending in a newline
+	* Convert <a> tags to links
+	* Convert formatting tags to the markdown equivalent
+	* Ignore other tags and just pass through their inner content
+
+	Note that zulip has no way of escaping markdown syntax, so if a message contains
+	characters that happen to be valid markdown, there's not much we can do about it.
+	The only thing that could cause problems is a malicious input that breaks out of
+	our surrounding quote block. And you know what? Fine, they can have that. We can always
+	just read the link to see the real deal.
+	"""
+	if html.name is None:
+		# Raw string, return it without surrounding whitespace
+		return html.strip()
+
+	if html.name == "br":
+		# <br> should never have any contents, and just become a newline
+		return "\n"
+
+	# Lists need to be handled specially as they should only contain <li> elements
+	# and the <li> elements should be rendered differently depending on the outer element.
+	if html.name in ("ul", "ol"):
+		prefix = {"ul": "*", "ol": "1."}[html.name]
+		items = []
+		for item in html.children:
+			if item.name != "li":
+				logging.warning(f"Ignoring non-<li> inside <{html.name}> element")
+				continue
+			inner = "".join(html_to_md(child) for child in item.children)
+			# Prepend two spaces to every line, to create a "nesting" effect
+			# for sub-lists. This may break things if mixed with blockquotes but oh well.
+			inner = "\n".join("  " + line for line in inner.split("\n"))
+			items.append(prefix + inner)
+		return "\n".join(items)
+
+	# Get contents recursively. Bail if empty as this might cause weird things if rendered naively,
+	# eg. <i></i> would be rendered ** which would actually make other text bold.
+	inner = "".join(html_to_md(child) for child in html.children)
+	if not inner:
+		return ""
+
+	# <p> should insert a newline at the end
+	if html.name == "p":
+		return f"{inner}\n"
+
+	# <a> should become a link
+	href = html.get("href")
+	if html.name == "a" and href:
+		return f"[{inner}]({href})"
+
+	# <blockquote> creates a nested quote block which is its own paragraph.
+	if html.name == "blockquote":
+		return md_wrap("quote", inner)
+
+	# The following tag types are simple character-based format wrappers.
+	# Note we don't handle <u> due to lack of underline formatting in markdown.
+	# We treat all headings as bold.
+	CHAR_FORMAT = {
+		"b": "**",
+		"strong": "**",
+		"h1": "**",
+		"h2": "**",
+		"h3": "**",
+		"h4": "**",
+		"h5": "**",
+		"h6": "**",
+		"i": "*",
+		"em": "*",
+		"del": "~~",
+		"pre": "`",
+		"code": "`",
+	}
+	if html.name in CHAR_FORMAT:
+		char = CHAR_FORMAT[html.name]
+		return f"{char}{inner}{char}"
+
+	# For any other types, most notably <span> but also anything that we don't recognize,
+	# just pass the inner text though unchanged.
+	return inner
+
+
+LINE = "\n---"
+
+
+class Listener(mastodon.StreamListener):
 	def __init__(self, zulip_client, stream, post_topic, notification_topic):
 		self.zulip_client = zulip_client
 		self.stream = stream
@ -55,7 +180,7 @@ class Listener(Mastodon.StreamListener):

 	def on_update(self, status):
 		logging.info(f"Got update: {status!r}")
-		self.send(self.post_topic, format_status(status))
+		self.send(self.post_topic, format_status(status) + LINE)

 	def on_delete(self, status_id):
 		logging.info(f"Got delete: {status_id}")
@ -63,13 +188,13 @@ class Listener(Mastodon.StreamListener):

 	def on_status_update(self, status):
 		logging.info(f"Got status update: {status!r}")
-		self.send(self.post_topic, f"*The following status has been updated*\n{format_status(status)}")
+		self.send(self.post_topic, f"*The following status has been updated*\n{format_status(status)}" + LINE)

 	def on_notification(self, notification):
 		logging.info(f"Got {notification['type']} notification: {notification!r}")
 		if notification["type"] != "mention":
 			return
-		self.send(self.notification_topic, format_status(status))
+		self.send(self.notification_topic, format_status(status) + LINE)


@cli
@ -95,7 +220,7 @@ def main(conf_file, stream="bot-spam", post_topic="Toots from Desert Bus", notif
 	mc = conf["mastodon"]

 	zulip_client = zulip.Client(zc["url"], zc["email"], zc["api_key"])
-	mastodon = Mastodon(api_base_url=mc["url"], access_token=mc["access_token"])
+	mastodon = mastodon.Mastodon(api_base_url=mc["url"], access_token=mc["access_token"])
 	listener = Listener(zulip_client, stream, post_topic, notification_topic)

 	logging.info("Starting")