diff --git a/zulip_bots/zulip_bots/blogbot.py b/zulip_bots/zulip_bots/blogbot.py index 3f3289f..0172fb2 100644 --- a/zulip_bots/zulip_bots/blogbot.py +++ b/zulip_bots/zulip_bots/blogbot.py @@ -7,18 +7,28 @@ import time from base64 import b64encode from datetime import datetime from hashlib import sha256 -from uuid import uuid4 import argh import requests import bs4 from bs4 import BeautifulSoup +from common import atomic_write +from common import media + from .zulip import Client from .config import get_config logging.basicConfig(level='INFO') +def try_save_image(media_dir, url): + if media_dir is None: + return {"error": "no media dir given"} + try: + return {"path": media.download_media(url, media_dir)} + except media.Rejected as e: + return {"error": str(e)} + def html_to_md(html): """Lossy attempt to convert html to markdown""" if isinstance(html, bs4.Comment): @@ -35,7 +45,8 @@ def html_to_md(html): return "---" if html.name == "img": - return html.get("src") + "\n" + src = html.get("src") + return "[{}]({})\n".format(html.get("alt") or src, src) inner = "".join(html_to_md(child) for child in html.children) @@ -101,6 +112,10 @@ def blog_to_md(id, html): "```", ]) +def find_images(html): + for img in html.find_all("img"): + yield img.get("src") + def get_posts(): """Get all blog posts on the front page as (id, html)""" # Need to clear UA or we get blocked due to "python" in UA @@ -123,36 +138,23 @@ def send_post(client, stream, topic, id, html): content=content, ) -def save_post(save_dir, id, html): +def save_post(save_dir, media_dir, id, html): hash = b64encode(sha256(html.encode()).digest(), b"-_").decode().rstrip("=") filename = f"{id}-{hash}.json" filepath = os.path.join(save_dir, filename) if os.path.exists(filepath): return + images = set(find_images(html)) content = { "id": id, "hash": hash, "retrieved_at": datetime.utcnow().isoformat() + "Z", "html": html, + "images": {image: try_save_image(media_dir, image) for image in images}, } atomic_write(filepath, json.dumps(content) + "\n") -# This is copied from common, which isn't currently installed in zulip_bots. -# Fix that later. -def atomic_write(filepath, content): - if isinstance(content, str): - content = content.encode("utf-8") - temp_path = "{}.{}.temp".format(filepath, uuid4()) - dir_path = os.path.dirname(filepath) - os.makedirs(dir_path, exist_ok=True) - with open(temp_path, 'wb') as f: - f.write(content) - try: - os.rename(temp_path, filepath) - except FileExistsError: - os.remove(temp_path) - -def main(config_file, interval=60, test=False, stream='bot-spam', topic='Blog Posts', save_dir=None): +def main(config_file, interval=60, test=False, stream='bot-spam', topic='Blog Posts', save_dir=None, media_dir=None): """Post to zulip each new blog post, checking every INTERVAL seconds. Will not post any posts that already exist, unless --test is given in which case it will print the most recent on startup.""" @@ -169,7 +171,7 @@ def main(config_file, interval=60, test=False, stream='bot-spam', topic='Blog Po else: if save_dir is not None: for id, html in posts: - save_post(save_dir, id, str(html)) + save_post(save_dir, media_dir, id, str(html)) if first: seen = set(id for id, html in posts) if test: