blog bot: Save images linked in blog posts

Fixes #531
master
Mike Lang 12 hours ago
parent ba0299f362
commit c0166eb3ce

@ -7,18 +7,28 @@ import time
from base64 import b64encode from base64 import b64encode
from datetime import datetime from datetime import datetime
from hashlib import sha256 from hashlib import sha256
from uuid import uuid4
import argh import argh
import requests import requests
import bs4 import bs4
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from common import atomic_write
from common import media
from .zulip import Client from .zulip import Client
from .config import get_config from .config import get_config
logging.basicConfig(level='INFO') logging.basicConfig(level='INFO')
def try_save_image(media_dir, url):
if media_dir is None:
return {"error": "no media dir given"}
try:
return {"path": media.download_media(url, media_dir)}
except media.Rejected as e:
return {"error": str(e)}
def html_to_md(html): def html_to_md(html):
"""Lossy attempt to convert html to markdown""" """Lossy attempt to convert html to markdown"""
if isinstance(html, bs4.Comment): if isinstance(html, bs4.Comment):
@ -35,7 +45,8 @@ def html_to_md(html):
return "---" return "---"
if html.name == "img": if html.name == "img":
return html.get("src") + "\n" src = html.get("src")
return "[{}]({})\n".format(html.get("alt") or src, src)
inner = "".join(html_to_md(child) for child in html.children) inner = "".join(html_to_md(child) for child in html.children)
@ -101,6 +112,10 @@ def blog_to_md(id, html):
"```", "```",
]) ])
def find_images(html):
for img in html.find_all("img"):
yield img.get("src")
def get_posts(): def get_posts():
"""Get all blog posts on the front page as (id, html)""" """Get all blog posts on the front page as (id, html)"""
# Need to clear UA or we get blocked due to "python" in UA # Need to clear UA or we get blocked due to "python" in UA
@ -123,36 +138,23 @@ def send_post(client, stream, topic, id, html):
content=content, content=content,
) )
def save_post(save_dir, id, html): def save_post(save_dir, media_dir, id, html):
hash = b64encode(sha256(html.encode()).digest(), b"-_").decode().rstrip("=") hash = b64encode(sha256(html.encode()).digest(), b"-_").decode().rstrip("=")
filename = f"{id}-{hash}.json" filename = f"{id}-{hash}.json"
filepath = os.path.join(save_dir, filename) filepath = os.path.join(save_dir, filename)
if os.path.exists(filepath): if os.path.exists(filepath):
return return
images = set(find_images(html))
content = { content = {
"id": id, "id": id,
"hash": hash, "hash": hash,
"retrieved_at": datetime.utcnow().isoformat() + "Z", "retrieved_at": datetime.utcnow().isoformat() + "Z",
"html": html, "html": html,
"images": {image: try_save_image(media_dir, image) for image in images},
} }
atomic_write(filepath, json.dumps(content) + "\n") atomic_write(filepath, json.dumps(content) + "\n")
# This is copied from common, which isn't currently installed in zulip_bots. def main(config_file, interval=60, test=False, stream='bot-spam', topic='Blog Posts', save_dir=None, media_dir=None):
# Fix that later.
def atomic_write(filepath, content):
if isinstance(content, str):
content = content.encode("utf-8")
temp_path = "{}.{}.temp".format(filepath, uuid4())
dir_path = os.path.dirname(filepath)
os.makedirs(dir_path, exist_ok=True)
with open(temp_path, 'wb') as f:
f.write(content)
try:
os.rename(temp_path, filepath)
except FileExistsError:
os.remove(temp_path)
def main(config_file, interval=60, test=False, stream='bot-spam', topic='Blog Posts', save_dir=None):
"""Post to zulip each new blog post, checking every INTERVAL seconds. """Post to zulip each new blog post, checking every INTERVAL seconds.
Will not post any posts that already exist, unless --test is given Will not post any posts that already exist, unless --test is given
in which case it will print the most recent on startup.""" in which case it will print the most recent on startup."""
@ -169,7 +171,7 @@ def main(config_file, interval=60, test=False, stream='bot-spam', topic='Blog Po
else: else:
if save_dir is not None: if save_dir is not None:
for id, html in posts: for id, html in posts:
save_post(save_dir, id, str(html)) save_post(save_dir, media_dir, id, str(html))
if first: if first:
seen = set(id for id, html in posts) seen = set(id for id, html in posts)
if test: if test:

Loading…
Cancel
Save