blogbot: Save blog contents by hash when seen

For every unique hash of the blog html, it will save a copy.
Blogs are saved as JSON files in {segments_dir}/blogs
mike/chat-no-end-time
Mike Lang 3 weeks ago
parent 3b737f8155
commit fbbe37ca9e

@ -757,7 +757,9 @@
[if $.enabled.blogbot then "blogbot"]: [if $.enabled.blogbot then "blogbot"]:
bot_service("blogbot", $.blogbot + { bot_service("blogbot", $.blogbot + {
zulip_url: $.zulip_url, zulip_url: $.zulip_url,
}), }, ["--save-dir", "/mnt/blogs"]) + {
volumes: ["%s:/mnt" % $.segments_path],
},
}, },
} }

@ -1,8 +1,13 @@
import json import json
import logging import logging
import time import os
import re import re
import time
from base64 import b64encode
from datetime import datetime
from hashlib import sha256
from uuid import uuid4
import argh import argh
import requests import requests
@ -118,7 +123,36 @@ def send_post(client, stream, topic, id, html):
content=content, content=content,
) )
def main(config_file, interval=60, test=False, stream='bot-spam', topic='Blog Posts'): def save_post(save_dir, id, html):
hash = b64encode(sha256(html).digest(), b"-_").decode().rstrip("=")
filename = f"{id}-{hash}.json"
filepath = os.path.join(save_dir, filename)
if os.path.exists(filepath):
return
content = {
"id": id,
"hash": hash,
"retrieved_at": datetime.utcnow().isoformat() + "Z",
"html": html,
}
atomic_write(filepath, json.dumps(content) + "\n")
# This is copied from common, which isn't currently installed in zulip_bots.
# Fix that later.
def atomic_write(filepath, content):
if isinstance(content, str):
content = content.encode("utf-8")
temp_path = "{}.{}.temp".format(filepath, uuid4())
dir_path = os.path.dirname(filepath)
os.makedirs(dir_path, exist_ok=True)
with open(temp_path, 'wb') as f:
f.write(content)
try:
os.rename(temp_path, filepath)
except FileExistsError:
os.remove(temp_path)
def main(config_file, interval=60, test=False, stream='bot-spam', topic='Blog Posts', save_dir=None):
"""Post to zulip each new blog post, checking every INTERVAL seconds. """Post to zulip each new blog post, checking every INTERVAL seconds.
Will not post any posts that already exist, unless --test is given Will not post any posts that already exist, unless --test is given
in which case it will print the most recent on startup.""" in which case it will print the most recent on startup."""
@ -133,6 +167,9 @@ def main(config_file, interval=60, test=False, stream='bot-spam', topic='Blog Po
except Exception: except Exception:
logging.exception("Failed to get posts") logging.exception("Failed to get posts")
else: else:
if save_dir is not None:
for id, html in posts:
save_post(id, html)
if first: if first:
seen = set(id for id, html in posts) seen = set(id for id, html in posts)
if test: if test:

Loading…
Cancel
Save