Compare commits

..

4 Commits

Author SHA1 Message Date
Mike Lang 7ce736b82e blogbot: save parsed info 3 days ago
Mike Lang 4567775b60 move project documentation to its own directory
not including code stuff like INSTALL.md
3 days ago
Mike Lang c0166eb3ce blog bot: Save images linked in blog posts
Fixes #531
3 days ago
Mike Lang ba0299f362 tootbot: fix missing imports from adding media downloads 3 days ago

@ -7,18 +7,28 @@ import time
from base64 import b64encode
from datetime import datetime
from hashlib import sha256
from uuid import uuid4
import argh
import requests
import bs4
from bs4 import BeautifulSoup
from common import atomic_write
from common import media
from .zulip import Client
from .config import get_config
logging.basicConfig(level='INFO')
def try_save_image(media_dir, url):
if media_dir is None:
return {"error": "no media dir given"}
try:
return {"path": media.download_media(url, media_dir)}
except media.Rejected as e:
return {"error": str(e)}
def html_to_md(html):
"""Lossy attempt to convert html to markdown"""
if isinstance(html, bs4.Comment):
@ -35,7 +45,8 @@ def html_to_md(html):
return "---"
if html.name == "img":
return html.get("src") + "\n"
src = html.get("src")
return "[{}]({})\n".format(html.get("alt") or src, src)
inner = "".join(html_to_md(child) for child in html.children)
@ -93,7 +104,7 @@ def blog_to_md(id, html):
except Exception as e:
md_content = f"Parsing blog failed, please see logs: {e}"
return "\n".join([
return title, author, date, "\n".join([
f"Blog Post: [{title}](https://desertbus.org/?id={id})",
f"Posted by {author} at {date}",
"```quote",
@ -101,6 +112,10 @@ def blog_to_md(id, html):
"```",
])
def find_images(html):
for img in html.find_all("img"):
yield img.get("src")
def get_posts():
"""Get all blog posts on the front page as (id, html)"""
# Need to clear UA or we get blocked due to "python" in UA
@ -115,7 +130,7 @@ def get_posts():
return posts
def send_post(client, stream, topic, id, html):
content = blog_to_md(id, html)
title, author, date, content = blog_to_md(id, html)
client.request("POST", "messages",
type="stream",
to=stream,
@ -123,36 +138,27 @@ def send_post(client, stream, topic, id, html):
content=content,
)
def save_post(save_dir, id, html):
def save_post(save_dir, media_dir, id, html):
hash = b64encode(sha256(html.encode()).digest(), b"-_").decode().rstrip("=")
filename = f"{id}-{hash}.json"
filepath = os.path.join(save_dir, filename)
if os.path.exists(filepath):
return
images = set(find_images(html))
title, author, date, _ = blog_to_md(id, html)
content = {
"id": id,
"hash": hash,
"retrieved_at": datetime.utcnow().isoformat() + "Z",
"html": html,
"images": {image: try_save_image(media_dir, image) for image in images},
"title": title,
"author": author,
"date": date,
}
atomic_write(filepath, json.dumps(content) + "\n")
# This is copied from common, which isn't currently installed in zulip_bots.
# Fix that later.
def atomic_write(filepath, content):
if isinstance(content, str):
content = content.encode("utf-8")
temp_path = "{}.{}.temp".format(filepath, uuid4())
dir_path = os.path.dirname(filepath)
os.makedirs(dir_path, exist_ok=True)
with open(temp_path, 'wb') as f:
f.write(content)
try:
os.rename(temp_path, filepath)
except FileExistsError:
os.remove(temp_path)
def main(config_file, interval=60, test=False, stream='bot-spam', topic='Blog Posts', save_dir=None):
def main(config_file, interval=60, test=False, stream='bot-spam', topic='Blog Posts', save_dir=None, media_dir=None):
"""Post to zulip each new blog post, checking every INTERVAL seconds.
Will not post any posts that already exist, unless --test is given
in which case it will print the most recent on startup."""
@ -169,7 +175,7 @@ def main(config_file, interval=60, test=False, stream='bot-spam', topic='Blog Po
else:
if save_dir is not None:
for id, html in posts:
save_post(save_dir, id, str(html))
save_post(save_dir, media_dir, id, str(html))
if first:
seen = set(id for id, html in posts)
if test:

@ -1,12 +1,15 @@
import argh
import json
import logging
import os
import time
import argh
import mastodon
from bs4 import BeautifulSoup
from common import media
from . import zulip
from .config import get_config

Loading…
Cancel
Save