Compare commits

..

No commits in common. '7ce736b82e7924ac0ce14f725bcab96ef20b161d' and '06d58359124138e90a9d5f3660dc5035c1695c83' have entirely different histories.

@ -7,28 +7,18 @@ import time
from base64 import b64encode from base64 import b64encode
from datetime import datetime from datetime import datetime
from hashlib import sha256 from hashlib import sha256
from uuid import uuid4
import argh import argh
import requests import requests
import bs4 import bs4
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from common import atomic_write
from common import media
from .zulip import Client from .zulip import Client
from .config import get_config from .config import get_config
logging.basicConfig(level='INFO') logging.basicConfig(level='INFO')
def try_save_image(media_dir, url):
if media_dir is None:
return {"error": "no media dir given"}
try:
return {"path": media.download_media(url, media_dir)}
except media.Rejected as e:
return {"error": str(e)}
def html_to_md(html): def html_to_md(html):
"""Lossy attempt to convert html to markdown""" """Lossy attempt to convert html to markdown"""
if isinstance(html, bs4.Comment): if isinstance(html, bs4.Comment):
@ -45,8 +35,7 @@ def html_to_md(html):
return "---" return "---"
if html.name == "img": if html.name == "img":
src = html.get("src") return html.get("src") + "\n"
return "[{}]({})\n".format(html.get("alt") or src, src)
inner = "".join(html_to_md(child) for child in html.children) inner = "".join(html_to_md(child) for child in html.children)
@ -104,7 +93,7 @@ def blog_to_md(id, html):
except Exception as e: except Exception as e:
md_content = f"Parsing blog failed, please see logs: {e}" md_content = f"Parsing blog failed, please see logs: {e}"
return title, author, date, "\n".join([ return "\n".join([
f"Blog Post: [{title}](https://desertbus.org/?id={id})", f"Blog Post: [{title}](https://desertbus.org/?id={id})",
f"Posted by {author} at {date}", f"Posted by {author} at {date}",
"```quote", "```quote",
@ -112,10 +101,6 @@ def blog_to_md(id, html):
"```", "```",
]) ])
def find_images(html):
for img in html.find_all("img"):
yield img.get("src")
def get_posts(): def get_posts():
"""Get all blog posts on the front page as (id, html)""" """Get all blog posts on the front page as (id, html)"""
# Need to clear UA or we get blocked due to "python" in UA # Need to clear UA or we get blocked due to "python" in UA
@ -130,7 +115,7 @@ def get_posts():
return posts return posts
def send_post(client, stream, topic, id, html): def send_post(client, stream, topic, id, html):
title, author, date, content = blog_to_md(id, html) content = blog_to_md(id, html)
client.request("POST", "messages", client.request("POST", "messages",
type="stream", type="stream",
to=stream, to=stream,
@ -138,27 +123,36 @@ def send_post(client, stream, topic, id, html):
content=content, content=content,
) )
def save_post(save_dir, media_dir, id, html): def save_post(save_dir, id, html):
hash = b64encode(sha256(html.encode()).digest(), b"-_").decode().rstrip("=") hash = b64encode(sha256(html.encode()).digest(), b"-_").decode().rstrip("=")
filename = f"{id}-{hash}.json" filename = f"{id}-{hash}.json"
filepath = os.path.join(save_dir, filename) filepath = os.path.join(save_dir, filename)
if os.path.exists(filepath): if os.path.exists(filepath):
return return
images = set(find_images(html))
title, author, date, _ = blog_to_md(id, html)
content = { content = {
"id": id, "id": id,
"hash": hash, "hash": hash,
"retrieved_at": datetime.utcnow().isoformat() + "Z", "retrieved_at": datetime.utcnow().isoformat() + "Z",
"html": html, "html": html,
"images": {image: try_save_image(media_dir, image) for image in images},
"title": title,
"author": author,
"date": date,
} }
atomic_write(filepath, json.dumps(content) + "\n") atomic_write(filepath, json.dumps(content) + "\n")
def main(config_file, interval=60, test=False, stream='bot-spam', topic='Blog Posts', save_dir=None, media_dir=None): # This is copied from common, which isn't currently installed in zulip_bots.
# Fix that later.
def atomic_write(filepath, content):
if isinstance(content, str):
content = content.encode("utf-8")
temp_path = "{}.{}.temp".format(filepath, uuid4())
dir_path = os.path.dirname(filepath)
os.makedirs(dir_path, exist_ok=True)
with open(temp_path, 'wb') as f:
f.write(content)
try:
os.rename(temp_path, filepath)
except FileExistsError:
os.remove(temp_path)
def main(config_file, interval=60, test=False, stream='bot-spam', topic='Blog Posts', save_dir=None):
"""Post to zulip each new blog post, checking every INTERVAL seconds. """Post to zulip each new blog post, checking every INTERVAL seconds.
Will not post any posts that already exist, unless --test is given Will not post any posts that already exist, unless --test is given
in which case it will print the most recent on startup.""" in which case it will print the most recent on startup."""
@ -175,7 +169,7 @@ def main(config_file, interval=60, test=False, stream='bot-spam', topic='Blog Po
else: else:
if save_dir is not None: if save_dir is not None:
for id, html in posts: for id, html in posts:
save_post(save_dir, media_dir, id, str(html)) save_post(save_dir, id, str(html))
if first: if first:
seen = set(id for id, html in posts) seen = set(id for id, html in posts)
if test: if test:

@ -1,15 +1,12 @@
import json
import argh
import logging import logging
import os
import time import time
import argh
import mastodon import mastodon
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from common import media
from . import zulip from . import zulip
from .config import get_config from .config import get_config

Loading…
Cancel
Save