mirror of https://github.com/ekimekim/wubloader
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
191 lines
4.6 KiB
Python
191 lines
4.6 KiB
Python
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import time
|
|
from base64 import b64encode
|
|
from datetime import datetime
|
|
from hashlib import sha256
|
|
from uuid import uuid4
|
|
|
|
import argh
|
|
import requests
|
|
import bs4
|
|
from bs4 import BeautifulSoup
|
|
|
|
from .zulip import Client
|
|
from .config import get_config
|
|
|
|
logging.basicConfig(level='INFO')
|
|
|
|
def html_to_md(html):
|
|
"""Lossy attempt to convert html to markdown"""
|
|
if isinstance(html, bs4.Comment):
|
|
return ""
|
|
|
|
if html.name is None:
|
|
# Raw string, return as-is
|
|
return html
|
|
|
|
if html.name == "br":
|
|
return "\n"
|
|
|
|
if html.name == "hr":
|
|
return "---"
|
|
|
|
if html.name == "img":
|
|
return html.get("src") + "\n"
|
|
|
|
inner = "".join(html_to_md(child) for child in html.children)
|
|
|
|
if html.name == "a":
|
|
return "[{}]({})".format(inner, html.get("href"))
|
|
|
|
if html.name == "p":
|
|
return inner + "\n"
|
|
|
|
if html.name == "li":
|
|
return "\n* " + inner
|
|
|
|
CHAR_FORMAT = {
|
|
"b": "**",
|
|
"strong": "**",
|
|
"h1": "**",
|
|
"h2": "**",
|
|
"h3": "**",
|
|
"h4": "**",
|
|
"h5": "**",
|
|
"h6": "**",
|
|
"i": "*",
|
|
"em": "*",
|
|
"del": "~~",
|
|
"pre": "`",
|
|
"code": "`",
|
|
}
|
|
if html.name in CHAR_FORMAT:
|
|
char = CHAR_FORMAT[html.name]
|
|
return f"{char}{inner}{char}"
|
|
|
|
return inner
|
|
|
|
def blog_to_md(id, html):
|
|
title = "UNKNOWN"
|
|
author = "UNKNOWN"
|
|
date = "UNKNOWN"
|
|
try:
|
|
a = html.a
|
|
authoring, content = html.find_all("div", recursive=False)
|
|
|
|
title = a.string
|
|
author_candidates = authoring.find_all(string=lambda v: v.startswith("Posted by"))
|
|
if len(author_candidates) == 1:
|
|
author = author_candidates[0]
|
|
date_element = authoring.find("astro-island")
|
|
if date_element is not None:
|
|
try:
|
|
props = json.loads(date_element["props"])
|
|
timestamp = props["time"][1]
|
|
date = f"<time:{timestamp}>"
|
|
except Exception:
|
|
pass # error determining date, ignore
|
|
md_content = html_to_md(content)
|
|
except Exception as e:
|
|
md_content = f"Parsing blog failed, please see logs: {e}"
|
|
|
|
return "\n".join([
|
|
f"Blog Post: [{title}](https://desertbus.org/?id={id})",
|
|
f"Posted by {author} at {date}",
|
|
"```quote",
|
|
md_content,
|
|
"```",
|
|
])
|
|
|
|
def get_posts():
|
|
"""Get all blog posts on the front page as (id, html)"""
|
|
# Need to clear UA or we get blocked due to "python" in UA
|
|
resp = requests.get("https://desertbus.org/2024/", headers={"User-Agent": ""})
|
|
resp.raise_for_status()
|
|
# Requests doesn't correctly guess this is utf-8
|
|
html = BeautifulSoup(resp.content.decode(), "html.parser")
|
|
posts = []
|
|
for a in html.find_all("a", href=re.compile(r"^\?id=")):
|
|
id = a["href"].removeprefix("?id=")
|
|
posts.append((id, a.parent))
|
|
return posts
|
|
|
|
def send_post(client, stream, topic, id, html):
|
|
content = blog_to_md(id, html)
|
|
client.request("POST", "messages",
|
|
type="stream",
|
|
to=stream,
|
|
topic=topic,
|
|
content=content,
|
|
)
|
|
|
|
def save_post(save_dir, id, html):
|
|
hash = b64encode(sha256(html.encode()).digest(), b"-_").decode().rstrip("=")
|
|
filename = f"{id}-{hash}.json"
|
|
filepath = os.path.join(save_dir, filename)
|
|
if os.path.exists(filepath):
|
|
return
|
|
content = {
|
|
"id": id,
|
|
"hash": hash,
|
|
"retrieved_at": datetime.utcnow().isoformat() + "Z",
|
|
"html": html,
|
|
}
|
|
atomic_write(filepath, json.dumps(content) + "\n")
|
|
|
|
# This is copied from common, which isn't currently installed in zulip_bots.
|
|
# Fix that later.
|
|
def atomic_write(filepath, content):
|
|
if isinstance(content, str):
|
|
content = content.encode("utf-8")
|
|
temp_path = "{}.{}.temp".format(filepath, uuid4())
|
|
dir_path = os.path.dirname(filepath)
|
|
os.makedirs(dir_path, exist_ok=True)
|
|
with open(temp_path, 'wb') as f:
|
|
f.write(content)
|
|
try:
|
|
os.rename(temp_path, filepath)
|
|
except FileExistsError:
|
|
os.remove(temp_path)
|
|
|
|
def main(config_file, interval=60, test=False, stream='bot-spam', topic='Blog Posts', save_dir=None):
|
|
"""Post to zulip each new blog post, checking every INTERVAL seconds.
|
|
Will not post any posts that already exist, unless --test is given
|
|
in which case it will print the most recent on startup."""
|
|
config = get_config(config_file)
|
|
client = Client(config["zulip_url"], config["zulip_email"], config["zulip_api_key"])
|
|
seen = set()
|
|
first = True
|
|
while True:
|
|
start = time.time()
|
|
try:
|
|
posts = get_posts()
|
|
except Exception:
|
|
logging.exception("Failed to get posts")
|
|
else:
|
|
if save_dir is not None:
|
|
for id, html in posts:
|
|
save_post(save_dir, id, str(html))
|
|
if first:
|
|
seen = set(id for id, html in posts)
|
|
if test:
|
|
id, html = posts[0]
|
|
send_post(client, stream, topic, id, html)
|
|
first = False
|
|
else:
|
|
for id, html in posts[::-1]:
|
|
if id not in seen:
|
|
send_post(client, stream, topic, id, html)
|
|
seen.add(id)
|
|
remaining = start + interval - time.time()
|
|
if remaining > 0:
|
|
time.sleep(remaining)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
argh.dispatch_command(main)
|