diff --git a/common/common/media.py b/common/common/media.py
index 7350742..0118158 100644
--- a/common/common/media.py
+++ b/common/common/media.py
@@ -11,7 +11,9 @@ from uuid import uuid4
import gevent
import prometheus_client as prom
+import requests
import urllib3.connection
+from gevent.pool import Pool
from ipaddress import ip_address
from . import atomic_write, ensure_directory, jitter, listdir
@@ -99,6 +101,9 @@ def download_media(
gevent.sleep(jitter(retry_interval))
try:
+ if download_imgur_url(output_dir, max_size, url):
+ return
+
resp = _request(urls[-1], max_size, content_types)
new_url = resp.get_redirect_location()
@@ -190,6 +195,29 @@ def _save_response(output_dir, urls, resp, max_size, chunk_size):
atomic_write(metadata_path, json.dumps(metadata, indent=4))
+def _save_content(output_dir, urls, ext, content):
+ """Alternate version of _save_response() for cases where content is explicitly generated
+ instead of coming from a response."""
+ url_dir = get_url_dir(output_dir, urls[0])
+ if isinstance(content, str):
+ content = content.encode()
+ hash = sha256(content)
+ filename = f"{hash_to_path(hash)}.{ext}"
+ filepath = os.path.join(url_dir, filename)
+ if not os.path.exists(filepath):
+ atomic_write(filepath, content)
+ metadata_path = os.path.join(url_dir, f"{hash_to_path(hash)}.metadata.json")
+ if not os.path.exists(metadata_path):
+ metadata = {
+ "url": urls[0],
+ "filename": filename,
+ "redirects": urls[1:],
+ "fetched_by": socket.gethostname(),
+ "fetch_time": time.time(),
+ }
+ atomic_write(metadata_path, json.dumps(metadata, indent=4))
+
+
def _request(url, max_size, content_types):
"""Do the actual request and return a vetted response object, which is either the content
(status 200) or a redirect.
@@ -245,3 +273,110 @@ def _request(url, max_size, content_types):
raise TooLarge(f"Content length {length} is too large for url {url}")
return resp
+
+
+def download_imgur_url(output_dir, max_size, url):
+ """Links to imgur require special handling to resolve the actual image.
+ Handles URLs like the following:
+ i.stack.imgur.com/ID.png
+ imgur.com/ID
+ i.imgur.com/ID.EXT
+ These map to actual media and are stored in the usual way.
+ imgur.com/a/ID
+ imgur.com/gallery/ID
+ These map to collections of media.
+ Under the original URL we store a json file that lists imgur.com/ID urls
+ of the contents of the collection. Those urls are then downloaded and stored
+ in the usual way.
+ """
+ parsed = urllib.parse.urlparse(url)
+ if parsed.hostname not in ("imgur.com", "i.stack.imgur.com"):
+ # not an imgur link that needs special handling
+ return False
+ match = re.match(r"^/([^/.]+)(?:\.([a-z]+))?$", parsed.path)
+ if match:
+ id, ext = match.groups()
+ if ext is None:
+ # Try to get a video ("gif") first, if that 400s then get a png.
+ try:
+ download_imgur_image(output_dir, max_size, url, id, "mp4")
+ except requests.HTTPError:
+ download_imgur_image(output_dir, max_size, url, id, "png")
+ else:
+ download_imgur_image(output_dir, max_size, url, id, ext)
+ return True
+ elif parsed.path.startswith("/a/"):
+ contents = download_imgur_album(parsed.path.removeprefix("/a/"))
+ elif parsed.path.startwith("/gallery/"):
+ contents = download_imgur_gallery(parsed.path.removeprefix("/gallery/"))
+ else:
+ # no match, treat like non-imgur link
+ return False
+
+ # Common part for albums and galleries
+ pool = Pool(16)
+ jobs = []
+ for id, ext in contents:
+ job = pool.spawn(download_imgur_image, output_dir, max_size, f"https://imgur.com/{id}", id, ext)
+ jobs.append(job)
+ gevent.wait(jobs)
+ failed = [g.exception for g in jobs if g.exception is not None]
+
+ # Save the album after trying to download things (so it will be retried until we get this far)
+ # but only raise for image download errors after, so we at least know about everything that
+ # succeeded.
+ contents_urls = [f"https://imgur.com/{id}.{ext}" for id, ext in contents]
+ _save_content(output_dir, [url], "json", json.dumps(contents_urls))
+
+ if failed:
+ raise ExceptionGroup(failed)
+
+ return True
+
+
+def imgur_request(url):
+ resp = requests.get(url, allow_redirects=False, timeout=30)
+ if 300 <= resp.status_code < 400:
+ # imgur redirects you if the resource is gone instead of 404ing, treat this as non-retryable
+ raise Rejected(f"imgur returned redirect for {url!r}")
+ # other errors are retryable
+ resp.raise_for_status()
+ return resp
+
+
+def download_imgur_album(url, id):
+ """Fetch imgur album and return a list of (id, ext) contents"""
+ url = f"https://api.imgur.com/post/v1/albums/{id}?client_id=546c25a59c58ad7&include=media,adconfig,account"
+ data = imgur_request(url).json()
+ result = []
+ for item in data.get("media", []):
+ result.append((item["id"], item["ext"]))
+ return result
+
+
+def download_imgur_gallery(url, id):
+ """Fetch imgur gallery and return a list of (id, ext) contents"""
+ # The gallery JSON is contained in a
+ # where ... is a json string.
+ html = imgur_request(f"https://imgur.com/gallery/{id}").content
+ regex = r'