From 83f18eda84af575df5bdd3b2226f1abc9070c57a Mon Sep 17 00:00:00 2001 From: Mike Lang Date: Mon, 28 Oct 2024 10:57:35 +0000 Subject: [PATCH] download_media: Special casing for imgur --- common/common/media.py | 135 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/common/common/media.py b/common/common/media.py index 7350742..0118158 100644 --- a/common/common/media.py +++ b/common/common/media.py @@ -11,7 +11,9 @@ from uuid import uuid4 import gevent import prometheus_client as prom +import requests import urllib3.connection +from gevent.pool import Pool from ipaddress import ip_address from . import atomic_write, ensure_directory, jitter, listdir @@ -99,6 +101,9 @@ def download_media( gevent.sleep(jitter(retry_interval)) try: + if download_imgur_url(output_dir, max_size, url): + return + resp = _request(urls[-1], max_size, content_types) new_url = resp.get_redirect_location() @@ -190,6 +195,29 @@ def _save_response(output_dir, urls, resp, max_size, chunk_size): atomic_write(metadata_path, json.dumps(metadata, indent=4)) +def _save_content(output_dir, urls, ext, content): + """Alternate version of _save_response() for cases where content is explicitly generated + instead of coming from a response.""" + url_dir = get_url_dir(output_dir, urls[0]) + if isinstance(content, str): + content = content.encode() + hash = sha256(content) + filename = f"{hash_to_path(hash)}.{ext}" + filepath = os.path.join(url_dir, filename) + if not os.path.exists(filepath): + atomic_write(filepath, content) + metadata_path = os.path.join(url_dir, f"{hash_to_path(hash)}.metadata.json") + if not os.path.exists(metadata_path): + metadata = { + "url": urls[0], + "filename": filename, + "redirects": urls[1:], + "fetched_by": socket.gethostname(), + "fetch_time": time.time(), + } + atomic_write(metadata_path, json.dumps(metadata, indent=4)) + + def _request(url, max_size, content_types): """Do the actual request and return a vetted response object, which is either the content (status 200) or a redirect. @@ -245,3 +273,110 @@ def _request(url, max_size, content_types): raise TooLarge(f"Content length {length} is too large for url {url}") return resp + + +def download_imgur_url(output_dir, max_size, url): + """Links to imgur require special handling to resolve the actual image. + Handles URLs like the following: + i.stack.imgur.com/ID.png + imgur.com/ID + i.imgur.com/ID.EXT + These map to actual media and are stored in the usual way. + imgur.com/a/ID + imgur.com/gallery/ID + These map to collections of media. + Under the original URL we store a json file that lists imgur.com/ID urls + of the contents of the collection. Those urls are then downloaded and stored + in the usual way. + """ + parsed = urllib.parse.urlparse(url) + if parsed.hostname not in ("imgur.com", "i.stack.imgur.com"): + # not an imgur link that needs special handling + return False + match = re.match(r"^/([^/.]+)(?:\.([a-z]+))?$", parsed.path) + if match: + id, ext = match.groups() + if ext is None: + # Try to get a video ("gif") first, if that 400s then get a png. + try: + download_imgur_image(output_dir, max_size, url, id, "mp4") + except requests.HTTPError: + download_imgur_image(output_dir, max_size, url, id, "png") + else: + download_imgur_image(output_dir, max_size, url, id, ext) + return True + elif parsed.path.startswith("/a/"): + contents = download_imgur_album(parsed.path.removeprefix("/a/")) + elif parsed.path.startwith("/gallery/"): + contents = download_imgur_gallery(parsed.path.removeprefix("/gallery/")) + else: + # no match, treat like non-imgur link + return False + + # Common part for albums and galleries + pool = Pool(16) + jobs = [] + for id, ext in contents: + job = pool.spawn(download_imgur_image, output_dir, max_size, f"https://imgur.com/{id}", id, ext) + jobs.append(job) + gevent.wait(jobs) + failed = [g.exception for g in jobs if g.exception is not None] + + # Save the album after trying to download things (so it will be retried until we get this far) + # but only raise for image download errors after, so we at least know about everything that + # succeeded. + contents_urls = [f"https://imgur.com/{id}.{ext}" for id, ext in contents] + _save_content(output_dir, [url], "json", json.dumps(contents_urls)) + + if failed: + raise ExceptionGroup(failed) + + return True + + +def imgur_request(url): + resp = requests.get(url, allow_redirects=False, timeout=30) + if 300 <= resp.status_code < 400: + # imgur redirects you if the resource is gone instead of 404ing, treat this as non-retryable + raise Rejected(f"imgur returned redirect for {url!r}") + # other errors are retryable + resp.raise_for_status() + return resp + + +def download_imgur_album(url, id): + """Fetch imgur album and return a list of (id, ext) contents""" + url = f"https://api.imgur.com/post/v1/albums/{id}?client_id=546c25a59c58ad7&include=media,adconfig,account" + data = imgur_request(url).json() + result = [] + for item in data.get("media", []): + result.append((item["id"], item["ext"])) + return result + + +def download_imgur_gallery(url, id): + """Fetch imgur gallery and return a list of (id, ext) contents""" + # The gallery JSON is contained in a + # where ... is a json string. + html = imgur_request(f"https://imgur.com/gallery/{id}").content + regex = r'