download_media: Special casing for imgur

6 months ago · 83f18eda84
parent 4dbe1dce5e
commit 83f18eda84
1 changed files with 135 additions and 0 deletions
--- a/common/common/media.py
+++ b/common/common/media.py
@ -11,7 +11,9 @@ from uuid import uuid4

 import gevent
 import prometheus_client as prom
+import requests
 import urllib3.connection
+from gevent.pool import Pool
 from ipaddress import ip_address

 from . import atomic_write, ensure_directory, jitter, listdir
@ -99,6 +101,9 @@ def download_media(
 					gevent.sleep(jitter(retry_interval))

 				try:
+					if download_imgur_url(output_dir, max_size, url):
+						return
+
 					resp = _request(urls[-1], max_size, content_types)

 					new_url = resp.get_redirect_location()
@ -190,6 +195,29 @@ def _save_response(output_dir, urls, resp, max_size, chunk_size):
 		atomic_write(metadata_path, json.dumps(metadata, indent=4))


+def _save_content(output_dir, urls, ext, content):
+	"""Alternate version of _save_response() for cases where content is explicitly generated
+	instead of coming from a response."""
+	url_dir = get_url_dir(output_dir, urls[0])
+	if isinstance(content, str):
+		content = content.encode()
+	hash = sha256(content)
+	filename = f"{hash_to_path(hash)}.{ext}"
+	filepath = os.path.join(url_dir, filename)
+	if not os.path.exists(filepath):
+		atomic_write(filepath, content)
+	metadata_path = os.path.join(url_dir, f"{hash_to_path(hash)}.metadata.json")
+	if not os.path.exists(metadata_path):
+		metadata = {
+			"url": urls[0],
+			"filename": filename,
+			"redirects": urls[1:],
+			"fetched_by": socket.gethostname(),
+			"fetch_time": time.time(),
+		}
+		atomic_write(metadata_path, json.dumps(metadata, indent=4))
+
+
 def _request(url, max_size, content_types):
 	"""Do the actual request and return a vetted response object, which is either the content
 	(status 200) or a redirect.
@ -245,3 +273,110 @@ def _request(url, max_size, content_types):
 			raise TooLarge(f"Content length {length} is too large for url {url}")

 	return resp
+
+
+def download_imgur_url(output_dir, max_size, url):
+	"""Links to imgur require special handling to resolve the actual image.
+	Handles URLs like the following:
+		i.stack.imgur.com/ID.png
+		imgur.com/ID
+		i.imgur.com/ID.EXT
+			These map to actual media and are stored in the usual way.
+		imgur.com/a/ID
+		imgur.com/gallery/ID
+			These map to collections of media.
+			Under the original URL we store a json file that lists imgur.com/ID urls
+			of the contents of the collection. Those urls are then downloaded and stored
+			in the usual way.
+	"""
+	parsed = urllib.parse.urlparse(url)
+	if parsed.hostname not in ("imgur.com", "i.stack.imgur.com"):
+		# not an imgur link that needs special handling
+		return False
+	match = re.match(r"^/([^/.]+)(?:\.([a-z]+))?$", parsed.path)
+	if match:
+		id, ext = match.groups()
+		if ext is None:
+			# Try to get a video ("gif") first, if that 400s then get a png.
+			try:
+				download_imgur_image(output_dir, max_size, url, id, "mp4")
+			except requests.HTTPError:
+				download_imgur_image(output_dir, max_size, url, id, "png")
+		else:
+			download_imgur_image(output_dir, max_size, url, id, ext)
+		return True
+	elif parsed.path.startswith("/a/"):
+		contents = download_imgur_album(parsed.path.removeprefix("/a/"))
+	elif parsed.path.startwith("/gallery/"):
+		contents = download_imgur_gallery(parsed.path.removeprefix("/gallery/"))
+	else:
+		# no match, treat like non-imgur link
+		return False
+
+	# Common part for albums and galleries
+	pool = Pool(16)
+	jobs = []
+	for id, ext in contents:
+		job = pool.spawn(download_imgur_image, output_dir, max_size, f"https://imgur.com/{id}", id, ext)
+		jobs.append(job)
+	gevent.wait(jobs)
+	failed = [g.exception for g in jobs if g.exception is not None]
+
+	# Save the album after trying to download things (so it will be retried until we get this far)
+	# but only raise for image download errors after, so we at least know about everything that
+	# succeeded.
+	contents_urls = [f"https://imgur.com/{id}.{ext}" for id, ext in contents]
+	_save_content(output_dir, [url], "json", json.dumps(contents_urls))
+
+	if failed:
+		raise ExceptionGroup(failed)
+
+	return True
+
+
+def imgur_request(url):
+	resp = requests.get(url, allow_redirects=False, timeout=30)
+	if 300 <= resp.status_code < 400:
+		# imgur redirects you if the resource is gone instead of 404ing, treat this as non-retryable
+		raise Rejected(f"imgur returned redirect for {url!r}")
+	# other errors are retryable
+	resp.raise_for_status()
+	return resp
+
+
+def download_imgur_album(url, id):
+	"""Fetch imgur album and return a list of (id, ext) contents"""
+	url = f"https://api.imgur.com/post/v1/albums/{id}?client_id=546c25a59c58ad7&include=media,adconfig,account"
+	data = imgur_request(url).json()
+	result = []
+	for item in data.get("media", []):
+		result.append((item["id"], item["ext"]))
+	return result
+
+
+def download_imgur_gallery(url, id):
+	"""Fetch imgur gallery and return a list of (id, ext) contents"""
+	# The gallery JSON is contained in a <script> tag like this:
+	# <script>window.postDataJSON=...</script>
+	# where ... is a json string.
+	html = imgur_request(f"https://imgur.com/gallery/{id}").content
+	regex = r'<script>window.postDataJSON=("(?:[^"\\]|\\.)*")'
+	match = re.search(regex, html)
+	# If we can't find a match, assume we got served a 404 page instead.
+	if not match:
+		raise Rejected(f"Could not load gallery for {url!r}")
+	data = match.group(1)
+	# TODO python3 equivalent?
+	data = data[1:-1].decode("string-escape") # remove quotes and unescape contents
+	data = json.loads(data)
+	result = []
+	for item in data.get("media", []):
+		result.append((item["id"], item["ext"]))
+	return result
+
+
+def download_imgur_image(output_dir, max_size, url, id, ext):
+	"""Fetch imgur image and save it as per download_media()"""
+	image_url = f"https://i.imgur.com/{id}.{ext}"
+	resp = imgur_request(image_url)
+	_save_response(output_dir, [url, image_url], resp, max_size, 64*1024)