download_media: Special casing for imgur

9 months ago · 83f18eda84
parent 4dbe1dce5e
commit 83f18eda84
1 changed files with 135 additions and 0 deletions
--- a/common/common/media.py
+++ b/common/common/media.py
@ -11,7 +11,9 @@ from uuid import uuid4
 import gevent
 import prometheus_client as prom
 import requests
 import urllib3.connection
 from gevent.pool import Pool
 from ipaddress import ip_address
 from . import atomic_write, ensure_directory, jitter, listdir
@ -99,6 +101,9 @@ def download_media(
 					gevent.sleep(jitter(retry_interval))
 				try:
 					if download_imgur_url(output_dir, max_size, url):
 						return
 					resp = _request(urls[-1], max_size, content_types)
 					new_url = resp.get_redirect_location()
@ -190,6 +195,29 @@ def _save_response(output_dir, urls, resp, max_size, chunk_size):
 		atomic_write(metadata_path, json.dumps(metadata, indent=4))
 def _save_content(output_dir, urls, ext, content):
 	"""Alternate version of _save_response() for cases where content is explicitly generated
 	instead of coming from a response."""
 	url_dir = get_url_dir(output_dir, urls[0])
 	if isinstance(content, str):
 		content = content.encode()
 	hash = sha256(content)
 	filename = f"{hash_to_path(hash)}.{ext}"
 	filepath = os.path.join(url_dir, filename)
 	if not os.path.exists(filepath):
 		atomic_write(filepath, content)
 	metadata_path = os.path.join(url_dir, f"{hash_to_path(hash)}.metadata.json")
 	if not os.path.exists(metadata_path):
 		metadata = {
 			"url": urls[0],
 			"filename": filename,
 			"redirects": urls[1:],
 			"fetched_by": socket.gethostname(),
 			"fetch_time": time.time(),
 		}
 		atomic_write(metadata_path, json.dumps(metadata, indent=4))
 def _request(url, max_size, content_types):
 	"""Do the actual request and return a vetted response object, which is either the content
 	(status 200) or a redirect.
@ -245,3 +273,110 @@ def _request(url, max_size, content_types):
 			raise TooLarge(f"Content length {length} is too large for url {url}")
 	return resp
 def download_imgur_url(output_dir, max_size, url):
 	"""Links to imgur require special handling to resolve the actual image.
 	Handles URLs like the following:
 		i.stack.imgur.com/ID.png
 		imgur.com/ID
 		i.imgur.com/ID.EXT
 			These map to actual media and are stored in the usual way.
 		imgur.com/a/ID
 		imgur.com/gallery/ID
 			These map to collections of media.
 			Under the original URL we store a json file that lists imgur.com/ID urls
 			of the contents of the collection. Those urls are then downloaded and stored
 			in the usual way.
 	"""
 	parsed = urllib.parse.urlparse(url)
 	if parsed.hostname not in ("imgur.com", "i.stack.imgur.com"):
 		# not an imgur link that needs special handling
 		return False
 	match = re.match(r"^/([^/.]+)(?:\.([a-z]+))?$", parsed.path)
 	if match:
 		id, ext = match.groups()
 		if ext is None:
 			# Try to get a video ("gif") first, if that 400s then get a png.
 			try:
 				download_imgur_image(output_dir, max_size, url, id, "mp4")
 			except requests.HTTPError:
 				download_imgur_image(output_dir, max_size, url, id, "png")
 		else:
 			download_imgur_image(output_dir, max_size, url, id, ext)
 		return True
 	elif parsed.path.startswith("/a/"):
 		contents = download_imgur_album(parsed.path.removeprefix("/a/"))
 	elif parsed.path.startwith("/gallery/"):
 		contents = download_imgur_gallery(parsed.path.removeprefix("/gallery/"))
 	else:
 		# no match, treat like non-imgur link
 		return False
 	# Common part for albums and galleries
 	pool = Pool(16)
 	jobs = []
 	for id, ext in contents:
 		job = pool.spawn(download_imgur_image, output_dir, max_size, f"https://imgur.com/{id}", id, ext)
 		jobs.append(job)
 	gevent.wait(jobs)
 	failed = [g.exception for g in jobs if g.exception is not None]
 	# Save the album after trying to download things (so it will be retried until we get this far)
 	# but only raise for image download errors after, so we at least know about everything that
 	# succeeded.
 	contents_urls = [f"https://imgur.com/{id}.{ext}" for id, ext in contents]
 	_save_content(output_dir, [url], "json", json.dumps(contents_urls))
 	if failed:
 		raise ExceptionGroup(failed)
 	return True
 def imgur_request(url):
 	resp = requests.get(url, allow_redirects=False, timeout=30)
 	if 300 <= resp.status_code < 400:
 		# imgur redirects you if the resource is gone instead of 404ing, treat this as non-retryable
 		raise Rejected(f"imgur returned redirect for {url!r}")
 	# other errors are retryable
 	resp.raise_for_status()
 	return resp
 def download_imgur_album(url, id):
 	"""Fetch imgur album and return a list of (id, ext) contents"""
 	url = f"https://api.imgur.com/post/v1/albums/{id}?client_id=546c25a59c58ad7&include=media,adconfig,account"
 	data = imgur_request(url).json()
 	result = []
 	for item in data.get("media", []):
 		result.append((item["id"], item["ext"]))
 	return result
 def download_imgur_gallery(url, id):
 	"""Fetch imgur gallery and return a list of (id, ext) contents"""
 	# The gallery JSON is contained in a <script> tag like this:
 	# <script>window.postDataJSON=...</script>
 	# where ... is a json string.
 	html = imgur_request(f"https://imgur.com/gallery/{id}").content
 	regex = r'<script>window.postDataJSON=("(?:[^"\\]|\\.)*")'
 	match = re.search(regex, html)
 	# If we can't find a match, assume we got served a 404 page instead.
 	if not match:
 		raise Rejected(f"Could not load gallery for {url!r}")
 	data = match.group(1)
 	# TODO python3 equivalent?
 	data = data[1:-1].decode("string-escape") # remove quotes and unescape contents
 	data = json.loads(data)
 	result = []
 	for item in data.get("media", []):
 		result.append((item["id"], item["ext"]))
 	return result
 def download_imgur_image(output_dir, max_size, url, id, ext):
 	"""Fetch imgur image and save it as per download_media()"""
 	image_url = f"https://i.imgur.com/{id}.{ext}"
 	resp = imgur_request(image_url)
 	_save_response(output_dir, [url, image_url], resp, max_size, 64*1024)