download_media: Get data from potentially malicious URLs and store in the filesystem

This is suitable for taking arbitary URLs from chat, etc and trying to fetch them. It downloads them to a filepath that contains a hash of the URL and content.
10 months ago · 352c9e9081
parent 07055e3605
commit 352c9e9081
1 changed files with 235 additions and 0 deletions
--- a/common/common/media.py
+++ b/common/common/media.py
@ -0,0 +1,235 @@
 import json
 import logging
 import os
 import re
 import socket
 import time
 import urllib.parse
 from base64 import b64encode
 from hashlib import sha256
 from uuid import uuid4
 import gevent
 import prometheus_client as prom
 import urllib3.connection
 from ipaddress import ip_address
 from . import atomic_write, ensure_directory, jitter
 from .stats import timed
 media_bytes_downloaded = prom.Counter(
 	"media_bytes_downloaded",
 	"Number of bytes of media files downloaded. Includes data downloaded then later rejected.",
 )
 media_bytes_saved = prom.Histogram(
 	"media_bytes_saved",
 	"Size in bytes of downloaded media that was successfully saved",
 	["content_type"],
 	buckets = [2**n for n in range(11, 27, 2)],
 )
 media_already_exists = prom.Counter(
 	"media_already_exists",
 	"Count of times we downloaded a file but it already existed",
 )
 class Rejected(Exception):
 	"""Indicates a non-retryable failure due to the url response violating our constraints"""
 class TooLarge(Rejected):
 	"""Response was too large"""
 class ForbiddenDestination(Rejected):
 	"""Hostname resolved to non-global IP"""
 class BadScheme(Rejected):
 	"""Bad url scheme"""
 class WrongContent(Rejected):
 	"""Response was not a video or image"""
@timed()
 def download_media(
 	url,
 	output_dir,
 	max_size=128*2**20, # 128MiB
 	timeout=60,
 	content_types=("image", "video"),
 	max_redirects=5,
 	retries=3,
 	retry_interval=1,
 	chunk_size=64*1024, # 64KiB
 ):
 	"""Make a GET request to a potentially malicious URL and download the content to file.
 	We check the following:
 	- That the host is a public IP
 	- That the response does not exceed given max size (default 128MB)
 	- That the content type is in the given list
 	  (the list may contain exact types like "image/png" or categories like "image")
 	- That the whole thing doesn't take more than a timeout
 	Redirects *will* be followed but the follow-up requests must obey the same rules
 	(and do not reset the timeout).
 	We save the file to OUTPUT_DIR/URL_HASH/FILE_HASH.EXT where EXT is gussed from content-type.
 	We save additional metadata including the url and content type to OUTPUT_DIR/URL_HASH/FILE_HASH.metadata.json
 	Raises on any rule violation or non-200 response.
 	"""
 	# Stores a list of urls redirected to, latest is current.
 	urls = [url]
 	with gevent.Timeout(timeout):
 		for redirect_number in range(max_redirects):
 			errors = []
 			for retry in range(retries):
 				if retry > 0:
 					gevent.sleep(jitter(retry_interval))
 				try:
 					resp = _request(urls[-1], max_size, content_types)
 					new_url = resp.get_redirect_location()
 					if new_url:
 						urls.append(new_url)
 						break # break from retry loop, continuing in the redirect loop
 					_save_response(output_dir, urls, resp, max_size, chunk_size)
 					return
 				except Rejected:
 					raise
 				except Exception as e:
 					errors.append(e)
 					# fall through to next retry loop
 			else:
 				# This block will be reached if range(retries) runs out but not via "break"
 				raise ExceptionGroup(f"All retries failed for url {urls[-1]}", errors)
 		raise Exception("Too many redirects")
 def hash_to_path(hash):
 	return b64encode(hash.digest(), b"-_").decode().rstrip("=")
 def get_url_dir(output_dir, url):
 	return os.path.join(output_dir, hash_to_path(sha256(url.encode())))
 def _save_response(output_dir, urls, resp, max_size, chunk_size):
 	url_dir = get_url_dir(output_dir, urls[0])
 	temp_path = os.path.join(url_dir, f".{uuid4()}.temp")
 	ensure_directory(temp_path)
 	content_type = resp.headers["content-type"]
 	# Content type may have form "TYPE ; PARAMS", strip params if present.
 	# Also normalize for whitespace and case.
 	content_type = content_type.split(";")[0].strip().lower()
 	# We attempt to convert content type to an extension by taking the second part
 	# and stripping anything past the first character not in [a-z0-9-].
 	# So eg. "image/png" -> "png", "image/svg+xml" -> "svg", "image/../../../etc/password" -> ""
 	ext = content_type.split("/")[-1]
 	ext = re.match(r"^[a-z0-9.-]*", ext).group(0)
 	try:
 		length = 0
 		hash = sha256()
 		with open(temp_path, "wb") as f:
 			while True:
 				chunk = resp.read(chunk_size)
 				if not chunk:
 					break
 				hash.update(chunk)
 				length += len(chunk)
 				media_bytes_downloaded.inc(len(chunk))
 				if length > max_size:
 					raise TooLarge(f"Read more than {length} bytes from url {urls[-1]}")
 				f.write(chunk)
 		filename = f"{hash_to_path(hash)}.{ext}"
 		filepath = os.path.join(url_dir, filename)
 		# This is vulnerable to a race where two things create the file at once,
 		# but that's fine since it will always have the same content. This is just an optimization
 		# to avoid replacing the file over and over (and for observability)
 		if os.path.exists(filepath):
 			logging.info(f"Discarding downloaded file for {urls[0]} as it already exists")
 			media_already_exists.inc()
 		else:
 			os.rename(temp_path, filepath)
 			logging.info(f"Downloaded file for {urls[0]}")
 			media_bytes_saved.labels(content_type).observe(length)
 	finally:
 		if os.path.exists(temp_path):
 			os.remove(temp_path)
 	metadata_path = os.path.join(url_dir, f"{hash_to_path(hash)}.metadata.json")
 	# Again, this is racy but we don't care about double-writes.
 	# Note it's entirely possible for the image to already exist but still write the metadata,
 	# this can happen if a previous attempt crashed midway.
 	if not os.path.exists(metadata_path):
 		metadata = {
 			"url": urls[0],
 			"filename": filename,
 			"redirects": urls[1:],
 			"content_type": resp.headers["content-type"],
 			"fetched_by": socket.gethostname(),
 			"fetch_time": time.time(),
 		}
 		atomic_write(metadata_path, json.dumps(metadata, indent=4))
 def _request(url, max_size, content_types):
 	"""Do the actual request and return a vetted response object, which is either the content
 	(status 200) or a redirect.
 	Raises Rejected if content fails checks, anything else should be considered retryable."""
 	parsed = urllib.parse.urlparse(url)
 	hostname = parsed.hostname
 	port = parsed.port
 	ip = socket.gethostbyname(hostname)
 	if not ip_address(ip).is_global:
 		raise ForbiddenDestination(f"Non-global IP {ip} for url {url}")
 	# In order to provide the host/ip to connect to seperately from the URL,
 	# we need to drop to a fairly low-level interface.
 	if parsed.scheme == "http":
 		conn = urllib3.connection.HTTPConnection(ip, port or 80)
 	elif parsed.scheme == "https":
 		conn = urllib3.connection.HTTPSConnection(
 			ip, port or 443,
 			assert_hostname = hostname,
 			server_hostname = hostname,
 		)
 	else:
 		raise BadScheme(f"Bad scheme {parsed.scheme!r} for url {url}")
 	conn.request("GET", url, preload_content=False)
 	resp = conn.getresponse()
 	# Redirects do not require further checks
 	if resp.get_redirect_location():
 		return resp
 	if resp.status != 200:
 		raise Exception(f"Url returned {resp.status} response: {url}")
 	content_type = resp.getheader("content-type")
 	if content_type is None:
 		raise Exception(f"No content-type given for url {url}")
 	if not any(content_type.startswith(target) for target in content_types):
 		raise WrongContent(f"Disallowed content-type {content_type} for url {url}")
 	# If length is known but too large, reject early
 	length = resp.getheader("content-length")
 	if length is not None:
 		try:
 			length = int(length)
 		except ValueError:
 			raise Exception(f"Invalid content length {length!r} for url {url}")
 		if length > max_size:
 			raise TooLarge(f"Content length {length} is too large for url {url}")
 	return resp