You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
wubloader/common/common/media.py

403 lines
13 KiB
Python

import json
import logging
import os
import re
import socket
import time
import urllib.parse
from base64 import b64encode
from hashlib import sha256
from uuid import uuid4
import gevent
import prometheus_client as prom
import requests
import urllib3.connection
from gevent.pool import Pool
from ipaddress import ip_address
from . import atomic_write, ensure_directory, jitter, listdir
from .stats import timed
# Lots of things will tell you to go away if you don't look like a browser
# (eg. imgur)
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
media_bytes_downloaded = prom.Counter(
"media_bytes_downloaded",
"Number of bytes of media files downloaded. Includes data downloaded then later rejected.",
)
media_bytes_saved = prom.Histogram(
"media_bytes_saved",
"Size in bytes of downloaded media that was successfully saved",
["content_type"],
buckets = [2**n for n in range(11, 27, 2)],
)
media_already_exists = prom.Counter(
"media_already_exists",
"Count of times we downloaded a file but it already existed",
)
class Rejected(Exception):
"""Indicates a non-retryable failure due to the url response violating our constraints"""
class TooLarge(Rejected):
"""Response was too large"""
class ForbiddenDestination(Rejected):
"""Hostname resolved to non-global IP"""
class BadScheme(Rejected):
"""Bad url scheme"""
class WrongContent(Rejected):
"""Response was not a video or image"""
class FailedResponse(Rejected):
"""Got a 4xx response, probably a bad link"""
def check_for_media(output_dir, url):
"""Returns True if we have at least one version of content for the given url already."""
url_dir = get_url_dir(output_dir, url)
return any(filename.endswith(".metadata.json") for filename in listdir(url_dir))
@timed()
def download_media(
url,
output_dir,
max_size=128*2**20, # 128MiB
timeout=60,
content_types=("image", "video", "application/pdf"),
max_redirects=5,
retries=3,
retry_interval=1,
chunk_size=64*1024, # 64KiB
):
"""Make a GET request to a potentially malicious URL and download the content to file.
We check the following:
- That the host is a public IP
- That the response does not exceed given max size (default 128MB)
- That the content type is in the given list
(the list may contain exact types like "image/png" or categories like "image")
- That the whole thing doesn't take more than a timeout
Redirects *will* be followed but the follow-up requests must obey the same rules
(and do not reset the timeout).
We save the file to OUTPUT_DIR/URL_HASH/FILE_HASH.EXT where EXT is gussed from content-type.
We save additional metadata including the url and content type to OUTPUT_DIR/URL_HASH/FILE_HASH.metadata.json
Raises on any rule violation or non-200 response.
"""
# Stores a list of urls redirected to, latest is current.
urls = [url]
with gevent.Timeout(timeout):
for redirect_number in range(max_redirects):
errors = []
for retry in range(retries):
if retry > 0:
gevent.sleep(jitter(retry_interval))
try:
if download_imgur_url(output_dir, max_size, url):
return
resp = _request(urls[-1], max_size, content_types)
new_url = resp.get_redirect_location()
if new_url:
urls.append(new_url)
break # break from retry loop, continuing in the redirect loop
_save_response(output_dir, urls, resp, max_size, chunk_size)
return
except Rejected:
raise
except Exception as e:
errors.append(e)
# fall through to next retry loop
else:
# This block will be reached if range(retries) runs out but not via "break"
raise Exception(f"All retries failed for url {urls[-1]}: {errors}")
raise Exception("Too many redirects")
def hash_to_path(hash):
return b64encode(hash.digest(), b"-_").decode().rstrip("=")
def get_url_dir(output_dir, url):
return os.path.join(output_dir, hash_to_path(sha256(url.encode())))
def _save_response(output_dir, urls, resp, max_size, chunk_size):
url_dir = get_url_dir(output_dir, urls[0])
temp_path = os.path.join(url_dir, f".{uuid4()}.temp")
ensure_directory(temp_path)
content_type = resp.headers["content-type"]
# Content type may have form "TYPE ; PARAMS", strip params if present.
# Also normalize for whitespace and case.
content_type = content_type.split(";")[0].strip().lower()
# We attempt to convert content type to an extension by taking the second part
# and stripping anything past the first character not in [a-z0-9-].
# So eg. "image/png" -> "png", "image/svg+xml" -> "svg", "image/../../../etc/password" -> ""
ext = content_type.split("/")[-1]
ext = re.match(r"^[a-z0-9.-]*", ext).group(0)
try:
length = 0
hash = sha256()
with open(temp_path, "wb") as f:
while True:
chunk = resp.read(chunk_size)
if not chunk:
break
hash.update(chunk)
length += len(chunk)
media_bytes_downloaded.inc(len(chunk))
if length > max_size:
raise TooLarge(f"Read more than {length} bytes from url {urls[-1]}")
f.write(chunk)
filename = f"{hash_to_path(hash)}.{ext}"
filepath = os.path.join(url_dir, filename)
# This is vulnerable to a race where two things create the file at once,
# but that's fine since it will always have the same content. This is just an optimization
# to avoid replacing the file over and over (and for observability)
if os.path.exists(filepath):
logging.info(f"Discarding downloaded file for {urls[0]} as it already exists")
media_already_exists.inc()
else:
os.rename(temp_path, filepath)
logging.info(f"Downloaded file for {urls[0]}")
media_bytes_saved.labels(content_type).observe(length)
finally:
if os.path.exists(temp_path):
os.remove(temp_path)
metadata_path = os.path.join(url_dir, f"{hash_to_path(hash)}.metadata.json")
# Again, this is racy but we don't care about double-writes.
# Note it's entirely possible for the image to already exist but still write the metadata,
# this can happen if a previous attempt crashed midway.
if not os.path.exists(metadata_path):
metadata = {
"url": urls[0],
"filename": filename,
"redirects": urls[1:],
"content_type": resp.headers["content-type"],
"fetched_by": socket.gethostname(),
"fetch_time": time.time(),
}
atomic_write(metadata_path, json.dumps(metadata, indent=4))
def _save_content(output_dir, urls, ext, content):
"""Alternate version of _save_response() for cases where content is explicitly generated
instead of coming from a response."""
url_dir = get_url_dir(output_dir, urls[0])
if isinstance(content, str):
content = content.encode()
hash = sha256(content)
filename = f"{hash_to_path(hash)}.{ext}"
filepath = os.path.join(url_dir, filename)
if not os.path.exists(filepath):
atomic_write(filepath, content)
metadata_path = os.path.join(url_dir, f"{hash_to_path(hash)}.metadata.json")
if not os.path.exists(metadata_path):
metadata = {
"url": urls[0],
"filename": filename,
"redirects": urls[1:],
"fetched_by": socket.gethostname(),
"fetch_time": time.time(),
}
atomic_write(metadata_path, json.dumps(metadata, indent=4))
def _request(url, max_size, content_types):
"""Do the actual request and return a vetted response object, which is either the content
(status 200) or a redirect.
Raises Rejected if content fails checks, anything else should be considered retryable."""
parsed = urllib.parse.urlparse(url)
hostname = parsed.hostname
port = parsed.port
ip = socket.gethostbyname(hostname)
if not ip_address(ip).is_global:
raise ForbiddenDestination(f"Non-global IP {ip} for url {url}")
# In order to provide the host/ip to connect to seperately from the URL,
# we need to drop to a fairly low-level interface.
if parsed.scheme == "http":
conn = urllib3.connection.HTTPConnection(ip, port or 80)
elif parsed.scheme == "https":
conn = urllib3.connection.HTTPSConnection(
ip, port or 443,
assert_hostname = hostname,
server_hostname = hostname,
)
else:
raise BadScheme(f"Bad scheme {parsed.scheme!r} for url {url}")
headers = {
"User-Agent": USER_AGENT,
}
conn.request("GET", url, headers=headers, preload_content=False)
resp = conn.getresponse()
# Redirects do not require further checks
if resp.get_redirect_location():
return resp
# 4xx errors are non-retryable, anything else is.
# However 420 and 429 are "rate limit" errors, which should be retried.
if 400 <= resp.status < 500 and resp.status not in (420, 429):
raise FailedResponse(f"Url returned {resp.status} response: {url}")
elif not (200 <= resp.status < 300):
raise Exception(f"Url returned {resp.status} response: {url}")
content_type = resp.getheader("content-type")
if content_type is None:
raise Exception(f"No content-type given for url {url}")
if not any(content_type.startswith(target) for target in content_types):
raise WrongContent(f"Disallowed content-type {content_type} for url {url}")
# If length is known but too large, reject early
length = resp.getheader("content-length")
if length is not None:
try:
length = int(length)
except ValueError:
raise Exception(f"Invalid content length {length!r} for url {url}")
if length > max_size:
raise TooLarge(f"Content length {length} is too large for url {url}")
return resp
def download_imgur_url(output_dir, max_size, url):
"""Links to imgur require special handling to resolve the actual image.
Handles URLs like the following:
i.stack.imgur.com/ID.png
imgur.com/ID
These map to actual media and are stored in the usual way.
imgur.com/a/ID
imgur.com/gallery/ID
These map to collections of media.
Under the original URL we store a json file that lists imgur.com/ID urls
of the contents of the collection. Those urls are then downloaded and stored
in the usual way.
Notably this function doesn't need to handle URLs like:
i.imgur.com/ID.EXT
as this is already a direct image link, so we can just use the normal handling.
"""
parsed = urllib.parse.urlparse(url)
if parsed.hostname not in ("imgur.com", "i.stack.imgur.com"):
# not an imgur link that needs special handling
return False
match = re.match(r"^/([^/.]+)(?:\.([a-z]+))?$", parsed.path)
if match:
id, ext = match.groups()
if ext is None:
# Try to get a video ("gif") first, if that 400s then get a png.
try:
download_imgur_image(output_dir, max_size, url, id, "mp4")
except requests.HTTPError:
download_imgur_image(output_dir, max_size, url, id, "png")
else:
download_imgur_image(output_dir, max_size, url, id, ext)
return True
elif parsed.path.startswith("/a/"):
contents = download_imgur_album(url, parsed.path.removeprefix("/a/"))
elif parsed.path.startswith("/gallery/"):
contents = download_imgur_gallery(url, parsed.path.removeprefix("/gallery/"))
else:
# no match, treat like non-imgur link
return False
# Common part for albums and galleries
pool = Pool(16)
jobs = []
for id, ext in contents:
job = pool.spawn(download_imgur_image, output_dir, max_size, f"https://imgur.com/{id}", id, ext)
jobs.append(job)
gevent.wait(jobs)
failed = [g.exception for g in jobs if g.exception is not None]
# Save the album after trying to download things (so it will be retried until we get this far)
# but only raise for image download errors after, so we at least know about everything that
# succeeded.
contents_urls = [f"https://imgur.com/{id}" for id, ext in contents]
_save_content(output_dir, [url], "json", json.dumps(contents_urls))
if failed:
raise Exception(str(failed))
return True
def imgur_request(url):
resp = requests.get(url, allow_redirects=False, timeout=30, headers={
"User-Agent": USER_AGENT,
})
if 300 <= resp.status_code < 400:
# imgur redirects you if the resource is gone instead of 404ing, treat this as non-retryable
raise Rejected(f"imgur returned redirect for {url!r}")
# other errors are retryable
resp.raise_for_status()
return resp
def download_imgur_album(url, id):
"""Fetch imgur album and return a list of (id, ext) contents"""
url = f"https://api.imgur.com/post/v1/albums/{id}?client_id=546c25a59c58ad7&include=media,adconfig,account"
data = imgur_request(url).json()
result = []
for item in data.get("media", []):
result.append((item["id"], item["ext"]))
return result
def download_imgur_gallery(url, id):
"""Fetch imgur gallery and return a list of (id, ext) contents"""
# The gallery JSON is contained in a <script> tag like this:
# <script>window.postDataJSON=...</script>
# where ... is a json string.
html = imgur_request(f"https://imgur.com/gallery/{id}").text
regex = r'<script>window.postDataJSON=("(?:[^"\\]|\\.)*")'
match = re.search(regex, html)
# If we can't find a match, assume we got served a 404 page instead.
if not match:
raise Rejected(f"Could not load gallery for {url!r}")
data = match.group(1)
data = data[1:-1].encode().decode("unicode-escape") # remove quotes and unescape contents
data = json.loads(data)
result = []
for item in data.get("media", []):
result.append((item["id"], item["ext"]))
return result
def download_imgur_image(output_dir, max_size, url, id, ext):
"""Fetch imgur image and save it as per download_media()"""
image_url = f"https://i.imgur.com/{id}.{ext}"
resp = imgur_request(image_url)
_save_content(output_dir, [url, image_url], ext, resp.content)
if __name__ == '__main__':
import argh
def main(url, output_dir):
download_media(url, output_dir)
argh.dispatch_command(main)