diff --git a/common/common/media.py b/common/common/media.py
index 064e459..6bb3a17 100644
--- a/common/common/media.py
+++ b/common/common/media.py
@@ -20,6 +20,11 @@ from . import atomic_write, ensure_directory, jitter, listdir
from .stats import timed
+# Lots of things will tell you to go away if you don't look like a browser
+# (eg. imgur)
+USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
+
+
media_bytes_downloaded = prom.Counter(
"media_bytes_downloaded",
"Number of bytes of media files downloaded. Includes data downloaded then later rejected.",
@@ -243,7 +248,10 @@ def _request(url, max_size, content_types):
else:
raise BadScheme(f"Bad scheme {parsed.scheme!r} for url {url}")
- conn.request("GET", url, preload_content=False)
+ headers = {
+ "User-Agent": USER_AGENT,
+ }
+ conn.request("GET", url, headers=headers, preload_content=False)
resp = conn.getresponse()
# Redirects do not require further checks
@@ -251,7 +259,8 @@ def _request(url, max_size, content_types):
return resp
# 4xx errors are non-retryable, anything else is.
- if 400 <= resp.status < 500:
+ # However 420 and 429 are "rate limit" errors, which should be retried.
+ if 400 <= resp.status < 500 and resp.status not in (420, 429):
raise FailedResponse(f"Url returned {resp.status} response: {url}")
elif not (200 <= resp.status < 300):
raise Exception(f"Url returned {resp.status} response: {url}")
@@ -308,9 +317,9 @@ def download_imgur_url(output_dir, max_size, url):
download_imgur_image(output_dir, max_size, url, id, ext)
return True
elif parsed.path.startswith("/a/"):
- contents = download_imgur_album(parsed.path.removeprefix("/a/"))
- elif parsed.path.startwith("/gallery/"):
- contents = download_imgur_gallery(parsed.path.removeprefix("/gallery/"))
+ contents = download_imgur_album(url, parsed.path.removeprefix("/a/"))
+ elif parsed.path.startswith("/gallery/"):
+ contents = download_imgur_gallery(url, parsed.path.removeprefix("/gallery/"))
else:
# no match, treat like non-imgur link
return False
@@ -327,7 +336,7 @@ def download_imgur_url(output_dir, max_size, url):
# Save the album after trying to download things (so it will be retried until we get this far)
# but only raise for image download errors after, so we at least know about everything that
# succeeded.
- contents_urls = [f"https://imgur.com/{id}.{ext}" for id, ext in contents]
+ contents_urls = [f"https://imgur.com/{id}" for id, ext in contents]
_save_content(output_dir, [url], "json", json.dumps(contents_urls))
if failed:
@@ -337,7 +346,9 @@ def download_imgur_url(output_dir, max_size, url):
def imgur_request(url):
- resp = requests.get(url, allow_redirects=False, timeout=30)
+ resp = requests.get(url, allow_redirects=False, timeout=30, headers={
+ "User-Agent": USER_AGENT,
+ })
if 300 <= resp.status_code < 400:
# imgur redirects you if the resource is gone instead of 404ing, treat this as non-retryable
raise Rejected(f"imgur returned redirect for {url!r}")
@@ -361,15 +372,14 @@ def download_imgur_gallery(url, id):
# The gallery JSON is contained in a
# where ... is a json string.
- html = imgur_request(f"https://imgur.com/gallery/{id}").content
+ html = imgur_request(f"https://imgur.com/gallery/{id}").text
regex = r'