download_media: Get imgur links working and add test script

pull/428/merge
Mike Lang 2 weeks ago committed by Mike Lang
parent c2ff2dfbb1
commit a0ca96aff6

@ -20,6 +20,11 @@ from . import atomic_write, ensure_directory, jitter, listdir
from .stats import timed from .stats import timed
# Lots of things will tell you to go away if you don't look like a browser
# (eg. imgur)
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
media_bytes_downloaded = prom.Counter( media_bytes_downloaded = prom.Counter(
"media_bytes_downloaded", "media_bytes_downloaded",
"Number of bytes of media files downloaded. Includes data downloaded then later rejected.", "Number of bytes of media files downloaded. Includes data downloaded then later rejected.",
@ -243,7 +248,10 @@ def _request(url, max_size, content_types):
else: else:
raise BadScheme(f"Bad scheme {parsed.scheme!r} for url {url}") raise BadScheme(f"Bad scheme {parsed.scheme!r} for url {url}")
conn.request("GET", url, preload_content=False) headers = {
"User-Agent": USER_AGENT,
}
conn.request("GET", url, headers=headers, preload_content=False)
resp = conn.getresponse() resp = conn.getresponse()
# Redirects do not require further checks # Redirects do not require further checks
@ -251,7 +259,8 @@ def _request(url, max_size, content_types):
return resp return resp
# 4xx errors are non-retryable, anything else is. # 4xx errors are non-retryable, anything else is.
if 400 <= resp.status < 500: # However 420 and 429 are "rate limit" errors, which should be retried.
if 400 <= resp.status < 500 and resp.status not in (420, 429):
raise FailedResponse(f"Url returned {resp.status} response: {url}") raise FailedResponse(f"Url returned {resp.status} response: {url}")
elif not (200 <= resp.status < 300): elif not (200 <= resp.status < 300):
raise Exception(f"Url returned {resp.status} response: {url}") raise Exception(f"Url returned {resp.status} response: {url}")
@ -308,9 +317,9 @@ def download_imgur_url(output_dir, max_size, url):
download_imgur_image(output_dir, max_size, url, id, ext) download_imgur_image(output_dir, max_size, url, id, ext)
return True return True
elif parsed.path.startswith("/a/"): elif parsed.path.startswith("/a/"):
contents = download_imgur_album(parsed.path.removeprefix("/a/")) contents = download_imgur_album(url, parsed.path.removeprefix("/a/"))
elif parsed.path.startwith("/gallery/"): elif parsed.path.startswith("/gallery/"):
contents = download_imgur_gallery(parsed.path.removeprefix("/gallery/")) contents = download_imgur_gallery(url, parsed.path.removeprefix("/gallery/"))
else: else:
# no match, treat like non-imgur link # no match, treat like non-imgur link
return False return False
@ -327,7 +336,7 @@ def download_imgur_url(output_dir, max_size, url):
# Save the album after trying to download things (so it will be retried until we get this far) # Save the album after trying to download things (so it will be retried until we get this far)
# but only raise for image download errors after, so we at least know about everything that # but only raise for image download errors after, so we at least know about everything that
# succeeded. # succeeded.
contents_urls = [f"https://imgur.com/{id}.{ext}" for id, ext in contents] contents_urls = [f"https://imgur.com/{id}" for id, ext in contents]
_save_content(output_dir, [url], "json", json.dumps(contents_urls)) _save_content(output_dir, [url], "json", json.dumps(contents_urls))
if failed: if failed:
@ -337,7 +346,9 @@ def download_imgur_url(output_dir, max_size, url):
def imgur_request(url): def imgur_request(url):
resp = requests.get(url, allow_redirects=False, timeout=30) resp = requests.get(url, allow_redirects=False, timeout=30, headers={
"User-Agent": USER_AGENT,
})
if 300 <= resp.status_code < 400: if 300 <= resp.status_code < 400:
# imgur redirects you if the resource is gone instead of 404ing, treat this as non-retryable # imgur redirects you if the resource is gone instead of 404ing, treat this as non-retryable
raise Rejected(f"imgur returned redirect for {url!r}") raise Rejected(f"imgur returned redirect for {url!r}")
@ -361,15 +372,14 @@ def download_imgur_gallery(url, id):
# The gallery JSON is contained in a <script> tag like this: # The gallery JSON is contained in a <script> tag like this:
# <script>window.postDataJSON=...</script> # <script>window.postDataJSON=...</script>
# where ... is a json string. # where ... is a json string.
html = imgur_request(f"https://imgur.com/gallery/{id}").content html = imgur_request(f"https://imgur.com/gallery/{id}").text
regex = r'<script>window.postDataJSON=("(?:[^"\\]|\\.)*")' regex = r'<script>window.postDataJSON=("(?:[^"\\]|\\.)*")'
match = re.search(regex, html) match = re.search(regex, html)
# If we can't find a match, assume we got served a 404 page instead. # If we can't find a match, assume we got served a 404 page instead.
if not match: if not match:
raise Rejected(f"Could not load gallery for {url!r}") raise Rejected(f"Could not load gallery for {url!r}")
data = match.group(1) data = match.group(1)
# TODO python3 equivalent? data = data[1:-1].encode().decode("unicode-escape") # remove quotes and unescape contents
data = data[1:-1].decode("string-escape") # remove quotes and unescape contents
data = json.loads(data) data = json.loads(data)
result = [] result = []
for item in data.get("media", []): for item in data.get("media", []):
@ -381,4 +391,12 @@ def download_imgur_image(output_dir, max_size, url, id, ext):
"""Fetch imgur image and save it as per download_media()""" """Fetch imgur image and save it as per download_media()"""
image_url = f"https://i.imgur.com/{id}.{ext}" image_url = f"https://i.imgur.com/{id}.{ext}"
resp = imgur_request(image_url) resp = imgur_request(image_url)
_save_response(output_dir, [url, image_url], resp, max_size, 64*1024) _save_content(output_dir, [url, image_url], ext, resp.content)
if __name__ == '__main__':
import argh
def main(url, output_dir):
download_media(url, output_dir)
argh.dispatch_command(main)

Loading…
Cancel
Save