download_media: Special casing for imgur

pull/428/merge
Mike Lang 2 weeks ago committed by Mike Lang
parent 4dbe1dce5e
commit 83f18eda84

@ -11,7 +11,9 @@ from uuid import uuid4
import gevent
import prometheus_client as prom
import requests
import urllib3.connection
from gevent.pool import Pool
from ipaddress import ip_address
from . import atomic_write, ensure_directory, jitter, listdir
@ -99,6 +101,9 @@ def download_media(
gevent.sleep(jitter(retry_interval))
try:
if download_imgur_url(output_dir, max_size, url):
return
resp = _request(urls[-1], max_size, content_types)
new_url = resp.get_redirect_location()
@ -190,6 +195,29 @@ def _save_response(output_dir, urls, resp, max_size, chunk_size):
atomic_write(metadata_path, json.dumps(metadata, indent=4))
def _save_content(output_dir, urls, ext, content):
"""Alternate version of _save_response() for cases where content is explicitly generated
instead of coming from a response."""
url_dir = get_url_dir(output_dir, urls[0])
if isinstance(content, str):
content = content.encode()
hash = sha256(content)
filename = f"{hash_to_path(hash)}.{ext}"
filepath = os.path.join(url_dir, filename)
if not os.path.exists(filepath):
atomic_write(filepath, content)
metadata_path = os.path.join(url_dir, f"{hash_to_path(hash)}.metadata.json")
if not os.path.exists(metadata_path):
metadata = {
"url": urls[0],
"filename": filename,
"redirects": urls[1:],
"fetched_by": socket.gethostname(),
"fetch_time": time.time(),
}
atomic_write(metadata_path, json.dumps(metadata, indent=4))
def _request(url, max_size, content_types):
"""Do the actual request and return a vetted response object, which is either the content
(status 200) or a redirect.
@ -245,3 +273,110 @@ def _request(url, max_size, content_types):
raise TooLarge(f"Content length {length} is too large for url {url}")
return resp
def download_imgur_url(output_dir, max_size, url):
"""Links to imgur require special handling to resolve the actual image.
Handles URLs like the following:
i.stack.imgur.com/ID.png
imgur.com/ID
i.imgur.com/ID.EXT
These map to actual media and are stored in the usual way.
imgur.com/a/ID
imgur.com/gallery/ID
These map to collections of media.
Under the original URL we store a json file that lists imgur.com/ID urls
of the contents of the collection. Those urls are then downloaded and stored
in the usual way.
"""
parsed = urllib.parse.urlparse(url)
if parsed.hostname not in ("imgur.com", "i.stack.imgur.com"):
# not an imgur link that needs special handling
return False
match = re.match(r"^/([^/.]+)(?:\.([a-z]+))?$", parsed.path)
if match:
id, ext = match.groups()
if ext is None:
# Try to get a video ("gif") first, if that 400s then get a png.
try:
download_imgur_image(output_dir, max_size, url, id, "mp4")
except requests.HTTPError:
download_imgur_image(output_dir, max_size, url, id, "png")
else:
download_imgur_image(output_dir, max_size, url, id, ext)
return True
elif parsed.path.startswith("/a/"):
contents = download_imgur_album(parsed.path.removeprefix("/a/"))
elif parsed.path.startwith("/gallery/"):
contents = download_imgur_gallery(parsed.path.removeprefix("/gallery/"))
else:
# no match, treat like non-imgur link
return False
# Common part for albums and galleries
pool = Pool(16)
jobs = []
for id, ext in contents:
job = pool.spawn(download_imgur_image, output_dir, max_size, f"https://imgur.com/{id}", id, ext)
jobs.append(job)
gevent.wait(jobs)
failed = [g.exception for g in jobs if g.exception is not None]
# Save the album after trying to download things (so it will be retried until we get this far)
# but only raise for image download errors after, so we at least know about everything that
# succeeded.
contents_urls = [f"https://imgur.com/{id}.{ext}" for id, ext in contents]
_save_content(output_dir, [url], "json", json.dumps(contents_urls))
if failed:
raise ExceptionGroup(failed)
return True
def imgur_request(url):
resp = requests.get(url, allow_redirects=False, timeout=30)
if 300 <= resp.status_code < 400:
# imgur redirects you if the resource is gone instead of 404ing, treat this as non-retryable
raise Rejected(f"imgur returned redirect for {url!r}")
# other errors are retryable
resp.raise_for_status()
return resp
def download_imgur_album(url, id):
"""Fetch imgur album and return a list of (id, ext) contents"""
url = f"https://api.imgur.com/post/v1/albums/{id}?client_id=546c25a59c58ad7&include=media,adconfig,account"
data = imgur_request(url).json()
result = []
for item in data.get("media", []):
result.append((item["id"], item["ext"]))
return result
def download_imgur_gallery(url, id):
"""Fetch imgur gallery and return a list of (id, ext) contents"""
# The gallery JSON is contained in a <script> tag like this:
# <script>window.postDataJSON=...</script>
# where ... is a json string.
html = imgur_request(f"https://imgur.com/gallery/{id}").content
regex = r'<script>window.postDataJSON=("(?:[^"\\]|\\.)*")'
match = re.search(regex, html)
# If we can't find a match, assume we got served a 404 page instead.
if not match:
raise Rejected(f"Could not load gallery for {url!r}")
data = match.group(1)
# TODO python3 equivalent?
data = data[1:-1].decode("string-escape") # remove quotes and unescape contents
data = json.loads(data)
result = []
for item in data.get("media", []):
result.append((item["id"], item["ext"]))
return result
def download_imgur_image(output_dir, max_size, url, id, ext):
"""Fetch imgur image and save it as per download_media()"""
image_url = f"https://i.imgur.com/{id}.{ext}"
resp = imgur_request(image_url)
_save_response(output_dir, [url, image_url], resp, max_size, 64*1024)

Loading…
Cancel
Save