sheetsync: Optionally download media linked in image links column

To enable this, you need to: - set --media-dir globally for sheetsync - enable download_media=true for the events sync config To disable for individual rows (eg. because of known issues), put "[nodownload]" in the notes column.
11 months ago · 3e7cb38cf0
parent 7b590cf574
commit 3e7cb38cf0
1 changed files with 45 additions and 2 deletions
--- a/sheetsync/sheetsync/main.py
+++ b/sheetsync/sheetsync/main.py
@ -3,6 +3,7 @@ import json
 import logging
 import signal
 from collections import defaultdict
 from urllib.parse import urlparse
 import argh
 import gevent.backdoor
@ -15,6 +16,7 @@ from requests import HTTPError
 import common
 import common.dateutil
 from common.database import DBManager, query, get_column_placeholder
 from common.media import check_for_media, download_media
 from common.sheets import Sheets as SheetsClient
 from .sheets import SheetsEventsMiddleware, SheetsPlaylistsMiddleware, SheetsArchiveMiddleware
@ -274,6 +276,11 @@ class EventsSync(SheetSync):
 		"category",
 	}
 	def __init__(self, name, middleware, stop, dbmanager, reverse_sync=False, media_dir=None):
 		super().__init__(name, middleware, stop, dbmanager, reverse_sync)
 		self.media_dir = media_dir
 		self.media_downloads = None if media_dir is None else {}
 	def observe_rows(self, rows):
 		counts = defaultdict(lambda: 0)
 		for row in rows:
@ -287,6 +294,25 @@ class EventsSync(SheetSync):
 	def sync_row(self, sheet_row, db_row):
 		# Do some special-case transforms for events before syncing
 		# Attempt to download any URLs in the links column if we don't already have them.
 		# This is done asyncronously. We keep a record of failed attempts for two reasons:
 		# - To avoid retrying
 		# - To populate the errors column asyncronously
 		# This record is just in memory - we're ok retrying after every restart.
 		# You can disable downloads on a per-row basis by putting "[nodownload]" in the notes column.
 		if sheet_row is not None and self.media_dir is not None and "[nodownload]" not in sheet_row["notes"]:
 			for url in sheet_row['image_links']:
 				if url not in self.media_downloads:
 					self.media_downloads[url] = gevent.spawn(self.download_media, url)
 				# Greenlet.exception is populated if the greenlet failed with an exception,
 				# or None otherwise (success or not finished).
 				# We treat a failure to fetch a URL like a parse error.
 				e = self.media_downloads[url].exception
 				if e is not None:
 					sheet_row.setdefault("_parse_errors", []).append(
 						f"Failed to download media link {url:!r}: {e}"
 					)
 		if db_row is not None:
 			# If no database error, but we have parse errors, indicate they should be displayed.
 			if db_row.error is None and sheet_row is not None and sheet_row.get('_parse_errors'):
@ -300,6 +326,20 @@ class EventsSync(SheetSync):
 		super().sync_row(sheet_row, db_row)
 	def download_media(self, url):
 		hostname = urlparse(url).hostname
 		if hostname in ("youtu.be", "youtube.com"):
 			self.logger.info(f"Ignoring url {url:!r}: Blocklisted hostname")
 		if check_for_media(self.media_dir, url):
 			self.logger.info(f"Already have content for url {url:!r}")
 			return
 		try:
 			download_media(url, self.media_dir)
 		except Exception:
 			self.logger.warning(f"Failed to download url {url:!r}", exc_info=True)
 			raise
 		self.logger.info(f"Downloaded media for url {url:!r}")
 class ArchiveSync(EventsSync):
 	# Archive events are a special case of event with less input columns.
@ -372,7 +412,7 @@ class PlaylistsSync(SheetSync):
 			event_id: The id of the streamlog event to sync
 	""",
 )
-def main(dbconnect, sync_configs, metrics_port=8005, backdoor_port=0):
+def main(dbconnect, sync_configs, metrics_port=8005, backdoor_port=0, media_dir="."):
 	"""
 	Sheet sync constantly scans a Google Sheets sheet and a database, copying inputs from the sheet
 	to the DB and outputs from the DB to the sheet.
@ -466,7 +506,10 @@ def main(dbconnect, sync_configs, metrics_port=8005, backdoor_port=0):
 			"playlists": PlaylistsSync,
 			"archive": ArchiveSync,
 		}[config["type"]]
-		sync = sync_class(config["name"], middleware, stop, dbmanager, reverse_sync)
+		sync_class_kwargs = {}
 		if config["type"] == "events" and config.get("download_media", False):
 			sync_class_kwargs["media_dir"] = media_dir
 		sync = sync_class(config["name"], middleware, stop, dbmanager, reverse_sync, **sync_class_kwargs)
 		workers.append(sync)
 	jobs = [gevent.spawn(worker.run) for worker in workers]