From 9413b99b91f81f16a016fb1f79133c1120b9eb67 Mon Sep 17 00:00:00 2001
From: Mike Lang <mikelang3000@gmail.com>
Date: Mon, 19 Oct 2020 17:52:09 +1100
Subject: [PATCH] Implement playlist manager

---
 playlist_manager/playlist_manager/main.py | 308 +++++++++++++++++++++-
 1 file changed, 306 insertions(+), 2 deletions(-)

diff --git a/playlist_manager/playlist_manager/main.py b/playlist_manager/playlist_manager/main.py
index 5562489..2d8bdcb 100644
--- a/playlist_manager/playlist_manager/main.py
+++ b/playlist_manager/playlist_manager/main.py
@@ -1,4 +1,308 @@
 
+import logging
+import json
+import signal
 
-def main():
-	pass
+import argh
+import gevent
+import gevent.event
+import prometheus_client as prom
+
+import common
+from common.database import DBManager, query
+from common.googleapis import GoogleAPIClient
+
+
+class PlaylistManager(object):
+
+	def __init__(self, dbmanager, api_client, upload_locations, playlist_tags):
+		self.dbmanager = dbmanager
+		self.api = YoutubeAPI(api_client)
+		self.upload_locations = upload_locations
+		self.playlist_tags = playlist_tags
+		self.reset()
+
+	def reset(self, playlist=None):
+		"""Called to clear saved state and force a refresh after errors.
+		Either reset a specific playlist, or all if no arg given.
+		"""
+		if playlist is None:
+			# playlist_state represents our mirrored view of the list of items in each playlist.
+			# If a playlist is not present, it means we need to refresh our view of it.
+			# {playlist_id: [video_id]}
+			self.playlist_state = {}
+		else:
+			self.playlist_state.pop(playlist, None)
+
+	def run_once(self):
+		"""Do one check of all videos and playlists.
+		At a high level:
+			Fetch all eligible videos from the database
+			Group them into playlists depending on their tags
+			For each playlist, concurrently:
+				Compare this generated playlist to our local copy of the real thing
+				If they match, do nothing (don't even bother checking the real thing)
+				Check real thing for total length. If it matches, assume we're good. Otherwise refresh.
+				For each video to add:
+					Determine correct point in sort order
+					Add to playlist
+					Update local mirror with the action we just performed
+		"""
+		logging.info("Checking for new videos")
+		videos = self.get_videos()
+		logging.debug("Found {} eligible videos".format(len(videos)))
+
+		# start all workers
+		workers = {}
+		for playlist, tags in self.playlist_tags.items():
+			workers[playlist] = gevent.spawn(self.update_playlist, playlist, tags, videos)
+
+		# check each one for success, reset on failure
+		for playlist, worker in workers.items():
+			try:
+				worker.get()
+			except Exception:
+				logging.exception("Failed to update playlist {}".format(playlist))
+				self.reset(playlist)
+
+	def get_videos(self):
+		# Most of the time by getting then re-putting the conn, we'll just use the same
+		# one every time. But if there's an error we won't re-put it so we'll get a new one
+		# the next time.
+		conn = self.dbmanager.get_conn()
+		videos = query(conn, """
+			SELECT video_id, tags, start_time
+			FROM events
+			WHERE state = 'DONE' AND upload_location IN %s
+		""", self.upload_locations)
+		self.dbmanager.put_conn(conn)
+		return {video.video_id: video for video in videos}
+
+	def update_playlist(self, playlist, tags, videos):
+		# Filter the video list for videos with matching tags
+		matching = [
+			video for video in videos.values()
+			if all(tag in video.tags for tag in tags)
+		]
+		logging.debug("Found {} matching videos for playlist {}".format(len(matching), playlist))
+		# If we have nothing to add, short circuit without doing any API calls to save quota.
+		if not set([v.video_id for v in matching]) - set(self.playlist_state.get(playlist, [])):
+			logging.debug("All videos already in playlist, nothing to do")
+			return
+		# Refresh our playlist state, if nessecary.
+		self.refresh_playlist(playlist)
+		# Get an updated list of new videos
+		new_videos = [
+			video for video in matching
+			if video.video_id not in self.playlist_state[playlist]
+		]
+		# It shouldn't matter, but just for clarity let's sort them by event order
+		new_videos.sort(key=lambda v: v.start_time)
+		# Insert each new video one at a time
+		logging.debug("Inserting new videos for playlist {}: {}".format(playlist, new_videos))
+		for video in new_videos:
+			index = self.find_insert_index(videos, self.playlist_state[playlist], video)
+			self.insert_into_playlist(playlist, video.video_id, index)
+
+	def refresh_playlist(self, playlist):
+		"""Check playlist mirror is in a good state, and fetch it if it isn't.
+		We try to do this with only one page of list output, to save on quota.
+		If the total length does not match (or we don't have a copy at all),
+		then we do a full refresh.
+		"""
+		logging.debug("Fetching first page of playlist {}".format(playlist))
+		query = self.api.list_playlist(playlist)
+		# See if we can avoid further page fetches.
+		if playlist not in self.playlist_state:
+			logging.info("Fetching playlist {} because we don't currently have it".format(playlist))
+		elif query.is_complete:
+			logging.debug("First page of {} was entire playlist".format(playlist))
+		elif len(self.playlist_state[playlist]) == query.total_size:
+			logging.debug("Skipping fetching of remainder of playlist {}, size matches".format(playlist))
+			return
+		else:
+			logging.warning("Playlist {} has size mismatch ({} saved vs {} actual), refetching".format(
+				playlist, len(self.playlist_state[playlist]), query.total_size,
+			))
+		# Fetch remaining pages, if any
+		query.fetch_all()
+		# Update saved copy with video ids
+		self.playlist_state[playlist] = [
+			item['snippet']['resourceId'].get('videoId') # api implies it's possible that non-videos are added
+			for item in query.items
+		]
+
+	def find_insert_index(self, videos, playlist, new_video):
+		"""Find the index at which to insert new_video into playlist such that
+		playlist remains sorted (it is assumed to already be sorted).
+		videos should be a mapping from video ids to video info.
+		"""
+		# To try to behave as best we can in the presence of mis-sorted playlists,
+		# we walk through the list linearly. We insert immediately before the first
+		# item that should be after us in sort order.
+		# Note that we treat unknown items (videos we don't know) as being before us
+		# in sort order, so that we always insert after them.
+		for n, video_id in enumerate(playlist):
+			if video_id not in videos:
+				# ignore unknowns
+				continue
+			video = videos[video_id]
+			# if this video is after new video, return this index
+			if new_video.start_time < video.start_time:
+				return n
+		# if we reach here, it means everything on the list was before the new video
+		# therefore insert at end
+		return len(playlist)
+
+	def insert_into_playlist(self, playlist, video_id, index):
+		"""Insert video into given playlist at given index.
+		Makes the API call then also updates our mirrored copy.
+		"""
+		logging.info("Inserting {} at index {} of {}".format(video_id, index, playlist))
+		self.api.insert_into_playlist(playlist, video_id, index)
+		# Update our copy
+		self.playlist_state.setdefault(playlist, []).insert(index, video_id)
+
+
+class YoutubeAPI(object):
+	def __init__(self, client):
+		self.client = client
+
+	def insert_into_playlist(self, playlist, video_id, index):
+		json = {
+			"snippet": {
+				"playlistId": playlist,
+				"resourceId": {
+					"kind": "youtube#video",
+					"videoId": video_id,
+				},
+				"position": index,
+			},
+		}
+		resp = self.client.request("POST", "https://www.googleapis.com/youtube/v3/playlistItems",
+			params={"part": "snippet"},
+			json=json,
+			metric_name="playlist_insert",
+		)
+		if not resp.ok:
+			raise Exception("Failed to insert {video_id} at index {index} of {playlist} with {resp.status_code}: {resp.content}".format(
+				playlist=playlist, video_id=video_id, index=index, resp=resp,
+			))
+
+	def list_playlist(self, playlist):
+		"""Fetches the first page of playlist contents and returns a ListQuery object.
+		You can use this object to look up info and optionally retrieve the whole playlist."""
+		data = self._list_playlist(playlist)
+		return ListQuery(self, playlist, data)
+
+	def _list_playlist(self, playlist, page_token=None):
+		"""Internal method that actually does the list query.
+		Returns the full response json."""
+		params = {
+			"part": "snippet",
+			"playlistId": playlist,
+			"maxResults": 50,
+		}
+		if page_token is not None:
+			params['pageToken'] = page_token
+		resp = self.client.request("GET", "https://www.googleapis.com/youtube/v3/playlistItems",
+			params=params,
+			metric_name="playlist_list",
+		)
+		if not resp.ok:
+			raise Exception("Failed to list {playlist} (page_token={!r}) with {resp.status_code}: {resp.content}".format(
+				playlist=playlist, page_token=page_token, resp=resp,
+			))
+		return resp.json()
+
+
+class ListQuery(object):
+	"""Represents a partially-fetched list query for all playlist items.
+	To save on quota, we avoid fetching the entire playlist until asked."""
+	def __init__(self, api, playlist_id, data):
+		self.api = api
+		self.playlist_id = playlist_id
+		self.total_size = data['pageInfo']['totalResults']
+		self.is_complete = self.total_size <= data['pageInfo']['resultsPerPage']
+		self.items = data['items']
+		self.page_token = data.get('nextPageToken')
+
+	def fetch_all(self):
+		if self.is_complete:
+			return
+		page_token = self.page_token
+		while len(self.items) < self.total_size:
+			assert page_token is not None, "More items to fetch but no page token"
+			data = self.api._list_playlist(self.playlist_id, page_token)
+			self.items += data['items']
+			page_token = data.get('nextPageToken')
+			# I'm just being paranoid here about an infinite loop blowing our quota,
+			# let's assert we always are making forward progress
+			assert data['items'], "Got no extra items from new page"
+		self.is_complete = True
+
+
+def parse_playlist_arg(arg):
+	playlist, tags = arg.split('=', 1)
+	tags = tags.split(",") if tags else []
+	return playlist, tags
+
+
+@argh.arg("playlists", nargs="*", metavar="PLAYLIST={TAG,}", type=parse_playlist_arg, help=
+	"Each playlist arg specifies a youtube playlist ID, along with any number of tags. "
+	"Events will be added to the playlist if that event has all the tags. For example, "
+	"some_playlist_id=Day 1,Technical would populate that playlist with all Technical events "
+	"from Day 1. Note that having no tags (ie. 'id=') is allowed and all events will be added to it. "
+	"Note playlist ids must be unique (can't specify the same one twice)."
+)
+def main(
+	dbconnect,
+	creds_file,
+	playlists,
+	upload_location_allowlist="youtube",
+	interval=600,
+	metrics_port=8007,
+	backdoor_port=0,
+):
+	"""
+	dbconnect should be a postgres connection string
+
+	creds_file should contain youtube api creds
+
+	upload_location_allowlist is a comma-seperated list of database upload locations to
+	consider as eligible to being added to playlists. For these locations, the database video id
+	must be a youtube video id.
+
+	interval is how often to check for new videos, default every 10min.
+	"""
+	common.PromLogCountsHandler.install()
+	common.install_stacksampler()
+	prom.start_http_server(metrics_port)
+
+	if backdoor_port:
+		gevent.backdoor.BackdoorServer(('127.0.0.1', backdoor_port), locals=locals()).start()
+
+	upload_locations = upload_location_allowlist.split(",") if upload_location_allowlist else []
+	playlists = dict(playlists)
+
+	stop = gevent.event.Event()
+	gevent.signal_handler(signal.SIGTERM, stop.set) # shut down on sigterm
+
+	logging.info("Starting up")
+
+	with open(creds_file) as f:
+		creds = json.load(f)
+	client = GoogleAPIClient(creds['client_id'], creds['client_secret'], creds['refresh_token'])
+
+	dbmanager = DBManager(dsn=dbconnect)
+	manager = PlaylistManager(dbmanager, client, upload_locations, playlists)
+
+	while not stop.is_set():
+		try:
+			manager.run_once()
+		except Exception:
+			logging.exception("Failed to run playlist manager")
+			manager.reset()
+		stop.wait(interval) # wait for interval, or until stopping
+
+	logging.info("Stopped")