cutter: Implement actual cut methods

Each method is fairly complicated, but is self-contained and can be examined independently. cut_jobs in particular contains several extra helpers and directs control flow via some iterators. This is unfortunately nessecary due to the requests interface.
6 years ago · 5b2a1ef6b7
parent ae809c696c
commit 5b2a1ef6b7
1 changed files with 282 additions and 4 deletions
--- a/cutter/cutter/main.py
+++ b/cutter/cutter/main.py
@ -1,6 +1,8 @@

 import json
 import logging
+import os
+import random
 import signal
 import socket
 from collections import namedtuple
@ -8,9 +10,12 @@ from collections import namedtuple
 import gevent.backdoor
 import gevent.event
 import prometheus_client as prom
+import requests
+from psycopg2 import sql

 import common
 from common.database import DBManager, query
+from common.segments import get_best_segments, cut_segments, ContainsHoles

 from .youtube import Youtube

@ -50,6 +55,10 @@ class CandidateGone(Exception):


 class Cutter(object):
+	NO_CANDIDATES_RETRY_INTERVAL = 1
+	ERROR_RETRY_INTERVAL = 5
+	RETRYABLE_UPLOAD_ERROR_WAIT_INTERVAL = 5
+
 	def __init__(self, youtube, conn, stop, name, segments_path):
 		"""youtube is an authenticated and initialized youtube api client.
 		Conn is a database connection.
@ -86,25 +95,294 @@ class Cutter(object):
 		(or for which allow_holes is true), returning a CutJob.
 		Polls until one is available.
 		"""
-		raise NotImplementedError
+		while not self.stop.is_set():
+			try:
+				candidates = self.list_candidates()
+			except Exception:
+				self.logger.exception("Error while listing candidates")
+				self.wait(self.ERROR_RETRY_INTERVAL)
+				continue
+			if candidates:
+				self.logger.info("Found {} job candidates".format(len(candidates)))
+			# Shuffle the list so that (most of the time) we don't try to claim the same one as other nodes
+			random.shuffle(candidates)
+			for candidate in candidates:
+				try:
+					segments = self.check_candidate(candidate)
+				except ContainsHoles:
+					# TODO metric
+					self.logger.info("Ignoring candidate {} due to holes".format(format_job(candidate)))
+					continue # bad candidate, let someone else take it or just try again later
+				except Exception as e:
+					# Unknown error. This is either a problem with us, or a problem with the candidate
+					# (or most likely a problem with us that is only triggered by this candidate).
+					# In this case we would rather stay running so other jobs can continue to work if possible.
+					# But to give at least some feedback, we set the error message on the job
+					# if it isn't already.
+					self.logger.exception("Failed to check candidate {}, setting error on row".format(format_job(candidate)))
+					try:
+						# Since this error message is just for humans, we don't go to too large
+						# a length to prevent it being put on the row if the row has changed.
+						# We just check its state is still EDITING.
+						# Any successful claim will clear its error.
+						result = query(self.conn, """
+							UPDATE events
+							SET error = %s
+							WHERE id = %s AND state = 'EDITED' AND error IS NULL
+						""", id=candidate.id, error='{}: Error while checking candidate: {}'.format(self.name, e))
+					except Exception:
+						self.logger.exception("Failed to set error for candidate {}, ignoring".format(format_job(candidate)))
+					else:
+						if result.rowcount > 0:
+							assert result.rowcount == 1
+							self.logger.info("Set error for candidate {}".format(format_job(candidate)))
+					self.wait(self.ERROR_RETRY_INTERVAL)
+					continue
+				if all(segment is None for segment in segments):
+					self.logger.info("Ignoring candidate {} as we have no segments".format(format_job(candidate)))
+					continue
+				return CutJob(segments=segments, **candidate._asdict())
+			# No candidates
+			self.wait(self.NO_CANDIDATES_RETRY_INTERVAL)
+
+	def list_candidates(self):
+		"""Return a list of all available candidates that we might be able to cut."""
+		built_query = sql.SQL("""
+			SELECT id, {}
+			FROM events
+			WHERE state = 'EDITED'
+			AND (uploader_whitelist IS NULL OR %(name)s = ANY (uploader_whitelist))
+		""").format(
+			sql.SQL(", ").join(sql.Identifier(key) for key in CUT_JOB_PARAMS)
+		)
+		result = query(self.conn, built_query, name=self.name)
+		return result.fetchall()
+
+	def check_candidate(self, candidate):
+		return get_best_segments(
+			os.path.join(self.segments_path, candidate.video_channel, candidate.video_quality),
+			candidate.video_start,
+			candidate.video_end,
+			allow_holes=candidate.allow_holes,
+		)

 	def claim_job(self, job):
 		"""Update event in DB to say we're working on it.
 		If someone beat us to it, or it's changed, raise CandidateGone."""
 		# We need to verify all relevant cut params are unchanged, in case they
 		# were updated between verifying the candidate and now.
-		raise NotImplementedError
+		built_query = sql.SQL("""
+			UPDATE events
+			SET state = 'CLAIMED', uploader = %(name)s, error = NULL
+			WHERE id = %(id)s
+			AND state = 'EDITED'
+			AND {}
+		""").format(
+			# A built AND over all CUT_JOB_PARAMS to check key = %(key)s.
+			# Note the use of IS NOT DISTINCT FROM because key = NULL is false if key is NULL.
+			sql.SQL(' AND ').join(
+				sql.SQL("{} IS NOT DISTINCT FROM {}").format(sql.Identifier(key), sql.Placeholder(key))
+				for key in CUT_JOB_PARAMS
+			)
+		)
+		try:
+			result = query(self.conn, built_query, name=self.name, **job._asdict())
+		except Exception:
+			# Rather than retry on failure here, just assume someone else claimed it in the meantime
+			self.logger.exception("Error while claiming job {}, aborting claim".format(format_job(job)))
+			self.wait(self.ERROR_RETRY_INTERVAL)
+			raise CandidateGone
+		if result.rowcount == 0:
+			self.logger.info("Failed to claim job {}".format(format_job(job)))
+			raise CandidateGone
+		self.logger.info("Claimed job {}".format(format_job(job)))
+		assert result.rowcount == 1

 	def cut_job(self, job):
 		"""Perform the actual cut and upload, taking the job through FINALIZING and into
 		TRANSCODING or DONE.
+
+		Handles various error conditions:
+		* Errors while cutting: Assumed to be non-retryable until cut parameters are changed
+		  by operator. Sets error and rolls back to UNEDITED.
+		* Request error before request body closed: Assumed to be a transient network failure,
+		  immediately retryable. Sets error and rolls back to EDITED.
+		* Request error after request body closed: It's unknown whether the request went through.
+		  Sets error and remains in FINALIZING. Operator intervention is required.
+		* Row has changed (no longer claimed by us) before request body closed:
+		  Assumed an operator has made changes and changed state back. Abort cutting without error.
+		* Row has changed (no longer claimed by us) after request body closed:
+		  Request has already gone through, but we failed to update database with this state.
+		  Causes program crash (JobConsistencyError) and restart,
+		  at which point it will re-sync with DB as best it can.
+		  This situation almost certainly requires operator intervention.
 		"""
-		raise NotImplementedError
+		# TODO handle multiple upload locations. Currently everything's hard-coded to youtube.
+
+		self.logger.info("Cutting and uploading job {}".format(format_job(job)))
+		cut = cut_segments(job.segments, job.video_start, job.video_end)
+
+		# This flag tracks whether we've told requests to finalize the upload,
+		# and serves to detect whether errors from the request call are recoverable.
+		# Wrapping it in a one-element list is a hack that lets us modify it from within
+		# a closure (as py2 lacks the nonlocal keyword).
+		finalize_begun = [False]
+
+		# This dummy exception is used to pass control flow back out of upload_wrapper
+		# if we've already handled the error and do not need to do anything further.
+		class ErrorHandled(Exception):
+			pass
+
+		# This exception indicates a job we thought was ours somehow disappeared
+		# while we were still trying to cut it. This most likely represents a logic error
+		# or that our instance is in a bad state, and will be raised up to run() to terminate
+		# the cutter entirely.
+		class JobConsistencyError(Exception):
+			pass
+
+		def set_row(**kwargs):
+			"""Set columns on the row being cut. Returns True on success,
+			False if row could not be found.
+			Example:
+				if not set_row(state='UNEDITED', error=e):
+					<handle row having gone missing>
+			"""
+			# construct an UPDATE query like "SET key1=%(key1)s, key2=%(key2)s, ..."
+			built_query = sql.SQL("""
+				UPDATE events
+				SET {}
+				WHERE id = %(id)s AND uploader = %(name)s
+			""").format(sql.SQL(", ").join(
+				sql.SQL("{} = {}").format(
+					sql.Identifier(key), sql.Placeholder(key),
+				) for key in kwargs
+			))
+			result = query(self.conn, built_query, id=job.id, name=self.name, **kwargs)
+			return result.rowcount == 1
+
+		def upload_wrapper():
+			# This generator wraps the cut_segments generator so we can
+			# do things in between the data being finished and finalizing the request.
+			# This is also where we do the main error handling.
+
+			try:
+				for chunk in cut:
+					yield chunk
+			except Exception as ex:
+				self.logger.exception("Error occurred while trying to cut job {}".format(format_job(job)))
+				# Assumed error is not retryable, set state back to UNEDITED and set error.
+				if not set_row(state='UNEDITED', error="Error while cutting: {}".format(ex), uploader=None):
+					self.logger.warning("Tried to roll back row {} to unedited but it was already cancelled.".format(job.id))
+				# Abort the cut without further error handling
+				raise ErrorHandled
+
+			# The data is now fully uploaded, but the request is not finalized.
+			# We now set the DB state to finalized so we know about failures during this
+			# critical section.
+
+			self.logger.debug("Setting job to finalizing")
+			if not set_row(state='FINALIZING'):
+				# Abort the cut and crash the program, forcing a state resync
+				raise JobConsistencyError(
+					"No job with id {} and uploader {} when setting FINALIZING"
+					.format(job.id, self.name)
+				)
+			finalize_begun[0] = True
+
+			# Now we return from this generator, and any errors between now and returning
+			# from requests.post() are not recoverable.
+
+		try:
+			video_id = self.youtube.upload_video(
+				title=job.video_title,
+				description=job.video_description,
+				tags=[], # TODO
+				data=upload_wrapper(),
+				hidden=True, # TODO remove when not testing
+			)
+		except JobConsistencyError:
+			raise # this ensures it's not caught in the next except block
+		except ErrorHandled:
+			# we're aborting the cut, error handling has already happened
+			return
+		except Exception as ex:
+			# for HTTPErrors, getting http response body is also useful
+			if isinstance(ex, requests.HTTPError):
+				ex = "{}: {}".format(ex, ex.response.content)
+
+			# if error during finalizing, set it in the database and leave it
+			# stuck in FINALIZING state for operator intervention.
+			if finalize_begun[0]:
+				self.logger.critical((
+					"Error occurred while finalizing upload of job {}. "
+					"You will need to check the state of the video manually."
+				).format(format_job(job)))
+				error = (
+					"An error occurred during FINALIZING, please determine if video was actually "
+					"uploaded or not and either move to TRANSCODING and populate video_id or rollback "
+					"to EDITED and clear uploader. "
+					"Error: {}"
+				).format(ex)
+				if not set_row(error=error):
+					# Not only do we not know if it was uploaded, we also failed to set that in the database!
+					raise JobConsistencyError(
+						"No job with id {} and uploader {} when setting error while finalizing!"
+						.format(job.id, self.name)
+					)
+				return
+
+			# error before finalizing, assume it's a network issue / retryable.
+			# set back to EDITED but still set error
+			self.logger.exception("Retryable error when uploading job {}".format(format_job(job)))
+			if not set_row(state='EDITED', error="Retryable error while uploading: {}".format(ex), uploader=None):
+				raise JobConsistencyError(
+					"No job with id {} and uploader {} when setting error while rolling back for retryable error"
+					.format(job.id, self.name)
+				)
+			# pause briefly so we don't immediately grab the same one again in a rapid retry loop
+			gevent.sleep(self.RETRYABLE_UPLOAD_ERROR_WAIT_INTERVAL)
+			return
+
+		# Success! Set TRANSCODING and clear any previous error.
+		link = "https://youtu.be/{}".format(video_id)
+		if not set_row(state='TRANSCODING', video_id=video_id, video_link=link, error=None):
+			# This will result in it being stuck in FINALIZING, and an operator will need to go
+			# confirm it was really uploaded.
+			raise JobConsistencyError(
+				"No job with id {} and uploader {} when setting to TRANSCODING"
+				.format(job.id, self.name)
+			)
+
+		self.logger.info("Successfully cut and uploaded job {} as {}".format(format_job(job), link))

 	def rollback_all_owned(self):
 		"""Roll back any in-progress jobs that claim to be owned by us,
 		to recover from an unclean shutdown."""
-		raise NotImplementedError
+		result = query(self.conn, """
+			UPDATE events
+			SET state = 'EDITED', uploader = NULL
+			WHERE state = 'CLAIMED' AND uploader = %(name)s
+		""", name=self.name)
+		if result.rowcount > 0:
+			self.logger.warning("Rolled back {} CLAIMED rows for {} - unclean shutdown?".format(
+				result.rowcount, self.name,
+			))
+
+		# Also mark any rows in FINALIZED owned by us as errored, these require manual intervention
+		result = query(self.conn, """
+			UPDATE events
+			SET error = %(error)s
+			WHERE state = 'FINALIZING' AND uploader = %(name)s AND error IS NULL
+		""", name=self.name, error=(
+			"Uploader died during FINALIZING, please determine if video was actually "
+			"uploaded or not and either move to TRANSCODING and populate video_id or rollback "
+			"to EDITED and clear uploader."
+		))
+		if result.rowcount > 0:
+			self.logger.error("Found {} FINALIZING rows for {}, marked as errored".format(
+				result.rowcount, self.name,
+			))
+


 class TranscodeChecker(object):