Merge pull request #133 from ekimekim/mike/more-metrics

Add lots of metrics
6 years ago · 981b89551e
parent 1574f319c6 c740090c53
commit 981b89551e
10 changed files with 148 additions and 25 deletions
--- a/backfiller/backfiller/main.py
+++ b/backfiller/backfiller/main.py
@ -15,11 +15,14 @@ import argh
 import gevent.backdoor
 import gevent.pool
 import prometheus_client as prom
-import requests

 import common
 from common import dateutil
 from common import database
+from common.requests import InstrumentedSession
+
+# Wraps all requests in some metric collection
+requests = InstrumentedSession()

 segments_backfilled = prom.Counter(
 	'segments_backfilled',
@ -103,7 +106,7 @@ def list_remote_hours(node, channel, quality, timeout=TIMEOUT):
 	"""Wrapper around a call to restreamer.list_hours."""
 	uri = '{}/files/{}/{}'.format(node, channel, quality)
 	logging.debug('Getting list of hours from {}'.format(uri))
-	resp = requests.get(uri, timeout=timeout)
+	resp = requests.get(uri, timeout=timeout, metric_name='list_remote_hours')
 	return common.encode_strings(resp.json())


@ -111,7 +114,7 @@ def list_remote_segments(node, channel, quality, hour, timeout=TIMEOUT):
 	"""Wrapper around a call to restreamer.list_segments."""
 	uri = '{}/files/{}/{}/{}'.format(node, channel, quality, hour)
 	logging.debug('Getting list of segments from {}'.format(uri))
-	resp = requests.get(uri, timeout=timeout)
+	resp = requests.get(uri, timeout=timeout, metric_name='list_remote_segments')
 	return common.encode_strings(resp.json())


@ -140,7 +143,7 @@ def get_remote_segment(base_dir, node, channel, quality, hour, missing_segment,
 	try:
 		logging.debug('Fetching segment {} from {}'.format(path, node))
 		uri = '{}/segments/{}/{}/{}/{}'.format(node, channel, quality, hour, missing_segment)
-		resp = requests.get(uri, stream=True, timeout=timeout)
+		resp = requests.get(uri, stream=True, timeout=timeout, metric_name='get_remote_segment')

 		resp.raise_for_status()

--- a/common/common/googleapis.py
+++ b/common/common/googleapis.py
@ -2,7 +2,11 @@
 import time

 import gevent
-import requests
+
+from .requests import InstrumentedSession
+
+# Wraps all requests in some metric collection
+requests = InstrumentedSession()


 class GoogleAPIClient(object):
@ -40,7 +44,7 @@ class GoogleAPIClient(object):
 					'client_secret': self.client_secret,
 					'refresh_token': self.refresh_token,
 					'grant_type': 'refresh_token',
-				})
+				}, metric_name='get_access_token')
 				resp.raise_for_status()
 				data = resp.json()
 				self._access_token = data['access_token']
--- a/common/common/requests.py
+++ b/common/common/requests.py
@ -0,0 +1,51 @@
+
+"""Code for instrumenting requests calls. Requires requests, obviously."""
+
+import urlparse
+
+import requests
+import prometheus_client as prom
+from monotonic import monotonic
+
+request_latency = prom.Histogram(
+	'http_client_request_latency',
+	'Time taken to make an outgoing HTTP request. '
+	'Status = "error" is used if an error occurs. Measured as time from first byte sent to '
+	'headers finished being parsed, ie. does not include reading a streaming response.',
+	['name', 'method', 'domain', 'status'],
+)
+
+response_size = prom.Histogram(
+	'http_client_response_size',
+	"The content length of (non-streaming) responses to outgoing HTTP requests.",
+	['name', 'method', 'domain', 'status'],
+)
+
+request_concurrency = prom.Gauge(
+	'http_client_request_concurrency',
+	"The number of outgoing HTTP requests currently ongoing",
+	['name', 'method', 'domain'],
+)
+
+class InstrumentedSession(requests.Session):
+	"""A requests Session that automatically records metrics on requests made.
+	Users may optionally pass a 'metric_name' kwarg that will be included as the 'name' label.
+	"""
+
+	def request(self, method, url, *args, **kwargs):
+		_, domain, _, _, _ = urlparse.urlsplit(url)
+		name = kwargs.pop('metric_name', '')
+
+		start = monotonic() # we only use our own measured latency if an error occurs
+		try:
+			with request_concurrency.labels(name, method, domain).track_inprogress():
+				response = super(InstrumentedSession, self).request(method, url, *args, **kwargs)
+		except Exception:
+			latency = monotonic() - start
+			request_latency.labels(name, method, domain, "error").observe(latency)
+			raise
+
+		request_latency.labels(name, method, domain, response.status_code).observe(response.elapsed)
+		if 'content-length' in response.headers:
+			response_size.labels(name, method, domain, response.status_code).observe(response.headers['content-length'])
+		return response
--- a/common/common/segments.py
+++ b/common/common/segments.py
@ -328,6 +328,7 @@ def read_chunks(fileobj, chunk_size=16*1024):
 		yield chunk


+@timed('cut', type='fast', normalize=lambda _, segments, start, end: (end - start).total_seconds())
 def fast_cut_segments(segments, start, end):
 	"""Yields chunks of a MPEGTS video file covering the exact timestamp range.
 	segments should be a list of segments as returned by get_best_segments().
@ -400,6 +401,7 @@ def fast_cut_segments(segments, start, end):
 					yield chunk


+@timed('cut', type='full', normalize=lambda _, segments, start, end, encode_args: (end - start).total_seconds())
 def full_cut_segments(segments, start, end, encode_args):
 	# how far into the first segment to begin
 	cut_start = max(0, (start - segments[0].start).total_seconds())
--- a/cutter/cutter/main.py
+++ b/cutter/cutter/main.py
@ -17,22 +17,40 @@ from psycopg2 import sql
 import common
 from common.database import DBManager, query
 from common.segments import get_best_segments, fast_cut_segments, full_cut_segments, ContainsHoles
+from common.stats import timed

 from .upload_backends import Youtube, Local, UploadError


-videos_uploaded  = prom.Counter(
+videos_uploaded = prom.Counter(
 	'videos_uploaded',
 	'Number of videos successfully uploaded',
 	['video_channel', 'video_quality', 'upload_location']
 )

-upload_errors  = prom.Counter(
+upload_errors = prom.Counter(
 	'upload_errors',
 	'Number of errors uploading a video',
 	['video_channel', 'video_quality', 'upload_location', 'final_state']
 )

+no_candidates = prom.Counter(
+	'no_candidates',
+	"Number of times we looked for candidate jobs but didn't find any",
+)
+
+videos_transcoding = prom.Gauge(
+	'videos_transcoding',
+	"Number of videos currently in transcoding",
+	['location'],
+)
+
+videos_marked_done = prom.Counter(
+	'videos_marked_done',
+	"Number of videos we have successfully marked as done",
+	['location'],
+)
+
 # A list of all the DB column names in CutJob
 CUT_JOB_PARAMS = [
 	"sheet_name",
@ -156,7 +174,6 @@ class Cutter(object):
 				try:
 					segments = self.check_candidate(candidate)
 				except ContainsHoles:
-					# TODO metric
 					self.logger.info("Ignoring candidate {} due to holes".format(format_job(candidate)))
 					set_error(
 						"Node {} does not have all the video needed to cut this row. "
@ -184,8 +201,10 @@ class Cutter(object):
 				return CutJob(segments=segments, **candidate._asdict())

 			# No candidates
+			no_candidates.inc()
 			self.wait(self.NO_CANDIDATES_RETRY_INTERVAL)

+	@timed()
 	def list_candidates(self):
 		"""Return a list of all available candidates that we might be able to cut."""
 		# We only accept candidates if they haven't excluded us by whitelist,
@ -202,6 +221,7 @@ class Cutter(object):
 		result = query(self.conn, built_query, name=self.name, upload_locations=self.upload_locations.keys())
 		return result.fetchall()

+	# No need to instrument this function, just use get_best_segments() stats
 	def check_candidate(self, candidate):
 		return get_best_segments(
 			os.path.join(self.segments_path, candidate.video_channel, candidate.video_quality),
@ -210,6 +230,10 @@ class Cutter(object):
 			allow_holes=candidate.allow_holes,
 		)

+	@timed(
+		video_channel = lambda self, job: job.video_channel,
+		video_quality = lambda self, job: job.video_quality,
+	)
 	def claim_job(self, job):
 		"""Update event in DB to say we're working on it.
 		If someone beat us to it, or it's changed, raise CandidateGone."""
@ -445,13 +469,14 @@ class TranscodeChecker(object):
 	FOUND_VIDEOS_RETRY_INTERVAL = 20
 	ERROR_RETRY_INTERVAL = 20

-	def __init__(self, backend, dbmanager, stop):
+	def __init__(self, location, backend, dbmanager, stop):
 		"""
 		backend is an upload backend that supports transcoding
 		and defines check_status().
 		Conn is a database connection.
 		Stop is an Event triggering graceful shutdown when set.
 		"""
+		self.location = location
 		self.backend = backend
 		self.dbmanager = dbmanager
 		self.stop = stop
@ -469,11 +494,13 @@ class TranscodeChecker(object):
 				if not ids:
 					self.wait(self.NO_VIDEOS_RETRY_INTERVAL)
 					continue
+				videos_transcoding.labels(self.location).set(len(ids))
 				self.logger.info("Found {} videos in TRANSCODING".format(len(ids)))
 				ids = self.check_ids(ids)
 				if ids:
 					self.logger.info("{} videos are done".format(len(ids)))
 					done = self.mark_done(ids)
+					videos_marked_done.labels(self.location).inc(done)
 					self.logger.info("Marked {} videos as done".format(done))
 				self.wait(self.FOUND_VIDEOS_RETRY_INTERVAL)
 			except Exception:
@ -585,7 +612,7 @@ def main(

 	config = json.loads(config)
 	upload_locations = {}
-	needs_transcode_check = []
+	needs_transcode_check = {}
 	for location, backend_config in config.items():
 		backend_type = backend_config.pop('type')
 		no_transcode_check = backend_config.pop('no_transcode_check', False)
@ -604,12 +631,12 @@ def main(
 			raise ValueError("Unknown cut type: {!r}".format(cut_type))
 		upload_locations[location] = backend
 		if backend.needs_transcode and not no_transcode_check:
-			needs_transcode_check.append(backend)
+			needs_transcode_check[location] = backend

 	cutter = Cutter(upload_locations, dbmanager, stop, name, base_dir, tags)
 	transcode_checkers = [
-		TranscodeChecker(backend, dbmanager, stop)
-		for backend in needs_transcode_check
+		TranscodeChecker(location, backend, dbmanager, stop)
+		for location, backend in needs_transcode_check.items()
 	]
 	jobs = [gevent.spawn(cutter.run)] + [
 		gevent.spawn(transcode_checker.run)
--- a/cutter/cutter/upload_backends.py
+++ b/cutter/cutter/upload_backends.py
@ -143,13 +143,14 @@ class Youtube(UploadBackend):
 				'uploadType': 'resumable',
 			},
 			json=json,
+			metric_name='create_video',
 		)
 		if not resp.ok:
 			# Don't retry, because failed calls still count against our upload quota.
 			# The risk of repeated failed attempts blowing through our quota is too high.
 			raise UploadError("Youtube create video call failed with {resp.status_code}: {resp.content}".format(resp=resp))
 		upload_url = resp.headers['Location']
-		resp = self.client.request('POST', upload_url, data=data)
+		resp = self.client.request('POST', upload_url, data=data, metric_name='upload_video')
 		if 400 <= resp.status_code < 500:
 			# As above, don't retry. But with 4xx's we know the upload didn't go through.
 			# On a 5xx, we can't be sure (the server is in an unspecified state).
@ -169,6 +170,7 @@ class Youtube(UploadBackend):
 					'part': 'id,status',
 					'id': ','.join(group),
 				},
+				metric_name='list_videos',
 			)
 			resp.raise_for_status()
 			for item in resp.json()['items']:
--- a/downloader/downloader/main.py
+++ b/downloader/downloader/main.py
@ -21,6 +21,7 @@ from monotonic import monotonic
 import twitch
 import common
 import common.dateutil
+import common.requests


 segments_downloaded = prom.Counter(
@ -29,6 +30,13 @@ segments_downloaded = prom.Counter(
 	["partial", "channel", "quality"],
 )

+segment_duration_downloaded = prom.Counter(
+	"segment_duration_downloaded",
+	"Total duration of all segments partially or fully downloaded. "
+	"Note partial segments still count the full duration.",
+	["partial", "stream", "variant"],
+)
+
 latest_segment = prom.Gauge(
 	"latest_segment",
 	"Timestamp of the time of the newest segment fully downloaded",
@ -74,8 +82,6 @@ def soft_hard_timeout(logger, description, (soft_timeout, hard_timeout), on_soft
 		finished = True


-
-
 class StreamsManager(object):
 	"""Keeps track of what qualities are being downloaded and the workers doing so.
 	Re-fetches master playlist when needed and starts new stream workers.
@ -280,7 +286,7 @@ class StreamWorker(object):
 		# with our connection pool.
 		# This worker's SegmentGetters will use its session by default for performance,
 		# but will fall back to a new one if something goes wrong.
-		self.session = requests.Session()
+		self.session = common.requests.InstrumentedSession()

 	def __repr__(self):
 		return "<{} at 0x{:x} for stream {!r}>".format(type(self).__name__, id(self), self.quality)
@ -456,7 +462,7 @@ class SegmentGetter(object):
 				break
 			# Create a new session, so we don't reuse a connection from the old session
 			# which had an error / some other issue. This is mostly just out of paranoia.
-			self.session = requests.Session()
+			self.session = common.requests.InstrumentedSession()
 			# if retry not set, wait for FETCH_RETRY first
 			self.retry.wait(common.jitter(self.FETCH_RETRY))
 		self.logger.debug("Getter is done")
@ -517,7 +523,7 @@ class SegmentGetter(object):
 			self.logger.debug("Downloading segment {} to {}".format(self.segment, temp_path))
 			with soft_hard_timeout(self.logger, "getting and writing segment", self.FETCH_FULL_TIMEOUTS, retry.set):
 				with soft_hard_timeout(self.logger, "getting segment headers", self.FETCH_HEADERS_TIMEOUTS, retry.set):
-					resp = self.session.get(self.segment.uri, stream=True)
+					resp = self.session.get(self.segment.uri, stream=True, metric_name='get_segment')
 				# twitch returns 403 for expired segment urls, and 404 for very old urls where the original segment is gone.
 				# the latter can happen if we have a network issue that cuts us off from twitch for some time.
 				if resp.status_code in (403, 404):
@ -542,12 +548,14 @@ class SegmentGetter(object):
 				self.logger.warning("Saving partial segment {} as {}".format(temp_path, partial_path))
 				common.rename(temp_path, partial_path)
 				segments_downloaded.labels(partial="True", channel=self.channel, quality=self.quality).inc()
+				segment_duration_downloaded.labels(partial="True", channel=self.channel, quality=self.quality).inc(self.segment.duration)
 			raise ex_type, ex, tb
 		else:
 			full_path = self.make_path("full", hash)
 			self.logger.debug("Saving completed segment {} as {}".format(temp_path, full_path))
 			common.rename(temp_path, full_path)
 			segments_downloaded.labels(partial="False", channel=self.channel, quality=self.quality).inc()
+			segment_duration_downloaded.labels(partial="False", channel=self.channel, quality=self.quality).inc(self.segment.duration)
 			# Prom doesn't provide a way to compare value to gauge's existing value,
 			# we need to reach into internals
 			stat = latest_segment.labels(channel=self.channel, quality=self.quality)
--- a/downloader/downloader/twitch.py
+++ b/downloader/downloader/twitch.py
@ -2,16 +2,18 @@
 import logging
 import random

-import requests
-
 import hls_playlist

+from common.requests import InstrumentedSession
+

 logger = logging.getLogger(__name__)


-def get_master_playlist(channel, session=requests):
+def get_master_playlist(channel, session=None):
 	"""Get the master playlist for given channel from twitch"""
+	if session is None:
+		session = InstrumentedSession()
 	resp = session.get(
 		"https://api.twitch.tv/api/channels/{}/access_token.json".format(channel),
 		params={'as3': 't'},
@ -19,6 +21,7 @@ def get_master_playlist(channel, session=requests):
 			'Accept': 'application/vnd.twitchtv.v3+json',
 			'Client-ID': 'pwkzresl8kj2rdj6g7bvxl9ys1wly3j',
 		},
+		metric_name='get_access_token',
 	)
 	resp.raise_for_status() # getting access token
 	token = resp.json()
@ -42,6 +45,7 @@ def get_master_playlist(channel, session=requests):
 			# in flux. Better to just blend in with the crowd for now.
 			# "platform": "_"
 		},
+		metric_name='get_master_playlist',
 	)
 	resp.raise_for_status() # getting master playlist
 	playlist = hls_playlist.load(resp.text, base_uri=resp.url)
@ -96,7 +100,9 @@ def get_media_playlist_uris(master_playlist, target_qualities):
 	return {name: variant.uri for name, variant in variants.items()}


-def get_media_playlist(uri, session=requests):
-	resp = session.get(uri)
+def get_media_playlist(uri, session=None):
+	if session is None:
+		session = InstrumentedSession()
+	resp = session.get(uri, metric_name='get_media_playlist')
 	resp.raise_for_status()
 	return hls_playlist.load(resp.text, base_uri=resp.url)
--- a/sheetsync/sheetsync/main.py
+++ b/sheetsync/sheetsync/main.py
@ -28,6 +28,18 @@ sync_errors = prom.Counter(
 	'Number of errors syncing sheets',
 )

+rows_found = prom.Counter(
+	'rows_found',
+	'Number of rows that sheetsync looked at with an id',
+	['worksheet'],
+)
+
+rows_changed = prom.Counter(
+	'rows_changed',
+	'Number of rows that needed changes applied, with type=insert, type=input or type=output',
+	['type', 'worksheet'],
+)
+
 class SheetSync(object):

 	# Time between syncs
@ -202,8 +214,12 @@ class SheetSync(object):
 				sql.SQL(", ").join(sql.Placeholder(col) for col in insert_cols),
 			)
 			query(self.conn, built_query, sheet_name=worksheet, **row)
+			rows_found(worksheet).inc()
+			rows_changed('insert', worksheet).inc()
 			return

+		rows_found(worksheet).inc()
+
 		# Update database with any changed inputs
 		changed = [col for col in self.input_columns if row[col] != getattr(event, col)]
 		if changed:
@ -220,6 +236,7 @@ class SheetSync(object):
 				) for col in changed
 			))
 			query(self.conn, built_query, **row)
+			rows_changed('input', worksheet).inc()

 		# Update sheet with any changed outputs
 		format_output = lambda v: '' if v is None else v # cast nulls to empty string
@ -234,6 +251,7 @@ class SheetSync(object):
 					row_index, self.column_map[col],
 					format_output(getattr(event, col)),
 				)
+			rows_changed('output', worksheet).inc()

 		# Set edit link if marked for editing and start/end set.
 		# This prevents accidents / clicking the wrong row and provides
--- a/sheetsync/sheetsync/sheets.py
+++ b/sheetsync/sheetsync/sheets.py
@ -23,6 +23,7 @@ class Sheets(object):
 			'https://sheets.googleapis.com/v4/spreadsheets/{}/values/{}'.format(
 				spreadsheet_id, range,
 			),
+			metric_name='get_rows',
 		)
 		resp.raise_for_status()
 		data = resp.json()
@ -46,6 +47,7 @@ class Sheets(object):
 				"range": range,
 				"values": [[value]],
 			},
+			metric_name='write_value',
 		)
 		resp.raise_for_status()