From c9d02b33187ecab3ffb357802d40a3ce7bd91d78 Mon Sep 17 00:00:00 2001
From: Mike Lang <mikelang3000@gmail.com>
Date: Sat, 5 Jan 2019 17:56:27 -0800
Subject: [PATCH] restreamer: Prevent prom client blowing up after two
 different endpoints are hit

Prom client doesn't like you creating two stats with the same name,
even though they have different labels and this makes perfect sense.

I feel like I just need to re-write the prom client at some point - it doesn't actually
do all that much except get in your way, apart from the actual text encoding which I
can steal.

Anyway, in the meantime, we get around this by breaking up metrics into two names,
a "foo_all" and a "foo_ENDPOINT". The foo_all lacks the detailed labels,
but is still labelled by endpoint and can be used more easily.
The foo_ENDPOINT labels have more information but require messier PromQL as you need to
match on a name regex if you want to look at more than one specific endpoint.
---
 restreamer/restreamer/main.py  |  1 +
 restreamer/restreamer/stats.py | 69 ++++++++++++++++++++++++----------
 2 files changed, 51 insertions(+), 19 deletions(-)
diff --git a/restreamer/restreamer/main.py b/restreamer/restreamer/main.py
index 9b354ef..429cd9a 100644
--- a/restreamer/restreamer/main.py
+++ b/restreamer/restreamer/main.py
@@ -91,6 +91,7 @@ def metrics():
 
 
 @app.route('/files/<stream>/<variant>')
+@stats
 @has_path_args
 def list_hours(stream, variant):
 	"""Returns a JSON list of hours for the given stream and variant for which
diff --git a/restreamer/restreamer/stats.py b/restreamer/restreamer/stats.py
index 513d71c..e43cf93 100644
--- a/restreamer/restreamer/stats.py
+++ b/restreamer/restreamer/stats.py
@@ -7,6 +7,35 @@ from flask import g as request_store
 from monotonic import monotonic
 
 
+# Generic metrics that all requests get logged to (see below for specific metrics per endpoint)
+
+LATENCY_HELP = "Time taken to run the request handler and create a response"
+# buckets: very long playlists / cutting can be quite slow,
+# so we have a wider range of latencies than default, up to 10min.
+LATENCY_BUCKETS = [.001, .005, .01, .05, .1, .5, 1, 5, 10, 30, 60, 120, 300, 600]
+generic_latency = prom.Histogram(
+	'http_request_latency_all', LATENCY_HELP,
+	['endpoint', 'method', 'status'],
+	buckets=LATENCY_BUCKETS,
+)
+
+SIZE_HELP = 'Size in bytes of response body for non-chunked responses'
+# buckets: powers of 4 up to 1GiB (1, 4, 16, 64, 256, 1Ki, 4Ki, ...)
+SIZE_BUCKETS = [4**i for i in range(16)]
+generic_size = prom.Histogram(
+	'http_response_size_all', SIZE_HELP,
+	['endpoint', 'method', 'status'],
+	buckets=SIZE_BUCKETS,
+)
+
+CONCURRENT_HELP = 'Number of requests currently ongoing'
+generic_concurrent = prom.Gauge(
+	'http_request_concurrency_all', CONCURRENT_HELP,
+	['endpoint', 'method'],
+)
+
+
+
 def stats(fn):
 	"""Decorator that wraps a handler func to collect metrics.
 	Adds handler func args as labels, along with 'endpoint' label using func's name,
@@ -15,6 +44,9 @@ def stats(fn):
 	# we pre-define our label names, but we don't know the names of the handler kwargs
 	# until the first time the function's called. So we delay defining the metrics until
 	# first call.
+	# In addition, it doesn't let us have different sets of labels with the same name.
+	# So we record everything twice: Once under a generic name with only endpoint, method
+	# and status, and once under a name specific to the endpoint with the full set of labels.
 	metrics = {}
 	endpoint = fn.__name__
 
@@ -25,30 +57,24 @@ def stats(fn):
 			labels_no_status = sorted(kwargs.keys()) + ['endpoint', 'method']
 			labels = labels_no_status + ['status']
 			metrics['latency'] = prom.Histogram(
-				'http_request_latency',
-				'Time taken to run the request handler and create a response',
-				labels,
-				# buckets: very long playlists / cutting can be quite slow,
-				# so we have a wider range of latencies than default, up to 10min.
-				buckets=[.001, .005, .01, .05, .1, .5, 1, 5, 10, 30, 60, 120, 300, 600],
+				'http_request_latency_{}'.format(endpoint), LATENCY_HELP,
+				labels, buckets=LATENCY_BUCKETS,
 			)
 			metrics['size'] = prom.Histogram(
-				'http_response_size',
-				'Size in bytes of response body for non-chunked responses',
-				labels,
-				# buckets: powers of 4 up to 1GiB (1, 4, 16, 64, 256, 1Ki, 4Ki, ...)
-				buckets=[4**i for i in range(16)],
+				'http_response_size_{}'.format(endpoint), SIZE_HELP,
+				labels, buckets=SIZE_BUCKETS,
 			)
 			metrics['concurrent'] = prom.Gauge(
-				'http_request_concurrency',
-				'Number of requests currently ongoing',
+				'http_request_concurrency_{}'.format(endpoint), CONCURRENT_HELP,
 				labels_no_status,
 			)
 
 		request_store.metrics = metrics
+		request_store.endpoint = endpoint
+		request_store.method = request.method
 		request_store.labels = {k: str(v) for k, v in kwargs.items()}
-		request_store.labels.update(endpoint=endpoint, method=request.method)
-		metrics['concurrent'].labels(**request_store.labels).inc()
+		generic_concurrent.labels(endpoint=endpoint, method=request.method).inc()
+		metrics['concurrent'].labels(endpoint=endpoint, method=request.method, **request_store.labels).inc()
 		request_store.start_time = monotonic()
 		return fn(**kwargs)
 
@@ -66,15 +92,20 @@ def after_request(response):
 
 	end_time = monotonic()
 	metrics = request_store.metrics
+	endpoint = request_store.endpoint
+	method = request_store.method
 	labels = request_store.labels
 	start_time = request_store.start_time
 
-	metrics['concurrent'].labels(**labels).dec()
+	generic_concurrent.labels(endpoint=endpoint, method=method).dec()
+	metrics['concurrent'].labels(endpoint=endpoint, method=method, **labels).dec()
 
-	labels['status'] = str(response.status_code)
-	metrics['latency'].labels(**labels).observe(end_time - start_time)
+	status = str(response.status_code)
+	generic_latency.labels(endpoint=endpoint, method=method, status=status).observe(end_time - start_time)
+	metrics['latency'].labels(endpoint=endpoint, method=method, status=status, **labels).observe(end_time - start_time)
 	size = response.calculate_content_length()
 	if size is not None:
-		metrics['size'].labels(**labels).observe(size)
+		generic_size.labels(endpoint=endpoint, method=method, status=status).observe(size)
+		metrics['size'].labels(endpoint=endpoint, method=method, status=status, **labels).observe(size)
 
 	return response